Skip to content

Commit

Permalink
add functionality to support .doc input, using LibreOffice to convert…
Browse files Browse the repository at this point in the history
… to .docx
  • Loading branch information
ChrisMuir committed May 12, 2018
1 parent 7ad518f commit 7fa2ab9
Show file tree
Hide file tree
Showing 6 changed files with 134 additions and 6 deletions.
2 changes: 2 additions & 0 deletions DESCRIPTION
Expand Up @@ -31,3 +31,5 @@ Imports:
utils,
httr
RoxygenNote: 6.0.1.9000
SystemRequirements: LibreOffice (<https://www.libreoffice.org/>) required to extract
data from .doc files.
1 change: 1 addition & 0 deletions NAMESPACE
Expand Up @@ -12,6 +12,7 @@ export(docx_extract_tbl)
export(docx_tbl_count)
export(mcga)
export(read_docx)
export(set_libreoffice_path)
importFrom(dplyr,arrange)
importFrom(dplyr,bind_cols)
importFrom(dplyr,count)
Expand Down
54 changes: 49 additions & 5 deletions R/read_docs.r
@@ -1,6 +1,8 @@
#' Read in a Word document for table extraction
#'
#' Local file path or URL pointing to a \code{.docx} file.
#' Local file path or URL pointing to a \code{.docx} file. Can also take
#' \code{.doc} file as input if \code{LibreOffice} is installed
#' (see \url{https://www.libreoffice.org/} for more info and to download).
#'
#' @param path path to the Word document
#' @importFrom xml2 read_xml
Expand All @@ -15,24 +17,66 @@
#' }
read_docx <- function(path) {

stopifnot(is.character(path))

# make temporary things for us to work with
tmpd <- tempdir()
tmpf <- tempfile(tmpdir=tmpd, fileext=".zip")

# Check to see if input is a .doc file
is_input_doc <- is_doc(path)

# If input is a .doc file, create a temp .doc file
if (is_input_doc) {
tmpf_doc <- tempfile(tmpdir = tmpd, fileext = ".doc")
tmpf_docx <- gsub("\\.doc$", ".docx", tmpf_doc)
} else {
tmpf_doc <- NULL
tmpf_docx <- NULL
}

on.exit({ #cleanup
unlink(tmpf)
unlink(tmpf_doc)
unlink(tmpf_docx)
unlink(sprintf("%s/docdata", tmpd), recursive=TRUE)
})

if (is_url(path)) {
res <- httr::GET(path, write_disk(tmpf))
httr::stop_for_status(res)
if (is_input_doc) {
# If input is a url pointing to a .doc file, write file to disk
res <- httr::GET(path, write_disk(tmpf_doc))
httr::stop_for_status(res)

# Save .doc file as a .docx file using LibreOffice command-line tools.
convert_doc_to_docx(tmpd, tmpf_doc)

# copy output of LibreOffice to zip (not entirely necessary)
file_copy(tmpf_docx, tmpf)
} else {
# If input is a url pointing to a .docx file, write file to disk
res <- httr::GET(path, write_disk(tmpf))
httr::stop_for_status(res)
}
} else {
path <- path.expand(path)
if (!file.exists(path)) stop(sprintf("Cannot find '%s'", path), call.=FALSE)
# copy docx to zip (not entirely necessary)
file.copy(path, tmpf)

# If input is a .doc file, save it as a .docx file using LibreOffice
# command-line tools.
if (is_input_doc) {
file_copy(path, tmpf_doc)
convert_doc_to_docx(tmpd, tmpf_doc)

# copy output of LibreOffice to zip (not entirely necessary)
file_copy(tmpf_docx, tmpf)
} else {
# Otherwise, if input is a .docx file, just copy docx to zip
# (not entirely necessary)
file_copy(path, tmpf)
}
}

# unzip it
unzip(tmpf, exdir=sprintf("%s/docdata", tmpd))

Expand Down
51 changes: 51 additions & 0 deletions R/utils.r
Expand Up @@ -32,3 +32,54 @@ has_header <- function(tbl, rows, ns) {
is_url <- function(path) { grepl("^(http|ftp)s?://", path) }

is_docx <- function(path) { tolower(tools::file_ext(path)) == "docx" }

is_doc <- function(path) { tolower(tools::file_ext(path)) == "doc" }

# Copy a file to a new location, throw an error if the copy fails.
file_copy <- function(from, to) {
fc <- file.copy(from, to)
if (!fc) stop(sprintf("file copy failure for file %s", from), call.=FALSE)
}

# Save a .doc file as a new .docx file, using the LibreOffice command line
# tools.
convert_doc_to_docx <- function(docx_dir, doc_file) {
lo_path <- getOption("path_to_libreoffice")
if (is.null(lo_path)) {
stop("Cannot determine file path to LibreOffice. ",
"To download LibreOffice, visit: https://www.libreoffice.org/ \n",
"If you've already downloaded the software, use function ",
"'set_libreoffice_path()' to point R to your local 'soffice.exe' file",
call. = FALSE)
}
cmd <- sprintf('"%s" -convert-to docx:"MS Word 2007 XML" -headless -outdir "%s" "%s"',
lo_path,
docx_dir,
doc_file)
system(cmd, show.output.on.console = FALSE)
}


#' Point to Local soffice.exe File
#'
#' Function to set an option that points to the local LibreOffice file
#' \code{soffice.exe}.
#'
#' @param path
#'
#' @details For a list of possible file path locations for \code{soffice.exe},
#' see \url{https://github.com/hrbrmstr/docxtractr/issues/5#issuecomment-233181976}
#'
#' @return Returns nothing, function sets the option variable
#' \code{path_to_libreoffice}.
#' @export
#'
#' @examples \dontrun{
#' set_libreoffice_path("local/path/to/soffice.exe")
#' }
set_libreoffice_path <- function(path) {
stopifnot(is.character(path))

if (!file.exists(path)) stop(sprintf("Cannot find '%s'", path), call.=FALSE)
options("path_to_libreoffice" = path)
}
4 changes: 3 additions & 1 deletion man/read_docx.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

28 changes: 28 additions & 0 deletions man/set_libreoffice_path.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 7fa2ab9

Please sign in to comment.