diff --git a/DESCRIPTION b/DESCRIPTION index 39f5884..d8810f1 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -31,3 +31,5 @@ Imports: utils, httr RoxygenNote: 6.0.1.9000 +SystemRequirements: LibreOffice () required to extract + data from .doc files. diff --git a/NAMESPACE b/NAMESPACE index 0d1b0c3..e035672 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -12,6 +12,7 @@ export(docx_extract_tbl) export(docx_tbl_count) export(mcga) export(read_docx) +export(set_libreoffice_path) importFrom(dplyr,arrange) importFrom(dplyr,bind_cols) importFrom(dplyr,count) diff --git a/R/read_docs.r b/R/read_docs.r index be64791..bde8839 100644 --- a/R/read_docs.r +++ b/R/read_docs.r @@ -1,6 +1,8 @@ #' Read in a Word document for table extraction #' -#' Local file path or URL pointing to a \code{.docx} file. +#' Local file path or URL pointing to a \code{.docx} file. Can also take +#' \code{.doc} file as input if \code{LibreOffice} is installed +#' (see \url{https://www.libreoffice.org/} for more info and to download). #' #' @param path path to the Word document #' @importFrom xml2 read_xml @@ -15,24 +17,66 @@ #' } read_docx <- function(path) { + stopifnot(is.character(path)) + # make temporary things for us to work with tmpd <- tempdir() tmpf <- tempfile(tmpdir=tmpd, fileext=".zip") + # Check to see if input is a .doc file + is_input_doc <- is_doc(path) + + # If input is a .doc file, create a temp .doc file + if (is_input_doc) { + tmpf_doc <- tempfile(tmpdir = tmpd, fileext = ".doc") + tmpf_docx <- gsub("\\.doc$", ".docx", tmpf_doc) + } else { + tmpf_doc <- NULL + tmpf_docx <- NULL + } + on.exit({ #cleanup unlink(tmpf) + unlink(tmpf_doc) + unlink(tmpf_docx) unlink(sprintf("%s/docdata", tmpd), recursive=TRUE) }) if (is_url(path)) { - res <- httr::GET(path, write_disk(tmpf)) - httr::stop_for_status(res) + if (is_input_doc) { + # If input is a url pointing to a .doc file, write file to disk + res <- httr::GET(path, write_disk(tmpf_doc)) + httr::stop_for_status(res) + + # Save .doc file as a .docx file using LibreOffice command-line tools. + convert_doc_to_docx(tmpd, tmpf_doc) + + # copy output of LibreOffice to zip (not entirely necessary) + file_copy(tmpf_docx, tmpf) + } else { + # If input is a url pointing to a .docx file, write file to disk + res <- httr::GET(path, write_disk(tmpf)) + httr::stop_for_status(res) + } } else { path <- path.expand(path) if (!file.exists(path)) stop(sprintf("Cannot find '%s'", path), call.=FALSE) - # copy docx to zip (not entirely necessary) - file.copy(path, tmpf) + + # If input is a .doc file, save it as a .docx file using LibreOffice + # command-line tools. + if (is_input_doc) { + file_copy(path, tmpf_doc) + convert_doc_to_docx(tmpd, tmpf_doc) + + # copy output of LibreOffice to zip (not entirely necessary) + file_copy(tmpf_docx, tmpf) + } else { + # Otherwise, if input is a .docx file, just copy docx to zip + # (not entirely necessary) + file_copy(path, tmpf) + } } + # unzip it unzip(tmpf, exdir=sprintf("%s/docdata", tmpd)) diff --git a/R/utils.r b/R/utils.r index df2275c..4ed09c1 100644 --- a/R/utils.r +++ b/R/utils.r @@ -32,3 +32,54 @@ has_header <- function(tbl, rows, ns) { is_url <- function(path) { grepl("^(http|ftp)s?://", path) } is_docx <- function(path) { tolower(tools::file_ext(path)) == "docx" } + +is_doc <- function(path) { tolower(tools::file_ext(path)) == "doc" } + +# Copy a file to a new location, throw an error if the copy fails. +file_copy <- function(from, to) { + fc <- file.copy(from, to) + if (!fc) stop(sprintf("file copy failure for file %s", from), call.=FALSE) +} + +# Save a .doc file as a new .docx file, using the LibreOffice command line +# tools. +convert_doc_to_docx <- function(docx_dir, doc_file) { + lo_path <- getOption("path_to_libreoffice") + if (is.null(lo_path)) { + stop("Cannot determine file path to LibreOffice. ", + "To download LibreOffice, visit: https://www.libreoffice.org/ \n", + "If you've already downloaded the software, use function ", + "'set_libreoffice_path()' to point R to your local 'soffice.exe' file", + call. = FALSE) + } + cmd <- sprintf('"%s" -convert-to docx:"MS Word 2007 XML" -headless -outdir "%s" "%s"', + lo_path, + docx_dir, + doc_file) + system(cmd, show.output.on.console = FALSE) +} + + +#' Point to Local soffice.exe File +#' +#' Function to set an option that points to the local LibreOffice file +#' \code{soffice.exe}. +#' +#' @param path +#' +#' @details For a list of possible file path locations for \code{soffice.exe}, +#' see \url{https://github.com/hrbrmstr/docxtractr/issues/5#issuecomment-233181976} +#' +#' @return Returns nothing, function sets the option variable +#' \code{path_to_libreoffice}. +#' @export +#' +#' @examples \dontrun{ +#' set_libreoffice_path("local/path/to/soffice.exe") +#' } +set_libreoffice_path <- function(path) { + stopifnot(is.character(path)) + + if (!file.exists(path)) stop(sprintf("Cannot find '%s'", path), call.=FALSE) + options("path_to_libreoffice" = path) +} diff --git a/man/read_docx.Rd b/man/read_docx.Rd index e9a64ca..52211e6 100644 --- a/man/read_docx.Rd +++ b/man/read_docx.Rd @@ -10,7 +10,9 @@ read_docx(path) \item{path}{path to the Word document} } \description{ -Local file path or URL pointing to a \code{.docx} file. +Local file path or URL pointing to a \code{.docx} file. Can also take +\code{.doc} file as input if \code{LibreOffice} is installed +(see \url{https://www.libreoffice.org/} for more info and to download). } \examples{ doc <- read_docx(system.file("examples/data.docx", package="docxtractr")) diff --git a/man/set_libreoffice_path.Rd b/man/set_libreoffice_path.Rd new file mode 100644 index 0000000..f95ef45 --- /dev/null +++ b/man/set_libreoffice_path.Rd @@ -0,0 +1,28 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.r +\name{set_libreoffice_path} +\alias{set_libreoffice_path} +\title{Point to Local soffice.exe File} +\usage{ +set_libreoffice_path(path) +} +\arguments{ +\item{path}{} +} +\value{ +Returns nothing, function sets the option variable + \code{path_to_libreoffice}. +} +\description{ +Function to set an option that points to the local LibreOffice file +\code{soffice.exe}. +} +\details{ +For a list of possible file path locations for \code{soffice.exe}, + see \url{https://github.com/hrbrmstr/docxtractr/issues/5#issuecomment-233181976} +} +\examples{ +\dontrun{ +set_libreoffice_path("local/path/to/soffice.exe") +} +}