Displayr · JustinCCYap · Oct 8, 2019 · Sep 27, 2019 · Sep 30, 2019 · Oct 1, 2019
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -10,3 +10,4 @@
 ^docs
 ^.*\.Rproj$
 ^\.Rproj\.user$
+^stanford-corenlp-full-2018-10-05.zip$
diff --git a/.travis.yml b/.travis.yml
@@ -1,7 +1,56 @@
-# R for travis: see documentation at https://docs.travis-ci.com/user/languages/r
-
-language: R
-sudo: false
+language: r
+r:
+  - release
+dist: trusty
+sudo: required
 cache: packages
+warnings_are_errors: false
+
+# install debian libraries to match R-servers
+# update pre-installed packages to latest versions
+before_install:
+  - sudo add-apt-repository -y ppa:ubuntugis/ubuntugis-unstable
+  - sudo apt-get -qq update
+  - sudo apt-get install -y libgdal-dev libproj-dev python-protobuf libprotoc-dev libprotobuf-dev libv8-dev librsvg2-dev libmpfr-dev
+  - sudo apt install default-jdk
+  - wget http://nlp.stanford.edu/software/stanford-corenlp-full-2018-10-05.zip
+  - unzip -d $HOME stanford-corenlp-full-2018-10-05.zip
+  - export CORENLP=$HOME/stanford-corenlp-full-2018-10-05
+  - wget http://nlp.stanford.edu/software/stanford-english-corenlp-2018-10-05-models.jar -P $CORENLP/
+  - rcode="tfile <- tempfile(); capture.output(res<-devtools::test(), file = tfile); out <- readLines(tfile); cat(out, sep = '\n'); "
+  - rcode+="n.fail <- as.numeric(sub('Failed:[[:space:]]', '', out[grep('Failed:[[:space:]]', out)])); "
+  - rcode+="res <- as.data.frame(res); out <- data.frame(file = unlist(res[['file']]), warning = unlist(res[['warning']])); "
+  - rcode+="write.csv(out, file='test_results.csv'); "
+  - rcode+="quit(status = !identical(n.fail, 0), save='no');"
+  - Rscript tools/travis_run_before_install.R
+
+r_packages:
+  - devtools
+  - roxygen2
+  - covr
+
+r_github_packages:
+  - Displayr/flipDevTools
+
+script:
+  - R CMD build --no-manual --no-build-vignettes --no-resave-data .
+  - R CMD check --as-cran --no-manual --ignore-vignettes --no-tests *.tar.gz
+  - if [ -d tests/testthat ]; then
+    Rscript --default-packages='datasets,utils,grDevices,graphics,stats,methods' -e "$rcode";
+    fi
 
+notifications:
+  slack:
+    rooms:
+      - displayr:FTgSTNHC2rpanhJMGTKMwZXM#github-notifications
+    template:
+      - "Build <%{build_url}|#%{build_number}> %{result} in %{repository_name}@%{branch} by %{author}: <%{compare_url}|%{commit_message}>"
+    on_success: change
+    on_failure: always
 
+# Warning notifications and downstream package builds are implemented
+# by calling R functions so they can be updated in this package without
+# committing a new change to .travis.yml in each repository
+after_success: 
+  - Rscript -e "require(flipDevTools); NotifyWarnings(); TriggerDownstreamBuilds()"
+  - travis_wait Rscript -e "flipDevTools::CheckCoverage()"
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,10 +1,9 @@
 Package: cleanNLP
 Type: Package
 Title: A Tidy Data Model for Natural Language Processing
-Version: 2.4.1
-Authors@R: c(person(given = "Taylor B.", family = "Arnold", email = "taylor.arnold@acm.org", role = c("aut", "cre")))
-Author: Taylor B. Arnold [aut, cre]
-Maintainer: Taylor B. Arnold <taylor.arnold@acm.org>
+Version: 2.4.2
+Authors@R: c(person(given = "Justin", family = "Wishart", email = "justin.wishart@displayr.com", role = "ctb"),
+    person(given = "Taylor B.", family = "Arnold", email = "taylor.arnold@acm.org", role = c("aut", "cre")))
 Description: Provides a set of fast tools for converting a textual corpus into a set of normalized
   tables. Users may make use of the 'udpipe' back end with no external dependencies, a Python back
   end with 'spaCy' <https://spacy.io> or the Java back end 'CoreNLP'
@@ -17,6 +16,8 @@ Depends:
     R (>= 2.10)
 Imports:
     dplyr (>= 0.7.4),
+    jsonlite,
+    rJava (>= 0.9-8),
     Matrix (>= 1.2),
     stringi,
     stats,
@@ -25,16 +26,15 @@ Imports:
 Suggests:
     udpipe (>= 0.3),
     reticulate (>= 1.4),
-    rJava (>= 0.9-8),
     RCurl (>= 1.95),
     knitr (>= 1.15),
     rmarkdown (>= 1.4),
     testthat (>= 1.0.1),
     covr (>= 2.2.2)
 SystemRequirements: Python (>= 2.7.0); spaCy <https://spacy.io/> (>= 2.0); Java (>= 7.0); Stanford CoreNLP <http://nlp.stanford.edu/software/corenlp.shtml> (>= 3.9.2)
 License: LGPL-2
-URL: https://statsmaths.github.io/cleanNLP/
-BugReports: http://github.com/statsmaths/cleanNLP/issues
+BugReports: http://github.com/Displayr/cleanNLP/issues
 LazyData: true
 VignetteBuilder: knitr
+Encoding: UTF-8
 RoxygenNote: 6.1.1
diff --git a/NAMESPACE b/NAMESPACE
@@ -1,6 +1,7 @@
 # Generated by roxygen2: do not edit by hand
 
 S3method(print,annotation)
+export(NERAnnotate)
 export(cnlp_annotate)
 export(cnlp_combine_documents)
 export(cnlp_download_corenlp)
@@ -46,3 +47,5 @@ export(run_annotators)
 export(tidy_pca)
 export(to_CoNNL)
 export(write_annotation)
+importFrom(jsonlite,fromJSON)
+importFrom(rJava,.jcall)
diff --git a/R/accessors.R b/R/accessors.R
@@ -105,7 +105,7 @@ cnlp_get_token <- function(annotation, include_root = FALSE,
     res <- res[res$tid > 0,]
 
   if (spaces) {
-    res <- dplyr::group_by_(res, "id")
+    res <- dplyr::group_by(res, "id")
     res <- dplyr::mutate(res,
             spaces = dplyr::lead(cid, default = 0) - cid -
             stringi::stri_length(word))

diff --git a/R/annotate.R b/R/annotate.R
@@ -201,4 +201,4 @@ cnlp_quick <- function(input, ...) {
   names(df)[c(1L, 4L)] <- c("doc_id", "token")
 
   return(df)
-}
+}
diff --git a/R/backend_corenlp.R b/R/backend_corenlp.R
@@ -394,6 +394,8 @@ cnlp_init_corenlp <- function(language, anno_level = 2, lib_location = NULL,
 #'                       written to the console or suppressed?
 #' @param keys           vector string of flags to add to the corenlp calls
 #' @param values         vector string of values paired with the flags.
+#' @param corenlp.only   Logical to specify if only coreNLP should be initisalised
+#'                       and not the custom Annotator.
 #'
 #' The example below shows how to initialise corenlp to run named entity 
 #' recognition (ner) with its respective dependencies (tokenize, ssplit, pos 
@@ -408,31 +410,32 @@ cnlp_init_corenlp <- function(language, anno_level = 2, lib_location = NULL,
 #'
 #' @export
 
-cnlp_init_corenlp_custom <- function(language, lib_location = NULL,
-                         mem = "6g", verbose = FALSE, keys, values) {
+cnlp_init_corenlp_custom <- function(language, lib_location = NULL, mem = "6g", verbose = FALSE,
+                                     keys, values, corenlp.only = FALSE) {
   if (missing(language)) language <- "en"
   language_list <- c("en", "de", "fr", "es", "ar", "zh")
   language <- match.arg(arg = language, choices = language_list)
   cnlp_init_corenlp_volatiles(language, lib_location, mem, verbose)
   setup_corenlp_backend_raw(keys, values) 
-  invisible(init_corenlp_backend())
+  invisible(init_corenlp_backend(corenlp.only = corenlp.only))
 }
 
 cnlp_init_corenlp_volatiles <- function(language, lib_location, mem, verbose) {
-  if (is.null(lib_location))
-    lib_location <- file.path(system.file("extdata", package="cleanNLP"),
-                    "/stanford-corenlp-full-2018-10-05")
+  if (is.null(lib_location)) {
+    if(Sys.getenv("CORENLP") == ""){
+      lib_location <- file.path(system.file("extdata", package="cleanNLP"),
+                                "stanford-corenlp-full-2018-10-05")
+    } else {
+      lib_location <- Sys.getenv("CORENLP")
+    }
+  }
 
   # set properties
   volatiles$corenlp$language <- language
   volatiles$corenlp$lib_location <- lib_location
   volatiles$corenlp$mem <- mem
   volatiles$corenlp$verbose <- verbose
   volatiles$corenlp$properties <- list()
-
-  fin <- file.path(lib_location, "/properties.rds")
-  if (file.exists(fin))
-    volatiles$corenlp$properties <- readRDS(fin)
 }
 
 setup_corenlp_backend_raw <- function(keys, values, clear = FALSE) {
@@ -454,7 +457,7 @@ setup_corenlp_backend_raw <- function(keys, values, clear = FALSE) {
     volatiles$corenlp$properties[[keys[i]]] <- values[i]
 }
 
-init_corenlp_backend <- function() {
+init_corenlp_backend <- function(corenlp.only = FALSE) {
 
   if (!requireNamespace("rJava")) {
     stop("The rJava package is required to use the corenlp backend")
@@ -542,8 +545,9 @@ init_corenlp_backend <- function() {
 
   volatiles$corenlp$corenlp <-
     rJava::.jnew("edu.stanford.nlp.pipeline.StanfordCoreNLP", prop)
-  volatiles$corenlp$AnnotationProcessor <-
-    rJava::.jnew("edu.richmond.nlp.AnnotationProcessor")
+  if(!corenlp.only)
+    volatiles$corenlp$AnnotationProcessor <-
+      rJava::.jnew("edu.richmond.nlp.AnnotationProcessor")
   if (!volatiles$corenlp$verbose)
     rJava::.jcall("java/lang/System", "V", "setErr", err)
 
@@ -624,6 +628,4 @@ annotate_with_corenlp <- function(input, as_strings, verbose) {
   out <- cnlp_read_csv(output_dir)
 
   return(out)
-}
-
-
+}
diff --git a/R/io.R b/R/io.R
@@ -177,10 +177,10 @@ cnlp_read_conll <- function(file) {
   source <- x$X7
 
   # create token table
-  token <- dplyr::data_frame(id = id + 1, sid = sid, tid = tid,
-                word = word, lemma = lemma,
-                upos = upos, pos = pos,
-                cid = NA_integer_)
+  token <- dplyr::tibble(id = id + 1, sid = sid, tid = tid,
+                         word = word, lemma = lemma,
+                         upos = upos, pos = pos,
+                         cid = NA_integer_)
 
   roots <- token[tid == 1,]
   roots$tid <- 0L
@@ -191,10 +191,10 @@ cnlp_read_conll <- function(file) {
 
   # create dependency table
   tid_target <- tid
-  dep <- dplyr::data_frame(id = id + 1, sid = sid, tid = source,
-                            tid_target = tid_target,
-                            relation = relation,
-                            relation_full = relation)
+  dep <- dplyr::tibble(id = id + 1, sid = sid, tid = source,
+                       tid_target = tid_target,
+                       relation = relation,
+                       relation_full = relation)
 
   # create annotation object
   anno <- list()

diff --git a/R/ner_annotate.R b/R/ner_annotate.R
@@ -0,0 +1,58 @@
+#' Run the annotation pipeline on a set of documents to extract entities
+#'
+#' Runs the entity detection algorithms from CoreNLP using CoreNLP java library via rJava.
+#' It expects the CoreNLP java object to already be initialised with rJava with a call to 
+#' \code{cnlp_init_corenlp_custom} with the appropriate annotators setup for named entity
+#' recognition and a path to an input file that has the input text separately by new lines.
+#' The input file must have Unix style line endings or will cause the CoreNLP java call to crash
+#' with a null pointer exception. The function returns a \code{data.frame} showing the location
+#' in the document whre each entity occurs and the entity type. If no entities are detected for a
+#' document then an empty data.frame with no rows is returned.
+#'
+#' @param input.file a character string showing the path to the file to be processed. The file should
+#'    have text with Unix style line endings (will throw Nullpointer exception if not)
+#' @return data.frame with the details of the detected entities. The output data.frame has three
+#'    columns. \itemize{
+#'        \item \code{id} integer: the row index of the input file that has an extracted entity.
+#'        \item \code{entity} character: The extracted entity word (e.g. William)
+#'        \item \code{entity.tyoe} character: The entity type of the extracted entity (e.g. Person)
+#'    }
+#' @importFrom jsonlite fromJSON
+#' @importFrom rJava .jcall
+#' @examples
+#' \dontrun{
+#' file <- file(input.file, "wb") # need linux style line endings
+#' writeLines(simple.input.test, con = file)
+#' close(file)
+#' keys <- c("ssplit.eolonly", "annotators", "outputFormat", "file", "outputDirectory")
+#' values <- c("true", "tokenize,ssplit,pos,lemma,ner", "json", input.file, dirname(tmp.file))
+#' 
+#' cnlp_init_corenlp_custom(language = "en", mem = "2g", keys = keys, values = values)
+#' simple.output <- NERAnnotate(input.file)
+#' }
+#' @export
+NERAnnotate <- function(input.file) {
+
+  if(!volatiles$corenlp$init)
+    stop("Java CoreNLP not initialized. Named Entity Recognition cannot be executed.")
+
+  .jcall(volatiles$corenlp$corenlp, "V", "run")
+
+  output <- fromJSON(paste0(input.file, ".json"))
+  relevant.cols = c("text", "ner")
+  ner.mentions = output$sentences$entitymentions
+  response = sapply(ner.mentions, function(x) nrow(x))
+  if(all(sapply(response, is.null))) {
+    out <- data.frame(id = character(), entity = character(), entity.type = character())
+  } else {
+    response = rep(1:length(ner.mentions), response)
+    ner.mentions = lapply(ner.mentions, function(x) {if(nrow(x) != 0) {
+      subset(x, select = relevant.cols)
+    }})
+    # Remove the NULL list elements
+    ner.mentions <- Filter(Negate(is.null), ner.mentions)
+    out = cbind(response, do.call(rbind.data.frame, ner.mentions))
+    names(out) <- c("id", "entity", "entity.type")
+  }
+  out
+}
-Original file line number
+Diff line change
@@ Expand Up / @@ -201,4 +201,4 @@ cnlp_quick <- function(input, ...) { @@
       names(df)[c(1L, 4L)] <- c("doc_id", "token")
       return(df)
-    }
+    }