Displayr · JustinCCYap · Oct 9, 2019 · Oct 8, 2019
diff --git a/R/ner_annotate.R b/R/ner_annotate.R
@@ -11,6 +11,16 @@
 #'
 #' @param input.file a character string showing the path to the file to be processed. The file should
 #'    have text with Unix style line endings (will throw Nullpointer exception if not)
+#' @param entity.mentions.only Logical to specify if only entity mention output from CoreNLP is used
+#'    in the extraction. If TRUE, this will extract personal pronouns as well as standard entities.
+#'    The benefit of the entity.mention output is it groups words that are from the same entity. E.g.
+#'    'John Smith' is a single person entity and 'New York City' is a single location entity. The 
+#'    entity mention output also classifies personal pronouns as entities and will be extracted.
+#'    If FALSE, the entity.mentions output from CoreNLP is validated against the CoreNLP token output.
+#'    The token output also identifies entities on the single word level and it doesn't classify personal
+#'    pronouns as entities. The net effect if set to FALSE is that entity mentions are extracted but
+#'    personal pronouns and other potential entity mentions that are not entities on the token level
+#'    are not extracted.
 #' @return data.frame with the details of the detected entities. The output data.frame has three
 #'    columns. \itemize{
 #'        \item \code{id} integer: the row index of the input file that has an extracted entity.
@@ -21,38 +31,64 @@
 #' @importFrom rJava .jcall
 #' @examples
 #' \dontrun{
+#' simple.input.test <- c("John is a person", "Google is a company", "This is nothing")
+#' input.file <- tempfile()
 #' file <- file(input.file, "wb") # need linux style line endings
 #' writeLines(simple.input.test, con = file)
 #' close(file)
 #' keys <- c("ssplit.eolonly", "annotators", "outputFormat", "file", "outputDirectory")
-#' values <- c("true", "tokenize,ssplit,pos,lemma,ner", "json", input.file, dirname(tmp.file))
+#' values <- c("true", "tokenize,ssplit,pos,lemma,ner", "json", input.file, dirname(input.file))
 #' 
 #' cnlp_init_corenlp_custom(language = "en", mem = "2g", keys = keys, values = values)
 #' simple.output <- NERAnnotate(input.file)
 #' }
 #' @export
-NERAnnotate <- function(input.file) {
+NERAnnotate <- function(input.file, entity.mentions.only = FALSE) {
 
   if(!volatiles$corenlp$init)
     stop("Java CoreNLP not initialized. Named Entity Recognition cannot be executed.")
 
   .jcall(volatiles$corenlp$corenlp, "V", "run")
 
   output <- fromJSON(paste0(input.file, ".json"))
-  relevant.cols = c("text", "ner")
   ner.mentions = output$sentences$entitymentions
   response = sapply(ner.mentions, function(x) nrow(x))
   if(all(sapply(response, is.null))) {
     out <- data.frame(id = character(), entity = character(), entity.type = character())
   } else {
-    response = rep(1:length(ner.mentions), response)
-    ner.mentions = lapply(ner.mentions, function(x) {if(nrow(x) != 0) {
-      subset(x, select = relevant.cols)
+    if(!entity.mentions.only) {
+      # Validate ner.mentions against the token output
+      ner.mentions <- mapply(function(x, y) {
+        if(nrow(y) != 0) {
+          idx <- sapply(1:nrow(y), function(i) {
+            z <- y[i, ]
+            ind <- c(z$tokenBegin + 1, z$tokenEnd)
+            all(x[ind, ]$ner != "O")})
+          if(any(idx))
+          {
+            y <- y[idx, ]
+          } else
+          {
+            return(data.frame())
+          }
+        }
+        y
+      }, x = output$sentences$tokens, y = ner.mentions)
+      response <- sapply(ner.mentions, function(x) nrow(x))
+      # Check if filtered ner is not empty
+      if(all(sapply(response, is.null))) {
+        return(data.frame(id = character(), entity = character(), entity.type = character())) 
+      }
+    }
+    response <- rep(1:length(ner.mentions), response)
+    ner.mentions = lapply(ner.mentions, function(x) {
+      if(nrow(x) != 0) {
+        subset(x, select = c("text", "ner"))
     }})
     # Remove the NULL list elements
     ner.mentions <- Filter(Negate(is.null), ner.mentions)
     out = cbind(response, do.call(rbind.data.frame, ner.mentions))
     names(out) <- c("id", "entity", "entity.type")
   }
   out
-}
+}
diff --git a/man/NERAnnotate.Rd b/man/NERAnnotate.Rd
diff --git a/tests/testthat/test-entity.R b/tests/testthat/test-entity.R
@@ -1,7 +1,7 @@
 library(testthat)
 
-# Input has variety of entities
-simple.input.test <- c("There is a person called Julie that went down the lane.",  # Person, Julie,
+# Input has variety of entities, first entry has person and personal pronoun
+simple.input.test <- c("There is a person called Julie that went down the lane. She likes bubbles", 
                        "Toys are fine", #No entities
                        "There is trouble brewing in Hong Kong", #Location Hong Kong
                        "There are two people caled Jane and John") 
@@ -17,6 +17,14 @@ simple.expected <- structure(list(id = c(1L, 3L, 4L, 4L, 4L),
                              class = "data.frame",
                              row.names = c(NA, -5L))
 
+simple.with.pronouns.expected <- structure(list(id = c(1L, 1L, 3L, 4L, 4L, 4L), 
+                                  entity = c("Julie", "She", "Hong Kong", "two", "Jane", "John"),
+                                  entity.type = c("PERSON", "PERSON", "CITY", "NUMBER", "PERSON",
+                                                  "PERSON")),
+                             class = "data.frame",
+                             row.names = c(NA, -6L))
+
+
 none.expected <- data.frame(id = character(), entity = character(), entity.type = character())
 
 # If this is throwing errors that you need to download Core NLP then the way to get testthat to 
@@ -42,14 +50,17 @@ test_that("NERAnnotate consistency", {
   cnlp_init_corenlp_custom(language = "en", mem = "2g", keys = keys, values = values, 
                            corenlp.only = TRUE)
 
-  simple.output <- NERAnnotate(tmp.file)
+  expect_error(simple.output <- NERAnnotate(tmp.file), NA)
   expect_identical(simple.output, simple.expected)
 
+  expect_error(simple.output.with.pronouns <- NERAnnotate(tmp.file, entity.mentions.only = TRUE), NA)
+  expect_identical(simple.output.with.pronouns, simple.with.pronouns.expected)
+
   file <- file(tmp.file, "wb")
   writeLines(none.input, con = file)
   close(file)
 
   none.output <- NERAnnotate(tmp.file)
   expect_identical(none.output, none.expected)
 
-})
+})