diff --git a/R/ner_annotate.R b/R/ner_annotate.R index e79a0f0..e44ab32 100644 --- a/R/ner_annotate.R +++ b/R/ner_annotate.R @@ -11,6 +11,16 @@ #' #' @param input.file a character string showing the path to the file to be processed. The file should #' have text with Unix style line endings (will throw Nullpointer exception if not) +#' @param entity.mentions.only Logical to specify if only entity mention output from CoreNLP is used +#' in the extraction. If TRUE, this will extract personal pronouns as well as standard entities. +#' The benefit of the entity.mention output is it groups words that are from the same entity. E.g. +#' 'John Smith' is a single person entity and 'New York City' is a single location entity. The +#' entity mention output also classifies personal pronouns as entities and will be extracted. +#' If FALSE, the entity.mentions output from CoreNLP is validated against the CoreNLP token output. +#' The token output also identifies entities on the single word level and it doesn't classify personal +#' pronouns as entities. The net effect if set to FALSE is that entity mentions are extracted but +#' personal pronouns and other potential entity mentions that are not entities on the token level +#' are not extracted. #' @return data.frame with the details of the detected entities. The output data.frame has three #' columns. \itemize{ #' \item \code{id} integer: the row index of the input file that has an extracted entity. @@ -21,17 +31,19 @@ #' @importFrom rJava .jcall #' @examples #' \dontrun{ +#' simple.input.test <- c("John is a person", "Google is a company", "This is nothing") +#' input.file <- tempfile() #' file <- file(input.file, "wb") # need linux style line endings #' writeLines(simple.input.test, con = file) #' close(file) #' keys <- c("ssplit.eolonly", "annotators", "outputFormat", "file", "outputDirectory") -#' values <- c("true", "tokenize,ssplit,pos,lemma,ner", "json", input.file, dirname(tmp.file)) +#' values <- c("true", "tokenize,ssplit,pos,lemma,ner", "json", input.file, dirname(input.file)) #' #' cnlp_init_corenlp_custom(language = "en", mem = "2g", keys = keys, values = values) #' simple.output <- NERAnnotate(input.file) #' } #' @export -NERAnnotate <- function(input.file) { +NERAnnotate <- function(input.file, entity.mentions.only = FALSE) { if(!volatiles$corenlp$init) stop("Java CoreNLP not initialized. Named Entity Recognition cannot be executed.") @@ -39,15 +51,39 @@ NERAnnotate <- function(input.file) { .jcall(volatiles$corenlp$corenlp, "V", "run") output <- fromJSON(paste0(input.file, ".json")) - relevant.cols = c("text", "ner") ner.mentions = output$sentences$entitymentions response = sapply(ner.mentions, function(x) nrow(x)) if(all(sapply(response, is.null))) { out <- data.frame(id = character(), entity = character(), entity.type = character()) } else { - response = rep(1:length(ner.mentions), response) - ner.mentions = lapply(ner.mentions, function(x) {if(nrow(x) != 0) { - subset(x, select = relevant.cols) + if(!entity.mentions.only) { + # Validate ner.mentions against the token output + ner.mentions <- mapply(function(x, y) { + if(nrow(y) != 0) { + idx <- sapply(1:nrow(y), function(i) { + z <- y[i, ] + ind <- c(z$tokenBegin + 1, z$tokenEnd) + all(x[ind, ]$ner != "O")}) + if(any(idx)) + { + y <- y[idx, ] + } else + { + return(data.frame()) + } + } + y + }, x = output$sentences$tokens, y = ner.mentions) + response <- sapply(ner.mentions, function(x) nrow(x)) + # Check if filtered ner is not empty + if(all(sapply(response, is.null))) { + return(data.frame(id = character(), entity = character(), entity.type = character())) + } + } + response <- rep(1:length(ner.mentions), response) + ner.mentions = lapply(ner.mentions, function(x) { + if(nrow(x) != 0) { + subset(x, select = c("text", "ner")) }}) # Remove the NULL list elements ner.mentions <- Filter(Negate(is.null), ner.mentions) @@ -55,4 +91,4 @@ NERAnnotate <- function(input.file) { names(out) <- c("id", "entity", "entity.type") } out -} \ No newline at end of file +} diff --git a/man/NERAnnotate.Rd b/man/NERAnnotate.Rd index 9f58dc8..8c0af28 100644 --- a/man/NERAnnotate.Rd +++ b/man/NERAnnotate.Rd @@ -4,11 +4,22 @@ \alias{NERAnnotate} \title{Run the annotation pipeline on a set of documents to extract entities} \usage{ -NERAnnotate(input.file) +NERAnnotate(input.file, entity.mentions.only = FALSE) } \arguments{ \item{input.file}{a character string showing the path to the file to be processed. The file should have text with Unix style line endings (will throw Nullpointer exception if not)} + +\item{entity.mentions.only}{Logical to specify if only entity mention output from CoreNLP is used +in the extraction. If TRUE, this will extract personal pronouns as well as standard entities. +The benefit of the entity.mention output is it groups words that are from the same entity. E.g. +'John Smith' is a single person entity and 'New York City' is a single location entity. The +entity mention output also classifies personal pronouns as entities and will be extracted. +If FALSE, the entity.mentions output from CoreNLP is validated against the CoreNLP token output. +The token output also identifies entities on the single word level and it doesn't classify personal +pronouns as entities. The net effect if set to FALSE is that entity mentions are extracted but +personal pronouns and other potential entity mentions that are not entities on the token level +are not extracted.} } \value{ data.frame with the details of the detected entities. The output data.frame has three diff --git a/tests/testthat/test-entity.R b/tests/testthat/test-entity.R index 4ba5763..ca4f978 100644 --- a/tests/testthat/test-entity.R +++ b/tests/testthat/test-entity.R @@ -1,7 +1,7 @@ library(testthat) -# Input has variety of entities -simple.input.test <- c("There is a person called Julie that went down the lane.", # Person, Julie, +# Input has variety of entities, first entry has person and personal pronoun +simple.input.test <- c("There is a person called Julie that went down the lane. She likes bubbles", "Toys are fine", #No entities "There is trouble brewing in Hong Kong", #Location Hong Kong "There are two people caled Jane and John") @@ -17,6 +17,14 @@ simple.expected <- structure(list(id = c(1L, 3L, 4L, 4L, 4L), class = "data.frame", row.names = c(NA, -5L)) +simple.with.pronouns.expected <- structure(list(id = c(1L, 1L, 3L, 4L, 4L, 4L), + entity = c("Julie", "She", "Hong Kong", "two", "Jane", "John"), + entity.type = c("PERSON", "PERSON", "CITY", "NUMBER", "PERSON", + "PERSON")), + class = "data.frame", + row.names = c(NA, -6L)) + + none.expected <- data.frame(id = character(), entity = character(), entity.type = character()) # If this is throwing errors that you need to download Core NLP then the way to get testthat to @@ -42,9 +50,12 @@ test_that("NERAnnotate consistency", { cnlp_init_corenlp_custom(language = "en", mem = "2g", keys = keys, values = values, corenlp.only = TRUE) - simple.output <- NERAnnotate(tmp.file) + expect_error(simple.output <- NERAnnotate(tmp.file), NA) expect_identical(simple.output, simple.expected) + expect_error(simple.output.with.pronouns <- NERAnnotate(tmp.file, entity.mentions.only = TRUE), NA) + expect_identical(simple.output.with.pronouns, simple.with.pronouns.expected) + file <- file(tmp.file, "wb") writeLines(none.input, con = file) close(file) @@ -52,4 +63,4 @@ test_that("NERAnnotate consistency", { none.output <- NERAnnotate(tmp.file) expect_identical(none.output, none.expected) -}) \ No newline at end of file +})