Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 43 additions & 7 deletions R/ner_annotate.R
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,16 @@
#'
#' @param input.file a character string showing the path to the file to be processed. The file should
#' have text with Unix style line endings (will throw Nullpointer exception if not)
#' @param entity.mentions.only Logical to specify if only entity mention output from CoreNLP is used
#' in the extraction. If TRUE, this will extract personal pronouns as well as standard entities.
#' The benefit of the entity.mention output is it groups words that are from the same entity. E.g.
#' 'John Smith' is a single person entity and 'New York City' is a single location entity. The
#' entity mention output also classifies personal pronouns as entities and will be extracted.
#' If FALSE, the entity.mentions output from CoreNLP is validated against the CoreNLP token output.
#' The token output also identifies entities on the single word level and it doesn't classify personal
#' pronouns as entities. The net effect if set to FALSE is that entity mentions are extracted but
#' personal pronouns and other potential entity mentions that are not entities on the token level
#' are not extracted.
#' @return data.frame with the details of the detected entities. The output data.frame has three
#' columns. \itemize{
#' \item \code{id} integer: the row index of the input file that has an extracted entity.
Expand All @@ -21,38 +31,64 @@
#' @importFrom rJava .jcall
#' @examples
#' \dontrun{
#' simple.input.test <- c("John is a person", "Google is a company", "This is nothing")
#' input.file <- tempfile()
#' file <- file(input.file, "wb") # need linux style line endings
#' writeLines(simple.input.test, con = file)
#' close(file)
#' keys <- c("ssplit.eolonly", "annotators", "outputFormat", "file", "outputDirectory")
#' values <- c("true", "tokenize,ssplit,pos,lemma,ner", "json", input.file, dirname(tmp.file))
#' values <- c("true", "tokenize,ssplit,pos,lemma,ner", "json", input.file, dirname(input.file))
#'
#' cnlp_init_corenlp_custom(language = "en", mem = "2g", keys = keys, values = values)
#' simple.output <- NERAnnotate(input.file)
#' }
#' @export
NERAnnotate <- function(input.file) {
NERAnnotate <- function(input.file, entity.mentions.only = FALSE) {

if(!volatiles$corenlp$init)
stop("Java CoreNLP not initialized. Named Entity Recognition cannot be executed.")

.jcall(volatiles$corenlp$corenlp, "V", "run")

output <- fromJSON(paste0(input.file, ".json"))
relevant.cols = c("text", "ner")
ner.mentions = output$sentences$entitymentions
response = sapply(ner.mentions, function(x) nrow(x))
if(all(sapply(response, is.null))) {
out <- data.frame(id = character(), entity = character(), entity.type = character())
} else {
response = rep(1:length(ner.mentions), response)
ner.mentions = lapply(ner.mentions, function(x) {if(nrow(x) != 0) {
subset(x, select = relevant.cols)
if(!entity.mentions.only) {
# Validate ner.mentions against the token output
ner.mentions <- mapply(function(x, y) {
if(nrow(y) != 0) {
idx <- sapply(1:nrow(y), function(i) {
z <- y[i, ]
ind <- c(z$tokenBegin + 1, z$tokenEnd)
all(x[ind, ]$ner != "O")})
if(any(idx))
{
y <- y[idx, ]
} else
{
return(data.frame())
}
}
y
}, x = output$sentences$tokens, y = ner.mentions)
response <- sapply(ner.mentions, function(x) nrow(x))
# Check if filtered ner is not empty
if(all(sapply(response, is.null))) {
return(data.frame(id = character(), entity = character(), entity.type = character()))
}
}
response <- rep(1:length(ner.mentions), response)
ner.mentions = lapply(ner.mentions, function(x) {
if(nrow(x) != 0) {
subset(x, select = c("text", "ner"))
}})
# Remove the NULL list elements
ner.mentions <- Filter(Negate(is.null), ner.mentions)
out = cbind(response, do.call(rbind.data.frame, ner.mentions))
names(out) <- c("id", "entity", "entity.type")
}
out
}
}
13 changes: 12 additions & 1 deletion man/NERAnnotate.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

19 changes: 15 additions & 4 deletions tests/testthat/test-entity.R
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
library(testthat)

# Input has variety of entities
simple.input.test <- c("There is a person called Julie that went down the lane.", # Person, Julie,
# Input has variety of entities, first entry has person and personal pronoun
simple.input.test <- c("There is a person called Julie that went down the lane. She likes bubbles",
"Toys are fine", #No entities
"There is trouble brewing in Hong Kong", #Location Hong Kong
"There are two people caled Jane and John")
Expand All @@ -17,6 +17,14 @@ simple.expected <- structure(list(id = c(1L, 3L, 4L, 4L, 4L),
class = "data.frame",
row.names = c(NA, -5L))

simple.with.pronouns.expected <- structure(list(id = c(1L, 1L, 3L, 4L, 4L, 4L),
entity = c("Julie", "She", "Hong Kong", "two", "Jane", "John"),
entity.type = c("PERSON", "PERSON", "CITY", "NUMBER", "PERSON",
"PERSON")),
class = "data.frame",
row.names = c(NA, -6L))


none.expected <- data.frame(id = character(), entity = character(), entity.type = character())

# If this is throwing errors that you need to download Core NLP then the way to get testthat to
Expand All @@ -42,14 +50,17 @@ test_that("NERAnnotate consistency", {
cnlp_init_corenlp_custom(language = "en", mem = "2g", keys = keys, values = values,
corenlp.only = TRUE)

simple.output <- NERAnnotate(tmp.file)
expect_error(simple.output <- NERAnnotate(tmp.file), NA)
expect_identical(simple.output, simple.expected)

expect_error(simple.output.with.pronouns <- NERAnnotate(tmp.file, entity.mentions.only = TRUE), NA)
expect_identical(simple.output.with.pronouns, simple.with.pronouns.expected)

file <- file(tmp.file, "wb")
writeLines(none.input, con = file)
close(file)

none.output <- NERAnnotate(tmp.file)
expect_identical(none.output, none.expected)

})
})