Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: cleanNLP
Type: Package
Title: A Tidy Data Model for Natural Language Processing
Version: 2.4.2
Version: 2.4.3
Authors@R: c(person(given = "Justin", family = "Wishart", email = "justin.wishart@displayr.com", role = "ctb"),
person(given = "Taylor B.", family = "Arnold", email = "taylor.arnold@acm.org", role = c("aut", "cre")))
Description: Provides a set of fast tools for converting a textual corpus into a set of normalized
Expand Down
16 changes: 8 additions & 8 deletions R/ner_annotate.R
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,6 @@
#' in the document whre each entity occurs and the entity type. If no entities are detected for a
#' document then an empty data.frame with no rows is returned.
#'
#' @param input.file a character string showing the path to the file to be processed. The file should
#' have text with Unix style line endings (will throw Nullpointer exception if not)
#' @param entity.mentions.only Logical to specify if only entity mention output from CoreNLP is used
#' in the extraction. If TRUE, this will extract personal pronouns as well as standard entities.
#' The benefit of the entity.mention output is it groups words that are from the same entity. E.g.
Expand Down Expand Up @@ -42,17 +40,19 @@
#' "true")
#'
#' cnlp_init_corenlp_custom(language = "en", mem = "2g", keys = keys, values = values)
#' simple.output <- NERAnnotate(input.file)
#' simple.output <- NERAnnotate()
#' }
#' @export
NERAnnotate <- function(input.file, entity.mentions.only = FALSE) {
NERAnnotate <- function(entity.mentions.only = FALSE)
{
if(!volatiles$corenlp$init)
stop("Java CoreNLP not initialized. Named Entity Recognition cannot be executed.")

if(is.null(volatiles$corenlp$properties$file))
stop("Java CoreNLP properties doesn't have an input file path.",
"Please set the input file path via cnlp_init_corenlp_custom")
.jcall(volatiles$corenlp$corenlp, "V", "run")

output <- fromJSON(paste0(input.file, ".json"))
output <- fromJSON(paste0(volatiles$corenlp$properties$file, ".json"))
ner.mentions = output$sentences$entitymentions
response = sapply(ner.mentions, nrow)
if(all(sapply(response, is.null)))
Expand All @@ -77,7 +77,7 @@ NERAnnotate <- function(input.file, entity.mentions.only = FALSE) {
return(data.frame())
}
y
}, x = output$sentences$tokens, y = ner.mentions)
}, x = output$sentences$tokens, y = ner.mentions, SIMPLIFY = FALSE)
# Check if filtered ner is not empty
if(all(sapply(ner.mentions, nrow) == 0))
return(data.frame(id = character(), entity = character(), entity.type = character()))
Expand Down
7 changes: 2 additions & 5 deletions man/NERAnnotate.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

24 changes: 19 additions & 5 deletions tests/testthat/test-entity.R
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,15 @@ simple.with.pronouns.expected <- structure(list(id = c(1L, 1L, 3L, 4L, 4L, 4L),

pronouns <- c("he's", "hes", "he is", "He is", "He Is", "she's", "She is")

all.single.entity <- as.character(1:3)

none.expected <- data.frame(id = character(), entity = character(), entity.type = character())

all.single.output <- structure(list(id = 1:3,
entity = c("1", "2", "3"),
entity.type = c("NUMBER", "NUMBER", "NUMBER")),
class = "data.frame", row.names = c(NA, -3L))

# If this is throwing errors that you need to download Core NLP then the way to get testthat to
# find CORENLP is to set CORENLP as a system environment variable with the path to CoreNLP
# CoreNLP directories in the package installation cannot be located by testthat
Expand All @@ -45,29 +52,36 @@ test_that("NERAnnotate consistency", {
values <- c("true", "tokenize,ssplit,pos,lemma,ner", "json", tmp.file, dirname(tmp.file))

# Expect error if NERAnnotate is called before corenlp is initialised.
expect_error(NERAnnotate(tmp.file),
expect_error(NERAnnotate(),
"^Java CoreNLP not initialized. Named Entity Recognition cannot be executed.$")

cnlp_init_corenlp_custom(language = "en", mem = "2g", keys = keys, values = values,
corenlp.only = TRUE)

expect_error(simple.output <- NERAnnotate(tmp.file), NA)
expect_error(simple.output <- NERAnnotate(), NA)
expect_identical(simple.output, simple.expected)

expect_error(simple.output.with.pronouns <- NERAnnotate(tmp.file, entity.mentions.only = TRUE), NA)
expect_error(simple.output.with.pronouns <- NERAnnotate(entity.mentions.only = TRUE), NA)
expect_identical(simple.output.with.pronouns, simple.with.pronouns.expected)

file <- file(tmp.file, "wb")
writeLines(none.input, con = file)
close(file)

none.output <- NERAnnotate(tmp.file)
none.output <- NERAnnotate()
expect_identical(none.output, none.expected)

file <- file(tmp.file, "wb")
writeLines(pronouns, con = file)
close(file)

expect_error(pronoun.output.after.validation <- NERAnnotate(tmp.file, entity.mentions.only = FALSE), NA)
expect_error(pronoun.output.after.validation <- NERAnnotate(entity.mentions.only = FALSE), NA)
expect_identical(pronoun.output.after.validation, none.expected)

file <- file(tmp.file, "wb")
writeLines(all.single.entity, con = file)
close(file)

expect_error(all.single.entity.output <- NERAnnotate(entity.mentions.only = FALSE), NA)
expect_identical(all.single.entity.output, all.single.output)
})