Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
15 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .Rbuildignore
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,4 @@
^docs
^.*\.Rproj$
^\.Rproj\.user$
^stanford-corenlp-full-2018-10-05.zip$
57 changes: 53 additions & 4 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -1,7 +1,56 @@
# R for travis: see documentation at https://docs.travis-ci.com/user/languages/r

language: R
sudo: false
language: r
r:
- release
dist: trusty
sudo: required
cache: packages
warnings_are_errors: false

# install debian libraries to match R-servers
# update pre-installed packages to latest versions
before_install:
- sudo add-apt-repository -y ppa:ubuntugis/ubuntugis-unstable
- sudo apt-get -qq update
- sudo apt-get install -y libgdal-dev libproj-dev python-protobuf libprotoc-dev libprotobuf-dev libv8-dev librsvg2-dev libmpfr-dev
- sudo apt install default-jdk
- wget http://nlp.stanford.edu/software/stanford-corenlp-full-2018-10-05.zip
- unzip -d $HOME stanford-corenlp-full-2018-10-05.zip
- export CORENLP=$HOME/stanford-corenlp-full-2018-10-05
- wget http://nlp.stanford.edu/software/stanford-english-corenlp-2018-10-05-models.jar -P $CORENLP/
- rcode="tfile <- tempfile(); capture.output(res<-devtools::test(), file = tfile); out <- readLines(tfile); cat(out, sep = '\n'); "
- rcode+="n.fail <- as.numeric(sub('Failed:[[:space:]]', '', out[grep('Failed:[[:space:]]', out)])); "
- rcode+="res <- as.data.frame(res); out <- data.frame(file = unlist(res[['file']]), warning = unlist(res[['warning']])); "
- rcode+="write.csv(out, file='test_results.csv'); "
- rcode+="quit(status = !identical(n.fail, 0), save='no');"
- Rscript tools/travis_run_before_install.R

r_packages:
- devtools
- roxygen2
- covr

r_github_packages:
- Displayr/flipDevTools

script:
- R CMD build --no-manual --no-build-vignettes --no-resave-data .
- R CMD check --as-cran --no-manual --ignore-vignettes --no-tests *.tar.gz
- if [ -d tests/testthat ]; then
Rscript --default-packages='datasets,utils,grDevices,graphics,stats,methods' -e "$rcode";
fi

notifications:
slack:
rooms:
- displayr:FTgSTNHC2rpanhJMGTKMwZXM#github-notifications
template:
- "Build <%{build_url}|#%{build_number}> %{result} in %{repository_name}@%{branch} by %{author}: <%{compare_url}|%{commit_message}>"
on_success: change
on_failure: always

# Warning notifications and downstream package builds are implemented
# by calling R functions so they can be updated in this package without
# committing a new change to .travis.yml in each repository
after_success:
- Rscript -e "require(flipDevTools); NotifyWarnings(); TriggerDownstreamBuilds()"
- travis_wait Rscript -e "flipDevTools::CheckCoverage()"
14 changes: 7 additions & 7 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,10 +1,9 @@
Package: cleanNLP
Type: Package
Title: A Tidy Data Model for Natural Language Processing
Version: 2.4.1
Authors@R: c(person(given = "Taylor B.", family = "Arnold", email = "taylor.arnold@acm.org", role = c("aut", "cre")))
Author: Taylor B. Arnold [aut, cre]
Maintainer: Taylor B. Arnold <taylor.arnold@acm.org>
Version: 2.4.2
Authors@R: c(person(given = "Justin", family = "Wishart", email = "justin.wishart@displayr.com", role = "ctb"),
person(given = "Taylor B.", family = "Arnold", email = "taylor.arnold@acm.org", role = c("aut", "cre")))
Description: Provides a set of fast tools for converting a textual corpus into a set of normalized
tables. Users may make use of the 'udpipe' back end with no external dependencies, a Python back
end with 'spaCy' <https://spacy.io> or the Java back end 'CoreNLP'
Expand All @@ -17,6 +16,8 @@ Depends:
R (>= 2.10)
Imports:
dplyr (>= 0.7.4),
jsonlite,
rJava (>= 0.9-8),
Matrix (>= 1.2),
stringi,
stats,
Expand All @@ -25,16 +26,15 @@ Imports:
Suggests:
udpipe (>= 0.3),
reticulate (>= 1.4),
rJava (>= 0.9-8),
RCurl (>= 1.95),
knitr (>= 1.15),
rmarkdown (>= 1.4),
testthat (>= 1.0.1),
covr (>= 2.2.2)
SystemRequirements: Python (>= 2.7.0); spaCy <https://spacy.io/> (>= 2.0); Java (>= 7.0); Stanford CoreNLP <http://nlp.stanford.edu/software/corenlp.shtml> (>= 3.9.2)
License: LGPL-2
URL: https://statsmaths.github.io/cleanNLP/
BugReports: http://github.com/statsmaths/cleanNLP/issues
BugReports: http://github.com/Displayr/cleanNLP/issues
LazyData: true
VignetteBuilder: knitr
Encoding: UTF-8
RoxygenNote: 6.1.1
3 changes: 3 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Generated by roxygen2: do not edit by hand

S3method(print,annotation)
export(NERAnnotate)
export(cnlp_annotate)
export(cnlp_combine_documents)
export(cnlp_download_corenlp)
Expand Down Expand Up @@ -46,3 +47,5 @@ export(run_annotators)
export(tidy_pca)
export(to_CoNNL)
export(write_annotation)
importFrom(jsonlite,fromJSON)
importFrom(rJava,.jcall)
2 changes: 1 addition & 1 deletion R/accessors.R
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ cnlp_get_token <- function(annotation, include_root = FALSE,
res <- res[res$tid > 0,]

if (spaces) {
res <- dplyr::group_by_(res, "id")
res <- dplyr::group_by(res, "id")
res <- dplyr::mutate(res,
spaces = dplyr::lead(cid, default = 0) - cid -
stringi::stri_length(word))
Expand Down
2 changes: 1 addition & 1 deletion R/annotate.R
Original file line number Diff line number Diff line change
Expand Up @@ -201,4 +201,4 @@ cnlp_quick <- function(input, ...) {
names(df)[c(1L, 4L)] <- c("doc_id", "token")

return(df)
}
}
34 changes: 18 additions & 16 deletions R/backend_corenlp.R
Original file line number Diff line number Diff line change
Expand Up @@ -394,6 +394,8 @@ cnlp_init_corenlp <- function(language, anno_level = 2, lib_location = NULL,
#' written to the console or suppressed?
#' @param keys vector string of flags to add to the corenlp calls
#' @param values vector string of values paired with the flags.
#' @param corenlp.only Logical to specify if only coreNLP should be initisalised
#' and not the custom Annotator.
#'
#' The example below shows how to initialise corenlp to run named entity
#' recognition (ner) with its respective dependencies (tokenize, ssplit, pos
Expand All @@ -408,31 +410,32 @@ cnlp_init_corenlp <- function(language, anno_level = 2, lib_location = NULL,
#'
#' @export

cnlp_init_corenlp_custom <- function(language, lib_location = NULL,
mem = "6g", verbose = FALSE, keys, values) {
cnlp_init_corenlp_custom <- function(language, lib_location = NULL, mem = "6g", verbose = FALSE,
keys, values, corenlp.only = FALSE) {
if (missing(language)) language <- "en"
language_list <- c("en", "de", "fr", "es", "ar", "zh")
language <- match.arg(arg = language, choices = language_list)
cnlp_init_corenlp_volatiles(language, lib_location, mem, verbose)
setup_corenlp_backend_raw(keys, values)
invisible(init_corenlp_backend())
invisible(init_corenlp_backend(corenlp.only = corenlp.only))
}

cnlp_init_corenlp_volatiles <- function(language, lib_location, mem, verbose) {
if (is.null(lib_location))
lib_location <- file.path(system.file("extdata", package="cleanNLP"),
"/stanford-corenlp-full-2018-10-05")
if (is.null(lib_location)) {
if(Sys.getenv("CORENLP") == ""){
lib_location <- file.path(system.file("extdata", package="cleanNLP"),
"stanford-corenlp-full-2018-10-05")
} else {
lib_location <- Sys.getenv("CORENLP")
}
}

# set properties
volatiles$corenlp$language <- language
volatiles$corenlp$lib_location <- lib_location
volatiles$corenlp$mem <- mem
volatiles$corenlp$verbose <- verbose
volatiles$corenlp$properties <- list()

fin <- file.path(lib_location, "/properties.rds")
if (file.exists(fin))
volatiles$corenlp$properties <- readRDS(fin)
}

setup_corenlp_backend_raw <- function(keys, values, clear = FALSE) {
Expand All @@ -454,7 +457,7 @@ setup_corenlp_backend_raw <- function(keys, values, clear = FALSE) {
volatiles$corenlp$properties[[keys[i]]] <- values[i]
}

init_corenlp_backend <- function() {
init_corenlp_backend <- function(corenlp.only = FALSE) {

if (!requireNamespace("rJava")) {
stop("The rJava package is required to use the corenlp backend")
Expand Down Expand Up @@ -542,8 +545,9 @@ init_corenlp_backend <- function() {

volatiles$corenlp$corenlp <-
rJava::.jnew("edu.stanford.nlp.pipeline.StanfordCoreNLP", prop)
volatiles$corenlp$AnnotationProcessor <-
rJava::.jnew("edu.richmond.nlp.AnnotationProcessor")
if(!corenlp.only)
volatiles$corenlp$AnnotationProcessor <-
rJava::.jnew("edu.richmond.nlp.AnnotationProcessor")
if (!volatiles$corenlp$verbose)
rJava::.jcall("java/lang/System", "V", "setErr", err)

Expand Down Expand Up @@ -624,6 +628,4 @@ annotate_with_corenlp <- function(input, as_strings, verbose) {
out <- cnlp_read_csv(output_dir)

return(out)
}


}
16 changes: 8 additions & 8 deletions R/io.R
Original file line number Diff line number Diff line change
Expand Up @@ -177,10 +177,10 @@ cnlp_read_conll <- function(file) {
source <- x$X7

# create token table
token <- dplyr::data_frame(id = id + 1, sid = sid, tid = tid,
word = word, lemma = lemma,
upos = upos, pos = pos,
cid = NA_integer_)
token <- dplyr::tibble(id = id + 1, sid = sid, tid = tid,
word = word, lemma = lemma,
upos = upos, pos = pos,
cid = NA_integer_)

roots <- token[tid == 1,]
roots$tid <- 0L
Expand All @@ -191,10 +191,10 @@ cnlp_read_conll <- function(file) {

# create dependency table
tid_target <- tid
dep <- dplyr::data_frame(id = id + 1, sid = sid, tid = source,
tid_target = tid_target,
relation = relation,
relation_full = relation)
dep <- dplyr::tibble(id = id + 1, sid = sid, tid = source,
tid_target = tid_target,
relation = relation,
relation_full = relation)

# create annotation object
anno <- list()
Expand Down
58 changes: 58 additions & 0 deletions R/ner_annotate.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
#' Run the annotation pipeline on a set of documents to extract entities
#'
#' Runs the entity detection algorithms from CoreNLP using CoreNLP java library via rJava.
#' It expects the CoreNLP java object to already be initialised with rJava with a call to
#' \code{cnlp_init_corenlp_custom} with the appropriate annotators setup for named entity
#' recognition and a path to an input file that has the input text separately by new lines.
#' The input file must have Unix style line endings or will cause the CoreNLP java call to crash
#' with a null pointer exception. The function returns a \code{data.frame} showing the location
#' in the document whre each entity occurs and the entity type. If no entities are detected for a
#' document then an empty data.frame with no rows is returned.
#'
#' @param input.file a character string showing the path to the file to be processed. The file should
#' have text with Unix style line endings (will throw Nullpointer exception if not)
#' @return data.frame with the details of the detected entities. The output data.frame has three
#' columns. \itemize{
#' \item \code{id} integer: the row index of the input file that has an extracted entity.
#' \item \code{entity} character: The extracted entity word (e.g. William)
#' \item \code{entity.tyoe} character: The entity type of the extracted entity (e.g. Person)
#' }
#' @importFrom jsonlite fromJSON
#' @importFrom rJava .jcall
#' @examples
#' \dontrun{
#' file <- file(input.file, "wb") # need linux style line endings
#' writeLines(simple.input.test, con = file)
#' close(file)
#' keys <- c("ssplit.eolonly", "annotators", "outputFormat", "file", "outputDirectory")
#' values <- c("true", "tokenize,ssplit,pos,lemma,ner", "json", input.file, dirname(tmp.file))
#'
#' cnlp_init_corenlp_custom(language = "en", mem = "2g", keys = keys, values = values)
#' simple.output <- NERAnnotate(input.file)
#' }
#' @export
NERAnnotate <- function(input.file) {

if(!volatiles$corenlp$init)
stop("Java CoreNLP not initialized. Named Entity Recognition cannot be executed.")

.jcall(volatiles$corenlp$corenlp, "V", "run")

output <- fromJSON(paste0(input.file, ".json"))
relevant.cols = c("text", "ner")
ner.mentions = output$sentences$entitymentions
response = sapply(ner.mentions, function(x) nrow(x))
if(all(sapply(response, is.null))) {
out <- data.frame(id = character(), entity = character(), entity.type = character())
} else {
response = rep(1:length(ner.mentions), response)
ner.mentions = lapply(ner.mentions, function(x) {if(nrow(x) != 0) {
subset(x, select = relevant.cols)
}})
# Remove the NULL list elements
ner.mentions <- Filter(Negate(is.null), ner.mentions)
out = cbind(response, do.call(rbind.data.frame, ner.mentions))
names(out) <- c("id", "entity", "entity.type")
}
out
}
Loading