## Load required libraries

In [1]:
# Set working directory
# setwd()

# Needed for OutOfMemoryError: Java heap space 
library(rJava)
.jinit(parameters="-Xmx4g")
# If there are more memory problems, invoke gc() after the POS tagging

# The openNLPmodels.en library is not in CRAN; it has to be installed from another repository
#install.packages("openNLPmodels.en", repos = "http://datacube.wu.ac.at")

library(NLP)
library(openNLP) 
library(openNLPmodels.en)
library(tm)

## Auxiliary Functions

In [2]:
# Returns annotations for the text document: word, sentence, part-of-speech, and Penn Treebank parse annotations

getAnnotationsFromDocument = function(doc){
  x=as.String(doc)
  sent_token_annotator <- Maxent_Sent_Token_Annotator()
  word_token_annotator <- Maxent_Word_Token_Annotator()
  pos_tag_annotator <- Maxent_POS_Tag_Annotator()
  y1 <- annotate(x, list(sent_token_annotator, word_token_annotator))
  y2 <- annotate(x, pos_tag_annotator, y1)
  parse_annotator <- Parse_Annotator()
  y3 <- annotate(x, parse_annotator, y2)
  return(y3)  
} 

In [3]:
# Returns the text document merged with the annotations

getAnnotatedMergedDocument = function(doc,annotations){
  x=as.String(doc)
  y2w <- subset(annotations, type == "word")
  tags <- sapply(y2w$features, '[[', "POS")
  r1 <- sprintf("%s/%s", x[y2w], tags)
  r2 <- paste(r1, collapse = " ")
  return(r2)  
} 

In [4]:
# Returns the text document along with its annotations in an AnnotatedPlainTextDocument

getAnnotatedPlainTextDocument = function(doc,annotations){
  x=as.String(doc)
  a = AnnotatedPlainTextDocument(x,annotations)
  return(a)  
} 

## Load the corpus

In [5]:
# We load the corpus , we just take a small subset of 15 docs

source.pos = DirSource("txt_sentoken/pos/smallSet", encoding = "UTF-8")
corpus = Corpus(source.pos)

## Annotate corpus

In [6]:
# We apply the getAnnotationsFromDocument function to every document in the corpus using lapply

annotations = lapply(corpus, getAnnotationsFromDocument)

In [7]:
# We can create AnnotatedPlainTextDocuments that attach the annotations to the document and store the annotated corpus in another variable 
# (since we destroy the corpus metadata).

corpus.tagged = Map(getAnnotatedPlainTextDocument, corpus, annotations)
corpus.tagged[[1]]

<<AnnotatedPlainTextDocument>>
Metadata:  0
Annotations:  length: 849
Content:  chars: 4226

In [9]:
# There are functions for accessing parts of an AnnotatedPlainTextDocument.

doc = corpus.tagged[[1]] 
doc

<<AnnotatedPlainTextDocument>>
Metadata:  0
Annotations:  length: 849
Content:  chars: 4226

In [10]:
# The two sentences chosen are:
sents(doc)[[3]]
sents(doc)[[5]]

In [11]:
# Its taggs are:
tagged_sents(doc)[[3]]
tagged_sents(doc)[[5]]

to/TO
say/VB
moore/NN
and/CC
campbell/NN
thoroughly/RB
researched/VBD
the/DT
subject/NN
of/IN
jack/NN
the/DT
ripper/NN
would/MD
be/VB
like/IN
saying/VBG
michael/NN
jackson/NN
is/VBZ
starting/VBG
to/TO
look/VB
a/DT
little/JJ
odd/JJ
./.

if/IN
you/PRP
can/MD
get/VB
past/IN
the/DT
whole/JJ
comic/JJ
book/NN
thing/NN
,/,
you/PRP
might/MD
find/VB
another/DT
stumbling/JJ
block/NN
in/IN
from/IN
hell/NN
's/POS
directors/NNS
,/,
albert/NN
and/CC
allen/JJ
hughes/NNS
./.

## If we go word by word applying "The Penn Treebank" tagset:

### First sentence:

* to —> TO
* say —> VB 
* moore -> NN
* and/CC —> CC
* campbell -> NN
* thoroughly -> RB
* researched -> VBD
* the -> DT
* subject -> NN
* of -> IN
* jack -> NN
* the -> DT
* ripper -> NN
* would -> MD
* be -> VB
* like -> IN
* saying -> VBG
* michael -> NN
* jackson -> NN
* is -> VBZ
* starting ->VBG
* to -> TO
* look -> VB
* a -> DT
* little/JJ
* odd -> JJ
* . -> .

#### Mistakes: 0, Accuracy: 100%

### Second sentence:

* if —> IN
* you -> PRP
* can -> MD
* get -> VB
* past -> IN
* the -> DT
* whole -> JJ
* comic -> NN
* book -> NN
* thing -> NN
* , -> ,
* you -> PRP
* might -> MD
* find -> VB
* another -> DT
* stumbling -> JJ
* block -> NN
* in -> IN
* from -> IN
* hell -> NN
* ’s -> POS
* directors -> NNS
* , -> ,
* albert -> NN
* and -> CC
* allen —> NN
* hughes -> NN
* . -> .

#### Mistakes: 3, Accuracy: 25 out of 28: 89'28%