## Load required libraries

In [1]:
# Needed for OutOfMemoryError: Java heap space 
library(rJava)
.jinit(parameters="-Xmx4g")
# If there are more memory problems, invoke gc() after the POS tagging

library(NLP)
library(openNLP) 
library(openNLPmodels.en)
library(tm)
library(stringr)
library(SPARQL)
library(parallel)

Loading required package: XML
Loading required package: RCurl
Loading required package: bitops

Attaching package: ‘RCurl’

The following object is masked from ‘package:rJava’:

    clone



## Auxiliary Functions

In [2]:
# Returns annotations for the text document: word, sentence, part-of-speech, and Penn Treebank parse annotations

getAnnotationsFromDocument = function(doc){
  x=as.String(doc)
  sent_token_annotator <- Maxent_Sent_Token_Annotator()
  word_token_annotator <- Maxent_Word_Token_Annotator()
  pos_tag_annotator <- Maxent_POS_Tag_Annotator()
  y1 <- annotate(x, list(sent_token_annotator, word_token_annotator))
  y2 <- annotate(x, pos_tag_annotator, y1)
  # parse_annotator <- Parse_Annotator()
  # y3 <- annotate(x, parse_annotator, y2)
  return(y2)  
} 

# Returns the text document merged with the annotations

getAnnotatedMergedDocument = function(doc,annotations){
  x=as.String(doc)
  y2w <- subset(annotations, type == "word")
  tags <- sapply(y2w$features, '[[', "POS")
  r1 <- sprintf("%s/%s", x[y2w], tags)
  r2 <- paste(r1, collapse = " ")
  return(r2)  
} 

# Returns the text document along with its annotations in an AnnotatedPlainTextDocument

getAnnotatedPlainTextDocument = function(doc,annotations){
  x=as.String(doc)
  a = AnnotatedPlainTextDocument(x,annotations)
  return(a)  
} 

# detectPatternOnDocument returns the pattern detected on an AnnotatedPlainTextDocument

detectPatternOnDocument <- function(doc, pattern) {
  x=as.String(doc)
  res=str_match(x,pattern)
  
  if (length(res)==1){
    return (res)
  } else {
    if (all(is.na(res[,2:length(res)])))
      return (NA)
    else {
      ret=list()
      for (i in 2:length(res)){
        ret = paste(ret,res[i])
      }
      return(ret)
    }
  }
}

# detectPatternOnDocumentWithContext returns the pattern detected on an AnnotatedPlainTextDocument with some context.

detectPatternOnDocumentWithContext <- function(doc, pattern) {
  txt=as.String(doc)
  number=50
  coord=str_locate(txt,pattern)
  res3=substr(txt,coord[1]-number,coord[2]+number)
  return (res3)
}

# detectPatternsInCorpus returns a data frame with all the patterns detected in a corpus.

detectPatternsInCorpus = function(corpus, patterns){
  vallEntities <- data.frame(matrix(NA, ncol = length(patterns)+1, 
                                    nrow = length(corpus)))
  names(vallEntities) <- c("File",patterns)
  for (i in 1:length(patterns)) {
    vallEntities[,i+1]=unlist(lapply(corpus, detectPatternOnDocument, 
                                     pattern=patterns[i]))
  }
  for (i in 1:length(corpus)) {
    vallEntities$File[i]=meta(corpus[[i]])$id
  }
  return (vallEntities)  
}

# detectPatternsInTaggedCorpus returns a data frame with all the patterns detected in an annotated corpus.

detectPatternsInTaggedCorpus = function(corpus, taggedCorpus, patterns){
  vallEntities <- data.frame(matrix(NA, ncol = length(patterns)+1, 
                                    nrow = length(corpus)))
  names(vallEntities) <- c("File",patterns)
  for (i in 1:length(patterns)) {
    vallEntities[,i+1]=unlist(lapply(taggedCorpus, detectPatternOnDocument, 
                                     pattern=patterns[i]))
  }
  for (i in 1:length(corpus)) {
    vallEntities$File[i]=meta(corpus[[i]])$id
  }
  return (vallEntities)  
}

# countMatchesPerColumn returns the number of matches per pattern/column. Counts the number of columns with non-NA values for each pattern.

countMatchesPerColumn = function (df) {
  entityCountPerPattern <- data.frame(matrix(NA, ncol = 2, 
                                             nrow = length(names(df))-1))
  names(entityCountPerPattern) <- c("Entity","Count")
  
  for (i in 2:length(names(df))) {
    entityCountPerPattern$Entity[i-1] = names(df)[i]
    entityCountPerPattern$Count[i-1] = nrow(subset(df, !is.na(df[i])))
  }
  return (entityCountPerPattern)
}

# countMatchesPerRow returns the number of entities per file/row. Counts the number of rows with non-NA values for each file.

countMatchesPerRow = function (df) {
  entityCountPerFile <- data.frame(matrix(NA, ncol = 2, nrow = nrow(df)))
  names(entityCountPerFile) <- c("File","Count")
  
  for (i in 1:nrow(df)) {
    entityCountPerFile$File[i] = df$File[i]
    entityCountPerFile$Count[i] = length(Filter(Negate(is.na),df[i,2:length(df[i,])]))
  }
  return (entityCountPerFile[entityCountPerFile[2]!=0,])
}

# printMatchesPerPattern prints the matches found per pattern.

printMatchesPerPattern = function (patterns, matches) {
  for (i in 1:length(patterns)){
    print(paste("PATTERN: ",patterns[i]))
    strings = matches[,i+1][!is.na(unlist(matches[,i+1]))]
    print(strings)
    print(" ") 
  }
}

# mergeAllMatchesInLists returns a data frame with all the files and their matches in a single list per file.

mergeAllMatchesInLists = function (df) {
  matchesPerFile = rep(list(list()), nrow(df))
  
  for (i in 1:nrow(df)) {    
    matches=as.list(unname(unlist(Filter(Negate(is.na),df[i,2:length(df[i,])]))))
    matchesPerFile[[i]]=append(matchesPerFile[[i]],matches)
  }
  
  files = df[,1]
  matches = matchesPerFile
  
  allMatches<- data.frame(matrix(NA, ncol = 2, nrow = nrow(df)))
  names(allMatches) <- c("Files","Matches")
  
  allMatches$Files=files
  allMatches$Matches=matches
  
  return (allMatches)
}

# mergeGoldStandardInLists returns a data frame with all the files and the gold standard matches in a single list per file.

mergeGoldStandardInLists = function (df) {
  matchesPerFile = rep(list(list()), nrow(df))
  
  for (i in 1:nrow(df)) {    
    matches=as.list(unlist(Filter(Negate(is.na),df[i,2:length(df)])))
    matchesPerFile[[i]]=append(matchesPerFile[[i]],matches)
  }
  
  files = df[,1]
  matches = matchesPerFile
  
  allMatches<- data.frame(matrix(NA, ncol = 2, nrow = nrow(df)))
  names(allMatches) <- c("Files","Matches")
  
  allMatches$Files=files
  allMatches$Matches=matches
  
  return (allMatches)
}

# calculateMetrics calculates precision, recall and f-measure according to a gold standard.

calculateMetrics = function (matches, matches.gs) {
  
  metrics<- data.frame(matrix(NA, ncol = 3, nrow = 1))
  names(metrics) <- c("Precision","Recall","Fmeasure")
  
  numCorrect = 0
  allAnswers = 0
  possibleAnswers = 0
  
  for (i in 1:nrow(matches)) {    
    if (length(matches.gs$Matches[[i]])!=0) {
      l = str_trim(unlist(matches[i,2]))
      l.gs = unname(unlist(matches.gs[i,2]))
      intersection = intersect(l, l.gs)
      numCorrect = numCorrect + length(intersect(l, l.gs))
      allAnswers = allAnswers + length (l)
      possibleAnswers = possibleAnswers + length(l.gs)    
    }
  }
  
  metrics$Precision = numCorrect / allAnswers
  metrics$Recall = numCorrect / possibleAnswers
  
  beta = 1
  metrics$Fmeasure= ((sqrt(beta)+1) * metrics$Precision * metrics$Recall) / 
    ((sqrt(beta)*metrics$Precision) + metrics$Recall)
  
  return(metrics)
}

## Load the corpus

In [3]:
source.pos = DirSource("/Users/franlosada/Documents/EIT 1st year/Intelligent Systems/naturalLenguage/review_polarity/txt_sentoken/pos", encoding = "UTF-8")
corpus = Corpus(source.pos)

In [4]:
inspect(corpus[[1]])

<<PlainTextDocument>>
Metadata:  7
Content:  chars: 4226

films adapted from comic books have had plenty of success , whether they're about superheroes ( batman , superman , spawn ) , or geared toward kids ( casper ) or the arthouse crowd ( ghost world ) , but there's never really been a comic book like from hell before . 
for starters , it was created by alan moore ( and eddie campbell ) , who brought the medium to a whole new level in the mid '80s with a 12-part series called the watchmen . 
to say moore and campbell thoroughly researched the subject of jack the ripper would be like saying michael jackson is starting to look a little odd . 
the book ( or " graphic novel , " if you will ) is over 500 pages long and includes nearly 30 more that consist of nothing but footnotes . 
in other words , don't dismiss this film because of its source . 
if you can get past the whole comic book thing , you might find another stumbling block in from hell's directors , albert and allen hughes . 
g

## Annotate corpus

In [5]:
annotations = lapply(corpus, getAnnotationsFromDocument)

In [6]:
corpus.tagged = Map(getAnnotatedPlainTextDocument, corpus, annotations)

In [7]:
corpus.taggedText = Map(getAnnotatedMergedDocument, corpus, annotations)

## Actors from DBpedia

In [8]:
prefixT <- c("skos","http://www.w3.org/2004/02/skos/core#")

sparql_prefixT <- "
PREFIX owl: <http://www.w3.org/2002/07/owl#>
"

qT <- paste(sparql_prefixT,"
SELECT DISTINCT ?label where {
  ?actor a <http://dbpedia.org/class/yago/Actor109765278> .
  ?actor rdfs:label ?label .
} 
LIMIT 10000
OFFSET 0
")

In [9]:
endpointT <- "http://dbpedia.org/sparql"
optionsT=""

actors <- SPARQL(endpointT,qT,ns=prefixT,extra=optionsT)$results

## Clean the query result

### We need to clean the output of the query. We need to:

* Remove everything out of the quotes
* Remove parentheses
* Remove duplicates
* Remove “.” for the regular expression
* Put all letters in non-capital

In [10]:
actors.2 <- mclapply(actors, function(x) strsplit(x,'"')[[1]][2])
actors.3 <- mclapply(actors.2, function(x) strsplit(x,' \\(')[[1]][1])
actor.names <- unique(actors.3)
actor.names <- mclapply(actor.names, gsub, pattern="\\.", replacement=" ")
actor.names <- mclapply(actor.names, tolower)
length(actor.names)

## ScreenWriter from DBpedia

In [11]:
prefixT <- c("skos","http://www.w3.org/2004/02/skos/core#")

sparql_prefixT <- "
PREFIX owl: <http://www.w3.org/2002/07/owl#>
"

qT <- paste(sparql_prefixT,"
SELECT DISTINCT ?label where {
  ?actor a <http://dbpedia.org/class/yago/Screenwriter110564400> .
  ?actor rdfs:label ?label .
} 
LIMIT 10000
OFFSET 0
")



In [12]:
endpointT <- "http://dbpedia.org/sparql"
optionsT=""

screenwriters <- SPARQL(endpointT,qT,ns=prefixT,extra=optionsT)$results

In [13]:
screenwriters.2 <- mclapply(screenwriters, function(x) strsplit(x,'"')[[1]][2])
screenwriters.3 <- mclapply(screenwriters.2, function(x) strsplit(x,' \\(')[[1]][1])
screenwriter.names <- unique(screenwriters.3)
screenwriter.names <- mclapply(screenwriter.names, gsub, pattern="\\.", replacement=" ")
screenwriter.names <- mclapply(screenwriter.names, tolower)
length(screenwriter.names)

## FilmMaker from DBpedia

In [14]:
prefixT <- c("skos","http://www.w3.org/2004/02/skos/core#")

sparql_prefixT <- "
PREFIX owl: <http://www.w3.org/2002/07/owl#>
"

qT <- paste(sparql_prefixT,"
SELECT DISTINCT ?label where {
  ?actor a <http://dbpedia.org/class/yago/FilmMaker110088390> .
  ?actor rdfs:label ?label .
} 
LIMIT 10000
OFFSET 0
")

In [15]:
endpointT <- "http://dbpedia.org/sparql"
optionsT=""

filmmakers <- SPARQL(endpointT,qT,ns=prefixT,extra=optionsT)$results

In [16]:
filmmakers.2 <- mclapply(filmmakers, function(x) strsplit(x,'"')[[1]][2])
filmmakers.3 <- mclapply(filmmakers.2, function(x) strsplit(x,' \\(')[[1]][1])
filmmaker.names <- unique(filmmakers.3)
filmmaker.names <- mclapply(filmmaker.names, gsub, pattern="\\.", replacement=" ")
filmmaker.names <- mclapply(filmmaker.names, tolower)
length(filmmaker.names)

## Scriptwiter from DBpedia

In [17]:
prefixT <- c("skos","http://www.w3.org/2004/02/skos/core#")

sparql_prefixT <- "
PREFIX owl: <http://www.w3.org/2002/07/owl#>
"

qT <- paste(sparql_prefixT,"
SELECT DISTINCT ?label where {
  ?actor a <http://dbpedia.org/class/yago/Scriptwriter110564905> .
  ?actor rdfs:label ?label .
} 
LIMIT 10000
OFFSET 0
")

In [18]:
endpointT <- "http://dbpedia.org/sparql"
optionsT=""

scriptwriters <- SPARQL(endpointT,qT,ns=prefixT,extra=optionsT)$results

In [19]:
scriptwriters.2 <- mclapply(scriptwriters, function(x) strsplit(x,'"')[[1]][2])
scriptwriters.3 <- mclapply(scriptwriters.2, function(x) strsplit(x,' \\(')[[1]][1])
scriptwriter.names <- unique(scriptwriters.3)
scriptwriter.names <- mclapply(scriptwriter.names, gsub, pattern="\\.", replacement=" ")
scriptwriter.names <- mclapply(scriptwriter.names, tolower)
length(scriptwriter.names)

## American Actors from DBpedia

In [20]:
prefixT <- c("skos","http://www.w3.org/2004/02/skos/core#")

sparql_prefixT <- "
PREFIX owl: <http://www.w3.org/2002/07/owl#>
"

qT <- paste(sparql_prefixT,"
SELECT DISTINCT ?label where {
  ?actor a <http://dbpedia.org/class/yago/WikicatAmericanActors> .
  ?actor rdfs:label ?label .
} 
LIMIT 10000
OFFSET 0
")

In [21]:
endpointT <- "http://dbpedia.org/sparql"
optionsT=""

americanactors <- SPARQL(endpointT,qT,ns=prefixT,extra=optionsT)$results

In [22]:
americanactors.2 <- mclapply(americanactors, function(x) strsplit(x,'"')[[1]][2])
americanactors.3 <- mclapply(americanactors.2, function(x) strsplit(x,' \\(')[[1]][1])
americanactor.names <- unique(americanactors.3)
americanactor.names <- mclapply(americanactor.names, gsub, pattern="\\.", replacement=" ")
americanactor.names <- mclapply(americanactor.names, tolower)
length(americanactor.names)

## American Actresses

In [23]:
prefixT <- c("skos","http://www.w3.org/2004/02/skos/core#")

sparql_prefixT <- "
PREFIX owl: <http://www.w3.org/2002/07/owl#>
"

qT <- paste(sparql_prefixT,"
SELECT DISTINCT ?label where {
  ?actor a <http://dbpedia.org/class/yago/WikicatAmericanActresses> .
  ?actor rdfs:label ?label .
} 
LIMIT 10000
OFFSET 0
")

In [24]:
endpointT <- "http://dbpedia.org/sparql"
optionsT=""

americanactresses <- SPARQL(endpointT,qT,ns=prefixT,extra=optionsT)$results

In [25]:
americanactresses.2 <- mclapply(americanactresses, function(x) strsplit(x,'"')[[1]][2])
americanactresses.3 <- mclapply(americanactresses.2, function(x) strsplit(x,' \\(')[[1]][1])
americanactress.names <- unique(americanactresses.3)
americanactress.names <- mclapply(americanactress.names, gsub, pattern="\\.", replacement=" ")
americanactress.names <- mclapply(americanactress.names, tolower)
length(americanactress.names)

## 21st Century Actors 

In [26]:
prefixT <- c("skos","http://www.w3.org/2004/02/skos/core#")

sparql_prefixT <- "
PREFIX owl: <http://www.w3.org/2002/07/owl#>
"

qT <- paste(sparql_prefixT,"
SELECT DISTINCT ?label where {
  ?actor a <http://dbpedia.org/class/yago/WikicatAmericanActresses> .
  ?actor rdfs:label ?label .
} 
LIMIT 10000
OFFSET 0
")

In [27]:
endpointT <- "http://dbpedia.org/sparql"
optionsT=""

firstactors <- SPARQL(endpointT,qT,ns=prefixT,extra=optionsT)$results

In [28]:
firstactors.2 <- mclapply(firstactors, function(x) strsplit(x,'"')[[1]][2])
firstactors.3 <- mclapply(firstactors.2, function(x) strsplit(x,' \\(')[[1]][1])
firstactor.names <- unique(firstactors.3)
firstactor.names <- mclapply(firstactor.names, gsub, pattern="\\.", replacement=" ")
firstactor.names <- mclapply(firstactor.names, tolower)
length(firstactor.names)

## Fictional Characters

In [29]:
####### FICTIONAL CHARACTER

prefixT <- c("skos","http://www.w3.org/2004/02/skos/core#")

sparql_prefixT <- "
PREFIX owl: <http://www.w3.org/2002/07/owl#>
"

qT <- paste(sparql_prefixT,"
            SELECT DISTINCT ?label where {
            ?actor a <http://dbpedia.org/class/yago/FictionalCharacter109587565> .
            ?actor rdfs:label ?label .
            } 
            LIMIT 10000
            OFFSET 0
            ")

endpointT <- "http://dbpedia.org/sparql"
optionsT=""

fictionalcharacters <- SPARQL(endpointT,qT,ns=prefixT,extra=optionsT)$results

## CLEAN

fictionalcharacters.2 <- mclapply(fictionalcharacters, function(x) strsplit(x,'"')[[1]][2])
fictionalcharacters.3 <- mclapply(fictionalcharacters.2, function(x) strsplit(x,' \\(')[[1]][1])
fictionalcharacter.names <- unique(fictionalcharacters.3)
fictionalcharacter.names <- mclapply(fictionalcharacter.names, gsub, pattern="\\.", replacement=" ")
fictionalcharacter.names <- mclapply(fictionalcharacter.names, tolower)
length(fictionalcharacter.names)

## Musicians

In [30]:
prefixT <- c("skos","http://www.w3.org/2004/02/skos/core#")

sparql_prefixT <- "
PREFIX owl: <http://www.w3.org/2002/07/owl#>
"

qT <- paste(sparql_prefixT,"
            SELECT DISTINCT ?label where {
            ?actor a <http://dbpedia.org/class/yago/Musician110339966> .
            ?actor rdfs:label ?label .
            } 
            LIMIT 10000
            OFFSET 0
            ")

endpointT <- "http://dbpedia.org/sparql"
optionsT=""

musicians <- SPARQL(endpointT,qT,ns=prefixT,extra=optionsT)$results

## CLEAN

musicians.2 <- mclapply(musicians, function(x) strsplit(x,'"')[[1]][2])
musicians.3 <- mclapply(musicians.2, function(x) strsplit(x,' \\(')[[1]][1])
musician.names <- unique(musicians.3)
musician.names <- mclapply(musician.names, gsub, pattern="\\.", replacement=" ")
musician.names <- mclapply(musician.names, tolower)
length(musician.names)

## Detect patterns

In [31]:
###### CREATE PATTERNS

pattern.an <- mclapply(actor.names, function(x) return(paste(" ",x," ",sep = "")))
# There is some actor named "you" that is spoiling our results; we remove it
pattern.an = pattern.an[grep("^ you $", pattern.an, invert = TRUE)]

pattern.sc <- mclapply(screenwriter.names, function(x) return(paste(" ",x," ",sep = ""))) 

pattern.fm <- mclapply(filmmaker.names, function(x) return(paste(" ",x," ",sep = ""))) 

pattern.sw <- mclapply(scriptwriter.names, function(x) return(paste(" ",x," ",sep = ""))) 

pattern.aa <- mclapply(americanactor.names, function(x) return(paste(" ",x," ",sep = ""))) 

pattern.aas <- mclapply(americanactress.names, function(x) return(paste(" ",x," ",sep = ""))) 

pattern.fa <- mclapply(firstactor.names, function(x) return(paste(" ",x," ",sep = ""))) 
                       
pattern.fc <- mclapply(fictionalcharacter.names, function(x) return(paste(" ",x," ",sep = "")))  
                       
pattern.m <- mclapply(musician.names, function(x) return(paste(" ",x," ",sep = ""))) 

In [32]:
###### JOIN PATTERNS

pattern <- append(pattern.an,pattern.sc)
pattern <- append(pattern, pattern.fm)
pattern <- append(pattern, pattern.sw)
pattern <- append(pattern, pattern.aa)
pattern <- append(pattern, pattern.aas)
pattern <- append(pattern, pattern.fa)
pattern <- append(pattern, pattern.fc)
pattern <- append(pattern, pattern.m)

In [48]:
pattern = pattern[grep("^ you $", pattern, invert = TRUE)]

In [49]:
####### CLEAN COPIES

pattern <- unique(pattern)

In [50]:
###### UNLIST PATTERN

pattern=unlist(pattern)

## Write gazetteer to a file


In [52]:
write.table(pattern, file = "gazetteer.txt", row.names = F, col.names = F, na="", sep=";")

In [53]:
######## DETECT PATTERNS

matches = detectPatternsInCorpus(corpus, pattern)

## Write the results to a file.

In [54]:
write.table(matches, file = "allEntitiesGazetteer.csv", row.names = F, na="", sep=";")

## Evaluate using gold standard

In [39]:
# Let’s see which patterns we have found
countColum = countMatchesPerColumn(matches) 
countColum[countColum$Count != 0,]

Unnamed: 0,Entity,Count
3,al pacino,6
8,alan rickman,4
13,albert finney,1
18,alex cox,1
25,andie macdowell,3
34,antonio banderas,5
43,ashley judd,6
53,ava gardner,1
69,blake edwards,1
75,brad pitt,11


In [55]:
allMatches = mergeAllMatchesInLists(matches)
head(allMatches,10)

Files,Matches
cv000_29590.txt,"tim burton , heather graham , spawn , jack , batman , casper , set , ghost"
cv001_18431.txt,"bill murray , jock"
cv002_15918.txt,meg ryan
cv003_11664.txt,"paul newman , roy scheider , mayor , nemesis , chief , presence , late"
cv004_11636.txt,"bruce lee , spawn , general , set , scream , era"
cv005_29443.txt,"eriq ebouaney , prophet , raoul , death , justice"
cv006_15448.txt,"jennifer lien , avery brooks , problem"
cv007_4968.txt,"woody allen , alec baldwin , madeline kahn , clown , presence , riot , lord"
cv008_29435.txt,"general , dog , late"
cv009_29592.txt,"gem , chief , death , tiger , drama"


In [44]:
goldStandard = read.table(file = "goldStandard.csv", quote = "", na.strings=c(""), colClasses="character", sep=";")

allMatchesGold = mergeGoldStandardInLists(goldStandard)
head(allMatchesGold,10)

Files,Matches
cv000_29590.txt,"alan moore , eddie campbell , moore , campbell , jack , michael jackson , albert , allen hughes , peter godley , robbie coltrane , frederick abberline, johnny depp , abberline , mary kelly , heather graham , terry hayes , rafael yglesias , steve guttenberg , tim burton , marilyn manson , peter deming , martin childs , depp , ians holm , joe gould , richardson , graham"
cv001_18431.txt,"matthew broderick , reese witherspoon , george washington carver, tracy flick , paul , max fischer , bill murray , broderick , witherspoon , jessica campbell , tammy , rooney , campbell , alexander payne , tracy , m"
cv002_15918.txt,"ryan , hanks , tom hanks , joe fox , meg ryan , kathleen kelley, fox , kelley"
cv003_11664.txt,"john williams , steven spielberg, spielberg , williams , martin brody , roy scheider , larry vaughn , murray hamilton , brody , matt hooper , richard dreyfuss, hooper , vaughn , quint , robert shaw , hitchcock , scheider , dreyfuss , shaw , robert redford , paul newman , duddy kravitz , ahab"
cv004_11636.txt,"herb , jackie chan , barry sanders , sanders , jackie , chan , bruce lee , tim allen , lawrence kazdan, john williams , spielberg , george lucas"
cv005_29443.txt,"raoul peck , lumumba , patrice lumumba , eriq ebouaney , helmer peck , peck , pascal bonitzer , patrice , joseph kasa vubu, maka kotto , moise tschombe , pascal nzonzi"
cv006_15448.txt,"tony kaye , edward norton , norton , derek vinyard , danny , edward furlong , beverly dangelo, davin , jennifer lien , derek , kaye , avery brooks , furlong , dangelo , lien"
cv007_4968.txt,"betsy , molly ringwald , alan alda , ringwald , alda , dylan walsh , walsh , madeline kahn , ally sheedy , sheedy , anthony lapaglia, lapaglia , stevie dee , robert de niro , alec baldwin , de niro , joe pesci , catherine ohara , woody allen"
cv008_29435.txt,"lumumba , janssens , rudi delhem , moise tshombe , pascal nzonzi , mobutu , joseph kasa vubu, maka kotto , peck , bonitzer , ebouaney"
cv009_29592.txt,"schwartznager, stallone , van damme , rongguang yu , wong fei-hong, jackie chan , fei-hong , sze-man tsang, wong kei-ying, yen chi dan , yuen wo ping , fox"


In [45]:
metrics = calculateMetrics(allMatches, allMatchesGold)
metrics

Precision,Recall,Fmeasure
0.2494759,0.09803041,0.1407527


### The use of a gazetteer for named entity recognition is a kind of brutal-force search which the effect is improved by the DBpedia dictionaries about specific key-words. 

### Benefits:

* Seach tools like DBpedia allows a more specific search
* Easier to automatize
* A bigger recall measure than using lexical-syntactic patterns

### Disadvantages

* Slower search because it demands a lot of resources
* A lower precision
* Not always easy to find the best DBpedia resource