Implementing Named Entity Recognition Using NLTK CHP07

In [1]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
import re

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [0]:
# Paragraph is first broken into sentences using "sent_tokenize", then sentence is broken into words using "word_tokenize" and finally POS tags of each word is found
def ie_preprocess(document):
  sentences = nltk.sent_tokenize(document) 
  sentences = [nltk.word_tokenize(sent) for sent in sentences]
  sentences = [nltk.pos_tag(sent) for sent in sentences]
  return(sentences)


In [3]:
sentence = ie_preprocess('the little dog barked at the cat')
print(sentence)

[[('the', 'DT'), ('little', 'JJ'), ('dog', 'NN'), ('barked', 'VBD'), ('at', 'IN'), ('the', 'DT'), ('cat', 'NN')]]


In [4]:
# Defining grammar to chunk the sentence
grammar = "NP: {<DT>?<JJ>*<NN>}"
cp = nltk.RegexpParser(grammar) 
result = cp.parse(sentence[0])
print('chunked sentence', result)
print('data type of result', type(result))

chunked sentence (S (NP the/DT little/JJ dog/NN) barked/VBD at/IN (NP the/DT cat/NN))
data type of result <class 'nltk.tree.Tree'>


In [0]:
# N-grams - are continous sequence of n-items in a sentence. Depending on the input n, the following function returns n-grams of the sentence.

# Implementing N-grams using Regex
def generate_ngrams(s, n):
    # Convert to lowercases
    s = s.lower()
    
    # Replace all none alphanumeric characters with spaces
    s = re.sub(r'[^a-zA-Z0-9\s]', ' ', s)
    
    # Break sentence in the token, remove empty tokens
    tokens = [token for token in s.split(" ") if token != ""]
    
    # Use the zip function to help us generate n-grams
    # Concatentate the tokens into ngrams and return
    print([tokens[:i] for i in range(n)])
#     ngrams = zip(*[tokens[:i] for i in range(n)])
#     return([" ".join(ngram) for ngram in ngrams])

In [21]:
s = "Existing Natural Language Techniques (NLP) focus mostly on transcribing what humans say, rather than understanding what’s being said. Even with the release of advanced chatbot technologies like Google Duplex and Microsoft’s Xiaoice, this is a challenge that has eluded researchers so far."
generate_ngrams(s, 3)

[[], ['existing'], ['existing', 'natural']]


In [8]:
# Implementing N-grams using NLTK
import re
from nltk.util import ngrams

s = s.lower()
s = re.sub(r'[^a-zA-Z0-9\s]', ' ', s)
tokens = [token for token in s.split(" ") if token != ""]
output = list(ngrams(tokens, 2))
print(output)
print(len(output),len(s))

[('existing', 'natural'), ('natural', 'language'), ('language', 'techniques'), ('techniques', 'nlp'), ('nlp', 'focus'), ('focus', 'mostly'), ('mostly', 'on'), ('on', 'transcribing'), ('transcribing', 'what'), ('what', 'humans'), ('humans', 'say'), ('say', 'rather'), ('rather', 'than'), ('than', 'understanding'), ('understanding', 'what'), ('what', 's'), ('s', 'being'), ('being', 'said'), ('said', 'even'), ('even', 'with'), ('with', 'the'), ('the', 'release'), ('release', 'of'), ('of', 'advanced'), ('advanced', 'chatbot'), ('chatbot', 'technologies'), ('technologies', 'like'), ('like', 'google'), ('google', 'duplex'), ('duplex', 'and'), ('and', 'microsoft'), ('microsoft', 's'), ('s', 'xiaoice'), ('xiaoice', 'this'), ('this', 'is'), ('is', 'a'), ('a', 'challenge'), ('challenge', 'that'), ('that', 'has'), ('has', 'eluded'), ('eluded', 'researchers'), ('researchers', 'so'), ('so', 'far')]
43 288


In [9]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('conll2000')
from nltk.corpus import conll2000
class UnigramChunker(nltk.ChunkParserI):
    def __init__(self, train_sents):
      train_data = [[(t,c) for w,t,c in nltk.chunk.tree2conlltags(sent)]                                   
                    for sent in train_sents]
      self.tagger = nltk.UnigramTagger(train_data)

    def parse(self, sentence):
      pos_tags = [pos for (word,pos) in sentence]
      tagged_pos_tags = self.tagger.tag(pos_tags)
      chunktags = [chunktag for (pos, chunktag) in tagged_pos_tags]
      conlltags = [(word, pos, chunktag) for ((word,pos),chunktag)
                   in zip(sentence, chunktags)]
      return nltk.chunk.conlltags2tree(conlltags)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package conll2000 to /root/nltk_data...
[nltk_data]   Unzipping corpora/conll2000.zip.


In [15]:
test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP'])
train_sents = conll2000.chunked_sents('train.txt', chunk_types=['NP'])
unigram_chunker = UnigramChunker(train_sents)
print(unigram_chunker.evaluate(test_sents))

ChunkParse score:
    IOB Accuracy:  92.9%%
    Precision:     79.9%%
    Recall:        86.8%%
    F-Measure:     83.2%%
