# Ngram POS Tagging and evaluation
## Import library and corpus

In [1]:
import nltk                                                                 #import library

In [2]:
from nltk.corpus import brown                                               #import corpus

## Editorial

In [3]:
editorial_tagged_sents = brown.tagged_sents(categories='editorial')         #set editorial tagged sentences

In [4]:
editorial_sents = brown.sents(categories='editorial')                       #set editorial sentences

In [5]:
editorial_unigram_tagger = nltk.UnigramTagger(editorial_tagged_sents)       #set tagger

In [6]:
editorial_unigram_tagger.tag(editorial_sents[2006])                         #tag sentence #2006

[('Southfield', 'NP'),
 ('schools', 'NNS'),
 ('furnish', 'VB'),
 ('an', 'AT'),
 ('old', 'JJ'),
 ('45-passenger', 'JJ'),
 ('bus', 'NN'),
 ('(', '('),
 ('the', 'AT'),
 ('heater', 'NN'),
 ('in', 'IN'),
 ('which', 'WDT'),
 ('needs', 'VBZ'),
 ('repair', 'VB'),
 ('since', 'IN'),
 ('some', 'DTI'),
 ('of', 'IN'),
 ('the', 'AT'),
 ('children', 'NNS'),
 ('ride', 'VB'),
 ('a', 'AT'),
 ('long', 'JJ'),
 ('distance', 'NN'),
 ('and', 'CC'),
 ('need', 'VB'),
 ('the', 'AT'),
 ('heat', 'NN'),
 (')', ')'),
 ('.', '.')]

In [7]:
editorial_unigram_tagger.evaluate(editorial_tagged_sents)                    #evaluate the tagged sents

0.9314654892539446

In [8]:
size = int(len(editorial_tagged_sents) * 0.8)                                #set the data size = 80%

In [9]:
size                                                                         # #of data points

2397

In [10]:
editorial_train_sents = editorial_tagged_sents[:size]                        #divide the dataset into training set

In [11]:
editorial_test_sents = editorial_tagged_sents[size:]                         #divide the dataset into test set

In [12]:
editorial_unigram_tagger = nltk.UnigramTagger(editorial_train_sents)         #set unigram tagger

In [13]:
editorial_unigram_tagger.evaluate(editorial_test_sents)                      #evaluate unigram performance

0.7849585141204233

In [14]:
editorial_bigram_tagger = nltk.BigramTagger(editorial_train_sents)           #set bigram tagger

In [15]:
editorial_bigram_tagger.tag(editorial_sents[2006])                           #tag sentence #2006

[('Southfield', 'NP'),
 ('schools', 'NNS'),
 ('furnish', 'VB'),
 ('an', 'AT'),
 ('old', 'JJ'),
 ('45-passenger', 'JJ'),
 ('bus', 'NN'),
 ('(', '('),
 ('the', 'AT'),
 ('heater', 'NN'),
 ('in', 'IN'),
 ('which', 'WDT'),
 ('needs', 'VBZ'),
 ('repair', 'NN'),
 ('since', 'IN'),
 ('some', 'DTI'),
 ('of', 'IN'),
 ('the', 'AT'),
 ('children', 'NNS'),
 ('ride', 'VB'),
 ('a', 'AT'),
 ('long', 'JJ'),
 ('distance', 'NN'),
 ('and', 'CC'),
 ('need', 'VB'),
 ('the', 'AT'),
 ('heat', 'NN'),
 (')', ')'),
 ('.', '.')]

In [16]:
editorial_unseen_sent = editorial_sents[1222]                                 #set unseen sentence #1222

In [17]:
editorial_bigram_tagger.tag(editorial_unseen_sent)                            #tag sentence unseen sent

[('No', 'AT'),
 ('warden', 'NN'),
 ('or', 'CC'),
 ('guard', 'VB'),
 ('to', 'TO'),
 ('touch', 'VB'),
 ('lock', 'NN'),
 (',', ','),
 ('key', 'NN'),
 ('or', 'CC'),
 ('doorknob', 'NN'),
 ('except', 'IN'),
 ('when', 'WRB'),
 ('accompanied', 'VBN'),
 ('by', 'IN'),
 ('a', 'AT'),
 ("prisoners'", 'NNS$'),
 ('committee', 'NN'),
 ('with', 'IN'),
 ('powers', 'NNS'),
 ('of', 'IN'),
 ('veto', 'NN'),
 ('.', '.')]

In [18]:
editorial_bigram_tagger.evaluate(editorial_test_sents)                       #evaluate bigram tagger

0.1020019791428789

In [19]:
t0 = nltk.DefaultTagger('NN')                                                #set default tagger to tag tokens as Noun

In [21]:
t1 = nltk.UnigramTagger(editorial_train_sents, backoff=t0)                   #set unigram tagger and put first backoff

In [23]:
t2 = nltk.BigramTagger(editorial_train_sents, backoff=t1)                    #set bigram tagger and put second backoff

In [24]:
t2.evaluate(editorial_test_sents)                                            #evaluate the N-Gram tagger

0.8216487782598767

### Confusion Matrix

In [25]:
def tag_list(editorial_tagged_sents):
...     return [tag for sent in editorial_tagged_sents for (word, tag) in sent] #function tag_list

In [26]:
def apply_tagger(tagger, corpus):
...     return [tagger.tag(nltk.tag.untag(sent)) for sent in corpus]         #function apply_tagger

In [27]:
gold = tag_list(brown.tagged_sents(categories='editorial'))                  #gold corpus

In [28]:
test = tag_list(apply_tagger(t2, brown.tagged_sents(categories='editorial')))#test the tag_list

In [29]:
cm = nltk.ConfusionMatrix(gold, test)                                        #generate confusion matrix

In [30]:
print(cm.pretty_format(sort_by_count=True, show_percents=True, truncate=15)) #print the confusion matrix

      |                                                                                                 N        |
      |                                                                                                 N        |
      |                                         N                                         V             -        |
      |      N      I      A      J             N             V      N      C      R      B      C      T      T |
      |      N      N      T      J      .      S      ,      B      P      C      B      N      S      L      O |
------+----------------------------------------------------------------------------------------------------------+
   NN | <12.3%>     .      .   0.1%      .      .      .   0.1%   0.0%      .   0.0%   0.0%      .   0.0%      . |
   IN |   0.0%  <9.1%>     .      .      .      .      .      .      .   0.0%   0.0%      .   0.0%      .   0.8% |
   AT |      .      .  <8.6%>     .      .      .      .      .      .      .   

## 2. Reviews

In [31]:
reviews_tagged_sents = brown.tagged_sents(categories='reviews')               #set reviews tagged sentences

In [32]:
reviews_sents = brown.sents(categories='reviews')                             #set reviews sentences

In [33]:
reviews_unigram_tagger = nltk.UnigramTagger(reviews_tagged_sents)             #set tagger

In [35]:
reviews_unigram_tagger.tag(reviews_sents[1006])                               #tag sentence #1006

[('And', 'CC'),
 ('Django', 'NP'),
 ('owed', 'VBD'),
 ('much', 'AP'),
 ('to', 'TO'),
 ('Louis', 'NP'),
 ('Armstrong', 'NP'),
 ('.', '.')]

In [36]:
reviews_unigram_tagger.evaluate(reviews_tagged_sents)                         #evaluate the tagged sents

0.9425363600628931

In [37]:
size = int(len(reviews_tagged_sents) * 0.8)                                   #set the data size = 80%

In [38]:
size                                                                          # #of data points

1400

In [39]:
reviews_train_sents = reviews_tagged_sents[:size]                             #divide the dataset into training set

In [40]:
reviews_test_sents = reviews_tagged_sents[size:]                              #divide the dataset into test set

In [41]:
reviews_unigram_tagger = nltk.UnigramTagger(reviews_train_sents)              #set unigram tagger

In [42]:
reviews_unigram_tagger.evaluate(reviews_test_sents)                           #evaluate unigram performance

0.7022086824067022

In [43]:
reviews_bigram_tagger = nltk.BigramTagger(reviews_train_sents)                #set bigram tagger

In [44]:
reviews_bigram_tagger.tag(reviews_sents[1006])                                #tag sentence #1006

[('And', 'CC'),
 ('Django', 'NP'),
 ('owed', 'VBD'),
 ('much', 'AP'),
 ('to', 'TO'),
 ('Louis', None),
 ('Armstrong', None),
 ('.', None)]

In [45]:
reviews_unseen_sent = reviews_sents[1203]                                     #set unseen sentence #1203

In [46]:
reviews_bigram_tagger.tag(reviews_unseen_sent)                                #tag sentence unseen sent

[('The', 'AT'),
 ('fact', 'NN'),
 ('that', 'WPS'),
 ('Sloan', None),
 ('was', None),
 ('an', None),
 ('extrovert', None),
 (',', None),
 ('concerned', None),
 ('primarily', None),
 ('with', None),
 ('what', None),
 ('he', None),
 ('saw', None),
 (',', None),
 ('adds', None),
 ('greatly', None),
 ('to', None),
 ('the', None),
 ('value', None),
 ('of', None),
 ('his', None),
 ('art', None),
 ('as', None),
 ('a', None),
 ('human', None),
 ('chronicle', None),
 ('.', None)]

In [47]:
reviews_bigram_tagger.evaluate(reviews_test_sents)                            #evaluate bigram tagger

0.05991368367605991

In [48]:
t0 = nltk.DefaultTagger('NN')                                                 #set default tagger to tag tokens as Noun

In [49]:
t1 = nltk.UnigramTagger(reviews_train_sents, backoff=t0)                      #set unigram tagger and put first backoff

In [50]:
t2 = nltk.BigramTagger(reviews_train_sents, backoff=t1)                       #set bigram tagger and put second backoff

In [51]:
t2.evaluate(reviews_test_sents)                                               #evaluate the N-Gram tagger

0.7627570449352628

### Confusion Matrix

In [52]:
def tag_list(reviews_tagged_sents):
...     return [tag for sent in reviews_tagged_sents for (word, tag) in sent] #function tag_list

In [53]:
def apply_tagger(tagger, corpus):
...     return [tagger.tag(nltk.tag.untag(sent)) for sent in corpus]          #function apply_tagger

In [54]:
gold = tag_list(brown.tagged_sents(categories='reviews'))                     #gold corpus

In [55]:
test = tag_list(apply_tagger(t2, brown.tagged_sents(categories='reviews')))   #test the tag_list

In [56]:
cm = nltk.ConfusionMatrix(gold, test)                                         #generate confusion matrix

In [57]:
print(cm.pretty_format(sort_by_count=True, show_percents=True, truncate=15))  #print the confusion matrix

      |                                                                                                 N        |
      |                                                                                                 N        |
      |                                                N                           V                    -      V |
      |      N      I      A      J             N      N             C      R      B      V      C      T      B |
      |      N      N      T      J      ,      P      S      .      C      B      N      B      S      L      Z |
------+----------------------------------------------------------------------------------------------------------+
   NN | <12.2%>     .      .   0.1%      .   0.0%   0.0%      .      .      .      .   0.0%      .   0.0%      . |
   IN |   0.0%  <9.4%>     .      .      .      .      .      .      .   0.0%      .      .   0.0%      .      . |
   AT |      .      .  <8.4%>     .      .      .      .      .      .      .   

## 3. Government

In [61]:
government_tagged_sents = brown.tagged_sents(categories='government')         #set government tagged sentences

In [62]:
government_sents = brown.sents(categories='government')                       #set government sentences

In [64]:
government_unigram_tagger = nltk.UnigramTagger(government_tagged_sents)       #set tagger

In [66]:
government_unigram_tagger.tag(government_sents[2006])                         #tag sentence #2006

[('Hunter', 'NN'),
 ('and', 'CC'),
 ('fisherman', 'NN'),
 ('visits', 'NNS'),
 ('since', 'IN'),
 ('1949', 'CD'),
 ('have', 'HV'),
 ('increased', 'VBN'),
 ('8', 'CD'),
 ('times', 'NNS'),
 ('faster', 'RBR'),
 ('than', 'CS'),
 ('the', 'AT'),
 ('nationwide', 'JJ'),
 ('sale', 'NN'),
 ('of', 'IN'),
 ('hunting', 'VBG'),
 ('and', 'CC'),
 ('fishing', 'VBG'),
 ('licenses', 'NNS'),
 ('.', '.')]

In [67]:
government_unigram_tagger.evaluate(government_tagged_sents)                    #evaluate the tagged sents

0.9254531711282571

In [68]:
size = int(len(government_tagged_sents) * 0.8)                                 #set the data size = 80%

In [69]:
size                                                                           # #of data points

2425

In [70]:
government_train_sents = government_tagged_sents[:size]                        #divide the dataset into training set

In [71]:
government_test_sents = government_tagged_sents[size:]                         #divide the dataset into test set

In [72]:
government_unigram_tagger = nltk.UnigramTagger(government_train_sents)         #set unigram tagger

In [73]:
government_unigram_tagger.evaluate(government_test_sents)                      #evaluate unigram performance

0.7801597309794032

In [74]:
government_bigram_tagger = nltk.BigramTagger(government_train_sents)           #set bigram tagger

In [75]:
government_bigram_tagger.tag(government_sents[2006])                           #tag sentence #2006

[('Hunter', 'NN'),
 ('and', 'CC'),
 ('fisherman', 'NN'),
 ('visits', 'NNS'),
 ('since', 'IN'),
 ('1949', 'CD'),
 ('have', 'HV'),
 ('increased', 'VBN'),
 ('8', 'CD'),
 ('times', 'NNS'),
 ('faster', 'RBR'),
 ('than', 'CS'),
 ('the', 'AT'),
 ('nationwide', 'JJ'),
 ('sale', 'NN'),
 ('of', 'IN'),
 ('hunting', 'VBG'),
 ('and', 'CC'),
 ('fishing', 'VBG'),
 ('licenses', 'NNS'),
 ('.', '.')]

In [76]:
government_unseen_sent = government_sents[1203]                             #set unseen sentence #1203

In [77]:
government_bigram_tagger.tag(government_unseen_sent)                        #tag sentence unseen sent

[('The', 'AT'),
 ('action', 'NN'),
 ('of', 'IN'),
 ('the', 'AT'),
 ('Commission', 'NN-TL'),
 ('in', 'IN'),
 ('allowing', 'VBG'),
 ('or', 'CC'),
 ('denying', 'VBG'),
 ('any', 'DTI'),
 ('claim', 'NN'),
 ('under', 'IN'),
 ('this', 'DT'),
 ('title', 'NN'),
 ('shall', 'MD'),
 ('be', 'BE'),
 ('final', 'JJ'),
 ('and', 'CC'),
 ('conclusive', 'JJ'),
 ('on', 'IN'),
 ('all', 'ABN'),
 ('questions', 'NNS'),
 ('of', 'IN'),
 ('law', 'NN'),
 ('and', 'CC'),
 ('fact', 'NN'),
 ('and', 'CC'),
 ('not', '*'),
 ('subject', 'JJ'),
 ('to', 'TO'),
 ('review', 'VB'),
 ('by', 'IN'),
 ('the', 'AT'),
 ('Secretary', 'NN-TL'),
 ('of', 'IN-TL'),
 ('State', 'NN-TL'),
 ('or', 'CC'),
 ('any', 'DTI'),
 ('other', 'AP'),
 ('official', 'NN'),
 (',', ','),
 ('department', 'NN'),
 (',', ','),
 ('agency', 'NN'),
 (',', ','),
 ('or', 'CC'),
 ('establishment', 'NN'),
 ('of', 'IN'),
 ('the', 'AT'),
 ('United', 'VBN-TL'),
 ('States', 'NNS-TL'),
 ('or', 'CC'),
 ('by', 'IN'),
 ('any', 'DTI'),
 ('court', 'NN'),
 ('by', 'IN'),
 ('manda

In [78]:
government_bigram_tagger.evaluate(government_test_sents)                       #evaluate bigram tagger

0.08245761524450049

In [79]:
t0 = nltk.DefaultTagger('NN')                                                  #set default tagger to tag tokens as Noun

In [80]:
t1 = nltk.UnigramTagger(government_train_sents, backoff=t0)                    #set unigram tagger and put first backoff

In [81]:
t2 = nltk.BigramTagger(government_train_sents, backoff=t1)                     #set bigram tagger and put second backoff

In [82]:
t2.evaluate(government_test_sents)                                             #evaluate the N-Gram tagger

0.8311615524730279

### Confusion Matrix

In [83]:
def tag_list(government_tagged_sents):
...     return [tag for sent in government_tagged_sents for (word, tag) in sent] #function tag_list

In [84]:
def apply_tagger(tagger, corpus):
...     return [tagger.tag(nltk.tag.untag(sent)) for sent in corpus]           #function apply_tagger

In [85]:
gold = tag_list(brown.tagged_sents(categories='government'))                   #gold corpus

In [86]:
test = tag_list(apply_tagger(t2, brown.tagged_sents(categories='government'))) #test the tag_list

In [87]:
cm = nltk.ConfusionMatrix(gold, test)                                          #generate confusion matrix

In [88]:
print(cm.pretty_format(sort_by_count=True, show_percents=True, truncate=15))   #print the confusion matrix

      |                                                                     N                                    |
      |                                                                     N                                    |
      |                           N                                  V      -                                    |
      |      N      I      A      N      J                    C      B      T      V      R      C      N      C |
      |      N      N      T      S      J      ,      .      C      N      L      B      B      D      P      S |
------+----------------------------------------------------------------------------------------------------------+
   NN | <13.8%>     .      .   0.0%   0.1%      .      .      .   0.0%   0.0%   0.0%   0.0%   0.0%   0.0%      . |
   IN |   0.0% <11.4%>     .      .   0.0%      .      .   0.0%      .      .      .   0.0%      .      .   0.0% |
   AT |      .      .  <8.1%>     .      .      .      .      .      .      .   

## 4. News

In [89]:
news_tagged_sents = brown.tagged_sents(categories='news')                        #set news tagged sentences
news_sents = brown.sents(categories='news')                                      #set news sentences
news_unigram_tagger = nltk.UnigramTagger(news_tagged_sents)                      #set tagger

In [90]:
news_unigram_tagger.tag(news_sents[2006])                                        #tag sentence #2006

[('Ultimately', 'RB'),
 ('the', 'AT'),
 ('development', 'NN'),
 ('will', 'MD'),
 ('comprise', 'VB'),
 ('300', 'CD'),
 ('units', 'NNS'),
 (',', ','),
 ('in', 'IN'),
 ('two-story', 'JJ'),
 ('and', 'CC'),
 ('three-story', 'JJ'),
 ('structures', 'NNS'),
 ('.', '.')]

In [91]:
news_unigram_tagger.evaluate(news_tagged_sents)                                 #evaluate the tagged sents

0.9349006503968017

In [92]:
size = int(len(news_tagged_sents) * 0.8)                                        #set the data size = 80%

In [93]:
size                                                                            # #of data points

3698

In [94]:
news_train_sents = news_tagged_sents[:size]                                     #divide the dataset into training set
news_test_sents = news_tagged_sents[size:]                                      #divide the dataset into test set
news_unigram_tagger = nltk.UnigramTagger(news_train_sents)                      #set unigram tagger

In [95]:
news_unigram_tagger.evaluate(news_test_sents)                                   #evaluate unigram performance

0.8018690688376126

In [96]:
 news_bigram_tagger = nltk.BigramTagger(news_train_sents)                       #set bigram tagger

In [97]:
news_bigram_tagger.tag(news_sents[2006])                                        #tag sentence #2006

[('Ultimately', 'RB'),
 ('the', 'AT'),
 ('development', 'NN'),
 ('will', 'MD'),
 ('comprise', 'VB'),
 ('300', 'CD'),
 ('units', 'NNS'),
 (',', ','),
 ('in', 'IN'),
 ('two-story', 'JJ'),
 ('and', 'CC'),
 ('three-story', 'JJ'),
 ('structures', 'NNS'),
 ('.', '.')]

In [98]:
news_unseen_sent = news_sents[1203]                                             #set unseen sentence #1203

In [99]:
news_bigram_tagger.tag(news_unseen_sent)                                        #tag sentence unseen sent

[('``', '``'),
 ('Buster', 'NP'),
 ('would', 'MD'),
 ('solve', 'VB'),
 ('that', 'CS'),
 ('quarterback', None),
 ('problem', None),
 ('just', None),
 ('as', None),
 ('we', None),
 ('head', None),
 ('that', None),
 ('way', None),
 ("''", None),
 ('.', None)]

In [100]:
news_bigram_tagger.evaluate(news_test_sents)                                    #evaluate bigram tagger

0.09147839491304977

In [101]:
t0 = nltk.DefaultTagger('NN')                                                   #set default tagger to tag tokens as Noun
t1 = nltk.UnigramTagger(news_train_sents, backoff=t0)                           #set unigram tagger and put first backoff
t2 = nltk.BigramTagger(news_train_sents, backoff=t1)                            #set bigram tagger and put second backoff

In [102]:
t2.evaluate(news_test_sents)                                                    #evaluate the N-Gram tagger

0.8354930391637362

### Confusion Matrix

In [103]:
def tag_list(news_tagged_sents):
...     return [tag for sent in news_tagged_sents for (word, tag) in sent]      #function tag_list

In [104]:
def apply_tagger(tagger, corpus):
...     return [tagger.tag(nltk.tag.untag(sent)) for sent in corpus]            #function apply_tagger

In [105]:
gold = tag_list(brown.tagged_sents(categories='news'))                          #gold corpus
test = tag_list(apply_tagger(t2, brown.tagged_sents(categories='news')))        #test the tag_list
cm = nltk.ConfusionMatrix(gold, test)                                           #generate confusion matrix

In [106]:
print(cm.pretty_format(sort_by_count=True, show_percents=True, truncate=15))    #print the confusion matrix

      |                                                                            N                             |
      |                                                                            N                             |
      |                                         N                           V      -             V               |
      |      N      I      A      N             N             J      C      B      T      V      B      R      C |
      |      N      N      T      P      ,      S      .      J      C      D      L      B      N      B      D |
------+----------------------------------------------------------------------------------------------------------+
   NN | <12.8%>     .   0.0%   0.0%      .   0.0%      .   0.1%      .      .   0.0%   0.1%      .   0.0%      . |
   IN |   0.0%  <9.6%>     .      .      .      .      .   0.0%   0.0%      .      .      .      .   0.0%      . |
   AT |      .      .  <8.8%>     .      .      .      .      .      .      .   

# End
## References:
### Bird,  S.,  Klein,  E.  and  Loper,  E.  (2009).   Natural  language  processing  with  python,Oâ€™Reilly Media, Inc.