# POS Tagging using rule based and machine learning
## Import libraries and corpus

In [1]:
from nltk.corpus import brown                                                   #import corpus

In [2]:
import nltk                                                                     #import library

## 1. News

In [3]:
suffix_fdist = nltk.FreqDist()                                                  #set suffix frequency distribution

In [4]:
for word in brown.words():
...     word = word.lower()
...     suffix_fdist[word[-1:]] += 1                                            #last letter in the word
...     suffix_fdist[word[-2:]] += 1                                            #second last letter in the word
...     suffix_fdist[word[-3:]] += 1                                            #third last letter in the word

In [5]:
common_suffixes = [suffix for (suffix, count) in suffix_fdist.most_common(100)] #100 topmost suffixes

In [6]:
print(common_suffixes)

['e', ',', '.', 's', 'd', 't', 'he', 'n', 'a', 'of', 'the', 'y', 'r', 'to', 'in', 'f', 'o', 'ed', 'nd', 'is', 'on', 'l', 'g', 'and', 'ng', 'er', 'as', 'ing', 'h', 'at', 'es', 'or', 're', 'it', '``', 'an', "''", 'm', ';', 'i', 'ly', 'ion', 'en', 'al', '?', 'nt', 'be', 'hat', 'st', 'his', 'th', 'll', 'le', 'ce', 'by', 'ts', 'me', 've', "'", 'se', 'ut', 'was', 'for', 'ent', 'ch', 'k', 'w', 'ld', '`', 'rs', 'ted', 'ere', 'her', 'ne', 'ns', 'ith', 'ad', 'ry', ')', '(', 'te', '--', 'ay', 'ty', 'ot', 'p', 'nce', "'s", 'ter', 'om', 'ss', ':', 'we', 'are', 'c', 'ers', 'uld', 'had', 'so', 'ey']


In [7]:
def pos_features(word): #features extractor function
...     features = {}
...     for suffix in common_suffixes:
...         features['endswith({})'.format(suffix)] = word.lower().endswith(suffix)
...     return features

In [8]:
news_tagged_words = brown.tagged_words(categories='news')                      #select news corpus

In [10]:
news_featuresets = [(pos_features(n), g) for (n,g) in news_tagged_words]       #define features for words

In [12]:
news_size = int(len(news_featuresets) * 0.1)                                   #divide corpus into training and test

In [17]:
news_train_set, news_test_set = news_featuresets[news_size:], news_featuresets[:news_size] #form training and test

In [18]:
news_classifier = nltk.DecisionTreeClassifier.train(news_train_set)            #Decision tree classification

In [20]:
nltk.classify.accuracy(news_classifier, news_test_set)                         #evaluation

0.6270512182993535

In [21]:
print (news_classifier.pseudocode(depth=5))                                    #pseudocode

if endswith(the) == False: 
  if endswith(,) == False: 
    if endswith(s) == False: 
      if endswith(.) == False: 
        if endswith(of) == False: return '.'
        if endswith(of) == True: return 'IN'
      if endswith(.) == True: return '.'
    if endswith(s) == True: 
      if endswith(is) == False: 
        if endswith(was) == False: return 'PP$'
        if endswith(was) == True: return 'BEDZ'
      if endswith(is) == True: 
        if endswith(his) == False: return 'BEZ'
        if endswith(his) == True: return 'PP$'
  if endswith(,) == True: return ','
if endswith(the) == True: return 'AT'



In [22]:
def pos_features(sentence, i):                                                   #feature detector function for sentence
    features = {"suffix(1)": sentence[i][-1:],                                   #last word in the sentence
                "suffix(2)": sentence[i][-2:],                                   #second last word in the sentence
                "suffix(3)": sentence[i][-3:]}                                   #third last word in the sentence
    if i == 0:
        features["prev-word"] = "<START>"
    else:
        features["prev-word"] = sentence[i-1]
    return features

In [23]:
pos_features(brown.sents()[0], 8)

{'prev-word': 'an', 'suffix(1)': 'n', 'suffix(2)': 'on', 'suffix(3)': 'ion'}

In [24]:
news_tagged_sents = brown.tagged_sents(categories='news')                        #set tagged sentences

In [25]:
featuresets = []                                                                 #features

In [28]:
for news_tagged_sent in news_tagged_sents:
...     untagged_sent = nltk.tag.untag(news_tagged_sent)
...     for i, (word, tag) in enumerate(news_tagged_sent):
...         featuresets.append( (pos_features(untagged_sent, i), tag) )

In [29]:
news_size = int(len(featuresets) * 0.1)                                           #setting size

In [31]:
news_train_set, news_test_set = featuresets[news_size:], featuresets[:news_size]  #declaring test and training set

In [33]:
news_classifier = nltk.NaiveBayesClassifier.train(news_train_set)                 #define classifier

In [35]:
nltk.classify.accuracy(news_classifier, news_test_set)                            #evaluation

0.7891596220785678

## 2. Reviews

In [37]:
suffix_fdist = nltk.FreqDist()                                                    #set suffix frequency distribution

In [38]:
for word in brown.words():
...     word = word.lower()
...     suffix_fdist[word[-1:]] += 1                                              #last letter in the word
...     suffix_fdist[word[-2:]] += 1                                              #second last letter in the word
...     suffix_fdist[word[-3:]] += 1                                              #third last letter in the word

In [39]:
common_suffixes = [suffix for (suffix, count) in suffix_fdist.most_common(100)] #100 topmost suffixes

In [40]:
print(common_suffixes)

['e', ',', '.', 's', 'd', 't', 'he', 'n', 'a', 'of', 'the', 'y', 'r', 'to', 'in', 'f', 'o', 'ed', 'nd', 'is', 'on', 'l', 'g', 'and', 'ng', 'er', 'as', 'ing', 'h', 'at', 'es', 'or', 're', 'it', '``', 'an', "''", 'm', ';', 'i', 'ly', 'ion', 'en', 'al', '?', 'nt', 'be', 'hat', 'st', 'his', 'th', 'll', 'le', 'ce', 'by', 'ts', 'me', 've', "'", 'se', 'ut', 'was', 'for', 'ent', 'ch', 'k', 'w', 'ld', '`', 'rs', 'ted', 'ere', 'her', 'ne', 'ns', 'ith', 'ad', 'ry', ')', '(', 'te', '--', 'ay', 'ty', 'ot', 'p', 'nce', "'s", 'ter', 'om', 'ss', ':', 'we', 'are', 'c', 'ers', 'uld', 'had', 'so', 'ey']


In [41]:
def pos_features(word):                                                          #features extractor function
...     features = {}
...     for suffix in common_suffixes:
...         features['endswith({})'.format(suffix)] = word.lower().endswith(suffix)
...     return features

In [42]:
reviews_tagged_words = brown.tagged_words(categories='reviews')               #select reviews corpus

In [44]:
reviews_featuresets = [(pos_features(n), g) for (n,g) in reviews_tagged_words]#define features for words

In [45]:
reviews_size = int(len(reviews_featuresets) * 0.1)                            #divide corpus into training and test

In [47]:
reviews_train_set, reviews_test_set = reviews_featuresets[reviews_size:], reviews_featuresets[:reviews_size] #form training and test

In [48]:
reviews_classifier = nltk.DecisionTreeClassifier.train(reviews_train_set)     #Decision tree classification

In [50]:
nltk.classify.accuracy(reviews_classifier, reviews_test_set)                  #evaluation

0.6071253071253071

In [52]:
print (reviews_classifier.pseudocode(depth=5))                                #pseudocode

if endswith(,) == False: 
  if endswith(the) == False: 
    if endswith(s) == False: 
      if endswith(.) == False: 
        if endswith(of) == False: return 'NN'
        if endswith(of) == True: return 'IN'
      if endswith(.) == True: return '.'
    if endswith(s) == True: 
      if endswith(is) == False: 
        if endswith(was) == False: return 'VBZ'
        if endswith(was) == True: return 'BEDZ'
      if endswith(is) == True: 
        if endswith(his) == False: return 'BEZ'
        if endswith(his) == True: return 'PP$'
  if endswith(the) == True: return 'AT'
if endswith(,) == True: return ','



In [53]:
def pos_features(sentence, i):                                                #feature detector function for sentence
    features = {"suffix(1)": sentence[i][-1:],                                #last word in the sentence
                "suffix(2)": sentence[i][-2:],                                #second last word in the sentence
                "suffix(3)": sentence[i][-3:]}                                #third last word in the sentence
    if i == 0:
        features["prev-word"] = "<START>"
    else:
        features["prev-word"] = sentence[i-1]
    return features

In [88]:
pos_features(brown.sents()[0], 8)

{'prev-word': 'an', 'suffix(1)': 'n', 'suffix(2)': 'on', 'suffix(3)': 'ion'}

In [89]:
reviews_tagged_sents = brown.tagged_sents(categories='reviews')                #set tagged sentences

In [90]:
featuresets = []                                                               #features

In [91]:
for reviews_tagged_sent in reviews_tagged_sents:
...     untagged_sent = nltk.tag.untag(reviews_tagged_sent)
...     for i, (word, tag) in enumerate(reviews_tagged_sent):
...         featuresets.append( (pos_features(untagged_sent, i), tag) )

In [96]:
size = int(len(featuresets) * 0.1)                                             #setting size

In [97]:
reviews_train_set, reviews_test_set = reviews_featuresets[reviews_size:], featuresets[:reviews_size] #declaring test and training set

In [98]:
classifier = nltk.NaiveBayesClassifier.train(reviews_train_set)                #define classifier

In [101]:
nltk.classify.accuracy(classifier, reviews_test_set)                          #evaluation

0.12776412776412777

## 3. Editorial

In [102]:
suffix_fdist = nltk.FreqDist()                                                  #set suffix frequency distribution

In [103]:
for word in brown.words():
...     word = word.lower()
...     suffix_fdist[word[-1:]] += 1                                            #last letter in the word
...     suffix_fdist[word[-2:]] += 1                                            #second last letter in the word
...     suffix_fdist[word[-3:]] += 1                                            #third last letter in the word

In [104]:
common_suffixes = [suffix for (suffix, count) in suffix_fdist.most_common(100)] #100 topmost suffixes

In [105]:
print(common_suffixes)

['e', ',', '.', 's', 'd', 't', 'he', 'n', 'a', 'of', 'the', 'y', 'r', 'to', 'in', 'f', 'o', 'ed', 'nd', 'is', 'on', 'l', 'g', 'and', 'ng', 'er', 'as', 'ing', 'h', 'at', 'es', 'or', 're', 'it', '``', 'an', "''", 'm', ';', 'i', 'ly', 'ion', 'en', 'al', '?', 'nt', 'be', 'hat', 'st', 'his', 'th', 'll', 'le', 'ce', 'by', 'ts', 'me', 've', "'", 'se', 'ut', 'was', 'for', 'ent', 'ch', 'k', 'w', 'ld', '`', 'rs', 'ted', 'ere', 'her', 'ne', 'ns', 'ith', 'ad', 'ry', ')', '(', 'te', '--', 'ay', 'ty', 'ot', 'p', 'nce', "'s", 'ter', 'om', 'ss', ':', 'we', 'are', 'c', 'ers', 'uld', 'had', 'so', 'ey']


In [106]:
def pos_features(word):                                                          #features extractor function
...     features = {}
...     for suffix in common_suffixes:
...         features['endswith({})'.format(suffix)] = word.lower().endswith(suffix)
...     return features

In [107]:
editorial_tagged_words = brown.tagged_words(categories='editorial')               #select editorial corpus

In [108]:
editorial_featuresets = [(pos_features(n), g) for (n,g) in editorial_tagged_words]#define features for words

In [109]:
editorial_size = int(len(editorial_featuresets) * 0.1)                            #divide corpus into training and test

In [110]:
editorial_train_set, editorial_test_set = editorial_featuresets[editorial_size:], editorial_featuresets[:editorial_size] #form training and test

In [111]:
editorial_classifier = nltk.DecisionTreeClassifier.train(editorial_train_set)     #Decision tree classification

In [112]:
nltk.classify.accuracy(classifier, editorial_test_set)                            #evaluation

0.5581168831168831

In [113]:
print (editorial_classifier.pseudocode(depth=5))                                  #pseudocode

if endswith(the) == False: 
  if endswith(,) == False: 
    if endswith(s) == False: 
      if endswith(.) == False: 
        if endswith(of) == False: return 'PPO'
        if endswith(of) == True: return 'IN'
      if endswith(.) == True: return '.'
    if endswith(s) == True: 
      if endswith(is) == False: 
        if endswith(as) == False: return 'VBZ'
        if endswith(as) == True: return 'BEDZ'
      if endswith(is) == True: 
        if endswith(his) == False: return 'BEZ'
        if endswith(his) == True: return 'DT'
  if endswith(,) == True: return ','
if endswith(the) == True: return 'AT'



In [114]:
def pos_features(sentence, i):                                                     #feature detector function for sentence
    features = {"suffix(1)": sentence[i][-1:],                                     #last word in the sentence
                "suffix(2)": sentence[i][-2:],                                     #second last word in the sentence
                "suffix(3)": sentence[i][-3:]}                                     #third last word in the sentence
    if i == 0:
        features["prev-word"] = "<START>"
    else:
        features["prev-word"] = sentence[i-1]
    return features

In [115]:
pos_features(brown.sents()[0], 8)

{'prev-word': 'an', 'suffix(1)': 'n', 'suffix(2)': 'on', 'suffix(3)': 'ion'}

In [116]:
editorial_tagged_sents = brown.tagged_sents(categories='editorial')                 #set tagged sentences

In [117]:
featuresets = []                                                                    #features

In [118]:
for tagged_sent in tagged_sents:
...     untagged_sent = nltk.tag.untag(tagged_sent)
...     for i, (word, tag) in enumerate(tagged_sent):
...         featuresets.append( (pos_features(untagged_sent, i), tag) )

In [119]:
size = int(len(featuresets) * 0.1)                                                   #setting size

In [121]:
editorial_train_set, editorial_test_set = featuresets[editorial_size:], featuresets[:editorial_size] #declaring test and training set

In [122]:
classifier = nltk.NaiveBayesClassifier.train(editorial_train_set)                    #define classifier

In [124]:
nltk.classify.accuracy(classifier, editorial_test_set)                               #evaluation

0.7803571428571429

## 4. Government

In [125]:
suffix_fdist = nltk.FreqDist() #set suffix frequency distribution

In [126]:
for word in brown.words():
...     word = word.lower()
...     suffix_fdist[word[-1:]] += 1                                                   #last letter in the word
...     suffix_fdist[word[-2:]] += 1                                                   #second last letter in the word
...     suffix_fdist[word[-3:]] += 1                                                   #third last letter in the word

In [127]:
common_suffixes = [suffix for (suffix, count) in suffix_fdist.most_common(100)]        #100 topmost suffixes

In [128]:
print(common_suffixes)

['e', ',', '.', 's', 'd', 't', 'he', 'n', 'a', 'of', 'the', 'y', 'r', 'to', 'in', 'f', 'o', 'ed', 'nd', 'is', 'on', 'l', 'g', 'and', 'ng', 'er', 'as', 'ing', 'h', 'at', 'es', 'or', 're', 'it', '``', 'an', "''", 'm', ';', 'i', 'ly', 'ion', 'en', 'al', '?', 'nt', 'be', 'hat', 'st', 'his', 'th', 'll', 'le', 'ce', 'by', 'ts', 'me', 've', "'", 'se', 'ut', 'was', 'for', 'ent', 'ch', 'k', 'w', 'ld', '`', 'rs', 'ted', 'ere', 'her', 'ne', 'ns', 'ith', 'ad', 'ry', ')', '(', 'te', '--', 'ay', 'ty', 'ot', 'p', 'nce', "'s", 'ter', 'om', 'ss', ':', 'we', 'are', 'c', 'ers', 'uld', 'had', 'so', 'ey']


In [129]:
def pos_features(word):                                                                 #features extractor function
...     features = {}
...     for suffix in common_suffixes:
...         features['endswith({})'.format(suffix)] = word.lower().endswith(suffix)
...     return features

In [130]:
government_tagged_words = brown.tagged_words(categories='government')                    #select government corpus

In [131]:
government_featuresets = [(pos_features(n), g) for (n,g) in government_tagged_words]     #define features for words

In [132]:
government_size = int(len(government_featuresets) * 0.1)                                 #divide corpus into training and test

In [133]:
government_train_set, government_test_set = government_featuresets[government_size:], government_featuresets[:government_size] #form training and test

In [134]:
government_classifier = nltk.DecisionTreeClassifier.train(government_train_set)          #Decision tree classification

In [135]:
nltk.classify.accuracy(government_classifier, government_test_set)                        #evaluation

0.62444729710455

In [136]:
print (government_classifier.pseudocode(depth=5))                                         #pseudocode

if endswith(the) == False: 
  if endswith(s) == False: 
    if endswith(,) == False: 
      if endswith(of) == False: 
        if endswith(.) == False: return 'NN'
        if endswith(.) == True: return '.'
      if endswith(of) == True: return 'IN'
    if endswith(,) == True: return ','
  if endswith(s) == True: 
    if endswith(is) == False: 
      if endswith(as) == False: 
        if endswith(ss) == False: return 'NNS'
        if endswith(ss) == True: return 'NN'
      if endswith(as) == True: 
        if endswith(was) == False: return 'CS'
        if endswith(was) == True: return 'BEDZ'
    if endswith(is) == True: 
      if endswith(his) == False: return 'BEZ'
      if endswith(his) == True: return 'DT'
if endswith(the) == True: return 'AT'



In [137]:
def pos_features(sentence, i):                                                             #feature detector function for sentence
    features = {"suffix(1)": sentence[i][-1:],                                             #last word in the sentence
                "suffix(2)": sentence[i][-2:],                                             #second last word in the sentence
                "suffix(3)": sentence[i][-3:]}                                             #third last word in the sentence
    if i == 0:
        features["prev-word"] = "<START>"
    else:
        features["prev-word"] = sentence[i-1]
    return features

In [138]:
pos_features(brown.sents()[0], 8)

{'prev-word': 'an', 'suffix(1)': 'n', 'suffix(2)': 'on', 'suffix(3)': 'ion'}

In [139]:
government_tagged_sents = brown.tagged_sents(categories='government')                      #set tagged sentences

In [140]:
featuresets = [] #features

In [141]:
for tagged_sent in tagged_sents:
...     untagged_sent = nltk.tag.untag(tagged_sent)
...     for i, (word, tag) in enumerate(tagged_sent):
...         featuresets.append( (pos_features(untagged_sent, i), tag) )

In [142]:
government_size = int(len(featuresets) * 0.1)                                              #setting size

In [143]:
government_train_set, government_test_set = featuresets[government_size:], featuresets[:government_size] #declaring test and training set

In [144]:
government_classifier = nltk.NaiveBayesClassifier.train(government_train_set)              #define classifier

In [145]:
nltk.classify.accuracy(government_classifier, government_test_set)                         #evaluation

0.7803571428571429

# End
## Reference: 
### Bird,  S.,  Klein,  E.  and  Loper,  E.  (2009).   Natural  language  processing  with  python,O’Reilly Media, Inc.