**Custom PoS Tagging with scikit-learn**

In [None]:
pip install sklearn_crfsuite



In [None]:
import nltk
from nltk.corpus import gutenberg as cg
from nltk.tokenize import sent_tokenize as st
from nltk import word_tokenize,pos_tag
import re
from nltk.tbl import demo as brill_demo
import numpy as np
import pandas as pd
from subprocess import check_output
from nltk.corpus import brown
import pprint
from sklearn_crfsuite import metrics

In [None]:
nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package biocreative_ppi to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package biocreative_ppi is already up-to-date!
[nltk_data]    | Downloading package brown to /root/nltk_data...
[nltk_data]    |   Package brown is already up-to-date!
[nltk_data]    | Downloading package brown_tei to /root/nltk_data...
[nltk_data]    |   Package brown_tei is already up-to-date!
[nltk_data]    | Downloading package cess_cat to /root/nltk_data...
[nltk_data]    |   Package cess_cat is already up-to-date!
[nltk_data]    | Downloading package cess_esp to /root/nltk_data...
[nltk_data]    |   Package cess_esp is already up-to-date!
[nltk_data]    | Downloading packag

True

In [None]:
sentence = "the little brown dog barked at the cat"

In [None]:
tagged = nltk.pos_tag(['I',  'watched',  'the', 'first',  'early','morning',  'sunbeams', 'creeping',  'through',  'the', 'Green','Forest'])
tagged

[('I', 'PRP'),
 ('watched', 'VBD'),
 ('the', 'DT'),
 ('first', 'JJ'),
 ('early', 'JJ'),
 ('morning', 'NN'),
 ('sunbeams', 'NN'),
 ('creeping', 'VBG'),
 ('through', 'IN'),
 ('the', 'DT'),
 ('Green', 'NNP'),
 ('Forest', 'NNP')]

In [22]:
# Data collection
tagged_sentences = nltk.corpus.treebank.tagged_sents()

# Function for detailed feature extraction
def features(sentence, index):
    """ sentence: [w1, w2, ...], index: the index of the word """
    return {
        'word': sentence[index],
        'is_first': index == 0,
        'is_last': index == len(sentence) - 1,
        'is_capitalized': sentence[index][0].upper() == sentence[index][0],
        'is_all_caps': sentence[index].upper() == sentence[index],
        'is_all_lower': sentence[index].lower() == sentence[index],
        'prefix-1': sentence[index][0],
        'prefix-2': sentence[index][:2],
        'prefix-3': sentence[index][:3],
        'suffix-1': sentence[index][-1],
        'suffix-2': sentence[index][-2:],
        'suffix-3': sentence[index][-3:],
        'prev_word': '' if index == 0 else sentence[index - 1],
        'next_word': '' if index == len(sentence) - 1 else sentence[index + 1],
        'has_hyphen': '-' in sentence[index],
        'is_numeric': sentence[index].isdigit(),
        'capitals_inside': sentence[index][1:].lower() != sentence[index][1:]
    }


In [23]:
pprint.pprint(features(['This', 'is', 'Dipanjana'], 2))


{'capitals_inside': False,
 'has_hyphen': False,
 'is_all_caps': False,
 'is_all_lower': False,
 'is_capitalized': True,
 'is_first': False,
 'is_last': True,
 'is_numeric': False,
 'next_word': '',
 'prefix-1': 'D',
 'prefix-2': 'Di',
 'prefix-3': 'Dip',
 'prev_word': 'is',
 'suffix-1': 'a',
 'suffix-2': 'na',
 'suffix-3': 'ana',
 'word': 'Dipanjana'}


In [24]:
# Helper function to remove tag from each word to make the data set prepared for custom PoS tagger building
def untag(tagged_sentence):
    return [w for w, t in tagged_sentence]

In [25]:
# Split the dataset for training and testing
cutoff = int(.75 * len(tagged_sentences))
training_sentences = tagged_sentences[:cutoff]
test_sentences = tagged_sentences[cutoff:]

In [29]:
def transform_to_dataset(tagged_sentences):
    X, y = [], []
 
    for tagged in tagged_sentences:
        for index in range(len(tagged)):
            X.append(features(untag(tagged), index))
            y.append(tagged[index][1])
 
    return X, y
 
X, y = transform_to_dataset(training_sentences)
X[1], y[1]

({'capitals_inside': False,
  'has_hyphen': False,
  'is_all_caps': False,
  'is_all_lower': False,
  'is_capitalized': True,
  'is_first': False,
  'is_last': False,
  'is_numeric': False,
  'next_word': ',',
  'prefix-1': 'V',
  'prefix-2': 'Vi',
  'prefix-3': 'Vin',
  'prev_word': 'Pierre',
  'suffix-1': 'n',
  'suffix-2': 'en',
  'suffix-3': 'ken',
  'word': 'Vinken'},
 'NNP')

In [30]:
# Training the PoS Tagger with DecisionTreeClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline

clf = Pipeline([
    ('vectorizer', DictVectorizer(sparse=False)),
    ('classifier', DecisionTreeClassifier(criterion='entropy'))
])
clf.fit(X, y)

Pipeline(memory=None,
         steps=[('vectorizer',
                 DictVectorizer(dtype=<class 'numpy.float64'>, separator='=',
                                sort=True, sparse=False)),
                ('classifier',
                 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,
                                        criterion='entropy', max_depth=None,
                                        max_features=None, max_leaf_nodes=None,
                                        min_impurity_decrease=0.0,
                                        min_impurity_split=None,
                                        min_samples_leaf=1, min_samples_split=2,
                                        min_weight_fraction_leaf=0.0,
                                        presort='deprecated', random_state=None,
                                        splitter='best'))],
         verbose=False)

In [None]:
# Validating on test dataset
X_test, y_test = transform_to_dataset(test_sentences)
 
print ("Accuracy:", clf.score(X_test, y_test))


Accuracy: 0.9451631046119235


In [None]:
# Validating on new/ random sentences

# Helper function to break the input sentence into tokens and tag each token using the trained model
def pos_tag(sentence):
  tags = clf.predict([features(sentence, index) for index in range(len(sentence))])
  #return zip(sentence, tags)
  return (sentence, tags)

In [None]:
# Validation sentence 1
test_sentence = ['Buster',  'Bear',  'yawned',  'as',  'he',  'lay',  'on',  'his',  'comfortable',  'bed',  'of',  'leaves',  'and',  'watched',  'the', 'first',  'early','morning',  'sunbeams', 'creeping',  'through',  'the', 'Green','Forest',  'to',  'chase',  'out',  'the','Black','Shadows','.']

pos_tag(test_sentence)

# O/P: Unsatisfactory
# Mistakes in the above output:
# 1. 'the black shadow' -- 'black' tagged as 'NNP' i.e proper noun where it is 'JJ' i.e Adjective
# 2. 'first early morning sunbeams' -- as per the context 'morning' should be 'JJ' but tagged as 'NN' and accordingly\
# 'first' and 'early' should be adverb (RB) but wrongly tagged as 'JJ'(adjective)


(['Buster',
  'Bear',
  'yawned',
  'as',
  'he',
  'lay',
  'on',
  'his',
  'comfortable',
  'bed',
  'of',
  'leaves',
  'and',
  'watched',
  'the',
  'first',
  'early',
  'morning',
  'sunbeams',
  'creeping',
  'through',
  'the',
  'Green',
  'Forest',
  'to',
  'chase',
  'out',
  'the',
  'Black',
  'Shadows',
  '.'],
 array(['NNP', 'NN', 'VBD', 'IN', 'PRP', 'VBD', 'IN', 'PRP$', 'JJ', 'VBD',
        'IN', 'NNS', 'CC', 'VBD', 'DT', 'JJ', 'JJ', 'NN', 'NNS', 'VBG',
        'IN', 'DT', 'NNP', 'NNP', 'TO', 'VB', 'RP', 'DT', 'NNP', 'NNPS',
        '.'], dtype='<U6'))

In [None]:
# Validation sentence 2
pos_tag(word_tokenize('we lost tickets to this lost game'))

# O/P: Unsatisfactory
# Mistake:
# 2nd 'lost' should be 'JJ' but tagged wrongly as 'VBD'

(['we', 'lost', 'tickets', 'to', 'this', 'lost', 'game'],
 array(['PRP', 'VBD', 'NNS', 'TO', 'DT', 'VBD', 'NN'], dtype='<U6'))

**POS tagger using a Conditional Random Field**

In [None]:
from nltk.tag.util import untag
 
# Split the dataset for training and testing
cutoff = int(.75 * len(tagged_sentences))
training_sentences = tagged_sentences[:cutoff]
test_sentences = tagged_sentences[cutoff:]

def transform_to_dataset(tagged_sentences):
    X, y = [], []
 
    for tagged in tagged_sentences:
        X.append([features(untag(tagged), index) for index in range(len(tagged))])
        y.append([tag for _, tag in tagged])
 
    return X, y
 
X_train, y_train = transform_to_dataset(training_sentences)
X_test, y_test = transform_to_dataset(test_sentences)
 
print(len(X_train))     
print(len(X_test))         
print(X_train[0])
print(y_train[0])

2935
979
[{'word': 'Pierre', 'is_first': True, 'is_last': False, 'is_capitalized': True, 'is_all_caps': False, 'is_all_lower': False, 'prefix-1': 'P', 'prefix-2': 'Pi', 'prefix-3': 'Pie', 'suffix-1': 'e', 'suffix-2': 're', 'suffix-3': 'rre', 'prev_word': '', 'next_word': 'Vinken', 'has_hyphen': False, 'is_numeric': False, 'capitals_inside': False}, {'word': 'Vinken', 'is_first': False, 'is_last': False, 'is_capitalized': True, 'is_all_caps': False, 'is_all_lower': False, 'prefix-1': 'V', 'prefix-2': 'Vi', 'prefix-3': 'Vin', 'suffix-1': 'n', 'suffix-2': 'en', 'suffix-3': 'ken', 'prev_word': 'Pierre', 'next_word': ',', 'has_hyphen': False, 'is_numeric': False, 'capitals_inside': False}, {'word': ',', 'is_first': False, 'is_last': False, 'is_capitalized': True, 'is_all_caps': True, 'is_all_lower': True, 'prefix-1': ',', 'prefix-2': ',', 'prefix-3': ',', 'suffix-1': ',', 'suffix-2': ',', 'suffix-3': ',', 'prev_word': 'Vinken', 'next_word': '61', 'has_hyphen': False, 'is_numeric': False, 'c

In [None]:
pip install sklearn-crfsuite

Collecting sklearn-crfsuite
  Downloading https://files.pythonhosted.org/packages/25/74/5b7befa513482e6dee1f3dd68171a6c9dfc14c0eaa00f885ffeba54fe9b0/sklearn_crfsuite-0.3.6-py2.py3-none-any.whl
Collecting python-crfsuite>=0.8.3 (from sklearn-crfsuite)
[?25l  Downloading https://files.pythonhosted.org/packages/2f/86/cfcd71edca9d25d3d331209a20f6314b6f3f134c29478f90559cee9ce091/python_crfsuite-0.9.6-cp36-cp36m-manylinux1_x86_64.whl (754kB)
[K     |████████████████████████████████| 757kB 5.2MB/s 
Installing collected packages: python-crfsuite, sklearn-crfsuite
Successfully installed python-crfsuite-0.9.6 sklearn-crfsuite-0.3.6


In [None]:
from sklearn_crfsuite import CRF

model = CRF()
model.fit(X_train, y_train)


CRF(algorithm=None, all_possible_states=None, all_possible_transitions=None,
    averaging=None, c=None, c1=None, c2=None, calibration_candidates=None,
    calibration_eta=None, calibration_max_trials=None, calibration_rate=None,
    calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
    gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=None,
    max_linesearch=None, min_freq=None, model_filename=None, num_memories=None,
    pa_type=None, period=None, trainer_cls=None, variance=None, verbose=False)

In [None]:
# Helper function to break the input sentence into tokens and tag each token using the trained model
def pos_tag(sentence):
    sentence_features = [features(sentence, index) for index in range(len(sentence))]
    return list((sentence, model.predict([sentence_features])[0]))

In [None]:
# Validation of model performance
 
y_pred = model.predict(X_test)
print(metrics.flat_accuracy_score(y_test, y_pred))

0.9602683593122289


In [None]:
# Validation sentence 1
sentence = word_tokenize('bob made a book collector happy the other day')
 
print(pos_tag(sentence))  

# O/P: Wrong
# # 'book' should be adjective but tagged as 'NN' (Noun)
# 'happy' should be adverb but tagged as 'IN' (Preposition or subordinating conjunction)

[['bob', 'made', 'a', 'book', 'collector', 'happy', 'the', 'other', 'day'], ['NN', 'VBD', 'DT', 'NN', 'NN', 'IN', 'DT', 'JJ', 'NN']]


['bob', 'made', 'a', 'book', 'collector', 'happy', 'the', 'other', 'day']

In [None]:
# Validation sentence 2
sentence = word_tokenize('we lost tickets to this lost game')
 
print(pos_tag(sentence)) 
# O/P: Wrong

# Validation sentence 3

sentence = ['Buster',
  'Bear',
  'yawned',
  'as',
  'he',
  'lay',
  'on',
  'his',
  'comfortable',
  'bed',
  'of',
  'leaves',
  'and',
  'watched',
  'the',
  'first',
  'early',
  'morning',
  'sunbeams',
  'creeping',
  'through',
  'the',
  'Green',
  'Forest',
  'to',
  'chase',
  'out',
  'the',
  'Black',
  'Shadows',
  '.']

print(pos_tag(sentence)) 
# O/P: Wrong

[['we', 'lost', 'tickets', 'to', 'this', 'lost', 'game'], ['PRP', 'VBD', 'NNS', 'TO', 'DT', 'NN', 'NN']]
[['Buster', 'Bear', 'yawned', 'as', 'he', 'lay', 'on', 'his', 'comfortable', 'bed', 'of', 'leaves', 'and', 'watched', 'the', 'first', 'early', 'morning', 'sunbeams', 'creeping', 'through', 'the', 'Green', 'Forest', 'to', 'chase', 'out', 'the', 'Black', 'Shadows', '.'], ['NNP', 'NNP', 'VBD', 'IN', 'PRP', 'VBP', 'IN', 'PRP$', 'NN', 'VBD', 'IN', 'NNS', 'CC', 'VBD', 'DT', 'JJ', 'JJ', 'NN', 'NNS', 'VBG', 'IN', 'DT', 'NNP', 'NNP', 'TO', 'VB', 'RP', 'DT', 'NNP', 'NNP', '.']]


**Auto Tagging**

In [None]:
#tags = [tag for (word, tag) in tagged_sentences]
#print("Most common tag is : ", nltk.FreqDist(tags).max())

#tagged_sentences

for tagged_sentence in tagged_sentences:
  tags = [tag for (word, tag) in tagged_sentence]

print("Most common tag is : ", nltk.FreqDist(tags).max())

Most common tag is :  NN


In [None]:
# Working with another data set : Brown Corpus
brown_tagged_sents = brown.tagged_sents(categories='news')
brown_sents = brown.sents(categories='news')

In [None]:
tags = [tag for (word, tag) in brown.tagged_words(categories='news')]
print("Most common tag is : ", nltk.FreqDist(tags).max())


Most common tag is :  NN


In [None]:
# Default Tagger

default_tagger = nltk.DefaultTagger('NN')
print("\nCheck results : ", default_tagger.tag(word_tokenize('bob made a book collector happy the other day')))

# Performances : 
print("\nPerformance with default tagger : ", default_tagger.evaluate(brown_tagged_sents))


Check results :  [('bob', 'NN'), ('made', 'NN'), ('a', 'NN'), ('book', 'NN'), ('collector', 'NN'), ('happy', 'NN'), ('the', 'NN'), ('other', 'NN'), ('day', 'NN')]

Performance with default tagger :  0.13089484257215028


In [None]:
# Regex Tagging

# Regex pattern
patterns = [
    (r'.*ing$', 'VBG'),               # gerunds
    (r'.*ed$', 'VBD'),                # simple past
    (r'.*es$', 'VBZ'),                # 3rd singular present
    (r'.*ould$', 'MD'),               # modals
    (r'.*\'s$', 'NN$'),               # possessive nouns
    (r'.*s$', 'NNS'),                 # plural nouns
    (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),  # cardinal numbers
    (r'(The|the|A|a|An|an)$', 'AT'),   # articles 
    (r'.*able$', 'JJ'),                # adjectives 
    (r'.*ness$', 'NN'),                # nouns formed from adjectives
    (r'.*ly$', 'RB'),                  # adverbs
    (r'(He|he|She|she|It|it|I|me|Me|You|you)$', 'PRP'), # pronouns
    (r'(His|his|Her|her|Its|its)$', 'PRP$'),    # possesive
    (r'(my|Your|your|Yours|yours)$', 'PRP$'),   # possesive
    (r'(on|On|in|In|at|At|since|Since)$', 'IN'),# time prepopsitions
    (r'(for|For|ago|Ago|before|Before)$', 'IN'),# time prepopsitions
    (r'(till|Till|until|Until)$', 'IN'),        # time prepopsitions
    (r'(by|By|beside|Beside)$', 'IN'),          # space prepopsitions
    (r'(under|Under|below|Below)$', 'IN'),      # space prepopsitions
    (r'(over|Over|above|Above)$', 'IN'),        # space prepopsitions
    (r'(across|Across|through|Through)$', 'IN'),# space prepopsitions
    (r'(into|Into|towards|Towards)$', 'IN'),    # space prepopsitions
    (r'(onto|Onto|from|From)$', 'IN'),          # space prepopsitions    
    (r'\.$','.'), (r'\,$',','), (r'\?$','?'),    # fullstop, comma, Qmark
    (r'\($','('), (r'\)$',')'),             # round brackets
    (r'\[$','['), (r'\]$',']'),             # square brackets
    (r'(Sam)$', 'NAM'),
    # WARNING : Put the default value in the end
    (r'.*', 'NN')                      # nouns (default)
]

# Construct Tagger
regexp_tagger = nltk.RegexpTagger(patterns)


['``',
 'Only',
 'a',
 'relative',
 'handful',
 'of',
 'such',
 'reports',
 'was',
 'received',
 "''",
 ',',
 'the',
 'jury',
 'said',
 ',',
 '``',
 'considering',
 'the',
 'widespread',
 'interest',
 'in',
 'the',
 'election',
 ',',
 'the',
 'number',
 'of',
 'voters',
 'and',
 'the',
 'size',
 'of',
 'this',
 'city',
 "''",
 '.']

In [None]:
# Evaluation
print(regexp_tagger.tag(word_tokenize('bob made a book collector happy the other day')))
print(regexp_tagger.evaluate(brown_tagged_sents))

# O/P: Very unsatisfactory

[('bob', 'NN'), ('made', 'NN'), ('a', 'AT'), ('book', 'NN'), ('collector', 'NN'), ('happy', 'NN'), ('the', 'AT'), ('other', 'NN'), ('day', 'NN')]
0.4461085585854367


**Combining N-Gram Tagging**

In [None]:
# Creating train and test set
size = int(len(brown_tagged_sents) * 0.9)
train_sents = brown_tagged_sents[:size]
test_sents = brown_tagged_sents[size:]

In [None]:
t0 = nltk.DefaultTagger('NN')
t1 = nltk.UnigramTagger(train_sents, backoff=t0)
t2 = nltk.BigramTagger(train_sents, backoff=t1)
t3 = nltk.TrigramTagger(train_sents, backoff=t2)

print ("\nEvaluation mix default/1G/2G/3G on train set ", t3.evaluate(train_sents))
print ("Evaluation mix default/1G/2G/3G on test set ", t3.evaluate(test_sents))


Evaluation mix default/1G/2G/3G on train set  0.9829321372941086
Evaluation mix default/1G/2G/3G on test set  0.843317053722715


In [None]:
t3.tag(word_tokenize('bob made a book collector happy the other day'))

# O/P: Wrong
# # 'book' should be adjective but tagged as 'NN' 
# 'happy' should be adverb but tagged as 'AP' 

[('bob', 'NN'),
 ('made', 'VBN'),
 ('a', 'AT'),
 ('book', 'NN'),
 ('collector', 'NN'),
 ('happy', 'JJ'),
 ('the', 'AT'),
 ('other', 'AP'),
 ('day', 'NN')]