## Part-of-Speech tagging using CRF

### Data Preparation

In [1]:
##################
#Code :Part-of-Speech tagging using CRF
#Author : Ambrish Gupta
#Date : Aug 01, 2018
###################

#Importing libraries
import nltk, re, pprint
import numpy as np
import pandas as pd
import pprint, time
import matplotlib.pyplot as plt 
import random
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
import sklearn
import requests
import seaborn as sns
nltk.download('universal_tagset')
from collections import Counter
nltk.download('conll2000')
from itertools import chain
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import label_binarize
import sklearn_crfsuite
from sklearn_crfsuite import metrics
from sklearn_crfsuite import scorers
from sklearn.model_selection import GridSearchCV
import pickle

[nltk_data] Downloading package universal_tagset to C:\Users\Ambrish
[nltk_data]     Gupta\AppData\Roaming\nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!
[nltk_data] Downloading package conll2000 to C:\Users\Ambrish
[nltk_data]     Gupta\AppData\Roaming\nltk_data...
[nltk_data]   Package conll2000 is already up-to-date!


In [2]:
# reading the Treebank tagged sentences
wsj = list(nltk.corpus.treebank.tagged_sents(tagset='universal'))

In [3]:
# first few tagged sentences
print(wsj[:2])
print(len(wsj))

[[('Pierre', 'NOUN'), ('Vinken', 'NOUN'), (',', '.'), ('61', 'NUM'), ('years', 'NOUN'), ('old', 'ADJ'), (',', '.'), ('will', 'VERB'), ('join', 'VERB'), ('the', 'DET'), ('board', 'NOUN'), ('as', 'ADP'), ('a', 'DET'), ('nonexecutive', 'ADJ'), ('director', 'NOUN'), ('Nov.', 'NOUN'), ('29', 'NUM'), ('.', '.')], [('Mr.', 'NOUN'), ('Vinken', 'NOUN'), ('is', 'VERB'), ('chairman', 'NOUN'), ('of', 'ADP'), ('Elsevier', 'NOUN'), ('N.V.', 'NOUN'), (',', '.'), ('the', 'DET'), ('Dutch', 'NOUN'), ('publishing', 'VERB'), ('group', 'NOUN'), ('.', '.')]]
3914


In [4]:
# reading the Brown tagged sentences
brown= list(nltk.corpus.brown.tagged_sents(tagset='universal'))

In [5]:
# first few tagged sentences
print(brown[:2])
print(len(brown))

[[('The', 'DET'), ('Fulton', 'NOUN'), ('County', 'NOUN'), ('Grand', 'ADJ'), ('Jury', 'NOUN'), ('said', 'VERB'), ('Friday', 'NOUN'), ('an', 'DET'), ('investigation', 'NOUN'), ('of', 'ADP'), ("Atlanta's", 'NOUN'), ('recent', 'ADJ'), ('primary', 'NOUN'), ('election', 'NOUN'), ('produced', 'VERB'), ('``', '.'), ('no', 'DET'), ('evidence', 'NOUN'), ("''", '.'), ('that', 'ADP'), ('any', 'DET'), ('irregularities', 'NOUN'), ('took', 'VERB'), ('place', 'NOUN'), ('.', '.')], [('The', 'DET'), ('jury', 'NOUN'), ('further', 'ADV'), ('said', 'VERB'), ('in', 'ADP'), ('term-end', 'NOUN'), ('presentments', 'NOUN'), ('that', 'ADP'), ('the', 'DET'), ('City', 'NOUN'), ('Executive', 'ADJ'), ('Committee', 'NOUN'), (',', '.'), ('which', 'DET'), ('had', 'VERB'), ('over-all', 'ADJ'), ('charge', 'NOUN'), ('of', 'ADP'), ('the', 'DET'), ('election', 'NOUN'), (',', '.'), ('``', '.'), ('deserves', 'VERB'), ('the', 'DET'), ('praise', 'NOUN'), ('and', 'CONJ'), ('thanks', 'NOUN'), ('of', 'ADP'), ('the', 'DET'), ('City

In [6]:
# reading the conll2000 tagged sentences
conll2000= list(nltk.corpus.conll2000.tagged_sents(tagset='universal'))

In [7]:
# first few tagged sentences
print(conll2000[:2])
print(len(conll2000))

[[('Confidence', 'NOUN'), ('in', 'ADP'), ('the', 'DET'), ('pound', 'NOUN'), ('is', 'VERB'), ('widely', 'ADV'), ('expected', 'VERB'), ('to', 'PRT'), ('take', 'VERB'), ('another', 'DET'), ('sharp', 'ADJ'), ('dive', 'NOUN'), ('if', 'ADP'), ('trade', 'NOUN'), ('figures', 'NOUN'), ('for', 'ADP'), ('September', 'NOUN'), (',', '.'), ('due', 'ADJ'), ('for', 'ADP'), ('release', 'NOUN'), ('tomorrow', 'NOUN'), (',', '.'), ('fail', 'VERB'), ('to', 'PRT'), ('show', 'VERB'), ('a', 'DET'), ('substantial', 'ADJ'), ('improvement', 'NOUN'), ('from', 'ADP'), ('July', 'NOUN'), ('and', 'CONJ'), ('August', 'NOUN'), ("'s", 'PRT'), ('near-record', 'ADJ'), ('deficits', 'NOUN'), ('.', '.')], [('Chancellor', 'NOUN'), ('of', 'ADP'), ('the', 'DET'), ('Exchequer', 'NOUN'), ('Nigel', 'NOUN'), ('Lawson', 'NOUN'), ("'s", 'PRT'), ('restated', 'VERB'), ('commitment', 'NOUN'), ('to', 'PRT'), ('a', 'DET'), ('firm', 'NOUN'), ('monetary', 'ADJ'), ('policy', 'NOUN'), ('has', 'VERB'), ('helped', 'VERB'), ('to', 'PRT'), ('prev

In [8]:
nltk_data = wsj + brown + conll2000

In [9]:
print(nltk_data[1])
print(len(nltk_data))

[('Mr.', 'NOUN'), ('Vinken', 'NOUN'), ('is', 'VERB'), ('chairman', 'NOUN'), ('of', 'ADP'), ('Elsevier', 'NOUN'), ('N.V.', 'NOUN'), (',', '.'), ('the', 'DET'), ('Dutch', 'NOUN'), ('publishing', 'VERB'), ('group', 'NOUN'), ('.', '.')]
72202


In [10]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
     

    features = {
        'word':word,
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],        
    }
         
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
             'word':word,
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True
        
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            'word':word,
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True
                
    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [postag for token, postag in sent]

def sent2tokens(sent):
    return [token for token, postag in sent]


In [11]:
# Splitting into train and test
random.seed(1000)
train_set, val_set = train_test_split(nltk_data,test_size=0.2)

print(len(train_set))
print(len(val_set))
train_set[1]


57761
14441


[('Still', 'ADV'),
 (',', '.'),
 ('says', 'VERB'),
 ('Mr.', 'NOUN'),
 ('Lee', 'NOUN'),
 (':', '.'),
 ('``', '.'),
 ('We', 'PRON'),
 ('need', 'VERB'),
 ('to', 'PRT'),
 ('educate', 'VERB'),
 ('people', 'NOUN'),
 ('that', 'ADP'),
 ('they', 'PRON'),
 ('need', 'VERB'),
 ('to', 'PRT'),
 ('get', 'VERB'),
 ('to', 'PRT'),
 ('a', 'DET'),
 ('phone', 'NOUN'),
 ('somehow', 'ADV'),
 (',', '.'),
 ('some', 'DET'),
 ('way', 'NOUN'),
 (',', '.'),
 ('to', 'PRT'),
 ('let', 'VERB'),
 ('someone', 'NOUN'),
 ('know', 'NOUN'),
 ('what', 'PRON'),
 ('their', 'PRON'),
 ('status', 'NOUN'),
 ('is', 'NOUN'),
 ('.', '.'),
 ("''", '.')]

In [12]:
print(train_set[1][6])
word2features(train_set[6],i=1)

('``', '.')


{'+1:postag': 'VERB',
 '+1:postag[:2]': 'VE',
 '+1:word.istitle()': False,
 '+1:word.isupper()': False,
 '+1:word.lower()': 'would',
 '-1:postag': '.',
 '-1:postag[:2]': '.',
 '-1:word.istitle()': False,
 '-1:word.isupper()': False,
 '-1:word.lower()': '``',
 'bias': 1.0,
 'postag': 'NOUN',
 'postag[:2]': 'NO',
 'word': 'Buster',
 'word.isdigit()': False,
 'word.istitle()': True,
 'word.isupper()': False,
 'word.lower()': 'buster',
 'word[-2:]': 'er',
 'word[-3:]': 'ter'}

In [13]:
### Creating Features from sentence
x_train= [sent2features(s) for s in train_set]
y_train= [sent2labels(s) for s in train_set]

x_val= [sent2features(s) for s in val_set]
y_val= [sent2labels(s) for s in val_set]

### 1. Build your CRF

In [14]:
crf= sklearn_crfsuite.CRF(
     algorithm= 'lbfgs',
     c1=.01,
     c2=.1,
     max_iterations=5,
     all_possible_transitions=True)

crf.fit(x_train,y_train)


CRF(algorithm='lbfgs', all_possible_states=None,
  all_possible_transitions=True, averaging=None, c=None, c1=0.01, c2=0.1,
  calibration_candidates=None, calibration_eta=None,
  calibration_max_trials=None, calibration_rate=None,
  calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
  gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=5,
  max_linesearch=None, min_freq=None, model_filename=None,
  num_memories=None, pa_type=None, period=None, trainer_cls=None,
  variance=None, verbose=False)

In [15]:
labels=list(crf.classes_)
labels

['DET',
 'NOUN',
 'VERB',
 'ADJ',
 '.',
 'X',
 'ADV',
 'PRT',
 'CONJ',
 'ADP',
 'PRON',
 'NUM']

In [16]:
y_pred=crf.predict(x_val)
metrics.flat_f1_score(y_val,y_pred,average='weighted',labels=labels)

0.9952961627962746

In [17]:
sorted_labels=sorted(labels,key=lambda name: (name[1:],name[0]))
print(metrics.flat_classification_report(y_val,y_pred,labels=sorted_labels,digits=3))

             precision    recall  f1-score   support

          .      1.000     1.000     1.000     38354
          X      1.000     0.347     0.516      1638
        ADJ      0.992     1.000     0.996     21336
        ADP      0.996     1.000     0.998     36407
        ADV      0.990     0.993     0.992     13774
       VERB      0.998     1.000     0.999     46549
        DET      1.000     1.000     1.000     33939
       CONJ      0.998     1.000     0.999      9435
       NOUN      0.996     1.000     0.998     76657
       PRON      0.978     1.000     0.989     11976
        PRT      0.999     0.999     0.999      8378
        NUM      0.995     1.000     0.997      5639

avg / total      0.996     0.996     0.995    304082



In [18]:
# Hyperparameter Tuning

crf= sklearn_crfsuite.CRF(
     algorithm= 'lbfgs',
     max_iterations=5,
     all_possible_transitions=True)

params_space= {
      'c1': [0.01,0.05,0.1],
      'c2': [0.01,0.05,0.1]
}

f1_scorer=scorers.make_scorer(metrics.flat_f1_score,average='weighted',labels=labels)


In [19]:
crf_gs=GridSearchCV(crf,
                   params_space,
                   cv=3,
                   verbose=1,
                   n_jobs=2,
                   scoring=f1_scorer,
                   return_train_score=True)

print(crf_gs.fit(x_train,y_train))

cv_results=pd.DataFrame(crf_gs.cv_results_)
cv_results

Fitting 3 folds for each of 9 candidates, totalling 27 fits


[Parallel(n_jobs=2)]: Done  27 out of  27 | elapsed: 20.2min finished


GridSearchCV(cv=3, error_score='raise',
       estimator=CRF(algorithm='lbfgs', all_possible_states=None,
  all_possible_transitions=True, averaging=None, c=None, c1=None, c2=None,
  calibration_candidates=None, calibration_eta=None,
  calibration_max_trials=None, calibration_rate=None,
  calibration_samples=None, delta=None, epsilon=None, error...e,
  num_memories=None, pa_type=None, period=None, trainer_cls=None,
  variance=None, verbose=False),
       fit_params=None, iid=True, n_jobs=2,
       param_grid={'c1': [0.01, 0.05, 0.1], 'c2': [0.01, 0.05, 0.1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=make_scorer(flat_f1_score, average=weighted, labels=['DET', 'NOUN', 'VERB', 'ADJ', '.', 'X', 'ADV', 'PRT', 'CONJ', 'ADP', 'PRON', 'NUM']),
       verbose=1)


Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_c1,param_c2,params,rank_test_score,split0_test_score,split0_train_score,split1_test_score,split1_train_score,split2_test_score,split2_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
0,57.852669,13.355213,0.995414,0.995429,0.01,0.01,"{'c1': 0.01, 'c2': 0.01}",7,0.995304,0.995439,0.995134,0.99543,0.995805,0.995419,0.913434,0.117434,0.000285,8e-06
1,56.291654,12.56861,0.995414,0.995429,0.01,0.05,"{'c1': 0.01, 'c2': 0.05}",7,0.995304,0.995439,0.995134,0.99543,0.995805,0.995419,0.797873,0.582431,0.000285,8e-06
2,50.29564,11.112801,0.995414,0.995429,0.01,0.1,"{'c1': 0.01, 'c2': 0.1}",7,0.995304,0.995439,0.995134,0.99543,0.995805,0.995419,1.28177,0.544614,0.000285,8e-06
3,45.347105,10.37767,0.995422,0.995432,0.05,0.01,"{'c1': 0.05, 'c2': 0.01}",1,0.995304,0.995439,0.995134,0.99543,0.995828,0.995429,2.494879,1.030107,0.000295,5e-06
4,42.401132,9.716918,0.995422,0.995432,0.05,0.05,"{'c1': 0.05, 'c2': 0.05}",1,0.995304,0.995439,0.995134,0.99543,0.995828,0.995429,0.819362,0.173702,0.000295,5e-06
5,42.500057,10.251196,0.995422,0.995432,0.05,0.1,"{'c1': 0.05, 'c2': 0.1}",1,0.995304,0.995439,0.995134,0.99543,0.995828,0.995429,0.146832,0.350189,0.000295,5e-06
6,42.445702,9.97101,0.995422,0.995432,0.1,0.01,"{'c1': 0.1, 'c2': 0.01}",1,0.995304,0.995439,0.995134,0.99543,0.995828,0.995429,0.316402,0.020139,0.000295,5e-06
7,42.011517,9.677885,0.995422,0.995432,0.1,0.05,"{'c1': 0.1, 'c2': 0.05}",1,0.995304,0.995439,0.995134,0.99543,0.995828,0.995429,0.622285,0.067819,0.000295,5e-06
8,39.394069,9.15308,0.995422,0.995432,0.1,0.1,"{'c1': 0.1, 'c2': 0.1}",1,0.995304,0.995439,0.995134,0.99543,0.995828,0.995429,4.534944,1.429428,0.000295,5e-06


In [20]:
print('best params:', crf_gs.best_params_)
print('model size: {:0.2f}M'.format(crf_gs.best_estimator_.size_ / 1000000))

best params: {'c1': 0.05, 'c2': 0.01}
model size: 21.82M


### 2. Evaluate the model performance

In [21]:
crf_best = crf_gs.best_estimator_
y_pred = crf_best.predict(x_val)
print(metrics.flat_classification_report(
    y_val, y_pred, labels=sorted_labels, digits=3
))
print(y_val[1])
y_pred[1]


             precision    recall  f1-score   support

          .      1.000     1.000     1.000     38354
          X      1.000     0.347     0.516      1638
        ADJ      0.992     1.000     0.996     21336
        ADP      0.996     1.000     0.998     36407
        ADV      0.990     0.993     0.992     13774
       VERB      0.998     1.000     0.999     46549
        DET      1.000     1.000     1.000     33939
       CONJ      0.998     1.000     0.999      9435
       NOUN      0.996     1.000     0.998     76657
       PRON      0.978     1.000     0.989     11976
        PRT      0.999     0.999     0.999      8378
        NUM      0.995     1.000     0.997      5639

avg / total      0.996     0.996     0.995    304082

['DET', 'VERB', 'DET', 'NOUN', 'ADP', 'DET', 'ADJ', 'ADJ', 'NOUN', 'ADP', 'DET', 'NOUN', 'NOUN', '.', 'DET', 'VERB', 'DET', 'NOUN', 'NOUN', 'ADP', 'DET', 'ADV', 'ADJ', 'NOUN', 'NOUN', '.', 'ADV', 'PRON', 'VERB', 'VERB', 'VERB', 'ADP', 'ADJ', 'NOUN', '.']


['DET',
 'VERB',
 'DET',
 'NOUN',
 'ADP',
 'DET',
 'ADJ',
 'ADJ',
 'NOUN',
 'ADP',
 'DET',
 'NOUN',
 'NOUN',
 '.',
 'DET',
 'VERB',
 'DET',
 'NOUN',
 'NOUN',
 'ADP',
 'DET',
 'ADV',
 'ADJ',
 'NOUN',
 'NOUN',
 '.',
 'ADV',
 'PRON',
 'VERB',
 'VERB',
 'VERB',
 'ADP',
 'ADJ',
 'NOUN',
 '.']

### 3. Interpret the model (enlist important state and transition features)

In [22]:
from collections import Counter

def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

print("Top likely transitions:")
print_transitions(Counter(crf_best.transition_features_).most_common(20))

print("\nTop unlikely transitions:")
print_transitions(Counter(crf_best.transition_features_).most_common()[-20:])


Top likely transitions:
ADJ    -> NOUN    0.612460
PRON   -> VERB    0.605355
DET    -> ADJ     0.460127
NUM    -> NOUN    0.455877
PRT    -> VERB    0.427282
NOUN   -> CONJ    0.388977
VERB   -> PRT     0.331743
ADP    -> PRON    0.299310
ADP    -> DET     0.270844
ADV    -> VERB    0.263639
DET    -> NOUN    0.257396
NOUN   -> PRT     0.248911
NOUN   -> ADP     0.220568
VERB   -> PRON    0.186248
VERB   -> ADV     0.181149
ADV    -> .       0.174083
ADP    -> ADJ     0.156679
.      -> CONJ    0.156562
CONJ   -> NOUN    0.148155
ADP    -> NUM     0.146557

Top unlikely transitions:
NOUN   -> PRON    -0.109935
ADJ    -> PRON    -0.113782
DET    -> PRT     -0.119776
DET    -> CONJ    -0.129186
PRON   -> NOUN    -0.137379
ADP    -> CONJ    -0.147783
NOUN   -> X       -0.149131
DET    -> .       -0.160485
ADJ    -> DET     -0.167330
ADJ    -> VERB    -0.171769
X      -> NOUN    -0.183606
.      -> NOUN    -0.188875
ADP    -> ADP     -0.207769
DET    -> DET     -0.210586
NOUN   -> ADJ    

In [23]:
def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-8s %s" % (weight, label, attr))    

print("Top positive:")
print_state_features(Counter(crf_best.state_features_).most_common(30))

print("\nTop negative:")
print_state_features(Counter(crf_best.state_features_).most_common()[-30:])

Top positive:
3.742310 NOUN     postag:NOUN
3.742310 NOUN     postag[:2]:NO
3.113762 ADP      postag:ADP
3.025177 VERB     postag:VERB
3.025177 VERB     postag[:2]:VE
2.937924 ADJ      postag:ADJ
2.587936 ADV      postag:ADV
2.452678 .        postag:.
2.452678 .        postag[:2]:.
2.306960 DET      postag:DET
2.306960 DET      postag[:2]:DE
2.146285 ADJ      postag[:2]:AD
2.024966 PRON     postag:PRON
1.954380 ADP      postag[:2]:AD
1.741926 PRT      postag:PRT
1.717605 PRON     postag[:2]:PR
1.623815 NUM      postag:NUM
1.623815 NUM      postag[:2]:NU
1.616619 ADV      postag[:2]:AD
1.528669 CONJ     postag:CONJ
1.528669 CONJ     postag[:2]:CO
1.520245 PRT      postag[:2]:PR
0.968373 CONJ     word.lower():and
0.958756 DET      word.lower():the
0.957710 CONJ     word[-2:]:nd
0.927130 CONJ     word:and
0.923813 CONJ     word[-3:]:and
0.862887 .        word:,
0.862887 .        word.lower():,
0.862887 .        word[-3:]:,

Top negative:
-0.201805 VERB     -1:postag[:2]:AD
-0.202286 DET  

In [24]:
import pickle
filename = 'CRF_based_POS_tagging_Assignment_AG.sav'
pickle.dump(crf_best, open(filename, 'wb'))