## Part-of-Speech tagging using CRF

### Data Preparation

In [1]:
#Importing libraries
import nltk, re, pprint
import numpy as np
import pandas as pd
import pprint, time
import random
import sklearn_crfsuite
from sklearn_crfsuite import metrics
from sklearn_crfsuite import scorers
import pycrfsuite
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.grid_search import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
import sklearn



In [2]:
# reading the Treebank tagged sentences
wsj = list(nltk.corpus.treebank.tagged_sents(tagset='universal'))

In [3]:
# first few tagged sentences
print(wsj[:2])
print(len(wsj))

[[('Pierre', 'NOUN'), ('Vinken', 'NOUN'), (',', '.'), ('61', 'NUM'), ('years', 'NOUN'), ('old', 'ADJ'), (',', '.'), ('will', 'VERB'), ('join', 'VERB'), ('the', 'DET'), ('board', 'NOUN'), ('as', 'ADP'), ('a', 'DET'), ('nonexecutive', 'ADJ'), ('director', 'NOUN'), ('Nov.', 'NOUN'), ('29', 'NUM'), ('.', '.')], [('Mr.', 'NOUN'), ('Vinken', 'NOUN'), ('is', 'VERB'), ('chairman', 'NOUN'), ('of', 'ADP'), ('Elsevier', 'NOUN'), ('N.V.', 'NOUN'), (',', '.'), ('the', 'DET'), ('Dutch', 'NOUN'), ('publishing', 'VERB'), ('group', 'NOUN'), ('.', '.')]]
3914


In [4]:
# reading the Brown tagged sentences
brown= list(nltk.corpus.brown.tagged_sents(tagset='universal'))

In [5]:
# first few tagged sentences
print(brown[:2])
print(len(brown))

[[('The', 'DET'), ('Fulton', 'NOUN'), ('County', 'NOUN'), ('Grand', 'ADJ'), ('Jury', 'NOUN'), ('said', 'VERB'), ('Friday', 'NOUN'), ('an', 'DET'), ('investigation', 'NOUN'), ('of', 'ADP'), ("Atlanta's", 'NOUN'), ('recent', 'ADJ'), ('primary', 'NOUN'), ('election', 'NOUN'), ('produced', 'VERB'), ('``', '.'), ('no', 'DET'), ('evidence', 'NOUN'), ("''", '.'), ('that', 'ADP'), ('any', 'DET'), ('irregularities', 'NOUN'), ('took', 'VERB'), ('place', 'NOUN'), ('.', '.')], [('The', 'DET'), ('jury', 'NOUN'), ('further', 'ADV'), ('said', 'VERB'), ('in', 'ADP'), ('term-end', 'NOUN'), ('presentments', 'NOUN'), ('that', 'ADP'), ('the', 'DET'), ('City', 'NOUN'), ('Executive', 'ADJ'), ('Committee', 'NOUN'), (',', '.'), ('which', 'DET'), ('had', 'VERB'), ('over-all', 'ADJ'), ('charge', 'NOUN'), ('of', 'ADP'), ('the', 'DET'), ('election', 'NOUN'), (',', '.'), ('``', '.'), ('deserves', 'VERB'), ('the', 'DET'), ('praise', 'NOUN'), ('and', 'CONJ'), ('thanks', 'NOUN'), ('of', 'ADP'), ('the', 'DET'), ('City

In [6]:
# reading the conll2000 tagged sentences
conll2000= list(nltk.corpus.conll2000.tagged_sents(tagset='universal'))

In [7]:
# first few tagged sentences
print(conll2000[:2])
print(len(conll2000))

[[('Confidence', 'NOUN'), ('in', 'ADP'), ('the', 'DET'), ('pound', 'NOUN'), ('is', 'VERB'), ('widely', 'ADV'), ('expected', 'VERB'), ('to', 'PRT'), ('take', 'VERB'), ('another', 'DET'), ('sharp', 'ADJ'), ('dive', 'NOUN'), ('if', 'ADP'), ('trade', 'NOUN'), ('figures', 'NOUN'), ('for', 'ADP'), ('September', 'NOUN'), (',', '.'), ('due', 'ADJ'), ('for', 'ADP'), ('release', 'NOUN'), ('tomorrow', 'NOUN'), (',', '.'), ('fail', 'VERB'), ('to', 'PRT'), ('show', 'VERB'), ('a', 'DET'), ('substantial', 'ADJ'), ('improvement', 'NOUN'), ('from', 'ADP'), ('July', 'NOUN'), ('and', 'CONJ'), ('August', 'NOUN'), ("'s", 'PRT'), ('near-record', 'ADJ'), ('deficits', 'NOUN'), ('.', '.')], [('Chancellor', 'NOUN'), ('of', 'ADP'), ('the', 'DET'), ('Exchequer', 'NOUN'), ('Nigel', 'NOUN'), ('Lawson', 'NOUN'), ("'s", 'PRT'), ('restated', 'VERB'), ('commitment', 'NOUN'), ('to', 'PRT'), ('a', 'DET'), ('firm', 'NOUN'), ('monetary', 'ADJ'), ('policy', 'NOUN'), ('has', 'VERB'), ('helped', 'VERB'), ('to', 'PRT'), ('prev

In [8]:
nltk_data = wsj + brown + conll2000

In [9]:
train_set, val_set = train_test_split(nltk_data,test_size=0.4,random_state=100)
print(len(train_set))

43321


### 1. Build your CRF

In [10]:
def word2features(data, i):
    word = data[i][0]
    postag = data[i][1]
    # Common features for all words
    features={
        'lowercase_word':word.lower(),
        'word[-3:]':word[-3:],
        'word[-2:]':word[-2:],
        'word.isupper':word.isupper(),
        'word.istitle':word.istitle(),
        'word.isdigit':word.isdigit(),
    }

    # Features for words that are not
    # at the beginning of a sentence
    if i > 0:
        word1 = data[i-1][0]
        postag1 = data[i-1][1]
        features.update({'previous.word.lower':word1.lower()})
        features.update({'previous.word.istitle':word1.istitle()})
        features.update({'previous.word.isupper':word1.isupper()})
        features.update({'previous.word.isdigit':word1.isdigit()})
    else:
        # Indicate that it is the 'beginning of a document'
        
        features.update({'position':'START'})
        
        
    # Features for words that are not
    # at the end of a sentence
    if i < len(data)-1:
        word1 = data[i+1][0]
        postag1 = data[i+1][1]
        features.update({'next.word.lower':word1.lower()})
        features.update({'next.word.istitle':word1.istitle()})
        features.update({'next.word.isupper':word1.isupper()})
        features.update({'next.word.isdigit':word1.isdigit()})
    else:
        # Indicate that it is the 'end of a document'
        features.update({'position':'END'})
    return features

In [11]:
from sklearn.model_selection import train_test_split
# A function for extracting features in documents
def extract_features(data):
    all_features=[]
    for i in data:
        each_feature=[]
        for j in range(len(i)):
            each_feature.append(word2features(i, j))
        all_features.append(each_feature)
    return all_features

# A function for generating the list of labels for each document
def get_labels(data):
    all_pos=[]
    for i in data:
        each_pos=[]
        for (word,postag) in i:
            each_pos.append(postag)
        all_pos.append(each_pos)
    return all_pos        
X_train = extract_features(train_set)
y_train = get_labels(train_set)
X_test = extract_features(val_set)
y_test = get_labels(val_set)
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)

In [12]:
from sklearn_crfsuite import CRF
model = CRF()
model.fit(X_train, y_train)

CRF(algorithm=None, all_possible_states=None, all_possible_transitions=None,
  averaging=None, c=None, c1=None, c2=None, calibration_candidates=None,
  calibration_eta=None, calibration_max_trials=None, calibration_rate=None,
  calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
  gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=None,
  max_linesearch=None, min_freq=None, model_filename=None,
  num_memories=None, pa_type=None, period=None, trainer_cls=None,
  variance=None, verbose=False)

### 2. Evaluate the model performance

In [13]:
y_pred = model.predict(X_test)
print(metrics.flat_accuracy_score(y_test, y_pred))

0.9648513416473967


In [14]:
# hyperparameter tuning

# define fixed parameters and parameters to search
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    max_iterations=50,
    all_possible_transitions=True
)

# parameters to tune
params_space = {
    'c1': [0.01, 0.1],
    'c2': [0.01, 0.1]
}

# use the same metric for evaluation
f1_scorer = scorers.make_scorer(metrics.flat_f1_score,
                        average='weighted')

In [15]:
# instantiate a GridSearchCV object
rs = GridSearchCV(crf, 
                  params_space,
                  cv=3,
                  verbose=1,
                  n_jobs=-1,
                  scoring=f1_scorer, 
                  return_train_score=True)
# fit
rs.fit(X_train, y_train)

Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed: 13.2min finished


GridSearchCV(cv=3, error_score='raise',
       estimator=CRF(algorithm='lbfgs', all_possible_states=None,
  all_possible_transitions=True, averaging=None, c=None, c1=None, c2=None,
  calibration_candidates=None, calibration_eta=None,
  calibration_max_trials=None, calibration_rate=None,
  calibration_samples=None, delta=None, epsilon=None, error...e,
  num_memories=None, pa_type=None, period=None, trainer_cls=None,
  variance=None, verbose=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'c1': [0.01, 0.1], 'c2': [0.01, 0.1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=make_scorer(flat_f1_score, average=weighted), verbose=1)

In [16]:
# store CV results in a DF
cv_results = pd.DataFrame(rs.cv_results_)
cv_results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_c1,param_c2,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,mean_train_score,std_train_score
0,213.047466,17.599016,16.345589,2.209197,0.01,0.01,"{'c1': 0.01, 'c2': 0.01}",0.966229,0.967119,0.967332,0.966893,0.000478,3,0.989999,0.989785,0.990228,0.990004,0.000181
1,207.419033,3.08912,15.155391,0.802297,0.01,0.1,"{'c1': 0.01, 'c2': 0.1}",0.966433,0.967088,0.967639,0.967053,0.000493,2,0.987212,0.986748,0.987258,0.987073,0.00023
2,191.043543,1.624542,13.425253,0.694842,0.1,0.01,"{'c1': 0.1, 'c2': 0.01}",0.966723,0.967036,0.967464,0.967074,0.000303,1,0.989342,0.98931,0.98918,0.989277,7e-05
3,183.629638,2.607905,10.086579,0.51766,0.1,0.1,"{'c1': 0.1, 'c2': 0.1}",0.966261,0.966866,0.967368,0.966832,0.000453,4,0.985903,0.985514,0.985766,0.985728,0.000161


In [17]:
# building a model with optimal hyperparams
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.01,
    c2=0.01,
    max_iterations=50,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CRF(algorithm='lbfgs', all_possible_states=None,
  all_possible_transitions=True, averaging=None, c=None, c1=0.01, c2=0.01,
  calibration_candidates=None, calibration_eta=None,
  calibration_max_trials=None, calibration_rate=None,
  calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
  gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=50,
  max_linesearch=None, min_freq=None, model_filename=None,
  num_memories=None, pa_type=None, period=None, trainer_cls=None,
  variance=None, verbose=False)

In [18]:
# save the model to a pickle file
import _pickle as cPickle
# import pickle
# model_name='final_crf'
# pickle.dump(model, open('C:\Users\vibhav\Desktop\Crf_Model', 'wb'))

with open('Vibhav_crf.pkl', 'wb') as clf:
    try:
        cPickle.dump(crf, clf)
    except Exception as e:
        print(e)
    finally:
        clf.close()   

In [19]:
# load the trained model
import _pickle as cPickle

with open('Vibhav_crf.pkl', 'rb') as fid:
    crf = cPickle.load(fid)

In [21]:
labels =list(crf.classes_)
y_pred = crf.predict(X_test)
print(metrics.flat_accuracy_score(y_test, y_pred))

0.969074253414234


In [None]:
# class-wise scores on validation data
sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)
print(metrics.flat_classification_report(
    y_val, y_pred, labels=sorted_labels, digits=3
))

### 3. Interpret the model (enlist important state and transition features)

In [22]:
from collections import Counter

def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

print("Top likely transitions:")
print_transitions(Counter(crf.transition_features_).most_common(20))

print("\nTop unlikely transitions:")
print_transitions(Counter(crf.transition_features_).most_common()[-20:])

Top likely transitions:
X      -> X       6.250391
ADJ    -> NOUN    5.882493
NOUN   -> NOUN    4.418739
VERB   -> NOUN    4.364898
DET    -> NOUN    3.828006
ADP    -> NOUN    3.737994
NUM    -> NOUN    3.295938
ADV    -> ADJ     2.646997
NOUN   -> VERB    2.376938
ADJ    -> ADJ     2.253906
VERB   -> ADV     2.224495
VERB   -> ADJ     2.148090
VERB   -> PRT     2.116329
ADV    -> ADV     1.885738
NOUN   -> PRON    1.884802
ADV    -> VERB    1.884006
PRT    -> VERB    1.817723
PRON   -> NOUN    1.757976
X      -> NOUN    1.722016
PRT    -> NOUN    1.607191

Top unlikely transitions:
PRT    -> PRT     -1.386880
PRON   -> CONJ    -1.391091
.      -> PRON    -1.447736
PRON   -> PRT     -1.455765
NUM    -> VERB    -1.494329
X      -> NUM     -1.495900
.      -> X       -1.509421
DET    -> PRON    -1.520167
DET    -> CONJ    -1.549758
DET    -> DET     -1.633011
ADP    -> CONJ    -1.637209
DET    -> ADP     -1.645952
DET    -> .       -1.653441
NUM    -> PRON    -1.728971
NUM    -> DET    

In [23]:
# important features
def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-8s %s" % (weight, label, attr))

print("Top positive:")
print_state_features(Counter(crf.state_features_).most_common(30))

print("\nTop negative:")
print_state_features(Counter(crf.state_features_).most_common()[-30:])

Top positive:
17.220056 NUM      lowercase_word:million
17.015641 DET      lowercase_word:these
16.687589 VERB     lowercase_word:were
16.099488 ADV      lowercase_word:n't
14.830224 NUM      lowercase_word:billion
14.680093 CONJ     lowercase_word:and
14.476164 ADP      lowercase_word:during
14.165553 DET      lowercase_word:which
13.829237 DET      lowercase_word:each
13.768926 ADJ      lowercase_word:willing
13.684326 ADP      lowercase_word:than
13.610947 NOUN     lowercase_word:something
13.517312 PRON     lowercase_word:themselves
13.385888 DET      lowercase_word:those
13.368296 NOUN     lowercase_word:anything
13.347438 ADV      lowercase_word:not
13.317618 ADV      lowercase_word:o'clock
13.243296 NUM      lowercase_word:hundred
13.173118 NUM      lowercase_word:five
13.043313 PRON     lowercase_word:hers
12.956917 ADJ      lowercase_word:outstanding
12.956593 ADV      lowercase_word:often
12.760283 ADV      lowercase_word:sometimes
12.703839 DET      lowercase_word:both
12.65