## Part-of-Speech tagging using CRF

### Data Preparation

In [2]:
#Importing libraries
import nltk, re, pprint
import numpy as np
import pandas as pd
import pprint, time
import random
import sklearn_crfsuite
from sklearn_crfsuite import metrics
from sklearn_crfsuite import scorers
import pycrfsuite
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.grid_search import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
import sklearn



In [3]:
# reading the Treebank tagged sentences
wsj = list(nltk.corpus.treebank.tagged_sents(tagset='universal'))

In [4]:
# first few tagged sentences
print(wsj[:2])
print(len(wsj))

[[('Pierre', 'NOUN'), ('Vinken', 'NOUN'), (',', '.'), ('61', 'NUM'), ('years', 'NOUN'), ('old', 'ADJ'), (',', '.'), ('will', 'VERB'), ('join', 'VERB'), ('the', 'DET'), ('board', 'NOUN'), ('as', 'ADP'), ('a', 'DET'), ('nonexecutive', 'ADJ'), ('director', 'NOUN'), ('Nov.', 'NOUN'), ('29', 'NUM'), ('.', '.')], [('Mr.', 'NOUN'), ('Vinken', 'NOUN'), ('is', 'VERB'), ('chairman', 'NOUN'), ('of', 'ADP'), ('Elsevier', 'NOUN'), ('N.V.', 'NOUN'), (',', '.'), ('the', 'DET'), ('Dutch', 'NOUN'), ('publishing', 'VERB'), ('group', 'NOUN'), ('.', '.')]]
3914


In [6]:
#Brown tagged sentences
brown= list(nltk.corpus.brown.tagged_sents(tagset='universal'))
#conll2000 tagged sentences
conll2000= list(nltk.corpus.conll2000.tagged_sents(tagset='universal'))
nltk_data = wsj + brown + conll2000

In [30]:
train_set, val_set = train_test_split(nltk_data,test_size=0.4,random_state=100)



### 1. Build your CRF

In [27]:
def word2features(data, i):
    word = data[i][0]
    postag = data[i][1]
    # Common features for all words
    features={
        'lowercase_word':word.lower(),
        'word[-3:]':word[-3:],
        'word[-2:]':word[-2:],
        'word.isupper':word.isupper(),
        'word.istitle':word.istitle(),
        'word.isdigit':word.isdigit(),
    }

    # Features for words that are not
    # at the beginning of a sentence
    if i > 0:
        word1 = data[i-1][0]
        postag1 = data[i-1][1]
        features.update({'previous.word.lower':word1.lower()})
        features.update({'previous.word.istitle':word1.istitle()})
        features.update({'previous.word.isupper':word1.isupper()})
        features.update({'previous.word.isdigit':word1.isdigit()})
    else:
        # Indicate that it is the 'beginning of a document'
        
        features.update({'position':'START'})
        
        
    # Features for words that are not
    # at the end of a sentence
    if i < len(data)-1:
        word1 = data[i+1][0]
        postag1 = data[i+1][1]
        features.update({'next.word.lower':word1.lower()})
        features.update({'next.word.istitle':word1.istitle()})
        features.update({'next.word.isupper':word1.isupper()})
        features.update({'next.word.isdigit':word1.isdigit()})
    else:
        # Indicate that it is the 'end of a document'
        features.update({'position':'END'})
    return features

In [9]:
from sklearn.model_selection import train_test_split
# A function for extracting features in documents
def extract_features(data):
    all_features=[]
    for i in data:
        each_feature=[]
        for j in range(len(i)):
            each_feature.append(word2features(i, j))
        all_features.append(each_feature)
    return all_features

# A function for generating the list of labels for each document
def get_labels(data):
    all_pos=[]
    for i in data:
        each_pos=[]
        for (word,postag) in i:
            each_pos.append(postag)
        all_pos.append(each_pos)
    return all_pos        
X_train = extract_features(train_set)
y_train = get_labels(train_set)
X_test = extract_features(val_set)
y_test = get_labels(val_set)
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)

In [10]:
from sklearn_crfsuite import CRF
model = CRF()
model.fit(X_train, y_train)

CRF(algorithm=None, all_possible_states=None, all_possible_transitions=None,
  averaging=None, c=None, c1=None, c2=None, calibration_candidates=None,
  calibration_eta=None, calibration_max_trials=None, calibration_rate=None,
  calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
  gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=None,
  max_linesearch=None, min_freq=None, model_filename=None,
  num_memories=None, pa_type=None, period=None, trainer_cls=None,
  variance=None, verbose=False)

### 2. Evaluate the model performance

In [11]:
y_pred = model.predict(X_test)
print(metrics.flat_accuracy_score(y_test, y_pred))

0.9648513416473967


In [12]:
# hyperparameter tuning

# define fixed parameters and parameters to search
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    max_iterations=50,
    all_possible_transitions=True
)

# parameters to tune
params_space = {
    'c1': [0.01, 0.1],
    'c2': [0.01, 0.1]
}

# use the same metric for evaluation
f1_scorer = scorers.make_scorer(metrics.flat_f1_score,
                        average='weighted')

In [13]:
# instantiate a GridSearchCV object
rs = GridSearchCV(crf, 
                  params_space,
                  cv=3,
                  verbose=1,
                  n_jobs=-1,
                  scoring=f1_scorer, 
                  return_train_score=True)
# fit
rs.fit(X_train, y_train)

Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:  7.9min finished


GridSearchCV(cv=3, error_score='raise',
       estimator=CRF(algorithm='lbfgs', all_possible_states=None,
  all_possible_transitions=True, averaging=None, c=None, c1=None, c2=None,
  calibration_candidates=None, calibration_eta=None,
  calibration_max_trials=None, calibration_rate=None,
  calibration_samples=None, delta=None, epsilon=None, error...e,
  num_memories=None, pa_type=None, period=None, trainer_cls=None,
  variance=None, verbose=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'c1': [0.01, 0.1], 'c2': [0.01, 0.1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=make_scorer(flat_f1_score, average=weighted), verbose=1)

In [14]:
# store CV results in a DF
cv_results = pd.DataFrame(rs.cv_results_)
cv_results.to_csv('results.csv')

In [15]:
# building a model with optimal hyperparams
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.01,
    c2=0.01,
    max_iterations=50,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CRF(algorithm='lbfgs', all_possible_states=None,
  all_possible_transitions=True, averaging=None, c=None, c1=0.01, c2=0.01,
  calibration_candidates=None, calibration_eta=None,
  calibration_max_trials=None, calibration_rate=None,
  calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
  gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=50,
  max_linesearch=None, min_freq=None, model_filename=None,
  num_memories=None, pa_type=None, period=None, trainer_cls=None,
  variance=None, verbose=False)

In [21]:
# save the model to a pickle file
import _pickle as cPickle

with open('Alvi_CRF.pkl', 'wb') as clf:
    try:
        cPickle.dump(crf, clf)
    except Exception as e:
        print(e)
    finally:
        clf.close()   

In [22]:
# load the trained model
import _pickle as cPickle

with open('Alvi_CRF.pkl', 'rb') as fid:
    crf = cPickle.load(fid)

In [23]:
labels =list(crf.classes_)
y_pred = crf.predict(X_test)
print(metrics.flat_accuracy_score(y_test, y_pred))

0.969074253414234


In [24]:
# class-wise scores on validation data
sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)
print(metrics.flat_classification_report(
    y_val, y_pred, labels=sorted_labels, digits=3
))

NameError: name 'y_val' is not defined

### 3. Interpret the model (enlist important state and transition features)

In [28]:
from collections import Counter

def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

print("Top likely transitions:")
print_transitions(Counter(crf.transition_features_).most_common(10))

print("\nTop unlikely transitions:")
print_transitions(Counter(crf.transition_features_).most_common()[-10:])

Top likely transitions:
X      -> X       6.250391
ADJ    -> NOUN    5.882493
NOUN   -> NOUN    4.418739
VERB   -> NOUN    4.364898
DET    -> NOUN    3.828006
ADP    -> NOUN    3.737994
NUM    -> NOUN    3.295938
ADV    -> ADJ     2.646997
NOUN   -> VERB    2.376938
ADJ    -> ADJ     2.253906

Top unlikely transitions:
ADP    -> CONJ    -1.637209
DET    -> ADP     -1.645952
DET    -> .       -1.653441
NUM    -> PRON    -1.728971
NUM    -> DET     -1.928503
PRT    -> CONJ    -2.047916
DET    -> PRT     -2.612784
CONJ   -> CONJ    -3.387113
CONJ   -> X       -3.636751
CONJ   -> .       -4.429760


In [29]:
# important features
def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-8s %s" % (weight, label, attr))

print("Top positive:")
print_state_features(Counter(crf.state_features_).most_common(10))

print("\nTop negative:")
print_state_features(Counter(crf.state_features_).most_common()[-10:])

Top positive:
17.220056 NUM      lowercase_word:million
17.015641 DET      lowercase_word:these
16.687589 VERB     lowercase_word:were
16.099488 ADV      lowercase_word:n't
14.830224 NUM      lowercase_word:billion
14.680093 CONJ     lowercase_word:and
14.476164 ADP      lowercase_word:during
14.165553 DET      lowercase_word:which
13.829237 DET      lowercase_word:each
13.768926 ADJ      lowercase_word:willing

Top negative:
-5.930749 VERB     lowercase_word:subject
-6.216390 PRON     next.word.lower:things
-6.246356 VERB     lowercase_word:down
-6.565561 .        previous.word.lower:c.
-6.795203 NOUN     lowercase_word:'s
-6.948302 ADV      previous.word.lower:asia
-7.063754 ADP      next.word.lower:down
-7.353129 NOUN     previous.word.lower:employees
-7.356359 DET      next.word.lower:ill
-7.696527 ADP      next.word.lower:rear
