# Q5_ErblinMarku_210762815_NLP_ECS763P_ASSIGNMENT_1_SEQUENCE_CLASSIFICATION
# Assignment 1: CRF sequence tagging for Movie Queries

**NOTEBOOK/CODE SUBMISSION:

In [None]:
conda install -c conda-forge sklearn-crfsuite

In [1]:
import os
import sys


from copy import deepcopy
from collections import Counter
from nltk.tag import CRFTagger

from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import classification_report


from matplotlib import pyplot as plt
import numpy as np

import re
import unicodedata

In [2]:
def get_raw_data_from_bio_file(fpath):
    f = open(fpath)
    data = []  
    current_sent = [] 
    for line in f:
        if line == "\n": 
            data.append(current_sent)
            current_sent = []
            continue
        line_data = line.strip("\n").split("\t")
        current_sent.append((line_data[1], line_data[0]))
    f.close()
    return data

In [3]:
raw_training_data = get_raw_data_from_bio_file("trivia10k13train.bio.txt") 

In [4]:
# have a look at the first example
print(raw_training_data[0], "\n")

[('steve', 'B-Actor'), ('mcqueen', 'I-Actor'), ('provided', 'O'), ('a', 'O'), ('thrilling', 'B-Plot'), ('motorcycle', 'I-Plot'), ('chase', 'I-Plot'), ('in', 'I-Plot'), ('this', 'I-Plot'), ('greatest', 'B-Opinion'), ('of', 'I-Opinion'), ('all', 'I-Opinion'), ('ww', 'B-Plot'), ('2', 'I-Plot'), ('prison', 'I-Plot'), ('escape', 'I-Plot'), ('movies', 'I-Plot')] 



In [5]:
# here I initialize the POS tagger
posttagger = CRFTagger()
posttagger.set_model_file("crf_pos.tagger")

In [6]:
def preProcess(example):
    """Function takes in list of (word, bio-tag) pairs, e.g.:
        [('steve', 'B-Actor'), ('mcqueen', 'I-Actor'), ('provided', 'O'),
        ('a', 'O'), ('thrilling', 'B-Plot'), ('motorcycle', 'I-Plot'),
        ('chase', 'I-Plot'), ('in', 'I-Plot'), ('this', 'I-Plot'),
        ('greatest', 'B-Opinion'), ('of', 'I-Opinion'), ('all', 'I-Opinion'),
        ('ww', 'B-Plot'), ('2', 'I-Plot'), ('prison', 'I-Plot'),
        ('escape', 'I-Plot'), ('movies', 'I-Plot')]
        
    returns new (token, bio-tag) pairs with preprocessing applied to the words"""

    preprocessed_example = example  # trivial- no preprocessing
    return preprocessed_example

In [7]:
training_data = [preProcess(example) for example in raw_training_data]

In [8]:
# check the effect of pre-processing with the POS tagger
print(training_data[0],"\n")
print(training_data[1],"\n")

[('steve', 'B-Actor'), ('mcqueen', 'I-Actor'), ('provided', 'O'), ('a', 'O'), ('thrilling', 'B-Plot'), ('motorcycle', 'I-Plot'), ('chase', 'I-Plot'), ('in', 'I-Plot'), ('this', 'I-Plot'), ('greatest', 'B-Opinion'), ('of', 'I-Opinion'), ('all', 'I-Opinion'), ('ww', 'B-Plot'), ('2', 'I-Plot'), ('prison', 'I-Plot'), ('escape', 'I-Plot'), ('movies', 'I-Plot')] 

[('liza', 'B-Actor'), ('minnelli', 'I-Actor'), ('and', 'O'), ('joel', 'B-Actor'), ('gray', 'I-Actor'), ('won', 'B-Award'), ('oscars', 'I-Award'), ('for', 'O'), ('their', 'O'), ('roles', 'O'), ('in', 'O'), ('this', 'O'), ('1972', 'B-Year'), ('movie', 'O'), ('that', 'B-Plot'), ('follows', 'I-Plot'), ('nightclub', 'I-Plot'), ('entertainers', 'I-Plot'), ('in', 'I-Plot'), ('berlin', 'I-Plot'), ('as', 'I-Plot'), ('the', 'I-Plot'), ('nazis', 'I-Plot'), ('come', 'I-Plot'), ('to', 'I-Plot'), ('power', 'I-Plot')] 



In [9]:
#Here I will insert the POS tag as part of the tuple
import nltk
data = []
for i, doc in enumerate(training_data):

    # get the list of tokens
    tokens = [t for t, label in doc]

    # Perform POS tagging
    tagged = nltk.pos_tag(tokens)

    # get the word, the POS tag, and its label into the data array
    data.append([(w, pos, label) for (w, label), (word, pos) in zip(doc, tagged)])

In [10]:
#check the data structure
data[0]

[('steve', 'NN', 'B-Actor'),
 ('mcqueen', 'NN', 'I-Actor'),
 ('provided', 'VBD', 'O'),
 ('a', 'DT', 'O'),
 ('thrilling', 'JJ', 'B-Plot'),
 ('motorcycle', 'NN', 'I-Plot'),
 ('chase', 'NN', 'I-Plot'),
 ('in', 'IN', 'I-Plot'),
 ('this', 'DT', 'I-Plot'),
 ('greatest', 'JJS', 'B-Opinion'),
 ('of', 'IN', 'I-Opinion'),
 ('all', 'DT', 'I-Opinion'),
 ('ww', '$', 'B-Plot'),
 ('2', 'CD', 'I-Plot'),
 ('prison', 'NN', 'I-Plot'),
 ('escape', 'NN', 'I-Plot'),
 ('movies', 'NNS', 'I-Plot')]

In [11]:
#feature extraction proccess
def word2features(doc, i):
    word = doc[i][0]
    postag = doc[i][1]

    # Common features for all words
    features = [
        'bias',
        'word.lower=' + word.lower(),
        'word[-3:]=' + word[-3:],
        'word[-2:]=' + word[-2:],
        'word.isupper=%s' % word.isupper(),
        'word.istitle=%s' % word.istitle(),
        'word.isdigit=%s' % word.isdigit(),
        'postag=' + postag
    ]

    # Features for words that are not
    # at the beginning of a document
    if i > 0:
        word1 = doc[i-1][0]
        postag1 = doc[i-1][1]
        '''These features are:
            The word itself (converted to lowercase for normalisation)
            The prefixes and suffixes for the word
            The words surrounding, so previous and the next word
            word is in uppercase or lowercase
            word is a number, or contains digits
            The POS tag of word, and those of the surrounding words
            word is or contains a special character (e.g. hypen, dollar sign)'''
        features.extend([
            '-1:word.lower=' + word1.lower(),
            '-1:word.istitle=%s' % word1.istitle(),
            '-1:word.isupper=%s' % word1.isupper(),
            '-1:word.isdigit=%s' % word1.isdigit(),
            '-1:postag=' + postag1
        ])
    else:
        # Indicate that it is the 'beginning of a document'
        features.append('BOS')

    # Features for words that are not
    # at the end of a document
    if i < len(doc)-1:
        word1 = doc[i+1][0]
        postag1 = doc[i+1][1]
        features.extend([
            '+1:word.lower=' + word1.lower(),
            '+1:word.istitle=%s' % word1.istitle(),
            '+1:word.isupper=%s' % word1.isupper(),
            '+1:word.isdigit=%s' % word1.isdigit(),
            '+1:postag=' + postag1
        ])
    else:
        # Indicate that it is the 'end of a document'
        features.append('EOS')

    return features

In [12]:
from sklearn.model_selection import train_test_split

# extracting features
def extract_features(doc):
    return [word2features(doc, i) for i in range(len(doc))]

# get a list pf labels for the document
def get_labels(doc):
    return [label for (token, postag, label) in doc]
#split the data
X = [extract_features(doc) for doc in data]
y = [get_labels(doc) for doc in data]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(y_test[0])
print(y_train[0])

['O', 'O', 'B-Year', 'O', 'O', 'O', 'B-Plot', 'I-Plot', 'I-Plot', 'I-Plot', 'I-Plot', 'O', 'O', 'B-Origin', 'I-Origin', 'I-Origin']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-Actor', 'I-Actor', 'O', 'O', 'O', 'O', 'O', 'O', 'B-Genre', 'O', 'B-Plot', 'I-Plot', 'I-Plot']


In [13]:
#using pycrfsuite trainer
import pycrfsuite
trainer = pycrfsuite.Trainer(verbose=True)

# feed the data to the trainer
for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq)

# Set the parameters of the model
trainer.set_params({
    # coefficient for Lasso regression L1 penalty
    'c1': 0.1,

    # coefficient for Ridge regression L2 penalty
    'c2': 0.01,  

    # maximum number of iterations
    'max_iterations': 100,

    # whether to include transitions that
    # are possible, but not observed
    'feature.possible_transitions': False
    #I set it to True first to see the iterations and understand the behaviour of the trainer
    #now is set to False so when you run it, it will show the lond output
})

# Provide a file name as a parameter to the train function, such that
# the model will be saved to the file when training is finished
trainer.train('crf.model')

Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 0
0....1....2....3....4....5....6....7....8....9....10
Number of features: 61163
Seconds required: 0.413

L-BFGS optimization
c1: 0.100000
c2: 0.010000
num_memories: 6
max_iterations: 100
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

***** Iteration #1 *****
Loss: 249831.899624
Feature norm: 1.000000
Error norm: 95184.596324
Active features: 60456
Line search trials: 1
Line search step: 0.000005
Seconds required for this iteration: 0.893

***** Iteration #2 *****
Loss: 220426.741470
Feature norm: 1.968560
Error norm: 92454.501037
Active features: 55935
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.455

***** Iteration #3 *****
Loss: 215355.258387
Feature norm: 1.988709
Error norm: 86275.414008
Active features: 60392
Line search trials: 2
Line search step: 0.500000
Seconds required 

In [14]:
#Testing tagger
print("testing tagger..")
import pycrfsuite
tagger = pycrfsuite.Tagger()
tagger.open('crf.model')
y_pred = [tagger.tag(xseq) for xseq in X_test]
print(y_pred[0])
print(y_test[0])
print("done.")

testing tagger..
['O', 'O', 'B-Year', 'O', 'O', 'B-Plot', 'I-Plot', 'I-Plot', 'I-Plot', 'I-Plot', 'I-Plot', 'I-Plot', 'I-Plot', 'I-Plot', 'I-Plot', 'I-Plot']
['O', 'O', 'B-Year', 'O', 'O', 'O', 'B-Plot', 'I-Plot', 'I-Plot', 'I-Plot', 'I-Plot', 'O', 'O', 'B-Origin', 'I-Origin', 'I-Origin']
done.


In [15]:
print("printing classification report.\n labels are converted to numbers")
import numpy as np
from sklearn.metrics import classification_report
# Convert the sequences of tags into a 1-dimensional array
predictions=[]
truths=[]
for x in y_pred:
    predictions.extend(y_pred)
for z in y_test:
    truths.extend(y_test)
#Use the MultiLabelBinarizer to get the classification report
from sklearn.preprocessing import MultiLabelBinarizer
ml=MultiLabelBinarizer()
x = ml.fit_transform(truths)
y = ml.fit_transform(predictions)
# Print out the classification report
print(classification_report(x,y))
print(precision_recall_fscore_support(x, y, average='macro'))  # print out accurate macro score

printing classification report.
 labels are converted to numbers
              precision    recall  f1-score   support

           0       0.96      0.95      0.96   1107312
           1       0.98      0.96      0.97     79764
           2       0.78      0.56      0.65    258060
           3       0.82      0.86      0.84    536452
           4       0.95      0.88      0.91   1046316
           5       0.53      0.46      0.49    247112
           6       0.79      0.70      0.75    231472
           7       0.92      0.94      0.93   1881492
           8       1.00      0.47      0.64     29716
           9       0.85      0.58      0.69    189244
          10       0.60      0.27      0.37     17204
          11       0.99      0.97      0.98    782000
          12       0.96      0.96      0.96   1097928
          13       0.83      0.81      0.82     67252
          14       0.77      0.66      0.71    156400
          15       0.87      0.87      0.87    441048
          16    

In [16]:
import pandas as pd
from sklearn.metrics import classification_report
report_dict3 = classification_report(x,y, output_dict=True)
pd.DataFrame(report_dict3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,19,20,21,22,23,24,micro avg,macro avg,weighted avg,samples avg
precision,0.9587482,0.98,0.781513,0.823529,0.9454254,0.528986,0.793893,0.9215686,1.0,0.853659,...,0.9189853,1.0,0.705882,0.6,0.703704,0.9967192,0.9186855,0.8220382,0.9139233,0.9241519
recall,0.9519774,0.960784,0.563636,0.857143,0.8804185,0.462025,0.702703,0.9376559,0.473684,0.578512,...,0.9389632,0.5,0.467532,0.272727,0.575758,0.9941099,0.8908013,0.6952318,0.8908013,0.89874
f1-score,0.9553508,0.970297,0.65493,0.84,0.9117647,0.493243,0.74552,0.9295426,0.642857,0.689655,...,0.9288668,0.666667,0.5625,0.375,0.633333,0.9954128,0.9045285,0.7420994,0.9003878,0.9023139
support,1107312.0,79764.0,258060.0,536452.0,1046316.0,247112.0,231472.0,1881492.0,29716.0,189244.0,...,1870544.0,28152.0,120428.0,17204.0,51612.0,2389792.0,13448840.0,13448840.0,13448840.0,13448840.0


In [17]:
pr= pd.DataFrame(report_dict3)
data1=pr.iloc[0,:]
sorted_precision = round(data1.sort_values(),3)
sorted_precision.iloc[0:5]

17    0.407
5     0.529
10    0.600
22    0.600
23    0.704
Name: precision, dtype: float64

In [18]:
#Get the recall values sorted for the analysis of False negatives in Error Analysis 2, we look at the 5 classes with lowest Recall values
rc= pd.DataFrame(report_dict3)
data2=rc.iloc[1,:]
sorted_recall = round(data2.sort_values(),3)
sorted_recall.iloc[0:5]

17    0.229
10    0.273
22    0.273
5     0.462
21    0.468
Name: recall, dtype: float64

In [19]:
def print_transition_weights(transitions):
    """Sort the transitions between states/labels from highest to
    lowest strengths and print out."""
    ranked_transitions = [x for x in sorted(transitions,
                                               key=lambda x:x[1], reverse=True)]
    for (label_from, label_to), weight in ranked_transitions:
        print("%0.6f %-8s -> %s" % (weight, label_from, label_to))

In [20]:
print_transition_weights(posttagger._tagger.info().transitions.items())

7.987627 GW       -> ^NN
7.883092 SYM      -> SYM
7.435817 GW       -> ^JJ
7.326343 GW       -> ^RB
6.729554 FW       -> FW
6.654484 GW       -> ^VB
6.432188 NNP      -> NNP
5.895129 GW       -> ^NNS
5.892003 GW       -> ^VBG
5.330292 GW       -> ^VBN
4.973696 PDT      -> DT
4.972408 TO       -> VB
4.560515 PRPMD    -> VB
4.459338 PRP      -> VBP
4.383430 JJ       -> NN
4.356466 PRPHVS   -> VBN
4.229960 NNP      -> NNPS
4.170384 DT       -> NN
4.159976 MD       -> VB
4.085463 EX       -> VBP
3.956153 EX       -> VBD
3.946131 MDRB     -> VB
3.889560 NNHVS    -> VBN
3.852103 PRP$     -> NN
3.840113 IN       -> NNP
3.837899 NNPOS    -> NN
3.717115 NNPPOS   -> NNP
3.682924 GW       -> ^VBD
3.564769 VBPRB    -> VB
3.541402 WDTHVS   -> VBN
3.512874 DTHVS    -> VBN
3.472506 NN       -> NN
3.395835 WDT      -> VBP
3.326896 NNP      -> NNPPOS
3.314201 PRPVBD   -> VBN
3.306448 PRP      -> VBD
3.189912 PRPHVS   -> PRPHVS
3.188859 VBPRP    -> VB
3.167770 DT       -> NNP
3.134258 CD       -> NN
3.1

In [21]:
def print_most_predictive_state_features(state_features,
                                         excluded_classes=["O"],
                                         top_k=None):
    """Takes in the dict of state_features from a python crf-suite tagger,
    orders them in terms of how predictive they are of different classes.
    We assume we are not interested in O labels."""
    if not top_k:
        top_k = len(state_features)
    ranked_state_features = [x for x in sorted(state_features, key=lambda x:x[1], reverse=True) \
         if x[0][1] not in excluded_classes][:top_k]
    n=20 #added this counter to see only the top predicted features
    for (attr, label), weight in ranked_state_features:
        n-=1
        if n>0:
            print("%0.6f %-8s %s" % (weight, label, attr))

In [22]:
print_most_predictive_state_features(posttagger._tagger.info().state_features.items())

10.119428 PRP      WORD_it
9.621445 CC       WORD_and
9.105461 PRP      WORD_i
8.682363 PRP      WORD_me
8.551747 PRP      WORD_we
8.532030 DT       WORD_a
8.494565 PRP$     WORD_my
8.397045 CC       WORD_or
8.341449 IN       WORD_at
8.250122 PRPBES   WORD_its
8.214729 CC       WORD_but
8.120318 UH       WORD_um
7.970469 UH       WORD_yes
7.898332 PRP      WORD_he
7.817139 VBD      WORD_was
7.815547 UH       WORD_uh
7.791442 TO       WORD_to
7.670905 VBZ      WORD_is
7.666048 PRP      WORD_us


In [23]:
#Write the above function to predict the least predictive features so I can see where to make improvements
def print_least_predictive_state_features(state_features,
                                         excluded_classes=["O"],
                                         top_k=None):
    """Takes in the dict of state_features from a python crf-suite tagger,
    orders them in terms of how predictive they are of different classes.
    We assume we are not interested in O labels."""
    if not top_k:
        top_k = len(state_features)
    ranked_state_features = [x for x in sorted(state_features, key=lambda x:x[1], reverse=False) \
         if x[0][1] not in excluded_classes][:top_k]
    n=20
    for (attr, label), weight in ranked_state_features:
        n-=1
        if n>0:
            print("%0.6f %-8s %s" % (weight, label, attr))

In [24]:
print_least_predictive_state_features(posttagger._tagger.info().state_features.items())

-2.609349 NN       WORD_little
-2.476698 VBN      SUF_eed
-2.340203 NNS      WORD_masters
-2.325738 NN       WORD_up
-2.164905 UH       SUF_ch
-2.116353 RB       WORD_silly
-2.071369 NN       WORD_like
-2.020407 NNS      SUF_ss
-1.937634 VBD      SUF_eed
-1.937321 JJ       WORD_bible
-1.922470 VBG      WORD_willing
-1.891080 VBP      SUF_ere
-1.858995 JJ       SUF_eed
-1.802833 NNS      WORD_seems
-1.769757 NN       WORD_talking
-1.698015 NN       WORD_short
-1.682806 NNS      WORD_wants
-1.664453 VBD      WORD_involved
-1.622473 NN       WORD_university
