# Assignment 4 on Natural Language Processing

### Instructor : Prof. Sudeshna Sarkar

### Teaching Assistants : Alapan Kuila, Aniruddha Roy, Anusha Potnuru, Uppada Vishnu


### -18QE30008, Divyanshu Sheth

In [1]:
!rm -r sample_data
!pip install sklearn-crfsuite 1> /dev/null

In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')

from itertools import chain

import re
import pandas as pd
import math
import string
import nltk
import sklearn
import scipy.stats
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV

import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

In [3]:
data = {}
data['train'] = pd.read_csv('hi-ud-train.conllu')
data['test'] = pd.read_csv('hi-ud-test.conllu', sep = '\t')

In [4]:
print(data['train'], data['test'], sep = '\n\n')

        ID    WORD POS_TAG
0      1.0    yaha     DET
1      2.0   eSiyA   PROPN
2      3.0      kI     ADP
3      4.0  sabase     ADV
4      5.0   badZI     ADJ
...    ...     ...     ...
8105   9.0   TaMdI     ADJ
8106  10.0      ho    VERB
8107  11.0    jAwI     AUX
8108  12.0      hE     AUX
8109  13.0       .   PUNCT

[8110 rows x 3 columns]

        ID      WORD    TAG
0      1.0  rAmAyaNa  PROPN
1      2.0      kAla  PROPN
2      3.0       meM    ADP
3      4.0  BagavAna   NOUN
4      5.0      rAma  PROPN
...    ...       ...    ...
1552  10.0     ISAna  PROPN
1553  11.0        kA    ADP
1554  12.0   maMxira   NOUN
1555  13.0        hE    AUX
1556  14.0         .  PUNCT

[1557 rows x 3 columns]


### Features Used:<br>
1.   The actual word itself
2.   The length of the word- no. of characters (as shorter words are expected to be more likely to belong to a particular POS, eg. prepositions, pronouns)
3.   The word in lowercase
4.   Stemmed version of the word, which deletes all vowels along with g, y, n from the end of the word, but leaves at least a 2 character long stem- so that words like ‘aayenga’ do not completely vanish.
5.   Prefixes and suffixes of the word of varying lengths
6.   Whether or not the word is a digit
7.   Whether or not the word is a punctuation mark
8.   Whether the word is at the beginning of the sentence (BOS) or the end of the sentence (EOS) or neither
9.   Features mentioned above for the previous word, the following word, and the words two places before and after




In [5]:
def word2features(sent, i):
    word = sent[i][0]
    features = {
        'bias': 1.0,
        'word': word,
        'len(word)': len(word),
        'word.lower()': word.lower(),
        'word.stemmed': re.sub(r'(.{2,}?)([aeiougyn]+$)',r'\1', word.lower()),
        'word[:4]': word[:4],
        'word[:3]': word[:3],
        'word[:2]': word[:2],
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word[-4:]': word[-4:],
        'word.ispunctuation': (word in string.punctuation),
        'word.isdigit()': word.isdigit(),
    }
    if i > 0:
        word1 = sent[i-1][0]
        features.update({
            '-1:word': word1,
            '-1:len(word)': len(word1),
            '-1:word.lower()': word1.lower(),
            '-1:word.stemmed': re.sub(r'(.{2,}?)([aeiougyn]+$)',r'\1', word1.lower()),
            '-1:word[:3]': word1[:3],
            '-1:word[:2]': word1[:2],
            '-1:word[-3:]': word1[-3:],
            '-1:word[-2:]': word1[-2:],
            '-1:word.isdigit()': word1.isdigit(),
            '-1:word.ispunctuation': (word1 in string.punctuation),
        })

    else:
        features['BOS'] = True

    if i > 1:
        word2 = sent[i-2][0]
        features.update({
            '-2:word': word2,
            '-2:len(word)': len(word2),
            '-2:word.lower()': word2.lower(),
            '-2:word[:3]': word2[:3],
            '-2:word[:2]': word2[:2],
            '-2:word[-3:]': word2[-3:],
            '-2:word[-2:]': word2[-2:],
            '-2:word.isdigit()': word2.isdigit(),
            '-2:word.ispunctuation': (word2 in string.punctuation),
        })

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        features.update({
            '+1:word': word1,
            '+1:len(word)': len(word1),
            '+1:word.lower()': word1.lower(),
            '+1:word[:3]': word1[:3],
            '+1:word[:2]': word1[:2],
            '+1:word[-3:]': word1[-3:],
            '+1:word[-2:]': word1[-2:],
            '+1:word.isdigit()': word1.isdigit(),
            '+1:word.ispunctuation': (word1 in string.punctuation),
        })

    else:
        features['EOS'] = True

    if i < len(sent) - 2:
        word2 = sent[i+2][0]
        features.update({
            '+2:word': word2,
            '+2:len(word)': len(word2),
            '+2:word.lower()': word2.lower(),
            '+2:word.stemmed': re.sub(r'(.{2,}?)([aeiougyn]+$)',r'\1', word2.lower()),
            '+2:word[:3]': word2[:3],
            '+2:word[:2]': word2[:2],
            '+2:word[-3:]': word2[-3:],
            '+2:word[-2:]': word2[-2:],
            '+2:word.isdigit()': word2.isdigit(),
            '+2:word.ispunctuation': (word2 in string.punctuation),
        })

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [word[1] for word in sent]

def sent2tokens(sent):
    return [word[0] for word in sent]

In [6]:
def format_data(csv_data):
    sents = []
    for i in range(len(csv_data)):
        if math.isnan(csv_data.iloc[i, 0]):
            continue
        elif csv_data.iloc[i, 0] == 1.0:
            sents.append([[csv_data.iloc[i, 1], csv_data.iloc[i, 2]]])
        else:
            sents[-1].append([csv_data.iloc[i, 1], csv_data.iloc[i, 2]])
    for sent in sents:
        for i, word in enumerate(sent):
            if type(word[0]) != str:
                del sent[i]
    return sents

In [7]:
train_sents = format_data(data['train'])
test_sents = format_data(data['test'])

In [8]:
X_train = [sent2features(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]

X_test = [sent2features(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]

In [9]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm = 'lbfgs',
    c1 = 0.2,
    c2 = 0.2,
    max_iterations = 100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CPU times: user 3.68 s, sys: 22.1 ms, total: 3.7 s
Wall time: 3.71 s


In [10]:
labels = list(crf.classes_)
labels.remove('X')
# print(labels)

In [11]:
y_pred = crf.predict(X_train)
print('F1 score on the train set = {}'.format(metrics.flat_f1_score(y_train, y_pred,
                      average='weighted', labels=labels)))
print('Accuracy on the train set = {}'.format(metrics.flat_accuracy_score(y_train, y_pred)))

sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)
print('Train set classification report: \n\n{}'.format(metrics.flat_classification_report(
    y_train, y_pred, labels=sorted_labels, digits=3
)))

F1 score on the train set = 0.9993327137643729
Accuracy on the train set = 0.9993330665599574
Train set classification report: 

              precision    recall  f1-score   support

        PART      1.000     1.000     1.000       163
       CCONJ      1.000     1.000     1.000       150
       SCONJ      1.000     1.000     1.000        61
         ADJ      1.000     1.000     1.000       570
         ADP      1.000     1.000     1.000      1387
         ADV      1.000     1.000     1.000       111
        VERB      1.000     0.994     0.997       640
         DET      1.000     0.996     0.998       231
        NOUN      1.000     1.000     1.000      1597
        PRON      0.998     1.000     0.999       431
       PROPN      1.000     1.000     1.000       708
         NUM      1.000     1.000     1.000       152
       PUNCT      1.000     1.000     1.000       564
         AUX      0.995     1.000     0.997       730

   micro avg      0.999     0.999     0.999      7495
   ma

In [12]:
y_pred = crf.predict(X_test)
print('F1 score on the test set = {}'.format(metrics.flat_f1_score(y_test, y_pred,
                      average='weighted', labels=labels)))

print('Accuracy on the test set = {}\n\n'.format(metrics.flat_accuracy_score(y_test, y_pred)))

sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)
print('Test set classification report: \n\n{}'.format(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
)))

F1 score on the test set = 0.8706746116675885
Accuracy on the test set = 0.8710562414266118


Test set classification report: 

              precision    recall  f1-score   support

        PART      1.000     0.879     0.935        33
       CCONJ      1.000     1.000     1.000        25
       SCONJ      0.667     0.667     0.667         3
         ADJ      0.689     0.777     0.730        94
         ADP      0.970     0.955     0.962       309
         ADV      0.667     0.381     0.485        21
        VERB      0.935     0.869     0.901        99
         DET      0.838     0.861     0.849        36
        NOUN      0.795     0.860     0.826       329
        PRON      0.931     0.831     0.878        65
       PROPN      0.667     0.634     0.650       145
         NUM      0.957     0.880     0.917        25
       PUNCT      1.000     0.993     0.996       135
         AUX      0.965     0.978     0.971       139

    accuracy                          0.871      1458
   mac

In [13]:
from collections import Counter

def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

print("Top likely transitions:")
print_transitions(Counter(crf.transition_features_).most_common(10))

print("\nTop unlikely transitions:")
print_transitions(Counter(crf.transition_features_).most_common()[-10:])

Top likely transitions:
VERB   -> AUX     3.569762
PROPN  -> PROPN   2.304620
ADJ    -> NOUN    1.873184
AUX    -> SCONJ   1.670297
AUX    -> AUX     1.622267
NUM    -> NOUN    1.541117
DET    -> NOUN    1.518860
PART   -> NUM     1.495714
VERB   -> SCONJ   1.437782
PRON   -> ADP     1.174530

Top unlikely transitions:
VERB   -> ADJ     -0.949693
PROPN  -> NOUN    -0.952549
AUX    -> VERB    -0.964514
PROPN  -> PART    -1.065034
DET    -> ADP     -1.308058
PROPN  -> AUX     -1.328083
PROPN  -> DET     -1.375118
ADJ    -> PRON    -1.427365
VERB   -> VERB    -1.513029
ADJ    -> ADP     -2.308780
