In [44]:
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, fbeta_score, classification_report, precision_score, precision_recall_fscore_support
import numpy as np
import math
np.random.seed(42)

In [24]:
import nltk
import numpy as np

nltk.download('brown')
nltk.download('universal_tagset')

from nltk.corpus import brown

# Access the Brown Corpus with the Universal POS tagset
brown_corpus = brown.tagged_sents(tagset='universal')
brown_corpus = np.array(brown_corpus, dtype = object)

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


In [25]:
def training(train):
    # word_tag corresponds to tag - word prob
    word_tag = {}
    for i in train:
        for j in i:
            if j[1] not in word_tag:
                word_tag[j[1]] = {}
            if j[0].lower() not in word_tag[j[1]]:
                word_tag[j[1]][j[0].lower()] = 0
            word_tag[j[1]][j[0].lower()]+=1
    for x in word_tag:
        tot = 0
        for j in word_tag[x]:
            tot += word_tag[x][j]
        word_tag[x]['Total'] = tot


    trans = {}
    trans['^'] = {}
    word_tag['^'] = {}
    word_tag['^']['Total'] = len(train)

    for i in train:
        if i[0][1] not in trans['^']:
            trans['^'][i[0][1]] = 0
        trans['^'][i[0][1]]+=1
        if len(i) <= 1:
            continue
        for x in range(len(i)-1):
            if i[x][1] not in trans:
                trans[i[x][1]] = {}
            if i[x+1][1] not in trans[i[x][1]]:
                trans[i[x][1]][i[x+1][1]] = 0
            trans[i[x][1]][i[x+1][1]]+=1
    return word_tag, trans

In [26]:
def lex_prob(word, tag, word_tag):
    if word in word_tag[tag]:
        return word_tag[tag][word]/word_tag[tag]['Total']
    else:
        return 1e-9

In [27]:
def trans_prob(curr_tag, prev_tag, trans, total):
    if curr_tag not in trans[prev_tag]:
        return 1e-9
    else:
        return trans[prev_tag][curr_tag] / total

In [28]:
def Viterbi_predict(word_tag, trans, sentence):
    s = ""
    n = 0
    for i in sentence:
        s += i[0].lower()
        n += 1

    curr_tag = [['^', 0]]
    new_tag = []
    #curr_max = -1e10000
    max_entries = 3
    curr_entries = 1

    possible_tags = ['NOUN', 'VERB','ADV', 'ADJ', 'PRT', 'DET', 'CONJ', '.', 'ADP', 'PRON', 'X', 'NUM']
    ans_tag = []

    for i in range(n):
        word = sentence[i][0].lower()
        for poss in curr_tag:
            for j in possible_tags:
                prob = poss[-1]
                prev_tag = poss[-2]
                copy_poss = poss.copy()

                prob += math.log(lex_prob(word, j, word_tag)) + math.log(trans_prob(j, prev_tag, trans, word_tag[j]['Total']))
                copy_poss[-1] = j
                copy_poss.append(prob)
                new_tag.append(copy_poss)
        curr_tag = sorted(new_tag, key = lambda r:r[-1], reverse = True)[:max_entries]
        #print(word, '->', curr_tag)
        new_tag = []
    final_tag = curr_tag[0][:-1]
    final_tag = final_tag[1:]
    #print(len(final_tag))
    return final_tag


In [29]:
def get_confusion_matrix(all_preds, all_vals):
    possible_tags = ['NOUN', 'VERB','ADV', 'ADJ', 'PRT', 'DET', 'CONJ', '.', 'ADP', 'PRON', 'X', 'NUM']
    cm = confusion_matrix(all_vals, all_preds, labels = possible_tags)
    print(cm)

In [30]:
kf = KFold(n_splits=5, shuffle = True)
split = kf.split(brown_corpus)

In [31]:
def know_vals(d):
    l = []
    for i in d:
        l.append(i[1])
    return l

In [32]:
for train, test in split:
    #train_data, test_data = brown_corpus[train], brown_corpus[test]
    train_data, test_data = brown_corpus[train], brown_corpus[test]
    word_tag, trans = training(train_data)
    all_vals_split = []
    all_preds_split = []
    suc = 0
    tot = 0
    for d in test_data:
        vals = know_vals(d)
        pred = Viterbi_predict(word_tag, trans, d)
        if(vals == pred):
            suc += 1
        tot += 1
        #print(len(vals), len(pred))
        all_vals_split.extend(vals)
        all_preds_split.extend(pred)
    #all_preds.append(all_preds_split)
    #all_vals.append(all_vals_split)
    print(suc/tot)
    get_confusion_matrix(all_preds_split, all_vals_split)


0.48979769794209976
[[51391   698   295   844    29   318    71   372   141   283    96   284]
 [  991 35021   217   170     7    71    60    82    85    29     3     8]
 [   17    30 10185   433   162    42    19    21   341    21     0     0]
 [  358   116   583 15373    53   118    66    22    34    36     4     9]
 [   30     7    43     1  5392     2     2     2   355    18     1     0]
 [    0     0    26     0     0 27101     7     0   129   208     3     0]
 [    0     0    35     0     0    15  7476     0     3     0     0     0]
 [    0     0     0     0     0     0     0 29491     0     0    29     0]
 [    7    21   440    17   887    50    22     3 27246    62    13     0]
 [    4     2     1     1     0    56     0     1   116  9752     0     1]
 [   60    26     5    12     5    11     5    12    12    24   152     5]
 [   37    13    18    16     0    55    10    24     5    29     4  2700]]
0.4939832577607255
[[51656   734   309   834    28   355   114   430   159   22

In [35]:
#Precision, Recall, F_Score
f0_5_score, f1_score, f2_score, recall, precision = 0, 0, 0, 0, 0
for i in range(5):
    f0_5_score+=fbeta_score(all_vals[i],all_preds[i],beta=0.5,average='weighted',zero_division=0)
    f1_score+=fbeta_score(all_vals[i],all_preds[i],beta=1,average='weighted',zero_division=0)
    f2_score+=fbeta_score(all_vals[i],all_preds[i],beta=2,average='weighted',zero_division=0)
    precision+=precision_score(all_vals[i],all_preds[i],average='weighted',zero_division=0)
    recall+=recall_score(all_vals[i],all_preds[i],average='weighted',zero_division=0)

f0_5_score, f1_score, f2_score, recall, precision = f0_5_score/5.0, f1_score/5.0, f2_score/5.0, recall/5.0, precision/5.0
print(f'Precision : {precision}')
print(f'Recall : {recall}')
print(f'F0.5 Score : {f0_5_score}')
print(f'F1 Score : {f1_score}')
print(f'F2 Score : {f2_score}')

Precision : 0.9546080893951563
Recall : 0.9538759845493135
F0.5 Score : 0.9543224882476086
F1 Score : 0.9540233328995054
F2 Score : 0.9538815616660485


In [42]:
def get_classification_report(all_preds, all_vals):
  possible_tags = ['NOUN', 'VERB','ADV', 'ADJ', 'PRT', 'DET', 'CONJ', '.', 'ADP', 'PRON', 'X', 'NUM']
  cr = precision_recall_fscore_support(all_vals, all_preds, average=None,
                                labels= possible_tags)
  print(cr)

In [49]:
all_vals_1, all_preds_1 = [],[]
for i in range(5):
  all_vals_1.extend(all_vals[i])
  all_preds_1.extend(all_preds[i])

In [50]:
precision_recall_fscore_support(all_preds_1, all_vals_1)

(array([0.99913259, 0.91569618, 0.95076192, 0.90158075, 0.99433829,
        0.98622089, 0.93584654, 0.93155842, 0.98151376, 0.91364109,
        0.95181943, 0.55555556]),
 array([0.98123216, 0.91253526, 0.95602526, 0.86297336, 0.96404066,
        0.97180192, 0.97007907, 0.88905999, 0.93430065, 0.83785778,
        0.97456915, 0.49967554]),
 array([0.99010147, 0.91411299, 0.95338632, 0.88185471, 0.97895511,
        0.97895831, 0.95265538, 0.90981319, 0.95732545, 0.87410995,
        0.96305996, 0.52613598]),
 array([150257,  84011, 143969,  58755,  39350, 139052, 265834,  15585,
         51827,  32527, 178484,   1541]))

In [79]:
#Handling input sentences
def checker(input_sentence, word_tag, trans):
  input_sentence = input_sentence.lower()
  input_split = input_sentence.split()
  input_split = ['^'] + input_split
  print(Viterbi_predict(word_tag, trans, input_split))

In [81]:
input = 'He runs very fast'
checker(input, word_tag, trans)

['DET', 'NOUN', 'NOUN', 'NOUN', 'NOUN']


In [82]:
trans

{'^': {'DET': 9772,
  '.': 4133,
  'PRON': 7305,
  'NOUN': 6514,
  'ADV': 4148,
  'ADP': 5600,
  'VERB': 2056,
  'NUM': 772,
  'ADJ': 1612,
  'CONJ': 2259,
  'PRT': 1675,
  'X': 26},
 'DET': {'NOUN': 68319,
  'VERB': 6994,
  'DET': 657,
  'ADJ': 26171,
  'PRON': 1061,
  'NUM': 1057,
  'ADV': 1911,
  'ADP': 977,
  '.': 1403,
  'X': 154,
  'PRT': 209,
  'CONJ': 74},
 'NOUN': {'NOUN': 32912,
  'ADJ': 2809,
  'VERB': 35011,
  'DET': 3404,
  'ADP': 53576,
  '.': 62140,
  'ADV': 5800,
  'CONJ': 13141,
  'PRT': 3857,
  'NUM': 1757,
  'PRON': 4310,
  'X': 66},
 'ADJ': {'NOUN': 43560,
  'ADP': 5838,
  'CONJ': 2473,
  '.': 6746,
  'PRT': 1272,
  'ADJ': 3803,
  'DET': 400,
  'ADV': 640,
  'VERB': 1172,
  'NUM': 448,
  'X': 34,
  'PRON': 262},
 'VERB': {'NOUN': 14083,
  '.': 11769,
  'ADP': 24700,
  'ADJ': 8353,
  'DET': 23704,
  'VERB': 26762,
  'PRON': 7919,
  'CONJ': 2106,
  'ADV': 14998,
  'PRT': 9522,
  'NUM': 1314,
  'X': 30},
 'ADP': {'NOUN': 29692,
  'DET': 52412,
  'ADJ': 9452,
  'VERB': 