## POS tagging using modified Viterbi

### Data Preparation

In [1]:
#Importing libraries
import nltk
import random
import numpy as np
import pandas as pd
import requests
import matplotlib.pyplot as plt
import seaborn as sns
import pprint, time
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize

In [2]:
# reading the Treebank tagged sentences
nltk_data = list(nltk.corpus.treebank.tagged_sents(tagset='universal'))

In [4]:
# first few tagged sentences
print(nltk_data[:5])

[[('Pierre', 'NOUN'), ('Vinken', 'NOUN'), (',', '.'), ('61', 'NUM'), ('years', 'NOUN'), ('old', 'ADJ'), (',', '.'), ('will', 'VERB'), ('join', 'VERB'), ('the', 'DET'), ('board', 'NOUN'), ('as', 'ADP'), ('a', 'DET'), ('nonexecutive', 'ADJ'), ('director', 'NOUN'), ('Nov.', 'NOUN'), ('29', 'NUM'), ('.', '.')], [('Mr.', 'NOUN'), ('Vinken', 'NOUN'), ('is', 'VERB'), ('chairman', 'NOUN'), ('of', 'ADP'), ('Elsevier', 'NOUN'), ('N.V.', 'NOUN'), (',', '.'), ('the', 'DET'), ('Dutch', 'NOUN'), ('publishing', 'VERB'), ('group', 'NOUN'), ('.', '.')], [('Rudolph', 'NOUN'), ('Agnew', 'NOUN'), (',', '.'), ('55', 'NUM'), ('years', 'NOUN'), ('old', 'ADJ'), ('and', 'CONJ'), ('former', 'ADJ'), ('chairman', 'NOUN'), ('of', 'ADP'), ('Consolidated', 'NOUN'), ('Gold', 'NOUN'), ('Fields', 'NOUN'), ('PLC', 'NOUN'), (',', '.'), ('was', 'VERB'), ('named', 'VERB'), ('*-1', 'X'), ('a', 'DET'), ('nonexecutive', 'ADJ'), ('director', 'NOUN'), ('of', 'ADP'), ('this', 'DET'), ('British', 'ADJ'), ('industrial', 'ADJ'), ('

In [5]:
# Splitting into train and test
random.seed(1234)
train_set, val_set = train_test_split(nltk_data,test_size=0.5)

print(len(train_set))
print(len(val_set))
print(train_set[:5])

1957
1957
[[('Year', 'NOUN'), ('ended', 'VERB'), ('Dec.', 'NOUN'), ('31', 'NUM'), (',', '.'), ('1988', 'NUM'), (':', '.'), ('Net', 'ADJ'), ('income', 'NOUN'), (':', '.'), ('$', '.'), ('65', 'NUM'), ('million', 'NUM'), ('*U*', 'X'), (';', '.'), ('or', 'CONJ'), ('$', '.'), ('1.49', 'NUM'), ('*U*', 'X'), ('a', 'DET'), ('share', 'NOUN')], [('It', 'PRON'), ('hopes', 'VERB'), ('*-1', 'X'), ('to', 'PRT'), ('speak', 'VERB'), ('to', 'PRT'), ('students', 'NOUN'), ('at', 'ADP'), ('theological', 'ADJ'), ('colleges', 'NOUN'), ('about', 'ADP'), ('the', 'DET'), ('joys', 'NOUN'), ('of', 'ADP'), ('bell', 'NOUN'), ('ringing', 'NOUN'), ('and', 'CONJ'), ('will', 'VERB'), ('shortly', 'ADV'), ('publish', 'VERB'), ('a', 'DET'), ('booklet', 'NOUN'), ('for', 'ADP'), ('every', 'DET'), ('vicar', 'NOUN'), ('in', 'ADP'), ('the', 'DET'), ('country', 'NOUN'), ('entitled', 'VERB'), (',', '.'), ('``', '.'), ('The', 'DET'), ('Bells', 'NOUN'), ('in', 'ADP'), ('Your', 'PRON'), ('Care', 'NOUN'), ('.', '.'), ("''", '.')], 

In [14]:
# Getting list of tagged words
train_tagged_words = [tup for sent in train_set for tup in sent]
len(train_tagged_words)

50218

In [15]:
# tokens 
tokens = [pair[0] for pair in train_tagged_words]
tokens[:10]

['Their',
 'report',
 'appears',
 'in',
 'today',
 "'s",
 'issue',
 'of',
 'the',
 'journal']

In [16]:
# vocabulary
V = set(tokens)
print(len(V))

8490


In [17]:
# number of tags
T = set([pair[1] for pair in train_tagged_words])
len(T)

12

In [18]:
print(T)

{'PRON', '.', 'VERB', 'CONJ', 'ADJ', 'X', 'ADV', 'NUM', 'NOUN', 'DET', 'ADP', 'PRT'}


In [22]:
### Emission Probabilities

In [20]:
# computing P(w/t) and storing in T x V matrix
t = len(T)
v = len(V)
w_given_t = np.zeros((t, v))

In [21]:
# compute word given tag: Emission Probability
def word_given_tag(word, tag, train_bag = train_tagged_words):
    tag_list = [pair for pair in train_bag if pair[1]==tag]
    count_tag = len(tag_list)
    w_given_tag_list = [pair[0] for pair in tag_list if pair[0]==word]
    count_w_given_tag = len(w_given_tag_list)
    
    return (count_w_given_tag, count_tag)

In [23]:
### Transition Probabilities

In [25]:
# compute tag given tag: tag2(t2) given tag1 (t1), i.e. Transition Probability

def t2_given_t1(t2, t1, train_bag = train_tagged_words):
    tags = [pair[1] for pair in train_bag]
    count_t1 = len([t for t in tags if t==t1])
    count_t2_t1 = 0
    for index in range(len(tags)-1):
        if tags[index]==t1 and tags[index+1] == t2:
            count_t2_t1 += 1
    return (count_t2_t1, count_t1)

In [26]:
# creating t x t transition matrix of tags
# each column is t2, each row is t1
# thus M(i, j) represents P(tj given ti)

tags_matrix = np.zeros((len(T), len(T)), dtype='float32')
for i, t1 in enumerate(list(T)):
    for j, t2 in enumerate(list(T)): 
        tags_matrix[i, j] = t2_given_t1(t2, t1)[0]/t2_given_t1(t2, t1)[1]

In [27]:
tags_matrix

array([[  5.78452647e-03,   4.12147492e-02,   4.83731031e-01,
          4.33839485e-03,   7.66449720e-02,   9.61677507e-02,
          3.90455537e-02,   5.06146066e-03,   2.03181490e-01,
          7.95372389e-03,   2.45842375e-02,   1.22921187e-02],
       [  7.18080550e-02,   9.64867175e-02,   8.91173929e-02,
          6.06683791e-02,   4.30162810e-02,   2.53641810e-02,
          4.83290479e-02,   7.67780617e-02,   2.22793490e-01,
          1.72579259e-01,   9.08311903e-02,   2.05655536e-03],
       [  3.61570232e-02,   3.68949249e-02,   1.68536007e-01,
          6.34592678e-03,   6.19834699e-02,   2.14433298e-01,
          8.23494717e-02,   2.00708378e-02,   1.11717828e-01,
          1.36658803e-01,   9.45985839e-02,   3.02538369e-02],
       [  6.04255311e-02,   3.48936170e-02,   1.59999996e-01,
          0.00000000e+00,   1.07234046e-01,   5.95744699e-03,
          5.36170229e-02,   4.76595759e-02,   3.60851049e-01,
          1.08936168e-01,   5.36170229e-02,   6.80851052e-03],
    

In [None]:
len(train_tagged_words)

### Build the vanilla Viterbi based POS tagger

In [1]:
# Viterbi Heuristic
def Viterbi(words, train_bag = train_tagged_words):
    state = []
    T = list(set([pair[1] for pair in train_bag]))
    
    for key, word in enumerate(words):
        #initialise list of probability column for a given observation
        p = [] 
        for tag in T:
            if key == 0:
                transition_p = tags_df.loc['.', tag]
            else:
                transition_p = tags_df.loc[state[-1], tag]
                
            # compute emission and state probabilities
            emission_p = word_given_tag(words[key], tag)[0]/word_given_tag(words[key], tag)[1]
            state_probability = emission_p * transition_p    
            p.append(state_probability)
            
        pmax = max(p)
        # getting state for which probability is maximum
        state_max = T[p.index(pmax)] 
        state.append(state_max)
    return list(zip(words, state))

NameError: name 'train_tagged_words' is not defined

### Solve the problem of unknown words

#### Evaluating tagging accuracy

### Compare the tagging accuracies of the modifications with the vanilla Viterbi algorithm

### List down cases which were incorrectly tagged by original POS tagger and got corrected by your modifications