## POS tagging using modified Viterbi

### Data Preparation

In [1]:
#Libraries
import nltk
import re
import pprint,time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import requests
import random
import seaborn as sns
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
nltk.download('universal_tagset')
from collections import Counter

In [2]:
# reading the Treebank tagged sentences
nltk_data = list(nltk.corpus.treebank.tagged_sents(tagset='universal'))

In [4]:
train_set, test_set = train_test_split(nltk_data,test_size=0.03)
train_tagged_words = [tup for sent in train_set for tup in sent]

3796


In [6]:
# tokens 
tokens = [pair[0] for pair in train_tagged_words]
tokens[:10]

['The',
 'funding',
 'is',
 'attached',
 '*-11',
 'to',
 'an',
 'estimated',
 '$',
 '27.1']

In [7]:
# vocabulary
V = set(tokens)
print(len(V))

12211


In [8]:
# number of tags
T = set([pair[1] for pair in train_tagged_words])
print(len(T),T)

12 {'.', 'NOUN', 'ADJ', 'VERB', 'X', 'ADV', 'PRON', 'CONJ', 'DET', 'NUM', 'PRT', 'ADP'}


### Build the vanilla Viterbi based POS tagger

##### Emission Probabilities

In [9]:
# computing P(w/t) and storing in T x V matrix
t = len(T)
print(t)
v = len(V)
print(v)
w_given_t = np.zeros((t, v))
print(w_given_t)

12
12211
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [10]:
# compute word given tag: Emission Probability
def word_given_tag(word, tag, train_bag = train_tagged_words):
    tag_list = [pair for pair in train_bag if pair[1]==tag]
    count_tag = len(tag_list)
    w_given_tag_list = [pair[0] for pair in tag_list if pair[0]==word]

    count_w_given_tag = len(w_given_tag_list)
    
    return (count_w_given_tag, count_tag)

### Transition Probabilities

In [11]:
# compute tag given tag: tag2(t2) given tag1 (t1), i.e. Transition Probability

def t2_given_t1(t2, t1, train_bag = train_tagged_words):
    tags = [pair[1] for pair in train_bag]
    count_t1 = len([t for t in tags if t==t1])
    count_t2_t1 = 0
    for index in range(len(tags)-1):
        if tags[index]==t1 and tags[index+1] == t2:
            count_t2_t1 += 1
    return (count_t2_t1, count_t1)

In [12]:
# creating t x t transition matrix of tags
# each column is t2, each row is t1
# thus M(i, j) represents P(tj given ti)

tags_matrix = np.zeros((len(T), len(T)), dtype='float32')
for i, t1 in enumerate(list(T)):
    for j, t2 in enumerate(list(T)): 
        tags_matrix[i, j] = t2_given_t1(t2, t1)[0]/t2_given_t1(t2, t1)[1]

In [13]:
# convert the matrix to a df for better readability
tags_df = pd.DataFrame(tags_matrix, columns = list(T), index=list(T))

In [14]:
tags_df

Unnamed: 0,.,NOUN,ADJ,VERB,X,ADV,PRON,CONJ,DET,NUM,PRT,ADP
.,0.093541,0.222193,0.043911,0.088349,0.026839,0.052622,0.065734,0.058958,0.17441,0.080341,0.002376,0.090637
NOUN,0.240171,0.264846,0.012141,0.146902,0.029031,0.017283,0.004642,0.042492,0.013212,0.009427,0.043956,0.175897
ADJ,0.064667,0.700693,0.065796,0.011934,0.020642,0.004838,0.000645,0.016933,0.004838,0.020964,0.010643,0.077407
VERB,0.0354,0.109997,0.065558,0.169401,0.218095,0.081814,0.035096,0.005318,0.133698,0.023017,0.031222,0.091386
X,0.163361,0.062002,0.016711,0.20506,0.074184,0.026238,0.056224,0.010464,0.054193,0.002811,0.184445,0.144307
ADV,0.134864,0.032018,0.128719,0.345084,0.022639,0.07859,0.015201,0.007115,0.069534,0.031695,0.014554,0.119987
PRON,0.041541,0.208459,0.072885,0.486027,0.092145,0.033988,0.007553,0.005287,0.009819,0.007553,0.012462,0.022281
CONJ,0.033998,0.349048,0.116954,0.157298,0.008613,0.05485,0.05893,0.000453,0.121487,0.042158,0.00408,0.052131
DET,0.018062,0.636407,0.205053,0.039665,0.046039,0.012749,0.00366,0.000472,0.005784,0.022548,0.000236,0.009326
NUM,0.115072,0.357681,0.032464,0.018261,0.207536,0.002899,0.001449,0.013913,0.003188,0.184348,0.027826,0.035362


In [15]:
tags_df.loc['.', :]

.       0.093541
NOUN    0.222193
ADJ     0.043911
VERB    0.088349
X       0.026839
ADV     0.052622
PRON    0.065734
CONJ    0.058958
DET     0.174410
NUM     0.080341
PRT     0.002376
ADP     0.090637
Name: ., dtype: float32

In [16]:
# Viterbi Heuristic
def Viterbi(words, train_bag = train_tagged_words):
    state = []
    T = list(set([pair[1] for pair in train_bag]))
    for key, word in enumerate(words):
        #initialise list of probability column for a given observation
        p = [] 
        for tag in T:
            if key == 0:
                transition_p = tags_df.loc['.', tag]
                
            else:
                transition_p = tags_df.loc[state[-1], tag]
                
                
            # compute emission and state probabilities
            emission_p = word_given_tag(words[key], tag)[0]/word_given_tag(words[key], tag)[1]
            state_probability = emission_p * transition_p 
            p.append(state_probability)
            
        pmax = max(p)
        state_max = T[p.index(pmax)]
        # getting state for which probability is maximum
        state.append(state_max)
        
    return list(zip(words, state))

##### Evaluating Test Set

In [17]:
# Running on entire test dataset would take more than 3-4hrs. 
# Let's test our Viterbi algorithm on a few sample sentences of test dataset

random.seed(1234)

# choose random 20 sents
rndom = [random.randint(1,len(test_set)) for x in range(2)]

# list of sents
test_run = [test_set[i] for i in rndom]
#print("list of sents",test_run)
# list of tagged words
test_run_base = [tup for sent in test_run for tup in sent]
#print("list of tagged words",test_run_base)
# list of untagged words
test_tagged_words = [tup[0] for sent in test_run for tup in sent]
print("len",len(test_tagged_words),"list of untagged words",test_tagged_words)
#test_run

len 83 list of untagged words ['The', 'company', 'is', 'operating', 'under', 'Chapter', '11', 'of', 'the', 'federal', 'Bankruptcy', 'Code', ',', '*', 'giving', 'it', 'court', 'protection', 'from', 'creditors', "'", 'lawsuits', 'while', 'it', 'attempts', '*-1', 'to', 'work', 'out', 'a', 'plan', '*', 'to', 'pay', 'its', 'debts', '.', 'Two', 'years', 'ago', ',', 'the', 'Rev.', 'Jeremy', 'Hummerstone', ',', 'vicar', 'of', 'Great', 'Torrington', ',', 'Devon', ',', 'got', 'so', 'fed', 'up', 'with', 'ringers', 'who', '*T*-228', 'did', "n't", 'attend', 'service', '0', 'he', 'sacked', 'the', 'entire', 'band', ';', 'the', 'ringers', 'promptly', 'set', 'up', 'a', 'picket', 'line', 'in', 'protest', '.']


In [18]:
# tagging the test sentences
start = time.time()
tagged_seq = Viterbi(test_tagged_words)
end = time.time()
difference = end-start

In [19]:
print("Time taken in seconds: ", difference)
print(len(tagged_seq),tagged_seq)
#print(test_run_base)

Time taken in seconds:  14.263831377029419
83 [('The', 'DET'), ('company', 'NOUN'), ('is', 'VERB'), ('operating', 'VERB'), ('under', 'ADP'), ('Chapter', 'NOUN'), ('11', 'NUM'), ('of', 'ADP'), ('the', 'DET'), ('federal', 'ADJ'), ('Bankruptcy', 'NOUN'), ('Code', 'NOUN'), (',', '.'), ('*', 'X'), ('giving', 'VERB'), ('it', 'PRON'), ('court', 'NOUN'), ('protection', 'NOUN'), ('from', 'ADP'), ('creditors', 'NOUN'), ("'", 'PRT'), ('lawsuits', 'NOUN'), ('while', 'ADP'), ('it', 'PRON'), ('attempts', 'VERB'), ('*-1', 'X'), ('to', 'PRT'), ('work', 'VERB'), ('out', 'PRT'), ('a', 'DET'), ('plan', 'NOUN'), ('*', 'X'), ('to', 'PRT'), ('pay', 'VERB'), ('its', 'PRON'), ('debts', 'NOUN'), ('.', '.'), ('Two', 'NUM'), ('years', 'NOUN'), ('ago', 'ADP'), (',', '.'), ('the', 'DET'), ('Rev.', '.'), ('Jeremy', '.'), ('Hummerstone', 'NOUN'), (',', '.'), ('vicar', 'NOUN'), ('of', 'ADP'), ('Great', 'NOUN'), ('Torrington', 'NOUN'), (',', '.'), ('Devon', 'NOUN'), (',', '.'), ('got', 'VERB'), ('so', 'ADV'), ('fed', 

In [20]:
# accuracy
check = [i for i, j in zip(tagged_seq, test_run_base) if i == j]

In [21]:
accuracy = len(check)/len(tagged_seq)

In [22]:
accuracy

0.9156626506024096

In [23]:
incorrect_tagged_cases = [[test_run_base[i-1],j] for i, j in enumerate(zip(tagged_seq, test_run_base)) if j[0]!=j[1]]

In [24]:
incorrect_tagged_cases

[[('years', 'NOUN'), (('ago', 'ADP'), ('ago', 'ADV'))],
 [('the', 'DET'), (('Rev.', '.'), ('Rev.', 'NOUN'))],
 [('Rev.', 'NOUN'), (('Jeremy', '.'), ('Jeremy', 'NOUN'))],
 [('fed', 'VERB'), (('up', 'ADV'), ('up', 'PRT'))],
 [('who', 'PRON'), (('*T*-228', '.'), ('*T*-228', 'X'))],
 [('he', 'PRON'), (('sacked', '.'), ('sacked', 'VERB'))],
 [('set', 'VERB'), (('up', 'ADV'), ('up', 'PRT'))]]

In [25]:
## Testing
sentence_test = 'Twitter is the best networking social site. Man is a social animal. Data science is an emerging field. Data science jobs are high in demand.'
words = nltk.word_tokenize(sentence_test)

#start = time.time()
tagged_seq = Viterbi(words)
print(tagged_seq)
# end = time.time()
# difference = end-start

[('Twitter', '.'), ('is', 'VERB'), ('the', 'DET'), ('best', 'ADJ'), ('networking', 'NOUN'), ('social', 'ADJ'), ('site', '.'), ('.', '.'), ('Man', 'NOUN'), ('is', 'VERB'), ('a', 'DET'), ('social', 'ADJ'), ('animal', '.'), ('.', '.'), ('Data', 'NOUN'), ('science', 'NOUN'), ('is', 'VERB'), ('an', 'DET'), ('emerging', 'VERB'), ('field', 'NOUN'), ('.', '.'), ('Data', 'NOUN'), ('science', 'NOUN'), ('jobs', 'NOUN'), ('are', 'VERB'), ('high', 'ADJ'), ('in', 'ADP'), ('demand', 'NOUN'), ('.', '.')]


In [26]:
print(tagged_seq)
print(difference)

[('Twitter', '.'), ('is', 'VERB'), ('the', 'DET'), ('best', 'ADJ'), ('networking', 'NOUN'), ('social', 'ADJ'), ('site', '.'), ('.', '.'), ('Man', 'NOUN'), ('is', 'VERB'), ('a', 'DET'), ('social', 'ADJ'), ('animal', '.'), ('.', '.'), ('Data', 'NOUN'), ('science', 'NOUN'), ('is', 'VERB'), ('an', 'DET'), ('emerging', 'VERB'), ('field', 'NOUN'), ('.', '.'), ('Data', 'NOUN'), ('science', 'NOUN'), ('jobs', 'NOUN'), ('are', 'VERB'), ('high', 'ADJ'), ('in', 'ADP'), ('demand', 'NOUN'), ('.', '.')]
14.263831377029419


# Solution1  for tagging Unknown Word

###### We can infer from the result that 'NOUN' has repeated maximum number of times.
######  Since for a new unknown word the 'emission probability' is zero so hence the 'state probability' will also be zero
######  So we consider by default that the tag for the word will be 'NOUN'.

##### Calculate the the tag which has maximum frequency in the training set

In [27]:
from collections import Counter
tags=[pos_tag for i in train_set for (word,pos_tag) in i]
print(Counter(tags))

Counter({'NOUN': 28005, 'VERB': 13164, '.': 11364, 'ADP': 9532, 'DET': 8471, 'X': 6403, 'ADJ': 6201, 'NUM': 3450, 'PRT': 3121, 'ADV': 3092, 'PRON': 2648, 'CONJ': 2206})


In [28]:
# # Viterbi Heuristic
def Viterbi_solution1(words, train_bag = train_tagged_words):
    state = []
    T = list(set([pair[1] for pair in train_bag]))
    for key, word in enumerate(words):
        #initialise list of probability column for a given observation
        p = [] 
        for tag in T:
            if key == 0:
                transition_p = tags_df.loc['.', tag]
            else:
                transition_p = tags_df.loc[state[-1], tag]
                
            # compute emission and state probabilities
            emission_p = word_given_tag(words[key], tag)[0]/word_given_tag(words[key], tag)[1]
            state_probability = emission_p * transition_p 
            p.append(state_probability)
            
        pmax = max(p)
        if (pmax==0):##Check if it is an unknown word
            state_max='NOUN'
        else:
            state_max = T[p.index(pmax)]
        state.append(state_max)
    return list(zip(words, state))

###### Evaluating tagging accuracy

In [29]:
# Running on entire test dataset would take more than 3-4hrs. 
# Let's test our Viterbi algorithm on a few sample sentences of test dataset

random.seed(1234)

# choose random 20 sents
rndom = [random.randint(1,len(test_set)) for x in range(2)]

# list of sents
test_run = [test_set[i] for i in rndom]
#print("list of sents",test_run)
# list of tagged words
test_run_base = [tup for sent in test_run for tup in sent]
#print("list of tagged words",test_run_base)
# list of untagged words
test_tagged_words = [tup[0] for sent in test_run for tup in sent]
print("len",len(test_tagged_words),"list of untagged words",test_tagged_words)
#test_run

len 83 list of untagged words ['The', 'company', 'is', 'operating', 'under', 'Chapter', '11', 'of', 'the', 'federal', 'Bankruptcy', 'Code', ',', '*', 'giving', 'it', 'court', 'protection', 'from', 'creditors', "'", 'lawsuits', 'while', 'it', 'attempts', '*-1', 'to', 'work', 'out', 'a', 'plan', '*', 'to', 'pay', 'its', 'debts', '.', 'Two', 'years', 'ago', ',', 'the', 'Rev.', 'Jeremy', 'Hummerstone', ',', 'vicar', 'of', 'Great', 'Torrington', ',', 'Devon', ',', 'got', 'so', 'fed', 'up', 'with', 'ringers', 'who', '*T*-228', 'did', "n't", 'attend', 'service', '0', 'he', 'sacked', 'the', 'entire', 'band', ';', 'the', 'ringers', 'promptly', 'set', 'up', 'a', 'picket', 'line', 'in', 'protest', '.']


In [30]:
# tagging the test sentences
start = time.time()
tagged_seq = Viterbi_solution1(test_tagged_words)
end = time.time()
difference = end-start

In [31]:
print("Time taken in seconds: ", difference)
print(len(tagged_seq),tagged_seq)
#print(test_run_base)

Time taken in seconds:  14.187083959579468
83 [('The', 'DET'), ('company', 'NOUN'), ('is', 'VERB'), ('operating', 'VERB'), ('under', 'ADP'), ('Chapter', 'NOUN'), ('11', 'NUM'), ('of', 'ADP'), ('the', 'DET'), ('federal', 'ADJ'), ('Bankruptcy', 'NOUN'), ('Code', 'NOUN'), (',', '.'), ('*', 'X'), ('giving', 'VERB'), ('it', 'PRON'), ('court', 'NOUN'), ('protection', 'NOUN'), ('from', 'ADP'), ('creditors', 'NOUN'), ("'", 'PRT'), ('lawsuits', 'NOUN'), ('while', 'ADP'), ('it', 'PRON'), ('attempts', 'VERB'), ('*-1', 'X'), ('to', 'PRT'), ('work', 'VERB'), ('out', 'PRT'), ('a', 'DET'), ('plan', 'NOUN'), ('*', 'X'), ('to', 'PRT'), ('pay', 'VERB'), ('its', 'PRON'), ('debts', 'NOUN'), ('.', '.'), ('Two', 'NUM'), ('years', 'NOUN'), ('ago', 'ADP'), (',', '.'), ('the', 'DET'), ('Rev.', 'NOUN'), ('Jeremy', 'NOUN'), ('Hummerstone', 'NOUN'), (',', '.'), ('vicar', 'NOUN'), ('of', 'ADP'), ('Great', 'NOUN'), ('Torrington', 'NOUN'), (',', '.'), ('Devon', 'NOUN'), (',', '.'), ('got', 'VERB'), ('so', 'ADV'), ('

In [32]:
# accuracy
check = [i for i, j in zip(tagged_seq, test_run_base) if i == j]

In [33]:
accuracy = len(check)/len(tagged_seq)

In [34]:
accuracy

0.9397590361445783

In [35]:
incorrect_tagged_cases = [[test_run_base[i-1],j] for i, j in enumerate(zip(tagged_seq, test_run_base)) if j[0]!=j[1]]

In [36]:
incorrect_tagged_cases

[[('years', 'NOUN'), (('ago', 'ADP'), ('ago', 'ADV'))],
 [('fed', 'VERB'), (('up', 'ADV'), ('up', 'PRT'))],
 [('who', 'PRON'), (('*T*-228', 'NOUN'), ('*T*-228', 'X'))],
 [('he', 'PRON'), (('sacked', 'NOUN'), ('sacked', 'VERB'))],
 [('set', 'VERB'), (('up', 'ADV'), ('up', 'PRT'))]]

In [62]:
## Testing
sentence_test = 'Twitter is the best networking social site. Man is a social animal. Data science is an emerging field. Data science jobs are high in demand.'
words = nltk.word_tokenize(sentence_test)
#start = time.time()
tagged_seq = Viterbi_solution1(words)
print(tagged_seq)
# end = time.time()
# difference = end-start

[('Twitter', 'NOUN'), ('is', 'VERB'), ('the', 'DET'), ('best', 'ADJ'), ('networking', 'NOUN'), ('social', 'ADJ'), ('site', 'NOUN'), ('.', '.'), ('Man', 'NOUN'), ('is', 'VERB'), ('a', 'DET'), ('social', 'ADJ'), ('animal', 'NOUN'), ('.', '.'), ('Data', 'NOUN'), ('science', 'NOUN'), ('is', 'VERB'), ('an', 'DET'), ('emerging', 'VERB'), ('field', 'NOUN'), ('.', '.'), ('Data', 'NOUN'), ('science', 'NOUN'), ('jobs', 'NOUN'), ('are', 'VERB'), ('high', 'ADJ'), ('in', 'ADP'), ('demand', 'NOUN'), ('.', '.')]


# Solution2  for tagging Unknown Word

###### For any unknown word which is not in the corpus the emission probability will be zero
###### So we ignore the emission probability and conisder only the transmission probability

In [73]:
# Viterbi Heuristic
def Viterbi_solution2(words, train_bag = train_tagged_words):
    state = []
    T = list(set([pair[1] for pair in train_bag]))
    for key, word in enumerate(words):
        #initialise list of probability column for a given observation
        p = []
        t = []
        for tag in T:
            if key == 0:
                transition_p = tags_df.loc['.', tag]
            else:
                transition_p = tags_df.loc[state[-1], tag]
            t.append(transition_p)
            # compute emission and state probabilities
            emission_p = word_given_tag(words[key], tag)[0]/word_given_tag(words[key], tag)[1]
            state_probability = emission_p * transition_p 
            p.append(state_probability)
            
        pmax = max(p)
        # getting state for which probability is maximum
        if pmax==0:
            pmax=max(t)
            state_max = T[t.index(pmax)]
        else:
            state_max = T[p.index(pmax)] 
        state.append(state_max)
    return list(zip(words, state))

#### Evaluating tagging accuracy

In [74]:
# Running on entire test dataset would take more than 3-4hrs. 
# Let's test our Viterbi algorithm on a few sample sentences of test dataset

random.seed(1234)

# choose random 20 sents
rndom = [random.randint(1,len(test_set)) for x in range(2)]

# list of sents
test_run = [test_set[i] for i in rndom]
#print("list of sents",test_run)
# list of tagged words
test_run_base = [tup for sent in test_run for tup in sent]
#print("list of tagged words",test_run_base)
# list of untagged words
test_tagged_words = [tup[0] for sent in test_run for tup in sent]
print("len",len(test_tagged_words),"list of untagged words",test_tagged_words)
#test_run

len 83 list of untagged words ['The', 'company', 'is', 'operating', 'under', 'Chapter', '11', 'of', 'the', 'federal', 'Bankruptcy', 'Code', ',', '*', 'giving', 'it', 'court', 'protection', 'from', 'creditors', "'", 'lawsuits', 'while', 'it', 'attempts', '*-1', 'to', 'work', 'out', 'a', 'plan', '*', 'to', 'pay', 'its', 'debts', '.', 'Two', 'years', 'ago', ',', 'the', 'Rev.', 'Jeremy', 'Hummerstone', ',', 'vicar', 'of', 'Great', 'Torrington', ',', 'Devon', ',', 'got', 'so', 'fed', 'up', 'with', 'ringers', 'who', '*T*-228', 'did', "n't", 'attend', 'service', '0', 'he', 'sacked', 'the', 'entire', 'band', ';', 'the', 'ringers', 'promptly', 'set', 'up', 'a', 'picket', 'line', 'in', 'protest', '.']


In [75]:
# tagging the test sentences
start = time.time()
tagged_seq = Viterbi_solution2(test_tagged_words)
end = time.time()
difference = end-start

In [76]:
print("Time taken in seconds: ", difference)
print(len(tagged_seq),tagged_seq)
#print(test_run_base)

Time taken in seconds:  14.121038675308228
83 [('The', 'DET'), ('company', 'NOUN'), ('is', 'VERB'), ('operating', 'VERB'), ('under', 'ADP'), ('Chapter', 'NOUN'), ('11', 'NUM'), ('of', 'ADP'), ('the', 'DET'), ('federal', 'ADJ'), ('Bankruptcy', 'NOUN'), ('Code', 'NOUN'), (',', '.'), ('*', 'X'), ('giving', 'VERB'), ('it', 'PRON'), ('court', 'NOUN'), ('protection', 'NOUN'), ('from', 'ADP'), ('creditors', 'NOUN'), ("'", 'PRT'), ('lawsuits', 'NOUN'), ('while', 'ADP'), ('it', 'PRON'), ('attempts', 'VERB'), ('*-1', 'X'), ('to', 'PRT'), ('work', 'VERB'), ('out', 'PRT'), ('a', 'DET'), ('plan', 'NOUN'), ('*', 'X'), ('to', 'PRT'), ('pay', 'VERB'), ('its', 'PRON'), ('debts', 'NOUN'), ('.', '.'), ('Two', 'NUM'), ('years', 'NOUN'), ('ago', 'ADP'), (',', '.'), ('the', 'DET'), ('Rev.', 'NOUN'), ('Jeremy', 'NOUN'), ('Hummerstone', 'NOUN'), (',', '.'), ('vicar', 'NOUN'), ('of', 'ADP'), ('Great', 'NOUN'), ('Torrington', 'NOUN'), (',', '.'), ('Devon', 'NOUN'), (',', '.'), ('got', 'VERB'), ('so', 'ADV'), ('

In [77]:
# accuracy
check = [i for i, j in zip(tagged_seq, test_run_base) if i == j]

In [78]:
accuracy = len(check)/len(tagged_seq)

In [79]:
accuracy

0.9518072289156626

In [80]:
incorrect_tagged_cases = [[test_run_base[i-1],j] for i, j in enumerate(zip(tagged_seq, test_run_base)) if j[0]!=j[1]]

In [81]:
incorrect_tagged_cases

[[('years', 'NOUN'), (('ago', 'ADP'), ('ago', 'ADV'))],
 [('fed', 'VERB'), (('up', 'ADV'), ('up', 'PRT'))],
 [('who', 'PRON'), (('*T*-228', 'VERB'), ('*T*-228', 'X'))],
 [('set', 'VERB'), (('up', 'ADV'), ('up', 'PRT'))]]

In [82]:
## Testing
sentence_test = 'Twitter is the best networking social site. Man is a social animal. Data science is an emerging field. Data science jobs are high in demand.'
words = nltk.word_tokenize(sentence_test)
#start = time.time()
tagged_seq = Viterbi_solution2(words)
print(tagged_seq)
# end = time.time()
# difference = end-start

[('Twitter', 'NOUN'), ('is', 'VERB'), ('the', 'DET'), ('best', 'ADJ'), ('networking', 'NOUN'), ('social', 'ADJ'), ('site', 'NOUN'), ('.', '.'), ('Man', 'NOUN'), ('is', 'VERB'), ('a', 'DET'), ('social', 'ADJ'), ('animal', 'NOUN'), ('.', '.'), ('Data', 'NOUN'), ('science', 'NOUN'), ('is', 'VERB'), ('an', 'DET'), ('emerging', 'VERB'), ('field', 'NOUN'), ('.', '.'), ('Data', 'NOUN'), ('science', 'NOUN'), ('jobs', 'NOUN'), ('are', 'VERB'), ('high', 'ADJ'), ('in', 'ADP'), ('demand', 'NOUN'), ('.', '.')]


### Compare the tagging accuracies of the modifications with the vanilla Viterbi algorithm

### Solution 1 Accuracy : 0.9397590361445783

### Solution 2 Accuracy : 0.9518072289156626

### List down cases which were incorrectly tagged by original POS tagger and got corrected by your modifications

In [None]:
Case 1
Original Pos Tagger
('Twitter', '.')
Corrected Pos Tagger
('Twitter', 'NOUN')

In [None]:
Case2
Original Pos Tagger 
('site', '.')
Corrected Pos Tagger
('site', 'NOUN')

In [None]:
Case3
Original Pos Tagger
('animal', '.')
Corrected Pos Tagger
('animal', 'NOUN')