<a href="https://colab.research.google.com/github/Dhiru-py/NLP/blob/master/POS_TAGGER_using_HMM_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [46]:
#======================== POS Tagger using HMM Model ======================#

In [2]:
# Importing libraries
import nltk
import numpy as np
import pandas as pd
import random
from sklearn.model_selection import train_test_split
import pprint, time
 

[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Unzipping corpora/treebank.zip.
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Unzipping taggers/universal_tagset.zip.
[[('Pierre', 'NOUN'), ('Vinken', 'NOUN'), (',', '.'), ('61', 'NUM'), ('years', 'NOUN'), ('old', 'ADJ'), (',', '.'), ('will', 'VERB'), ('join', 'VERB'), ('the', 'DET'), ('board', 'NOUN'), ('as', 'ADP'), ('a', 'DET'), ('nonexecutive', 'ADJ'), ('director', 'NOUN'), ('Nov.', 'NOUN'), ('29', 'NUM'), ('.', '.')], [('Mr.', 'NOUN'), ('Vinken', 'NOUN'), ('is', 'VERB'), ('chairman', 'NOUN'), ('of', 'ADP'), ('Elsevier', 'NOUN'), ('N.V.', 'NOUN'), (',', '.'), ('the', 'DET'), ('Dutch', 'NOUN'), ('publishing', 'VERB'), ('group', 'NOUN'), ('.', '.')]]


In [47]:
#download the treebank corpus from nltk
nltk.download('treebank')
 
#download the universal tagset from nltk
nltk.download('universal_tagset')
 
# reading the Treebank tagged sentences
nltk_data = list(nltk.corpus.treebank.tagged_sents(tagset='universal'))
 
#print the first two sentences along with tags
print(nltk_data[:2])

[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Package treebank is already up-to-date!
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!
[[('Pierre', 'NOUN'), ('Vinken', 'NOUN'), (',', '.'), ('61', 'NUM'), ('years', 'NOUN'), ('old', 'ADJ'), (',', '.'), ('will', 'VERB'), ('join', 'VERB'), ('the', 'DET'), ('board', 'NOUN'), ('as', 'ADP'), ('a', 'DET'), ('nonexecutive', 'ADJ'), ('director', 'NOUN'), ('Nov.', 'NOUN'), ('29', 'NUM'), ('.', '.')], [('Mr.', 'NOUN'), ('Vinken', 'NOUN'), ('is', 'VERB'), ('chairman', 'NOUN'), ('of', 'ADP'), ('Elsevier', 'NOUN'), ('N.V.', 'NOUN'), (',', '.'), ('the', 'DET'), ('Dutch', 'NOUN'), ('publishing', 'VERB'), ('group', 'NOUN'), ('.', '.')]]


In [48]:
# As NLTK data is large in size, spliting data in small for fast computing...
split_data = nltk_data[:500]

In [49]:
# split data into training and validation set in the ratio 80:20
train_set,test_set =train_test_split(split_data,train_size=0.80,test_size=0.20,random_state = 101)

In [51]:
# create list of train and test tagged words
train_tagged_words = [ tup for sent in train_set for tup in sent ]
test_tagged_words = [ tup for sent in test_set for tup in sent ]

In [96]:
#use set datatype to check how many unique tags and vocabs are present in training data
tags = {tag for word,tag in train_tagged_words}
print (tags)
 
# check total words in vocabulary
vocab = {word for word,tag in train_tagged_words}

{'PRT', 'CONJ', 'ADJ', 'X', 'PRON', 'ADP', 'NOUN', '.', 'VERB', 'ADV', 'NUM', 'DET'}


In [56]:
# compute Emission Probability
# How likely a word will be Noun, Verb, Adjective, Adverb and all.

def word_given_tag(word, tag, train_bag = train_tagged_words):
    tag_list = [pair for pair in train_bag if pair[1]==tag]
    count_tag = len(tag_list)             #total number of times the passed tag occurred in train_bag
    w_given_tag_list = [pair[0] for pair in tag_list if pair[0]==word]

#now calculate the total number of times the passed word occurred as the passed tag.
    count_w_given_tag = len(w_given_tag_list)
    return (count_w_given_tag, count_tag)

# Let's see how this Emission Probability does work !
word = "sales"
tag = "NOUN"
train_bag = train_tagged_words

emission_prob = word_given_tag(word, tag, train_bag)
emission_prob

(6, 2926)

In [57]:
# compute  Transition Probability
# How likely a sequence followed by other sequences
 
def t2_given_t1(t2, t1, train_bag = train_tagged_words):
    tags = [pair[1] for pair in train_bag]
    count_t1 = len([t for t in tags if t==t1])
    count_t2_t1 = 0
    for index in range(len(tags)-1):
        if tags[index]==t1 and tags[index+1] == t2:
            count_t2_t1 += 1
    return (count_t2_t1, count_t1)


# Let's see how transition probability does work !

t2 = "ADJ"
t1 = "NOUN"
train_bag = train_tagged_words

transition_prob = t2_given_t1(t2, t1, train_bag)
transition_prob

(38, 2926)

In [58]:
# creating t x t transition matrix of tags, t= no of tags
# Matrix(i, j) represents P(jth tag after the ith tag)
 
tags_matrix = np.zeros((len(tags), len(tags)), dtype='float32')

for i, t1 in enumerate(tags):
    for j, t2 in enumerate(tags): 
        tags_matrix[i, j] = t2_given_t1(t2, t1)[0]/t2_given_t1(t2, t1)[1]
 
print(tags_matrix)

[[0.         0.         0.06862745 0.01633987 0.0130719  0.0130719
  0.2777778  0.05228758 0.35947713 0.00980392 0.04901961 0.14052288]
 [0.         0.         0.12135922 0.         0.03883495 0.03398058
  0.43203884 0.01456311 0.17475729 0.04368932 0.02427184 0.11650486]
 [0.00764526 0.01681957 0.06880734 0.01529052 0.00152905 0.09480122
  0.6788991  0.06269113 0.01070336 0.00764526 0.02752294 0.00764526]
 [0.16666667 0.0046729  0.02180685 0.09345794 0.04517134 0.17133956
  0.05919003 0.16666667 0.19314642 0.02647975 0.00155763 0.04984424]
 [0.01793722 0.00896861 0.06278027 0.11659193 0.00896861 0.03139013
  0.24663678 0.01345291 0.4394619  0.03139013 0.02242152 0.        ]
 [0.00096805 0.         0.09777348 0.03291385 0.04356244 0.02129719
  0.34365925 0.03969022 0.00580833 0.01548887 0.07938045 0.3194579 ]
 [0.0430622  0.04203691 0.01298701 0.02289815 0.00615174 0.17737526
  0.2740943  0.22795625 0.1582365  0.01742994 0.00649351 0.0112782 ]
 [0.00093545 0.04677268 0.05238541 0.01683

In [60]:
# convert the matrix to a data frame for better readability
# the table is same as the transition table shown in section 3 of article

tags_df = pd.DataFrame(tags_matrix, columns = list(tags), index=list(tags))
display(tags_df)

Unnamed: 0,PRT,CONJ,ADJ,X,PRON,ADP,NOUN,.,VERB,ADV,NUM,DET
PRT,0.0,0.0,0.068627,0.01634,0.013072,0.013072,0.277778,0.052288,0.359477,0.009804,0.04902,0.140523
CONJ,0.0,0.0,0.121359,0.0,0.038835,0.033981,0.432039,0.014563,0.174757,0.043689,0.024272,0.116505
ADJ,0.007645,0.01682,0.068807,0.015291,0.001529,0.094801,0.678899,0.062691,0.010703,0.007645,0.027523,0.007645
X,0.166667,0.004673,0.021807,0.093458,0.045171,0.17134,0.05919,0.166667,0.193146,0.02648,0.001558,0.049844
PRON,0.017937,0.008969,0.06278,0.116592,0.008969,0.03139,0.246637,0.013453,0.439462,0.03139,0.022422,0.0
ADP,0.000968,0.0,0.097773,0.032914,0.043562,0.021297,0.343659,0.03969,0.005808,0.015489,0.07938,0.319458
NOUN,0.043062,0.042037,0.012987,0.022898,0.006152,0.177375,0.274094,0.227956,0.158237,0.01743,0.006494,0.011278
.,0.000935,0.046773,0.052385,0.016838,0.057998,0.091674,0.227315,0.053321,0.086997,0.057998,0.090739,0.21609
VERB,0.039604,0.003808,0.054836,0.223915,0.035796,0.100533,0.105864,0.028941,0.185834,0.089109,0.01904,0.112719
ADV,0.015291,0.012232,0.140673,0.04893,0.012232,0.131498,0.027523,0.110092,0.30581,0.085627,0.042813,0.067278


In [61]:
def Viterbi(words, train_bag = train_tagged_words):
    state = []
    T = list(set([pair[1] for pair in train_bag]))
     
    for key, word in enumerate(words):
        #initialise list of probability column for a given observation
        p = [] 
        for tag in T:
            if key == 0:
                transition_p = tags_df.loc['.', tag]
            else:
                transition_p = tags_df.loc[state[-1], tag]
                 
            # compute emission and state probabilities
            emission_p = word_given_tag(words[key], tag)[0]/word_given_tag(words[key], tag)[1]
            state_probability = emission_p * transition_p    
            p.append(state_probability)
             
        pmax = max(p)
        # getting state for which probability is maximum
        state_max = T[p.index(pmax)] 
        state.append(state_max)
    return list(zip(words, state))

In [68]:
# Let's test our Viterbi algorithm on a few sample sentences of test dataset
random.seed(500)      #define a random seed to get same sentences when run multiple times
 
# choose random 10 numbers
rndom = [random.randint(1,len(test_set)) for x in range(10)]
 
# list of 10 sents on which we test the model
test_sentences = [test_set[i] for i in rndom]
 
# list of tagged words
test_tagged_words = [tup for sent in test_sentences for tup in sent]
 
# list of untagged words
test_untagged_words = [tup[0] for sent in test_sentences for tup in sent]
print (test_untagged_words)

['The', 'new', 'company', 'said', '0', 'it', 'believes', '0', 'there', 'are', 'fewer', 'than', '100', 'potential', 'customers', 'for', 'supercomputers', 'priced', '*', 'between', '$', '15', 'million', 'and', '$', '30', 'million', '*U*', '--', 'presumably', 'the', 'Cray-3', 'price', 'range', '.', 'The', 'move', 'leaves', 'United', 'Illuminating', 'Co.', 'and', 'Northeast', 'Utilities', 'as', 'the', 'remaining', 'outside', 'bidders', 'for', 'PS', 'of', 'New', 'Hampshire', ',', 'which', '*T*-1', 'also', 'has', 'proposed', 'an', 'internal', 'reorganization', 'plan', 'in', 'Chapter', '11', 'bankruptcy', 'proceedings', 'under', 'which', 'it', 'would', 'remain', 'an', 'independent', 'company', '*T*-2', '.', 'The', 'commission', 'is', 'expected', '*-1', 'to', 'rule', 'on', 'the', 'Braidwood', '2', 'case', 'by', 'year', 'end', '.', 'The', 'new', 'company', 'said', '0', 'it', 'believes', '0', 'there', 'are', 'fewer', 'than', '100', 'potential', 'customers', 'for', 'supercomputers', 'priced', '*'

In [69]:
#Here We will only test 10 sentences to check the accuracy
#as testing the whole training set takes huge amount of time
start = time.time()
tagged_seq = Viterbi(test_untagged_words)
end = time.time()
difference = end-start
 
print("Time taken in seconds: ", difference)


Time taken in seconds:  5.411680698394775


In [70]:
 # accuracy
check = [i for i, j in zip(tagged_seq, test_run_base) if i == j] 
 
accuracy = len(check)/len(tagged_seq)
print('Viterbi Algorithm Accuracy: ',accuracy*100)

Viterbi Algorithm Accuracy:  83.07210031347962


In [71]:
#Code to test all the test sentences

test_tagged_words = [tup for sent in test_set for tup in sent]
test_untagged_words = [tup[0] for sent in test_set for tup in sent]
print (test_untagged_words)

['``', 'The', 'morbidity', 'rate', 'is', 'a', 'striking', 'finding', 'among', 'those', 'of', 'us', 'who', '*T*-5', 'study', 'asbestos-related', 'diseases', ',', "''", 'said', '*T*-1', 'Dr.', 'Talcott', '.', 'Coincident', 'with', 'the', 'talks', ',', 'the', 'State', 'Department', 'said', '0', 'it', 'has', 'permitted', 'a', 'Soviet', 'bank', 'to', 'open', 'a', 'New', 'York', 'branch', '.', 'Newsweek', 'said', '0', 'it', 'will', 'introduce', 'the', 'Circulation', 'Credit', 'Plan', ',', 'which', '*T*-1', 'awards', 'space', 'credits', '*ICH*-2', 'to', 'advertisers', 'on', '``', 'renewal', 'advertising', '.', "''", 'The', 'asbestos', 'fiber', ',', 'crocidolite', ',', 'is', 'unusually', 'resilient', 'once', 'it', 'enters', 'the', 'lungs', ',', 'with', 'even', 'brief', 'exposures', 'to', 'it', 'causing', 'symptoms', 'that', '*T*-1', 'show', 'up', 'decades', 'later', ',', 'researchers', 'said', '0', '*T*-2', '.', 'Last', 'year', 'Commonwealth', 'Edison', 'had', '*-1', 'to', 'refund', '$', '72.7

In [72]:
start = time.time()
model_predicted_output = Viterbi(test_untagged_words)
end = time.time()
difference = end-start
 
print("Time taken in seconds: ", difference)

Time taken in seconds:  44.29931950569153


In [77]:
# accuracy
check = [i for i, j in zip(test_tagged_words, model_predicted_output) if i == j] 

In [78]:
len(check) # checcking lenth of all corrected tagged words
len(model_predicted_output) # checking length of all model_output

2653

In [79]:
accuracy = len(check)/len(model_predicted_output)
print('Viterbi Algorithm Accuracy: ',accuracy*100)

Viterbi Algorithm Accuracy:  79.41952506596306


In [24]:
#modified Viterbi to include rule based tagger in it
def Viterbi_rule_based(words, train_bag = train_tagged_words):
    state = []
    T = list(set([pair[1] for pair in train_bag]))
     
    for key, word in enumerate(words):
        #initialise list of probability column for a given observation
        p = [] 
        for tag in T:
            if key == 0:
                transition_p = tags_df.loc['.', tag]
            else:
                transition_p = tags_df.loc[state[-1], tag]
                 
            # compute emission and state probabilities
            emission_p = word_given_tag(words[key], tag)[0]/word_given_tag(words[key], tag)[1]
            state_probability = emission_p * transition_p    
            p.append(state_probability)
             
        pmax = max(p)
        state_max = rule_based_tagger.tag([word])[0][1]
        if(pmax==0):
            state_max = rule_based_tagger.tag([word])[0][1] # assign based on rule based tagger
        else:
            if state_max != 'X':
                # getting state for which probability is maximum
                state_max = T[p.index(pmax)]                
             
         
        state.append(state_max)
    return list(zip(words, state))      

In [25]:
#test accuracy on subset of test data 
start = time.time()
tagged_seq = Viterbi_rule_based(test_untagged_words)
end = time.time()
difference = end-start
 
print("Time taken in seconds: ", difference)

Time taken in seconds:  44.171035051345825


In [85]:
# accuracy
check = [i for i, j in zip(tagged_seq, test_tagged_words) if i == j] 

In [27]:
accuracy = len(check)/len(tagged_seq)
accuracy
#print('Viterbi Algorithm Accuracy: ',accuracy*100)

0.9189596683000377