In [1]:
import nltk
import numpy as np
import pandas as pd
import random
from sklearn.model_selection import train_test_split
import pprint, time

# Download the treebank corpus from nltk

In [2]:
nltk.download('treebank')

[nltk_data] Downloading package treebank to
[nltk_data]     C:\Users\pakhrins\AppData\Roaming\nltk_data...
[nltk_data]   Package treebank is already up-to-date!


True

# Download the universal tagset from nltk

In [3]:
nltk.download('universal_tagset')

[nltk_data] Downloading package universal_tagset to
[nltk_data]     C:\Users\pakhrins\AppData\Roaming\nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


True

# reading the Treebank tagged sentences

In [4]:
nltk_data = list(nltk.corpus.treebank.tagged_sents(tagset='universal'))

# print the first two sentences along with tags

In [5]:
print(nltk_data[:2])

[[('Pierre', 'NOUN'), ('Vinken', 'NOUN'), (',', '.'), ('61', 'NUM'), ('years', 'NOUN'), ('old', 'ADJ'), (',', '.'), ('will', 'VERB'), ('join', 'VERB'), ('the', 'DET'), ('board', 'NOUN'), ('as', 'ADP'), ('a', 'DET'), ('nonexecutive', 'ADJ'), ('director', 'NOUN'), ('Nov.', 'NOUN'), ('29', 'NUM'), ('.', '.')], [('Mr.', 'NOUN'), ('Vinken', 'NOUN'), ('is', 'VERB'), ('chairman', 'NOUN'), ('of', 'ADP'), ('Elsevier', 'NOUN'), ('N.V.', 'NOUN'), (',', '.'), ('the', 'DET'), ('Dutch', 'NOUN'), ('publishing', 'VERB'), ('group', 'NOUN'), ('.', '.')]]


# print each word with its respective tag for first two sentences

In [6]:
for sent in nltk_data[:2]:
  for tuple in sent:
    print(tuple)

('Pierre', 'NOUN')
('Vinken', 'NOUN')
(',', '.')
('61', 'NUM')
('years', 'NOUN')
('old', 'ADJ')
(',', '.')
('will', 'VERB')
('join', 'VERB')
('the', 'DET')
('board', 'NOUN')
('as', 'ADP')
('a', 'DET')
('nonexecutive', 'ADJ')
('director', 'NOUN')
('Nov.', 'NOUN')
('29', 'NUM')
('.', '.')
('Mr.', 'NOUN')
('Vinken', 'NOUN')
('is', 'VERB')
('chairman', 'NOUN')
('of', 'ADP')
('Elsevier', 'NOUN')
('N.V.', 'NOUN')
(',', '.')
('the', 'DET')
('Dutch', 'NOUN')
('publishing', 'VERB')
('group', 'NOUN')
('.', '.')


# split data into training and validation set in the ratio 80:20

In [7]:
train_set,test_set =train_test_split(nltk_data,train_size=0.80,test_size=0.20,random_state = 101)

# create list of train and test tagged words

In [8]:
train_tagged_words = [ tup for sent in train_set for tup in sent ]
test_tagged_words = [ tup for sent in test_set for tup in sent ]
print(len(train_tagged_words))
print(len(test_tagged_words))

80310
20366


In [33]:
train_tagged_words[:5]

[('Drink', 'NOUN'),
 ('Carrier', 'NOUN'),
 ('Competes', 'VERB'),
 ('With', 'ADP'),
 ('Cartons', 'NOUN')]

# check some of the tagged words.

In [9]:
train_tagged_words[:5]

[('Drink', 'NOUN'),
 ('Carrier', 'NOUN'),
 ('Competes', 'VERB'),
 ('With', 'ADP'),
 ('Cartons', 'NOUN')]

# use set datatype to check how many unique tags are present in training data

In [10]:
tags = {tag for word,tag in train_tagged_words}
print(len(tags))
print(tags)

12
{'PRON', 'ADV', 'DET', '.', 'NOUN', 'X', 'PRT', 'ADP', 'CONJ', 'VERB', 'ADJ', 'NUM'}


# check total words in vocabulary

In [11]:
vocab = {word for word,tag in train_tagged_words}

In [12]:
len(vocab)

11052

# Compute Emission Probability

In [69]:
def word_given_tag(word, tag, train_bag = train_tagged_words):
    tag_list = [pair for pair in train_bag if pair[1]==tag]
#     print(tag_list)
    count_tag = len(tag_list)#total number of times the passed tag occurred in train_bag
    w_given_tag_list = [pair[0] for pair in tag_list if pair[0]==word]
    #now calculate the total number of times the passed word occurred as the passed tag.
    count_w_given_tag = len(w_given_tag_list)
 
     
    return (count_w_given_tag, count_tag)

In [29]:
check_values = word_given_tag('sold', 'VERB')
check_values



(28, 10860)

# Compute  Transition Probability

In [34]:
train_tagged_words[:5]

[('Drink', 'NOUN'),
 ('Carrier', 'NOUN'),
 ('Competes', 'VERB'),
 ('With', 'ADP'),
 ('Cartons', 'NOUN')]

In [14]:
def t2_given_t1(t2, t1, train_bag = train_tagged_words):
    tags = [pair[1] for pair in train_bag]
    count_t1 = len([t for t in tags if t==t1])
    count_t2_t1 = 0
    for index in range(len(tags)-1):
        if tags[index]==t1 and tags[index+1] == t2:
            count_t2_t1 += 1
    return (count_t2_t1, count_t1)

In [37]:
Verb_Follows_by_Noun = t2_given_t1('NOUN', 'VERB')
Verb_Follows_by_Noun

(1201, 10860)

In [38]:
Verb_Follows_by_Noun[0]/Verb_Follows_by_Noun[1]

0.11058931860036833

In [39]:
Noun_Follows_by_ADJ = t2_given_t1('ADJ', 'NOUN')
Noun_Follows_by_ADJ

(289, 22966)

In [40]:
Noun_Follows_by_ADJ[0]/Noun_Follows_by_ADJ[1]

0.012583819559348602

In [41]:
tags = {tag for word,tag in train_tagged_words}
print(len(tags))
print(tags)

12
{'PRON', 'ADV', 'DET', '.', 'NOUN', 'X', 'PRT', 'ADP', 'CONJ', 'VERB', 'ADJ', 'NUM'}


# creating t x t transition matrix of tags, t= no of tags
# Matrix(i, j) represents P(jth tag after the ith tag)

In [15]:
tags_matrix = np.zeros((len(tags), len(tags)), dtype='float32')
for i, t1 in enumerate(list(tags)):
    for j, t2 in enumerate(list(tags)): 
        tags_matrix[i, j] = t2_given_t1(t2, t1)[0]/t2_given_t1(t2, t1)[1]
 
print(tags_matrix)

[[6.83371304e-03 3.69020514e-02 9.56719834e-03 4.19134386e-02
  2.12756261e-01 8.83826911e-02 1.41230067e-02 2.23234631e-02
  5.01138950e-03 4.84738052e-01 7.06150308e-02 6.83371304e-03]
 [1.20248254e-02 8.14584941e-02 7.13731572e-02 1.39255241e-01
  3.21955010e-02 2.28859577e-02 1.47401085e-02 1.19472459e-01
  6.98215654e-03 3.39022487e-01 1.30721495e-01 2.98681147e-02]
 [3.30602261e-03 1.20741697e-02 6.03708485e-03 1.73925534e-02
  6.35906279e-01 4.51343954e-02 2.87480245e-04 9.91806854e-03
  4.31220367e-04 4.02472317e-02 2.06410810e-01 2.28546783e-02]
 [6.87694475e-02 5.25694676e-02 1.72191828e-01 9.23720598e-02
  2.18538776e-01 2.56410260e-02 2.78940029e-03 9.29084867e-02
  6.00793920e-02 8.96899477e-02 4.61323895e-02 7.82104954e-02]
 [4.65906132e-03 1.68945398e-02 1.31063312e-02 2.40094051e-01
  2.62344331e-01 2.88252197e-02 4.39345129e-02 1.76826611e-01
  4.24540639e-02 1.49133503e-01 1.25838192e-02 9.14395228e-03]
 [5.41995019e-02 2.57543717e-02 5.68902567e-02 1.60868734e-01
  6

# convert the matrix to a df for better readability
# the table is same as the transition table shown in section 3 of article

In [16]:
tags_df = pd.DataFrame(tags_matrix, columns = list(tags), index=list(tags))
display(tags_df)

Unnamed: 0,PRON,ADV,DET,.,NOUN,X,PRT,ADP,CONJ,VERB,ADJ,NUM
PRON,0.006834,0.036902,0.009567,0.041913,0.212756,0.088383,0.014123,0.022323,0.005011,0.484738,0.070615,0.006834
ADV,0.012025,0.081458,0.071373,0.139255,0.032196,0.022886,0.01474,0.119472,0.006982,0.339022,0.130721,0.029868
DET,0.003306,0.012074,0.006037,0.017393,0.635906,0.045134,0.000287,0.009918,0.000431,0.040247,0.206411,0.022855
.,0.068769,0.052569,0.172192,0.092372,0.218539,0.025641,0.002789,0.092908,0.060079,0.08969,0.046132,0.07821
NOUN,0.004659,0.016895,0.013106,0.240094,0.262344,0.028825,0.043935,0.176827,0.042454,0.149134,0.012584,0.009144
X,0.0542,0.025754,0.05689,0.160869,0.061695,0.075726,0.185086,0.142226,0.010379,0.206419,0.017682,0.003075
PRT,0.017613,0.009393,0.10137,0.04501,0.250489,0.012133,0.001174,0.019569,0.002348,0.401174,0.082975,0.056751
ADP,0.069603,0.014553,0.320931,0.038724,0.323589,0.034548,0.001266,0.016958,0.001012,0.008479,0.107062,0.063275
CONJ,0.060373,0.05708,0.123491,0.035126,0.349067,0.00933,0.004391,0.055982,0.000549,0.150384,0.113611,0.040615
VERB,0.035543,0.083886,0.13361,0.034807,0.110589,0.21593,0.030663,0.092357,0.005433,0.167956,0.06639,0.022836


In [17]:
def Viterbi(words, train_bag = train_tagged_words):
    state = []
    T = list(set([pair[1] for pair in train_bag]))
     
    for key, word in enumerate(words):
        #initialise list of probability column for a given observation
        p = [] 
        for tag in T:
            if key == 0:
                transition_p = tags_df.loc['.', tag]
            else:
                transition_p = tags_df.loc[state[-1], tag]
                 
            # compute emission and state probabilities
            emission_p = word_given_tag(words[key], tag)[0]/word_given_tag(words[key], tag)[1]
            state_probability = emission_p * transition_p    
            p.append(state_probability)
             
        pmax = max(p)
        # getting state for which probability is maximum
        state_max = T[p.index(pmax)] 
        state.append(state_max)
    return list(zip(words, state))

0.001785793392

In [46]:
train_tagged_words

[('Drink', 'NOUN'),
 ('Carrier', 'NOUN'),
 ('Competes', 'VERB'),
 ('With', 'ADP'),
 ('Cartons', 'NOUN'),
 ('At', 'ADP'),
 ('last', 'ADJ'),
 ('count', 'NOUN'),
 (',', '.'),
 ('Candela', 'NOUN'),
 ('had', 'VERB'),
 ('sold', 'VERB'),
 ('$', '.'),
 ('4', 'NUM'),
 ('million', 'NUM'),
 ('*U*', 'X'),
 ('of', 'ADP'),
 ('its', 'PRON'),
 ('medical', 'ADJ'),
 ('devices', 'NOUN'),
 ('in', 'ADP'),
 ('Japan', 'NOUN'),
 ('.', '.'),
 ('Mrs.', 'NOUN'),
 ('Hills', 'NOUN'),
 ('lauded', 'VERB'),
 ('South', 'NOUN'),
 ('Korea', 'NOUN'),
 ('for', 'ADP'),
 ('*-1', 'X'),
 ('creating', 'VERB'),
 ('an', 'DET'),
 ('intellectual-property', 'ADJ'),
 ('task', 'NOUN'),
 ('force', 'NOUN'),
 ('and', 'CONJ'),
 ('special', 'ADJ'),
 ('enforcement', 'NOUN'),
 ('teams', 'NOUN'),
 ('of', 'ADP'),
 ('police', 'NOUN'),
 ('officers', 'NOUN'),
 ('and', 'CONJ'),
 ('prosecutors', 'NOUN'),
 ('trained', 'VERB'),
 ('*', 'X'),
 ('to', 'PRT'),
 ('pursue', 'VERB'),
 ('movie', 'NOUN'),
 ('and', 'CONJ'),
 ('book', 'NOUN'),
 ('pirates', 'NO

In [52]:
def Viterbi_check(words, train_bag = train_tagged_words):
    state = []
    T = list(set([pair[1] for pair in train_bag]))
     
    for key, word in enumerate(words):
        print(key,word)
        p = [] 
        print()
        for tag in T:
            print(tag)

In [56]:
for tag in tags:
    print(tag,"\t",tags_df.loc['.', tag])

PRON 	 0.06876945
ADV 	 0.052569468
DET 	 0.17219183
. 	 0.09237206
NOUN 	 0.21853878
X 	 0.025641026
PRT 	 0.0027894003
ADP 	 0.09290849
CONJ 	 0.060079392
VERB 	 0.08968995
ADJ 	 0.04613239
NUM 	 0.078210495


In [73]:
for tag in tags:
    print(tag,"\t",tags_df.loc['PRON', tag])

PRON 	 0.006833713
ADV 	 0.03690205
DET 	 0.009567198
. 	 0.04191344
NOUN 	 0.21275626
X 	 0.08838269
PRT 	 0.014123007
ADP 	 0.022323463
CONJ 	 0.0050113895
VERB 	 0.48473805
ADJ 	 0.07061503
NUM 	 0.006833713


In [76]:
for tag in tags:
    print(tag,"\t",tags_df.loc['VERB', tag])

PRON 	 0.035543278
ADV 	 0.08388582
DET 	 0.13360958
. 	 0.03480663
NOUN 	 0.11058932
X 	 0.21593001
PRT 	 0.030662984
ADP 	 0.09235728
CONJ 	 0.005432781
VERB 	 0.1679558
ADJ 	 0.066390425
NUM 	 0.022836097


In [78]:
for tag in tags:
    print(tag,"\t",tags_df.loc['DET', tag])

PRON 	 0.0033060226
ADV 	 0.01207417
DET 	 0.006037085
. 	 0.017392553
NOUN 	 0.6359063
X 	 0.045134395
PRT 	 0.00028748024
ADP 	 0.009918069
CONJ 	 0.00043122037
VERB 	 0.04024723
ADJ 	 0.20641081
NUM 	 0.022854678


In [80]:
for tag in tags:
    print(tag,"\t",tags_df.loc['ADJ', tag])

PRON 	 0.00019417476
ADV 	 0.0052427184
DET 	 0.0052427184
. 	 0.066019416
NOUN 	 0.6968932
X 	 0.020970874
PRT 	 0.011456311
ADP 	 0.08058252
CONJ 	 0.016893204
VERB 	 0.011456311
ADJ 	 0.06330097
NUM 	 0.021747572


In [63]:
List_of_words_from_sentence = "He is a good boy".split()
print(List_of_words_from_sentence)
for value in List_of_words_from_sentence:
    print(value)

['He', 'is', 'a', 'good', 'boy']
He
is
a
good
boy


In [71]:
for tag in tags:
    print(tag)
    emission_p = word_given_tag("He", tag)[0]/word_given_tag("He", tag)[1]
    print(tag,"\t",emission_p)

PRON
PRON 	 0.025968109339407745
ADV
ADV 	 0.0
DET
DET 	 0.0
.
. 	 0.0
NOUN
NOUN 	 0.0
X
X 	 0.0
PRT
PRT 	 0.0
ADP
ADP 	 0.0
CONJ
CONJ 	 0.0
VERB
VERB 	 0.0
ADJ
ADJ 	 0.0
NUM
NUM 	 0.0


In [74]:
for tag in tags:
    print(tag)
    emission_p = word_given_tag("is", tag)[0]/word_given_tag("is", tag)[1]
    print(tag,"\t",emission_p)

PRON
PRON 	 0.0
ADV
ADV 	 0.0
DET
DET 	 0.0
.
. 	 0.0
NOUN
NOUN 	 0.0
X
X 	 0.0
PRT
PRT 	 0.0
ADP
ADP 	 0.0
CONJ
CONJ 	 0.0
VERB
VERB 	 0.04788213627992634
ADJ
ADJ 	 0.0
NUM
NUM 	 0.0


In [77]:
for tag in tags:
    print(tag)
    emission_p = word_given_tag("a", tag)[0]/word_given_tag("a", tag)[1]
    print(tag,"\t",emission_p)

PRON
PRON 	 0.0
ADV
ADV 	 0.0
DET
DET 	 0.2140290355038091
.
. 	 0.0
NOUN
NOUN 	 0.0
X
X 	 0.00019219680953296174
PRT
PRT 	 0.0
ADP
ADP 	 0.00012655024044545685
CONJ
CONJ 	 0.0
VERB
VERB 	 0.0
ADJ
ADJ 	 0.0003883495145631068
NUM
NUM 	 0.0


In [81]:
for tag in tags:
    print(tag)
    emission_p = word_given_tag("boy", tag)[0]/word_given_tag("boy", tag)[1]
    print(tag,"\t",emission_p)

PRON
PRON 	 0.0
ADV
ADV 	 0.0
DET
DET 	 0.0
.
. 	 0.0
NOUN
NOUN 	 4.3542628233040145e-05
X
X 	 0.0
PRT
PRT 	 0.0
ADP
ADP 	 0.0
CONJ
CONJ 	 0.0
VERB
VERB 	 0.0
ADJ
ADJ 	 0.0
NUM
NUM 	 0.0


In [None]:
for tag in tags:
    print(tag)
    emission_p = word_given_tag("good", tag)[0]/word_given_tag("good", tag)[1]
    print(tag,"\t",emission_p)

In [75]:
0.04788213627992634*0.48473805

0.023210293370165745

In [68]:
tags

{'.',
 'ADJ',
 'ADP',
 'ADV',
 'CONJ',
 'DET',
 'NOUN',
 'NUM',
 'PRON',
 'PRT',
 'VERB',
 'X'}

In [53]:
#Check how a sentence is tagged by Viterbi POS taggers
test_sent="He is a good boy"
Viterbi_check(test_sent.split())

0 He

PRON
ADV
DET
.
NOUN
X
PRT
ADP
CONJ
VERB
ADJ
NUM
1 is

PRON
ADV
DET
.
NOUN
X
PRT
ADP
CONJ
VERB
ADJ
NUM
2 a

PRON
ADV
DET
.
NOUN
X
PRT
ADP
CONJ
VERB
ADJ
NUM
3 good

PRON
ADV
DET
.
NOUN
X
PRT
ADP
CONJ
VERB
ADJ
NUM
4 boy

PRON
ADV
DET
.
NOUN
X
PRT
ADP
CONJ
VERB
ADJ
NUM


In [24]:
#Check how a sentence is tagged by Viterbi POS taggers
test_sent="He is a good boy"
pred_tags_withoutRules= Viterbi(test_sent.split())
print(pred_tags_withoutRules)

[('He', 'PRON'), ('is', 'VERB'), ('a', 'DET'), ('good', 'ADJ'), ('boy', 'NOUN')]


In [25]:
test_sent="Kids are playing in the ground"
pred_tags_withoutRules= Viterbi(test_sent.split())
print(pred_tags_withoutRules)

[('Kids', 'PRON'), ('are', 'VERB'), ('playing', 'VERB'), ('in', 'ADP'), ('the', 'DET'), ('ground', 'NOUN')]
