In [None]:
import nltk
import numpy as np
import pandas as pd
import random
from sklearn.model_selection import train_test_split
import pprint, time

nltk.download('treebank')

nltk.download('universal_tagset')

nltk_data = list(nltk.corpus.treebank.tagged_sents(tagset='universal'))

print(nltk_data[:2])

[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Unzipping corpora/treebank.zip.
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Unzipping taggers/universal_tagset.zip.


[[('Pierre', 'NOUN'), ('Vinken', 'NOUN'), (',', '.'), ('61', 'NUM'), ('years', 'NOUN'), ('old', 'ADJ'), (',', '.'), ('will', 'VERB'), ('join', 'VERB'), ('the', 'DET'), ('board', 'NOUN'), ('as', 'ADP'), ('a', 'DET'), ('nonexecutive', 'ADJ'), ('director', 'NOUN'), ('Nov.', 'NOUN'), ('29', 'NUM'), ('.', '.')], [('Mr.', 'NOUN'), ('Vinken', 'NOUN'), ('is', 'VERB'), ('chairman', 'NOUN'), ('of', 'ADP'), ('Elsevier', 'NOUN'), ('N.V.', 'NOUN'), (',', '.'), ('the', 'DET'), ('Dutch', 'NOUN'), ('publishing', 'VERB'), ('group', 'NOUN'), ('.', '.')]]


In [None]:
for sent in nltk_data[:2]:
  for tuple in sent:
    print(tuple)

('Pierre', 'NOUN')
('Vinken', 'NOUN')
(',', '.')
('61', 'NUM')
('years', 'NOUN')
('old', 'ADJ')
(',', '.')
('will', 'VERB')
('join', 'VERB')
('the', 'DET')
('board', 'NOUN')
('as', 'ADP')
('a', 'DET')
('nonexecutive', 'ADJ')
('director', 'NOUN')
('Nov.', 'NOUN')
('29', 'NUM')
('.', '.')
('Mr.', 'NOUN')
('Vinken', 'NOUN')
('is', 'VERB')
('chairman', 'NOUN')
('of', 'ADP')
('Elsevier', 'NOUN')
('N.V.', 'NOUN')
(',', '.')
('the', 'DET')
('Dutch', 'NOUN')
('publishing', 'VERB')
('group', 'NOUN')
('.', '.')


In [None]:
train_set,test_set =train_test_split(nltk_data,train_size=0.80,test_size=0.20,random_state = 101)

In [None]:
train_tagged_words = [ tup for sent in train_set for tup in sent ]
test_tagged_words = [ tup for sent in test_set for tup in sent ]
print(len(train_tagged_words))
print(len(test_tagged_words))

80310
20366


In [None]:
train_tagged_words[:5]

[('Drink', 'NOUN'),
 ('Carrier', 'NOUN'),
 ('Competes', 'VERB'),
 ('With', 'ADP'),
 ('Cartons', 'NOUN')]

In [None]:
tags = {tag for word,tag in train_tagged_words}
print(len(tags))
print(tags)

vocab = {word for word,tag in train_tagged_words}

12
{'X', 'DET', 'PRON', 'PRT', 'ADP', '.', 'NUM', 'ADV', 'NOUN', 'CONJ', 'VERB', 'ADJ'}


In [None]:
# compute Emission Probability
def word_given_tag(word, tag, train_bag = train_tagged_words):
    tag_list = [pair for pair in train_bag if pair[1]==tag]
    count_tag = len(tag_list)#total number of times the passed tag occurred in train_bag
    w_given_tag_list = [pair[0] for pair in tag_list if pair[0]==word]
#now calculate the total number of times the passed word occurred as the passed tag.
    count_w_given_tag = len(w_given_tag_list)


    return (count_w_given_tag, count_tag)

In [None]:
# compute  Transition Probability
def t2_given_t1(t2, t1, train_bag = train_tagged_words):
    tags = [pair[1] for pair in train_bag]
    count_t1 = len([t for t in tags if t==t1])
    count_t2_t1 = 0
    for index in range(len(tags)-1):
        if tags[index]==t1 and tags[index+1] == t2:
            count_t2_t1 += 1
    return (count_t2_t1, count_t1)

In [None]:
# creating t x t transition matrix of tags, t= no of tags
# Matrix(i, j) represents P(jth tag after the ith tag)

tags_matrix = np.zeros((len(tags), len(tags)), dtype='float32')
for i, t1 in enumerate(list(tags)):
    for j, t2 in enumerate(list(tags)):
        tags_matrix[i, j] = t2_given_t1(t2, t1)[0]/t2_given_t1(t2, t1)[1]

print(tags_matrix)

[[7.57255405e-02 5.68902567e-02 5.41995019e-02 1.85085520e-01
  1.42225638e-01 1.60868734e-01 3.07514891e-03 2.57543717e-02
  6.16951771e-02 1.03786280e-02 2.06419379e-01 1.76821072e-02]
 [4.51343954e-02 6.03708485e-03 3.30602261e-03 2.87480245e-04
  9.91806854e-03 1.73925534e-02 2.28546783e-02 1.20741697e-02
  6.35906279e-01 4.31220367e-04 4.02472317e-02 2.06410810e-01]
 [8.83826911e-02 9.56719834e-03 6.83371304e-03 1.41230067e-02
  2.23234631e-02 4.19134386e-02 6.83371304e-03 3.69020514e-02
  2.12756261e-01 5.01138950e-03 4.84738052e-01 7.06150308e-02]
 [1.21330721e-02 1.01369865e-01 1.76125243e-02 1.17416831e-03
  1.95694715e-02 4.50097844e-02 5.67514673e-02 9.39334650e-03
  2.50489235e-01 2.34833662e-03 4.01174158e-01 8.29745606e-02]
 [3.45482156e-02 3.20931405e-01 6.96026310e-02 1.26550242e-03
  1.69577319e-02 3.87243740e-02 6.32751212e-02 1.45532778e-02
  3.23588967e-01 1.01240189e-03 8.47886596e-03 1.07061505e-01]
 [2.56410260e-02 1.72191828e-01 6.87694475e-02 2.78940029e-03
  9

In [None]:
# convert the matrix to a df for better readability
#the table is same as the transition table shown in section 3 of article
tags_df = pd.DataFrame(tags_matrix, columns = list(tags), index=list(tags))
display(tags_df)

Unnamed: 0,X,DET,PRON,PRT,ADP,.,NUM,ADV,NOUN,CONJ,VERB,ADJ
X,0.075726,0.05689,0.0542,0.185086,0.142226,0.160869,0.003075,0.025754,0.061695,0.010379,0.206419,0.017682
DET,0.045134,0.006037,0.003306,0.000287,0.009918,0.017393,0.022855,0.012074,0.635906,0.000431,0.040247,0.206411
PRON,0.088383,0.009567,0.006834,0.014123,0.022323,0.041913,0.006834,0.036902,0.212756,0.005011,0.484738,0.070615
PRT,0.012133,0.10137,0.017613,0.001174,0.019569,0.04501,0.056751,0.009393,0.250489,0.002348,0.401174,0.082975
ADP,0.034548,0.320931,0.069603,0.001266,0.016958,0.038724,0.063275,0.014553,0.323589,0.001012,0.008479,0.107062
.,0.025641,0.172192,0.068769,0.002789,0.092908,0.092372,0.07821,0.052569,0.218539,0.060079,0.08969,0.046132
NUM,0.202428,0.00357,0.001428,0.026062,0.037487,0.119243,0.18422,0.00357,0.35166,0.014281,0.020707,0.035345
ADV,0.022886,0.071373,0.012025,0.01474,0.119472,0.139255,0.029868,0.081458,0.032196,0.006982,0.339022,0.130721
NOUN,0.028825,0.013106,0.004659,0.043935,0.176827,0.240094,0.009144,0.016895,0.262344,0.042454,0.149134,0.012584
CONJ,0.00933,0.123491,0.060373,0.004391,0.055982,0.035126,0.040615,0.05708,0.349067,0.000549,0.150384,0.113611


In [None]:
def Viterbi(words, train_bag = train_tagged_words):
    state = []
    T = list(set([pair[1] for pair in train_bag]))

    for key, word in enumerate(words):
        #initialise list of probability column for a given observation
        p = []
        for tag in T:
            if key == 0:
                transition_p = tags_df.loc['.', tag]
            else:
                transition_p = tags_df.loc[state[-1], tag]

            # compute emission and state probabilities
            emission_p = word_given_tag(words[key], tag)[0]/word_given_tag(words[key], tag)[1]
            state_probability = emission_p * transition_p
            p.append(state_probability)

        pmax = max(p)
        # getting state for which probability is maximum
        state_max = T[p.index(pmax)]
        state.append(state_max)
    return list(zip(words, state))

In [None]:
# Let's test our Viterbi algorithm on a few sample sentences of test dataset
random.seed(1234)      #define a random seed to get same sentences when run multiple times

# choose random 10 numbers
rndom = [random.randint(1,len(test_set)) for x in range(10)]

# list of 10 sents on which we test the model
test_run = [test_set[i] for i in rndom]

# list of tagged words
test_run_base = [tup for sent in test_run for tup in sent]

# list of untagged words
test_tagged_words = [tup[0] for sent in test_run for tup in sent]

In [None]:
#Here We will only test 10 sentences to check the accuracy
#as testing the whole training set takes huge amount of time
start = time.time()
tagged_seq = Viterbi(test_tagged_words)
end = time.time()
difference = end-start

print("Time taken in seconds: ", difference)

# accuracy
check = [i for i, j in zip(tagged_seq, test_run_base) if i == j]

accuracy = len(check)/len(tagged_seq)
print('Viterbi Algorithm Accuracy: ',accuracy*100)

Time taken in seconds:  51.92244505882263
Viterbi Algorithm Accuracy:  94.25837320574163


In [None]:
#modified Viterbi to include rule based tagger in it
def Viterbi_rule_based(words, train_bag = train_tagged_words):
    state = []
    T = list(set([pair[1] for pair in train_bag]))

    for key, word in enumerate(words):
        #initialise list of probability column for a given observation
        p = []
        for tag in T:
            if key == 0:
                transition_p = tags_df.loc['.', tag]
            else:
                transition_p = tags_df.loc[state[-1], tag]

            # compute emission and state probabilities
            emission_p = word_given_tag(words[key], tag)[0]/word_given_tag(words[key], tag)[1]
            state_probability = emission_p * transition_p
            p.append(state_probability)

        pmax = max(p)
        state_max = rule_based_tagger.tag([word])[0][1]


        if(pmax==0):
            state_max = rule_based_tagger.tag([word])[0][1] # assign based on rule based tagger
        else:
            if state_max != 'X':
                # getting state for which probability is maximum
                state_max = T[p.index(pmax)]


        state.append(state_max)
    return list(zip(words, state))

In [None]:
!pip install nltk
import nltk



In [None]:
rule_based_tagger = nltk.DefaultTagger('NOUN')

Time taken in seconds:  50.30930757522583
Viterbi Algorithm Accuracy:  95.69377990430623


In [None]:
test_sent="Justin will spot Will"
pred_tags_rule=Viterbi_rule_based(test_sent.split())
pred_tags_withoutRules= Viterbi(test_sent.split())
print(pred_tags_rule)
print(pred_tags_withoutRules)

[('Justin', 'NOUN'), ('will', 'VERB'), ('spot', 'NOUN'), ('Will', 'NOUN')]
[('Justin', 'X'), ('will', 'VERB'), ('spot', 'NOUN'), ('Will', 'X')]


In [None]:
# prompt: Maximum Entropy model using Corpus dataset

import nltk
import numpy as np
import pandas as pd
import random
from sklearn.model_selection import train_test_split
import pprint, time

nltk.download('treebank')

nltk.download('universal_tagset')

nltk_data = list(nltk.corpus.treebank.tagged_sents(tagset='universal'))

print(nltk_data[:2])
for sent in nltk_data[:2]:
  for tuple in sent:
    print(tuple)
train_set,test_set =train_test_split(nltk_data,train_size=0.80,test_size=0.20,random_state = 101)
train_tagged_words = [ tup for sent in train_set for tup in sent ]
test_tagged_words = [ tup for sent in test_set for tup in sent ]
print(len(train_tagged_words))
print(len(test_tagged_words))
train_tagged_words[:5]
tags = {tag for word,tag in train_tagged_words}
print(len(tags))
print(tags)

vocab = {word for word,tag in train_tagged_words}
# compute Emission Probability
def word_given_tag(word, tag, train_bag = train_tagged_words):
    tag_list = [pair for pair in train_bag if pair[1]==tag]
    count_tag = len(tag_list)#total number of times the passed tag occurred in train_bag
    w_given_tag_list = [pair[0] for pair in tag_list if pair[0]==word]
#now calculate the total number of times the passed word occurred as the passed tag.
    count_w_given_tag = len(w_given_tag_list)


    return (count_w_given_tag, count_tag)
# compute  Transition Probability
def t2_given_t1(t2, t1, train_bag = train_tagged_words):
    tags = [pair[1] for pair in train_bag]
    count_t1 = len([t for t in tags if t==t1])
    count_t2_t1 = 0
    for index in range(len(tags)-1):
        if tags[index]==t1 and tags[index+1] == t2:
            count_t2_t1 += 1
    return (count_t2_t1, count_t1)
# creating t x t transition matrix of tags, t= no of tags
# Matrix(i, j) represents P(jth tag after the ith tag)

tags_matrix = np.zeros((len(tags), len(tags)), dtype='float32')
for i, t1 in enumerate(list(tags)):
    for j, t2 in enumerate(list(tags)):
        tags_matrix[i, j] = t2_given_t1(t2, t1)[0]/t2_given_t1(t2, t1)[1]

print(tags_matrix)
# convert the matrix to a df for better readability
#the table is same as the transition table shown in section 3 of article
tags_df = pd.DataFrame(tags_matrix, columns = list(tags), index=list(tags))
display(tags_df)
def Viterbi(words, train_bag = train_tagged_words):
    state = []
    T = list(set([pair[1] for pair in train_bag]))

    for key, word in enumerate(words):
        #initialise list of probability column for a given observation
        p = []
        for tag in T:
            if key == 0:
                transition_p = tags_df.loc['.', tag]
            else:
                transition_p = tags_df.loc[state[-1], tag]

            # compute emission and state probabilities
            emission_p = word_given_tag(words[key], tag)[0]/word_given_tag(words[key], tag)[1]
            state_probability = emission_p * transition_p
            p.append(state_probability)

        pmax = max(p)
        # getting state for which probability is maximum
        state_max = T[p.index(pmax)]
        state.append(state_max)
    return list(zip(words, state))
# Let's test our Viterbi algorithm on a few sample sentences of test dataset
random.seed(1234)      #define a random seed to get same sentences when run multiple times

# choose random 10 numbers
rndom = [random.randint(1,len(test_set)) for x in range(10)]

# list of 10 sents on which we test the model
test_run = [test_set[i] for i in rndom]

# list of tagged words
test_run_base = [tup for sent in test_run for tup in sent]

# list of untagged words
test_tagged_words = [tup[0] for sent in test_run for tup in sent]
#Here We will only test 10 sentences to check the accuracy
#as testing the whole training set takes huge amount of time
start = time.time()
tagged_seq = Viterbi(test_tagged_words)
end = time.time()
difference = end-start

print("Time taken in seconds: ", difference)

# accuracy
check = [i for i, j in zip(tagged_seq, test_run_base) if i == j]

accuracy = len(check)/len(tagged_seq)
print('Viterbi Algorithm Accuracy: ',accuracy*100)
#modified Viterbi to include rule based tagger in it
def Viterbi_rule_based(words, train_bag = train_tagged_words):
    state = []
    T = list(set([pair[1] for pair in train_bag]))

    for key, word in enumerate(words):
        #initialise list of probability column for a given observation
        p = []
        for tag in T:
            if key == 0:
                transition_p = tags_df.loc['.', tag]
            else:
                transition_p = tags_df.loc[state[-1], tag]

            # compute emission and state probabilities
            emission_p = word_given_tag(words[key], tag)[0]/word_given_tag(words[key], tag)[1]
            state_probability = emission_p * transition_p
            p.append(state_probability)

        pmax = max(p)
        state_max = rule_based_tagger.tag([word])[0][1]


        if(pmax==0):
            state_max = rule_based_tagger.tag([word])[0][1] # assign based on rule based tagger
        else:
            if state_max != 'X':
                # getting state for which probability is maximum
                state_max = T[p.index(pmax)]


        state.append(state_max)
    return list(zip(words, state))
!pip install nltk
rule_based_tagger = nltk.DefaultTagger('NOUN')

test_sent="Justin will spot Will"
pred_tags_rule=Viterbi_rule_based(test_sent.split())
pred_tags_withoutRules= Viterbi(test_sent.split())
print(pred_tags_rule)
print(pred_tags_withoutRules)
# Maximum Entropy model using Corpus dataset
def extract_features(sentence, i):
  word, tag = sentence[i]
  features = {
      'bias': 1.0,
      'word': word,
      'word[-3:]': word[-3:],
      'word[-2:]': word[-2:],
      'word[:2]': word[:2],
      'word[:3]': word[:3],
      'tag-1': sentence[i-1][1],
      'tag-2': sentence[i-2][1],
      'tag-3': sentence[i-3][1],
  }
  if i > 0:
    features['word-1'] = sentence[i-1][0]
  if i > 1:
    features['word-2'] = sentence[i-2][0]
  return features

# Extract the features for each tagged word in the training set
train_set_features = [extract_features(train_set, i) for i in range(len(train_set)) for sentence in train_set]

# Train the Maximum Entropy model
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(random_state=0)
model.fit([extract_features(sentence, i) for sentence in train_set for i in range(len(sentence))], [tag for sentence in train_set for tag, word in sentence])

#

[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Unzipping corpora/treebank.zip.
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Unzipping taggers/universal_tagset.zip.


[[('Pierre', 'NOUN'), ('Vinken', 'NOUN'), (',', '.'), ('61', 'NUM'), ('years', 'NOUN'), ('old', 'ADJ'), (',', '.'), ('will', 'VERB'), ('join', 'VERB'), ('the', 'DET'), ('board', 'NOUN'), ('as', 'ADP'), ('a', 'DET'), ('nonexecutive', 'ADJ'), ('director', 'NOUN'), ('Nov.', 'NOUN'), ('29', 'NUM'), ('.', '.')], [('Mr.', 'NOUN'), ('Vinken', 'NOUN'), ('is', 'VERB'), ('chairman', 'NOUN'), ('of', 'ADP'), ('Elsevier', 'NOUN'), ('N.V.', 'NOUN'), (',', '.'), ('the', 'DET'), ('Dutch', 'NOUN'), ('publishing', 'VERB'), ('group', 'NOUN'), ('.', '.')]]
('Pierre', 'NOUN')
('Vinken', 'NOUN')
(',', '.')
('61', 'NUM')
('years', 'NOUN')
('old', 'ADJ')
(',', '.')
('will', 'VERB')
('join', 'VERB')
('the', 'DET')
('board', 'NOUN')
('as', 'ADP')
('a', 'DET')
('nonexecutive', 'ADJ')
('director', 'NOUN')
('Nov.', 'NOUN')
('29', 'NUM')
('.', '.')
('Mr.', 'NOUN')
('Vinken', 'NOUN')
('is', 'VERB')
('chairman', 'NOUN')
('of', 'ADP')
('Elsevier', 'NOUN')
('N.V.', 'NOUN')
(',', '.')
('the', 'DET')
('Dutch', 'NOUN')
('

Unnamed: 0,DET,VERB,ADP,.,X,CONJ,PRON,NOUN,ADJ,PRT,ADV,NUM
DET,0.006037,0.040247,0.009918,0.017393,0.045134,0.000431,0.003306,0.635906,0.206411,0.000287,0.012074,0.022855
VERB,0.13361,0.167956,0.092357,0.034807,0.21593,0.005433,0.035543,0.110589,0.06639,0.030663,0.083886,0.022836
ADP,0.320931,0.008479,0.016958,0.038724,0.034548,0.001012,0.069603,0.323589,0.107062,0.001266,0.014553,0.063275
.,0.172192,0.08969,0.092908,0.092372,0.025641,0.060079,0.068769,0.218539,0.046132,0.002789,0.052569,0.07821
X,0.05689,0.206419,0.142226,0.160869,0.075726,0.010379,0.0542,0.061695,0.017682,0.185086,0.025754,0.003075
CONJ,0.123491,0.150384,0.055982,0.035126,0.00933,0.000549,0.060373,0.349067,0.113611,0.004391,0.05708,0.040615
PRON,0.009567,0.484738,0.022323,0.041913,0.088383,0.005011,0.006834,0.212756,0.070615,0.014123,0.036902,0.006834
NOUN,0.013106,0.149134,0.176827,0.240094,0.028825,0.042454,0.004659,0.262344,0.012584,0.043935,0.016895,0.009144
ADJ,0.005243,0.011456,0.080583,0.066019,0.020971,0.016893,0.000194,0.696893,0.063301,0.011456,0.005243,0.021748
PRT,0.10137,0.401174,0.019569,0.04501,0.012133,0.002348,0.017613,0.250489,0.082975,0.001174,0.009393,0.056751


Time taken in seconds:  55.11999750137329
Viterbi Algorithm Accuracy:  93.77990430622009
[('Justin', 'NOUN'), ('will', 'VERB'), ('spot', 'NOUN'), ('Will', 'NOUN')]
[('Justin', 'DET'), ('will', 'VERB'), ('spot', 'NOUN'), ('Will', 'DET')]


ValueError: too many values to unpack (expected 2)