# Idea:
    ## Import the treebank corpus.
    ## For each word in the corpus ,tag them according to tags in the tagset defined.(here we have used universal tagset)

The tagset consists of the following 12 coarse tags:

VERB - verbs (all tenses and modes)
NOUN - nouns (common and proper)
PRON - pronouns
ADJ - adjectives
ADV - adverbs
ADP - adpositions (prepositions and postpositions)
CONJ - conjunctions
DET - determiners
NUM - cardinal numbers
PRT - particles or other function words
X - other: foreign words, typos, abbreviations
. - punctuation
    
    ## Using this as our base corpora,we have a list of tuples,mapping each word of corpus to it's corresponding tag.
    ## Now we make the emission probability and transition probability tables.
    ## For emission probability,we are given the tag and the word.We calculate P(word|tag) by count(word and tag)/count(tag).
    ## For transition probability,we are given current tag(t1) and the next tag(t2).We calculate P(next tag|current tag) by (number of times tag2 has occured after tag1)/(number of times tag1 has occured).
    ## Next we are given a sentence.We First separate into tokens using space as a delimiter(or we may tokenize it using inbuilt functions)
    ## Then we calculate the tags each word in a sentence has.We calculate emission probability matrix thus...
    ## Next we calculate the probabilities for each sequence possible.The one with the highest probability will give the most probable pos tagging sequence.
    
    

## Importing the corpus and assigning tags

In [1]:
# Importing essential libraries

import nltk as nl
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

#installing the treebank corpus from library nltk

nl.download('treebank')

#installing the universal tagset from library nltk

nl.download('universal_tagset')

# reading the Treebank tagged sentences

nl_data = list(nl.corpus.treebank.tagged_sents(tagset='universal'))
print(type(nl_data))
print(nl_data[0])
print(len(nl_data))



[nltk_data] Downloading package treebank to
[nltk_data]     C:\Users\Nehal\AppData\Roaming\nltk_data...
[nltk_data]   Package treebank is already up-to-date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     C:\Users\Nehal\AppData\Roaming\nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


<class 'list'>
[('Pierre', 'NOUN'), ('Vinken', 'NOUN'), (',', '.'), ('61', 'NUM'), ('years', 'NOUN'), ('old', 'ADJ'), (',', '.'), ('will', 'VERB'), ('join', 'VERB'), ('the', 'DET'), ('board', 'NOUN'), ('as', 'ADP'), ('a', 'DET'), ('nonexecutive', 'ADJ'), ('director', 'NOUN'), ('Nov.', 'NOUN'), ('29', 'NUM'), ('.', '.')]
3914


## Split the corpus and tags into train and test

In [2]:
# split data into 75:25

tr_set,ts_set =train_test_split(nl_data,train_size=0.75,test_size=0.25)

# create list of test and train tagged words

tr_tg_word = [ tup for sent in tr_set for tup in sent ]
print(tr_tg_word[1])
ts_tg_word = [ tup for sent in ts_set for tup in sent ]

#Unique tags present in training data

tags = {tag for word,tag in tr_tg_word}
print(tags)


('Rep.', 'NOUN')
{'ADJ', 'ADV', 'X', '.', 'VERB', 'PRT', 'NOUN', 'PRON', 'NUM', 'ADP', 'CONJ', 'DET'}


## Emission Probability calculation

In [3]:
# defining Emission Probability

def word_given_tag(word, tag, tr_bag = tr_tg_word):
    tg_lis = [pair for pair in tr_bag if pair[1]==tag]
    ct_tag = len(tg_lis)# for the required tag occurred in train_bag
    w_given_tg_lis = [pair[0] for pair in tg_lis if pair[0]==word]
    #now calculate the required word occurred as the passed tag.
    ct_w_given_tg = len(w_given_tg_lis)
    return ct_w_given_tg, ct_tag





## Transition probability calculation

In [4]:
# computation of Transition probability

def t2_with_t1(t2, t1, tr_bag = tr_tg_word):
    tags = [pair[1] for pair in tr_bag]
    ct_t1 = len([t for t in tags if t==t1])
    ct_t2_t1 = 0
    for index in range(len(tags)-1):
        if tags[index]==t1 and tags[index+1] == t2:
            ct_t2_t1 += 1
    return ct_t2_t1, ct_t1


## Transition probability matrix

In [5]:
# tags x tags transition tags matrix
# Matrix(i, j) means P(jth tag after the ith tag)
tgs_mtx = np.zeros((len(tags)+1, len(tags)), dtype='float32')
print(tgs_mtx)
tags=list(tags)
tags.append("Start")
print(tags)


[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
['ADJ', 'ADV', 'X', '.', 'VERB', 'PRT', 'NOUN', 'PRON', 'NUM', 'ADP', 'CONJ', 'DET', 'Start']


In [6]:
for i in range(len(tags)-1):
  for j in range(len(tags)-1):
    tgs_mtx[i][j]=t2_with_t1(tags[j], tags[i])[0]/t2_with_t1(tags[j], tags[i])[1]
tags2=tags[:-1]
print(tags2)

['ADJ', 'ADV', 'X', '.', 'VERB', 'PRT', 'NOUN', 'PRON', 'NUM', 'ADP', 'CONJ', 'DET']


In [7]:
print(type(tgs_mtx))
# convert numpy array to a dataframe
tags_df = pd.DataFrame(tgs_mtx, columns = tags2, index=tags)
tags_df

<class 'numpy.ndarray'>


Unnamed: 0,ADJ,ADV,X,.,VERB,PRT,NOUN,PRON,NUM,ADP,CONJ,DET
ADJ,0.067277,0.00511,0.022993,0.066425,0.012348,0.010645,0.69917,0.000639,0.021077,0.073877,0.015329,0.00511
ADV,0.127288,0.079183,0.026394,0.140485,0.341422,0.012771,0.0298,0.013197,0.036611,0.1192,0.007663,0.065986
X,0.016321,0.024985,0.075559,0.165827,0.201088,0.190006,0.061656,0.05541,0.003425,0.142656,0.010276,0.052791
.,0.042914,0.0535,0.028458,0.094707,0.08856,0.002732,0.220831,0.067501,0.081047,0.087877,0.058167,0.173591
VERB,0.063903,0.079854,0.217704,0.035939,0.169555,0.030819,0.111954,0.035644,0.022647,0.092556,0.004923,0.134502
PRT,0.081395,0.010797,0.01412,0.040282,0.414452,0.002492,0.24294,0.017442,0.05897,0.017857,0.002076,0.097176
NOUN,0.012366,0.01666,0.029213,0.240422,0.147464,0.043119,0.265248,0.004807,0.009846,0.175883,0.041766,0.013206
PRON,0.074181,0.035164,0.086224,0.042389,0.486994,0.011561,0.213391,0.006262,0.006262,0.023603,0.004335,0.009634
NUM,0.033309,0.003331,0.208364,0.11695,0.019985,0.027017,0.34567,0.00185,0.189119,0.03664,0.014064,0.003701
ADP,0.107143,0.013462,0.034478,0.040659,0.007692,0.001511,0.317995,0.071703,0.063462,0.016484,0.000824,0.324588


In [8]:
tags_df.loc["Start"]["DET"]=0.8
tags_df.loc["Start"]["NOUN"]=0.2
tags_df

Unnamed: 0,ADJ,ADV,X,.,VERB,PRT,NOUN,PRON,NUM,ADP,CONJ,DET
ADJ,0.067277,0.00511,0.022993,0.066425,0.012348,0.010645,0.69917,0.000639,0.021077,0.073877,0.015329,0.00511
ADV,0.127288,0.079183,0.026394,0.140485,0.341422,0.012771,0.0298,0.013197,0.036611,0.1192,0.007663,0.065986
X,0.016321,0.024985,0.075559,0.165827,0.201088,0.190006,0.061656,0.05541,0.003425,0.142656,0.010276,0.052791
.,0.042914,0.0535,0.028458,0.094707,0.08856,0.002732,0.220831,0.067501,0.081047,0.087877,0.058167,0.173591
VERB,0.063903,0.079854,0.217704,0.035939,0.169555,0.030819,0.111954,0.035644,0.022647,0.092556,0.004923,0.134502
PRT,0.081395,0.010797,0.01412,0.040282,0.414452,0.002492,0.24294,0.017442,0.05897,0.017857,0.002076,0.097176
NOUN,0.012366,0.01666,0.029213,0.240422,0.147464,0.043119,0.265248,0.004807,0.009846,0.175883,0.041766,0.013206
PRON,0.074181,0.035164,0.086224,0.042389,0.486994,0.011561,0.213391,0.006262,0.006262,0.023603,0.004335,0.009634
NUM,0.033309,0.003331,0.208364,0.11695,0.019985,0.027017,0.34567,0.00185,0.189119,0.03664,0.014064,0.003701
ADP,0.107143,0.013462,0.034478,0.040659,0.007692,0.001511,0.317995,0.071703,0.063462,0.016484,0.000824,0.324588


In [9]:
tr_tg_word

[('But', 'CONJ'),
 ('Rep.', 'NOUN'),
 ('Hammerschmidt', 'NOUN'),
 ('said', 'VERB'),
 ('that', 'ADP'),
 ('the', 'DET'),
 ('provision', 'NOUN'),
 (',', '.'),
 ('which', 'DET'),
 ('he', 'PRON'),
 ('dubbed', 'VERB'),
 ('*T*-2', 'X'),
 ('a', 'DET'),
 ('``', '.'),
 ('special', 'ADJ'),
 ('interest', 'NOUN'),
 ("''", '.'),
 ('amendment', 'NOUN'),
 (',', '.'),
 ('was', 'VERB'),
 ('likely', 'ADJ'),
 ('*-1', 'X'),
 ('to', 'PRT'),
 ('make', 'VERB'),
 ('the', 'DET'),
 ('bill', 'NOUN'),
 ('even', 'ADV'),
 ('more', 'ADV'),
 ('controversial', 'ADJ'),
 ('.', '.'),
 ('CALL', 'NOUN'),
 ('MONEY', 'NOUN'),
 (':', '.'),
 ('9', 'NUM'),
 ('3\\/4', 'NUM'),
 ('%', 'NOUN'),
 ('.', '.'),
 ('Dan', 'NOUN'),
 ('E.', 'NOUN'),
 ('Nelms', 'NOUN'),
 (',', '.'),
 ('Valley', 'NOUN'),
 ('Federal', 'NOUN'),
 ("'s", 'PRT'),
 ('president', 'NOUN'),
 ('and', 'CONJ'),
 ('chief', 'ADJ'),
 ('executive', 'NOUN'),
 ('officer', 'NOUN'),
 (',', '.'),
 ('said', 'VERB'),
 ('0', 'X'),
 ('the', 'DET'),
 ('one-time', 'ADJ'),
 ('charge', '

In [10]:
ts_tg_word

[('The', 'DET'),
 ('energy', 'NOUN'),
 ('segment', 'NOUN'),
 (',', '.'),
 ('with', 'ADP'),
 ('a', 'DET'),
 ('15', 'NUM'),
 ('%', 'NOUN'),
 ('rise', 'NOUN'),
 ('in', 'ADP'),
 ('operating', 'NOUN'),
 ('profit', 'NOUN'),
 (',', '.'),
 ('is', 'VERB'),
 ('clearly', 'ADV'),
 ('the', 'DET'),
 ('company', 'NOUN'),
 ("'s", 'PRT'),
 ('strongest', 'ADJ'),
 ('.', '.'),
 ('In', 'ADP'),
 ('reference', 'NOUN'),
 ('to', 'PRT'),
 ('your', 'PRON'),
 ('Oct.', 'NOUN'),
 ('9', 'NUM'),
 ('page-one', 'NOUN'),
 ('article', 'NOUN'),
 ('``', '.'),
 ('Barbara', 'NOUN'),
 ('Bush', 'NOUN'),
 ('Earns', 'VERB'),
 ('Even', 'ADV'),
 ('Higher', 'ADJ'),
 ('Ratings', 'NOUN'),
 ('Than', 'ADP'),
 ('the', 'DET'),
 ('President', 'NOUN'),
 (',', '.'),
 ("''", '.'),
 ('it', 'PRON'),
 ('*EXP*-1', 'X'),
 ('is', 'VERB'),
 ('regrettable', 'ADJ'),
 ('that', 'ADP'),
 ('you', 'PRON'),
 ('must', 'VERB'),
 ('continually', 'ADV'),
 ('define', 'VERB'),
 ('blacks', 'NOUN'),
 ('by', 'ADP'),
 ('our', 'PRON'),
 ('negatives', 'NOUN'),
 (':', 

## Emission probability matrix

In [11]:
print(word_given_tag("Mr.","NOUN")[0]/word_given_tag("Mr.","NOUN")[1])

0.013066405338559895


In [12]:
string=str(input())
words=[str(x) for x in string.split(" ")]
print(words)
emission_mtx = np.zeros((len(tags)-1, len(words)), dtype='float32')


company is operating with profit .
['company', 'is', 'operating', 'with', 'profit', '.']


In [13]:
print(len(tags)-1)
for i in range(len(tags)-1):
  for j in range(len(words)):
    emission_mtx[i][j]=word_given_tag(words[j], tags[i])[0]/word_given_tag(words[j], tags[i])[1]
emission_mtx

12


array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.32737622],
       [0.        , 0.0497243 , 0.00078771, 0.        , 0.00039386,
        0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        ],
       [0.00881982, 0.        , 0.00037333, 0.        , 0.00167997,
        0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , 0.03763736, 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        ],
       [0.        , 0

In [14]:
emission_df = pd.DataFrame(emission_mtx, columns = words, index=tags2)
emission_df

Unnamed: 0,company,is,operating,with,profit,.
ADJ,0.0,0.0,0.0,0.0,0.0,0.0
ADV,0.0,0.0,0.0,0.0,0.0,0.0
X,0.0,0.0,0.0,0.0,0.0,0.0
.,0.0,0.0,0.0,0.0,0.0,0.327376
VERB,0.0,0.049724,0.000788,0.0,0.000394,0.0
PRT,0.0,0.0,0.0,0.0,0.0,0.0
NOUN,0.00882,0.0,0.000373,0.0,0.00168,0.0
PRON,0.0,0.0,0.0,0.0,0.0,0.0
NUM,0.0,0.0,0.0,0.0,0.0,0.0
ADP,0.0,0.0,0.0,0.037637,0.0,0.0


In [15]:
tags_df

Unnamed: 0,ADJ,ADV,X,.,VERB,PRT,NOUN,PRON,NUM,ADP,CONJ,DET
ADJ,0.067277,0.00511,0.022993,0.066425,0.012348,0.010645,0.69917,0.000639,0.021077,0.073877,0.015329,0.00511
ADV,0.127288,0.079183,0.026394,0.140485,0.341422,0.012771,0.0298,0.013197,0.036611,0.1192,0.007663,0.065986
X,0.016321,0.024985,0.075559,0.165827,0.201088,0.190006,0.061656,0.05541,0.003425,0.142656,0.010276,0.052791
.,0.042914,0.0535,0.028458,0.094707,0.08856,0.002732,0.220831,0.067501,0.081047,0.087877,0.058167,0.173591
VERB,0.063903,0.079854,0.217704,0.035939,0.169555,0.030819,0.111954,0.035644,0.022647,0.092556,0.004923,0.134502
PRT,0.081395,0.010797,0.01412,0.040282,0.414452,0.002492,0.24294,0.017442,0.05897,0.017857,0.002076,0.097176
NOUN,0.012366,0.01666,0.029213,0.240422,0.147464,0.043119,0.265248,0.004807,0.009846,0.175883,0.041766,0.013206
PRON,0.074181,0.035164,0.086224,0.042389,0.486994,0.011561,0.213391,0.006262,0.006262,0.023603,0.004335,0.009634
NUM,0.033309,0.003331,0.208364,0.11695,0.019985,0.027017,0.34567,0.00185,0.189119,0.03664,0.014064,0.003701
ADP,0.107143,0.013462,0.034478,0.040659,0.007692,0.001511,0.317995,0.071703,0.063462,0.016484,0.000824,0.324588


In [16]:
tags_possible=[]
listt=[]
for i in range(len(words)):
    tags_possible=[]
    for j in range(len(tags)-1):
        if emission_mtx[j][i]!=0:
            tags_possible.append(j)
    listt.append(tags_possible)
print(listt)

[[6], [4], [4, 6], [9], [4, 6], [3]]


In [17]:
start=[]
for j in listt[0]:
    start.append({('start',j):tgs_mtx[12][j]*emission_mtx[j][0]})
previous=start
for i in range(1,len(listt)):#goes through len of words in our sentence
    current=[]
    for j in listt[i]:#goes through number of tags for the current word
        for m in previous:# m is a set,previous is a list
            keys=m.keys()
            for key in keys:
                k=key
                value=m[k]
                # print(k)
                k=list(k)
                # print(k)
                previous_tag=k[len(k)-1]
                k.append(j)
                # print(k)
                # print(type(k))
                k=tuple(k)
                # print(value,tgs_mtx[previous_tag][j],emission_mtx[j][i])
                current_prob=value*tgs_mtx[previous_tag][j]*emission_mtx[j][i]
                current.append({k:current_prob})
    previous=current
                

                
        
        

In [18]:
previous

[{('start', 6, 4, 4, 9, 4, 3): 2.1451476e-19},
 {('start', 6, 4, 6, 9, 4, 3): 1.2756288e-19},
 {('start', 6, 4, 4, 9, 6, 3): 2.5303886e-16},
 {('start', 6, 4, 6, 9, 6, 3): 1.5047153e-16}]

In [19]:
# final tag sequence print
probability=0
indexx=0
for i in range(len(previous)):
    for b in previous[i].keys():
        current=previous[i][b]
        if probability<current:
            # print("run")
            indexx=i
            probability=current
        

In [20]:
probability

2.5303886e-16

In [21]:
indexx

2

In [22]:
sequence=previous[indexx].keys()
sequence

dict_keys([('start', 6, 4, 4, 9, 6, 3)])

In [23]:
for a in sequence:
    sequence_final=a

In [24]:
sequence_final=list(sequence_final)

In [25]:
sequence_final

['start', 6, 4, 4, 9, 6, 3]

In [26]:
pos_sequence=[]
for i in range(1,len(sequence_final)):
    pos_sequence.append(tags2[sequence_final[i]])

In [27]:
print("pos tag sequence for",words,"is",pos_sequence)

pos tag sequence for ['company', 'is', 'operating', 'with', 'profit', '.'] is ['NOUN', 'VERB', 'VERB', 'ADP', 'NOUN', '.']


## End :)