In [1]:
%pwd

'D:\\Documents\\Projects\\POS TAGGER'

In [2]:
import nltk
import pandas as pd
from sklearn.model_selection import train_test_split
import re

In [3]:
dataset = list(nltk.corpus.treebank.tagged_sents(tagset='universal'))

In [4]:
print(dataset[0])

[('Pierre', 'NOUN'), ('Vinken', 'NOUN'), (',', '.'), ('61', 'NUM'), ('years', 'NOUN'), ('old', 'ADJ'), (',', '.'), ('will', 'VERB'), ('join', 'VERB'), ('the', 'DET'), ('board', 'NOUN'), ('as', 'ADP'), ('a', 'DET'), ('nonexecutive', 'ADJ'), ('director', 'NOUN'), ('Nov.', 'NOUN'), ('29', 'NUM'), ('.', '.')]


In [5]:
train_data,test_data =train_test_split(dataset,train_size=0.80,test_size=0.20,random_state = 26)

In [6]:
print("Total no. of sentences={}".format(len(dataset)))
print("Total no. of training sentences={}".format(len(train_data)))
print("Total no. of testing sentences={}".format(len(test_data)))

Total no. of sentences=3914
Total no. of training sentences=3131
Total no. of testing sentences=783


In [7]:
def sent_to_word(sent_list):
    words=[]
    for sent in sent_list:
        for word in sent:
            words.append(word)
    return words

In [8]:
train_word_tuples=sent_to_word(train_data)
test_word_tuples=sent_to_word(test_data)

In [9]:
print("Total no. of training words={}".format(len(train_word_tuples)))
print("Total no. of testing words={}".format(len(test_word_tuples)))
print(train_word_tuples[:3])

Total no. of training words=80581
Total no. of testing words=20095
[('Congress', 'NOUN'), ('learned', 'VERB'), ('during', 'ADP')]


In [10]:
unq_words=set()
unq_tags=set()
for word in train_word_tuples:
    unq_words.add(word[0])
    unq_tags.add(word[1])
print("No. of unique words in vocabulary: ", len(unq_words))
print("No. of unique tags: ", len(unq_tags))
print(unq_tags)

No. of unique words in vocabulary:  11042
No. of unique tags:  12
{'ADJ', 'PRT', 'X', 'PRON', 'DET', '.', 'NOUN', 'ADV', 'NUM', 'VERB', 'ADP', 'CONJ'}


In [11]:
def emission_prob(data):
    ems_dict={}
    for tuples in data:
        if tuples[0] not in ems_dict:
            ems_dict[tuples[0]]={}
        if tuples[0] in ems_dict:
            ems_dict[tuples[0]][tuples[1]]=ems_dict[tuples[0]].get(tuples[1], 0) + 1
    
    emission_df=pd.DataFrame(ems_dict).transpose()
    for tag in unq_tags:
        emission_df[tag].fillna(0, inplace=True)
        
    for tag in unq_tags:
        col_sum=emission_df[tag].sum()
        emission_df[tag] = emission_df[tag]/col_sum
        #print(tag, col_sum)
    return emission_df

emission_df=emission_prob(train_word_tuples)
print(emission_df.head(10))

                    NOUN      VERB       ADP       DET       ADJ       NUM  \
Congress        0.001775  0.000000  0.000000  0.000000  0.000000  0.000000   
learned         0.000000  0.000277  0.000000  0.000000  0.000000  0.000000   
during          0.000000  0.000000  0.004328  0.000000  0.000000  0.000000   
the             0.000043  0.000000  0.000000  0.460852  0.000975  0.000347   
Reagan          0.000303  0.000000  0.000000  0.000000  0.000000  0.000000   
administration  0.000996  0.000000  0.000000  0.000000  0.000000  0.000000   
that            0.000000  0.000000  0.051044  0.033967  0.000000  0.000000   
it              0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
could           0.000000  0.008313  0.000000  0.000000  0.000000  0.000000   
intimidate      0.000000  0.000092  0.000000  0.000000  0.000000  0.000000   

                     ADV      PRON  PRT    X  CONJ    .  
Congress        0.000000  0.000000  0.0  0.0   0.0  0.0  
learned         0.000000 

In [12]:
def create_empty_dict(data=train_data):
    tns_dict = {}
    tns_dict['START']={}
    for tag in unq_tags:
        tns_dict[tag]={}
    
    for item in tns_dict.items():
        for tag in unq_tags:
            item[1][tag]=0
        item[1]['END']=0
    return tns_dict

In [13]:
def transition_prob(data=train_data):
    tns_dict = create_empty_dict()
    for sent in data:
        for i, word in enumerate(sent):
            if i == 0:
                tns_dict['START'][word[1]] += 1
            elif i == len(sent)-1:
                tns_dict[word[1]]['END'] += 1
            else:
                tns_dict[word[1]][sent[i+1][1]] += 1
                
    for item in tns_dict.items():
        s = sum(item[1].values())
        for key in item[1]:
            tns_dict[item[0]][key] /= s
    return tns_dict

In [14]:
transition_df=pd.DataFrame(transition_prob()).transpose()
print(transition_df)

            ADJ       PRT         X      PRON       DET         .      NOUN  \
START  0.042159  0.001278  0.024912  0.069307  0.237304  0.083360  0.290003   
ADJ    0.065626  0.010004  0.021008  0.000600  0.004002  0.063425  0.701281   
PRT    0.087214  0.001973  0.012628  0.016180  0.100631  0.043410  0.245462   
X      0.016858  0.184483  0.075862  0.058621  0.054406  0.171264  0.062261   
PRON   0.080571  0.015298  0.089240  0.007139  0.011729  0.043855  0.221316   
DET    0.212732  0.000322  0.052055  0.003546  0.004996  0.018211  0.631426   
.      0.028624  0.001952  0.019191  0.032311  0.090968  0.070693  0.127290   
NOUN   0.012076  0.043079  0.029110  0.004641  0.013383  0.248603  0.252388   
ADV    0.137626  0.013889  0.023569  0.011785  0.068603  0.112374  0.029461   
NUM    0.032620  0.027008  0.219923  0.001403  0.002455  0.114697  0.354612   
VERB   0.067161  0.030570  0.217508  0.035387  0.133302  0.035387  0.107550   
ADP    0.104832  0.000940  0.034362  0.071275  0.321

In [15]:
reg_exp_list=[
          (r'.*ed$', 'VERB'),
          (r'.*er$', 'VERB'),
          (r'.*est$', 'VERB'),
         (r'.*ing$', 'VERB'),
         (r' \'s$', 'NOUN'),
         (r'.*ly$', 'ADV'),
         (r'.*able$', 'ADJ'),
         (r'.*ness$', 'NOUN'),
         (r'^-?[0-9]+(.[0-9]+)?$', 'NUM'), 
         (r'\*T?\*?-[0-9]+$', 'X'),
         (r'.*es$', 'VERB'),
         (r'.*s$', 'NOUN'),
         (r'.*$', 'NOUN')]

def find_reg_exp(word, exp_list):
    tag='NaN'
    for tuples in exp_list:
        if re.search(tuples[0], word) != None:
            tag=tuples[1]
            break
    return tag

In [16]:
def viterbi_algo(sent, tns_df=transition_df, ems_df=emission_df):
    print('\n',sent,'\n')
    dict_list=[]
    prev_info_dict={}
    new_info_dict={}
    final_tags=[]
    for i, word in enumerate(sent):
        new_info_dict={}
        if i==0:
            for tag in unq_tags:
                ems_p = 0
                if word in ems_df.index:
                    ems_p = ems_df[tag][word]
                    if ems_p == 0:
                        continue
                else:
                    ems_p = 1
                    reg_tag=find_reg_exp(word, reg_exp_list)
                    if reg_tag !='NaN' and reg_tag != tag:
                        continue
                prob=ems_p*tns_df[tag]['START']
                prev_info_dict[tag]=[prob, 'START']
                print(prev_info_dict)
            dict_list.append(prev_info_dict)
            
        else:
            prob=0
            p_tag='START'
            for tag in unq_tags:
                ems_p = 0
                if word in ems_df.index and ems_df[tag][word] == 0:
                    continue
                if word in ems_df.index:
                    ems_p = ems_df[tag][word]
                else:
                    ems_p = 1
                    reg_tag=find_reg_exp(word, reg_exp_list)
                    if reg_tag !='NaN' and reg_tag != tag:
                        continue
                for prev_tag in unq_tags:
                    if prev_tag not in prev_info_dict:
                        continue
                    
                    new_prob = ems_p*prev_info_dict[prev_tag][0]*tns_df[tag][prev_tag]
                    if new_prob > prob:
                        prob = new_prob
                        p_tag = prev_tag
                new_info_dict[tag] = [prob, p_tag]
            prev_info_dict = new_info_dict
            print(prev_info_dict)
            dict_list.append(prev_info_dict)
    
    prob=0
    for tag in unq_tags:
        if tag not in prev_info_dict:
            continue
        if tns_df['END'][tag] == 0:
            continue
        new_prob = prev_info_dict[tag][0]*tns_df['END'][tag]
        if new_prob > prob:
            prob = new_prob
            p_tag = tag
    final_tags.append(p_tag)
    i=-1
    prev_tag = final_tags[-1]
    final_list=[]
    try:
        while prev_tag != 'START':
            final_tags.append(dict_list[i][prev_tag][1])
            prev_tag = final_tags[-1]
            i -= 1
        final_tags.reverse()
 
        for i in range(len(sent)):
            tup=(sent[i], final_tags[i+1])
            final_list.append(tup)
    except:
        pass
    return final_list

In [17]:
correct, total=0,0
for sent in test_data:
    untagged_sent=[]
    for word in sent:
        untagged_sent.append(word[0])
    test_tagged_tuples=viterbi_algo(untagged_sent)
    try:
        for i in range(len(sent)):
            if sent[i] == test_tagged_tuples[i]:
                correct += 1
            total += 1
    except:
        pass
accuracy = correct*100/total
print("Accuracy is: ", accuracy)


 ['When', 'Warren', 'Winiarski', ',', 'proprietor', 'of', 'Stag', "'s", 'Leap', 'Wine', 'Cellars', 'in', 'Napa', 'Valley', ',', 'announced', 'a', '$', '75', '*U*', 'price', 'tag', 'for', 'his', '1985', 'Cask', '23', 'Cabernet', 'this', 'fall', '*T*-1', ',', 'few', 'wine', 'shops', 'and', 'restaurants', 'around', 'the', 'country', 'balked', '.'] 

{'ADV': [0.00039404862145200697, 'START']}
{'NOUN': [1.0051235115090476e-09, 'ADV']}
{'NOUN': [1.0981877147652317e-14, 'NOUN']}
{'.': [1.1376495827244326e-15, 'NOUN']}
{'NOUN': [1.448119494869873e-16, '.']}
{'ADP': [6.0790395330339885e-18, 'NOUN']}
{'NOUN': [1.6997785079728958e-22, 'ADP']}
{'PRT': [1.7079842879654564e-24, 'NOUN'], 'VERB': [1.7079842879654564e-24, 'NOUN']}
{'NOUN': [3.6298247799263825e-29, 'PRT']}
{'NOUN': [7.931819193207047e-34, 'NOUN']}
{'NOUN': [2.0018979497635485e-34, 'NOUN']}
{'PRT': [3.7377000876712674e-38, 'NOUN'], 'NOUN': [3.7377000876712674e-38, 'NOUN'], 'ADV': [3.7377000876712674e-38, 'NOUN'], 'ADP': [5.7252190740210

{'NOUN': [1.55452957578185e-10, 'NOUN']}
{'NOUN': [1.1889242306747368e-14, 'NOUN']}
{'VERB': [8.15024416598628e-17, 'NOUN']}
{'X': [2.971303929514048e-18, 'VERB']}
{'ADJ': [4.882158622878645e-23, 'X'], 'DET': [7.450002734529109e-20, 'X'], 'NOUN': [7.450002734529109e-20, 'X'], 'NUM': [7.450002734529109e-20, 'X']}
{'NOUN': [1.6291350412917973e-23, 'DET']}
{'ADJ': [2.1476628126275675e-27, 'NOUN'], 'ADV': [2.1476628126275675e-27, 'NOUN']}
{'VERB': [2.141405400635815e-31, 'ADV']}
{'PRT': [2.579280466176987e-36, 'VERB'], 'ADV': [6.994869098957514e-36, 'VERB'], 'ADP': [1.6545825116344124e-33, 'VERB']}
{'NOUN': [9.252855743612298e-38, 'ADP']}
{'PRT': [1.7275805556519086e-41, 'NOUN'], 'NOUN': [1.7275805556519086e-41, 'NOUN'], 'ADV': [1.7275805556519086e-41, 'NOUN'], 'ADP': [2.6462201132056463e-39, 'NOUN']}
{'NUM': [2.9589503419504094e-43, 'ADP']}
{'ADJ': [3.7630157716472806e-48, 'NUM'], 'NOUN': [4.5423399638189747e-48, 'NUM'], 'ADP': [4.5423399638189747e-48, 'NUM'], 'CONJ': [2.638019721051388e-

{'NOUN': [6.181066430795102e-91, 'NOUN']}
{'.': [4.967645046731255e-92, 'NOUN']}

 ['It', '*EXP*-3', "'s", 'probably', 'true', 'that', 'many', 'salarymen', 'put', 'in', 'unproductive', 'overtime', 'just', 'for', 'the', 'sake', 'of', 'solidarity', ',', 'that', 'the', 'system', 'is', 'so', 'hierarchical', 'that', 'only', 'the', 'assistant', 'manager', 'can', 'talk', 'to', 'the', 'manager', 'and', 'the', 'manager', 'to', 'the', 'general', 'manager', ',', 'and', 'that', 'Sony', 'was', 'chary', 'of', '*-4', 'letting', 'a', 'young', ',', 'short-term', 'American', 'employee', 'take', 'on', 'any', 'responsibility', '.'] 

{'PRON': [0.0024820663884570276, 'START']}
{'X': [8.361648175280172e-08, 'PRON']}
{'PRT': [3.598137564298166e-09, 'X'], 'VERB': [3.598137564298166e-09, 'X']}
{'ADV': [1.1753263186332957e-12, 'VERB']}
{'ADJ': [1.2612535563336014e-16, 'ADV']}
{'DET': [1.7143059831573963e-20, 'ADJ'], 'ADV': [1.7143059831573963e-20, 'ADJ'], 'ADP': [5.062226818000143e-19, 'ADJ']}
{'ADJ': [7.034413

{'NUM': [4.575067978739823e-77, 'ADP']}
{'.': [1.6964016725915645e-78, 'NUM']}

 ['The', 'Oct.', '13', 'plunge', 'was', 'triggered', '*-72', 'not', 'by', 'program', 'traders', ',', 'but', 'by', 'news', 'of', 'the', 'unraveling', 'of', 'the', '$', '6.79', 'billion', '*U*', 'buy-out', 'of', 'UAL', 'Corp', '.'] 

{'DET': [0.01963874726098717, 'START']}
{'DET': [0.01963874726098717, 'START'], 'NOUN': [5.0217003267146994e-05, 'START']}
{'NOUN': [1.0736295163242565e-05, 'DET']}
{'NUM': [7.647293647382999e-10, 'NOUN']}
{'NOUN': [3.521851014911246e-14, 'NUM']}
{'VERB': [1.4240472628480397e-16, 'NOUN']}
{'VERB': [4.4130826982221085e-21, 'VERB']}
{'X': [9.5988125756605e-22, 'VERB']}
{'ADV': [9.098740088885532e-25, 'X']}
{'PRT': [4.9791721876836156e-30, 'ADV'], 'ADV': [2.9236938074894845e-29, 'ADV'], 'ADP': [4.8714170382695794e-27, 'ADV']}
{'NOUN': [6.878663347579751e-30, 'ADP']}
{'NOUN': [2.8559118639769156e-33, 'NOUN']}
{'.': [2.95853513626827e-34, 'NOUN']}
{'ADV': [3.9513381290710414e-39, '.']

{'ADJ': [1.6653976468264976e-94, 'DET']}
{'NOUN': [5.561480546625197e-97, 'ADJ'], 'VERB': [5.561480546625197e-97, 'ADJ']}
{'NOUN': [3.645850211805322e-101, 'NOUN']}
{'.': [2.9301237818065415e-102, 'NOUN']}

 ['In', 'CAT', 'sections', 'where', 'students', "'", 'knowledge', 'of', 'two-letter', 'consonant', 'sounds', 'is', 'tested', '*-1', '*T*-2', ',', 'the', 'authors', 'noted', 'that', 'Scoring', 'High', 'concentrated', 'on', 'the', 'same', 'sounds', 'that', 'the', 'test', 'does', '*?*', '*T*-4', '--', 'to', 'the', 'exclusion', 'of', 'other', 'sounds', 'that', 'fifth', 'graders', 'should', 'know', '*T*-3', '.'] 

{'NOUN': [2.5108501633573497e-05, 'START']}
{'NOUN': [2.5108501633573497e-05, 'START'], 'ADP': [0.002739993794399478, 'START']}
{'NOUN': [1.5322757940265885e-07, 'ADP']}
{'NOUN': [3.8672840313369335e-08, 'NOUN']}
{'ADV': [7.184564545967708e-12, 'NOUN']}
{'NOUN': [1.8326100770247188e-16, 'ADV']}
{'PRT': [1.399756248526425e-19, 'NOUN'], '.': [1.399756248526425e-19, 'NOUN']}
{'NOU

{'ADJ': [2.926136221657644e-10, '.'], 'ADV': [4.046769658197813e-09, '.']}
{'PRON': [2.1895869938584416e-14, 'ADV'], 'NOUN': [6.218310025167668e-14, 'ADJ'], 'NUM': [4.627746995053578e-12, 'ADV']}
{'ADJ': [5.002497669624972e-16, 'NUM']}
{'NOUN': [4.556044322153674e-20, 'ADJ']}
{'X': [1.2516475827756865e-23, 'NOUN']}
{'VERB': [2.4921360301840123e-27, 'X']}
{'.': [1.078645917870833e-30, 'VERB']}
{'ADJ': [6.018569186785978e-36, '.']}
{'NOUN': [4.2207052820497903e-36, 'ADJ']}
{'NOUN': [9.222999234059962e-41, 'NOUN']}
{'.': [7.412408032506861e-42, 'NOUN']}

 ['But', 'in', 'recent', 'days', ',', 'Columbia', 'has', 'edged', 'up', ',', '*-1', 'closing', 'at', '5', '1\\/4', ',', 'up', '3\\/8', ',', 'yesterday', 'on', 'revived', 'speculation', 'that', 'the', 'thrift', 'might', 'restructure', '.'] 

{'CONJ': [0.0033278327832783282, 'START']}
{'PRT': [6.137545163063832e-08, 'CONJ'], 'NOUN': [6.137545163063832e-08, 'CONJ'], 'ADV': [7.56335584347261e-08, 'CONJ'], 'ADP': [2.114260200619385e-05, 'CONJ'

{'VERB': [9.364106939027784e-27, 'NOUN']}
{'ADJ': [5.148890429082773e-30, 'VERB']}
{'NOUN': [2.0320612435550546e-33, 'ADJ'], 'VERB': [2.0320612435550546e-33, 'ADJ']}
{'PRT': [3.449106651781586e-38, 'NOUN'], 'ADV': [6.637698025563997e-38, 'VERB'], 'ADP': [3.045889774639187e-35, 'NOUN']}
{'NOUN': [3.406681564383339e-39, 'ADP']}
{'.': [3.5290959897727892e-40, 'NOUN']}
{'ADV': [2.1210135781616804e-43, '.'], 'ADP': [5.302419395241211e-43, '.']}
{'VERB': [2.9607673356119747e-46, 'ADV']}
{'NOUN': [2.894817362116723e-50, 'VERB']}
{'NOUN': [1.0121113521796874e-53, 'NOUN']}
{'X': [2.113177670780645e-57, 'NOUN']}
{'.': [1.5080940253686079e-58, 'X']}
{'ADP': [3.596652456784401e-63, '.']}
{'PRT': [1.46467513484908e-68, 'ADP'], 'NOUN': [5.028353996933827e-68, 'ADP'], 'ADV': [5.028353996933827e-68, 'ADP'], 'ADP': [9.40299528625143e-66, 'ADP']}
{'NOUN': [2.103359810218397e-69, 'ADP']}
{'ADV': [3.907580668359724e-73, 'NOUN']}
{'ADJ': [1.0483152500915832e-77, 'ADV'], 'X': [1.0483152500915832e-77, 'ADV']

{'VERB': [7.320243319816732e-36, 'VERB']}
{'VERB': [1.2280649052513295e-36, 'VERB']}
{'NOUN': [2.2870670925441166e-41, 'VERB']}
{'VERB': [3.062142417559212e-46, 'NOUN']}
{'X': [9.667523072500854e-48, 'VERB']}
{'PRT': [7.729867840774372e-51, 'X'], 'NOUN': [7.729867840774372e-51, 'X'], 'ADV': [7.729867840774372e-51, 'X'], 'ADP': [2.1789759335978554e-49, 'X']}
{'ADJ': [2.226382778157077e-53, 'ADP'], 'DET': [3.228220539504564e-50, 'ADP'], 'NOUN': [3.228220539504564e-50, 'ADP'], 'NUM': [3.228220539504564e-50, 'ADP']}
{'NOUN': [2.6472509756017625e-54, 'DET']}
{'ADJ': [1.246367448258179e-59, 'NOUN'], 'NOUN': [2.892359462312668e-59, 'NOUN'], 'ADP': [6.085901392588598e-59, 'NOUN'], 'CONJ': [7.731725594836883e-56, 'NOUN']}
{'ADJ': [2.0625631050554035e-58, 'CONJ']}
{'VERB': [9.30020844827297e-64, 'ADJ']}
{'NOUN': [3.897021351315214e-68, 'VERB']}
{'VERB': [2.0870807718582132e-72, 'NOUN']}
{'X': [6.5891453645042664e-74, 'VERB']}
{'PRT': [4.789533939308419e-78, 'X'], 'ADV': [4.789533939308419e-78, '

{'VERB': [4.962462752857426e-41, 'PRON']}
{'VERB': [3.936904572029772e-43, 'VERB']}
{'X': [1.037656015046679e-44, 'VERB']}
{'.': [5.745145459477238e-46, 'X']}

 ['The', 'Los', 'Angeles', 'County', 'district', 'attorney', "'s", 'office', 'filed', 'seven', 'felony', 'and', 'five', 'misdemeanor', 'counts', 'charging', 'that', 'late', 'last', 'year', 'and', 'early', 'this', 'year', 'the', 'Irvine', ',', 'Calif.-based', 'circuit-board', 'manufacturer', 'illegally', 'disposed', 'of', 'acid', ',', 'caustic', 'and', 'heavy', 'metals', 'into', 'the', 'sewer', 'system', ',', 'and', 'stored', 'hazardous', 'materials', 'in', 'leaky', ',', 'unlabeled', 'or', 'open-top', 'containers', '.'] 

{'DET': [0.01963874726098717, 'START']}
{'DET': [0.01963874726098717, 'START'], 'NOUN': [5.0217003267146994e-05, 'START']}
{'NOUN': [4.831332823459154e-06, 'DET']}
{'NOUN': [4.75079854636889e-10, 'NOUN']}
{'NOUN': [1.55720223610545e-14, 'NOUN']}
{'NOUN': [1.020829984881252e-18, 'NOUN']}
{'NOUN': [3.3460453429572

{'.': [2.058901619960776e-88, 'X']}

 ['It', '*EXP*-1', 'is', "n't", 'clear', ',', 'however', ',', 'whether', 'support', 'for', 'the', 'proposal', 'will', 'be', 'broad', 'enough', '*', 'to', 'pose', 'a', 'serious', 'challenge', 'to', 'the', 'White', 'House', "'s", 'acid-rain', 'plan', '.'] 

{'PRON': [0.0024820663884570276, 'START']}
{'X': [1.0452060219100216e-06, 'PRON']}
{'VERB': [1.0424386281141328e-08, 'X']}
{'ADV': [8.921389183196373e-11, 'VERB']}
{'ADJ': [1.9147250693453138e-14, 'ADV'], 'ADV': [1.9147250693453138e-14, 'ADV'], 'VERB': [1.9147250693453138e-14, 'ADV']}
{'.': [8.965956741480184e-16, 'ADV']}
{'ADV': [2.035696474016781e-19, '.']}
{'.': [9.532421555988162e-21, 'ADV']}
{'ADP': [1.3640319568298613e-24, '.']}
{'NOUN': [1.3349037576297117e-28, 'ADP'], 'VERB': [1.3349037576297117e-28, 'ADP']}
{'PRT': [2.2657906815219132e-33, 'NOUN'], 'ADV': [4.360443399252615e-33, 'VERB'], 'ADP': [2.000909036766247e-30, 'NOUN']}
{'ADJ': [2.044441772589765e-34, 'ADP'], 'DET': [2.9644088998741

{'ADJ': [8.960414128641783e-25, 'VERB'], 'DET': [8.960414128641783e-25, 'VERB']}
{'ADJ': [3.7157189934568747e-29, 'DET'], 'X': [3.7157189934568747e-29, 'DET'], 'DET': [9.722488097925885e-28, 'DET']}
{'NOUN': [6.112458479975255e-31, 'DET']}
{'ADV': [8.411559757505385e-36, 'NOUN'], 'ADP': [1.3209110402895349e-33, 'NOUN']}
{'ADP': [5.068993183117405e-36, 'ADP']}
{'PRON': [6.635323041853167e-39, 'ADP'], 'DET': [6.635323041853167e-39, 'ADP']}
{'PRON': [8.438910163089894e-42, 'PRON']}
{'NOUN': [4.042560460120636e-46, 'PRON']}
{'X': [6.241471102430834e-49, 'NOUN']}
{'ADV': [2.6892305161174526e-52, 'X'], 'ADP': [2.8497341108700375e-51, 'X']}
{'DET': [3.349095199363189e-53, 'ADP']}
{'ADJ': [7.124586080837082e-54, 'DET']}
{'NOUN': [4.9963333760172014e-54, 'ADJ']}
{'ADP': [7.925566123695272e-57, 'NOUN']}
{'ADJ': [8.09799853815038e-61, 'ADP'], 'DET': [1.174197243448611e-57, 'ADP'], 'NOUN': [1.174197243448611e-57, 'ADP'], 'NUM': [1.174197243448611e-57, 'ADP']}
{'NOUN': [9.628817982631639e-62, 'DET'

{'NOUN': [6.761255228945137e-22, 'DET']}
{'NOUN': [5.17109503524635e-26, 'NOUN']}
{'PRT': [5.196058798454803e-28, 'NOUN'], 'VERB': [5.196058798454803e-28, 'NOUN']}
{'NOUN': [5.521357285743135e-33, 'PRT']}
{'X': [9.101044254645714e-38, 'NOUN'], 'NUM': [5.184113394508867e-37, 'NOUN']}
{'NOUN': [1.273316695586047e-40, 'NUM']}
{'VERB': [9.547088335365422e-44, 'NOUN']}
{'NOUN': [8.889933857753815e-49, 'VERB'], 'VERB': [3.106546813641812e-47, 'VERB']}
{'ADJ': [4.0670161532265106e-52, 'VERB'], 'X': [1.2753852631309172e-51, 'VERB'], 'DET': [8.993846680321103e-49, 'VERB']}
{'NOUN': [7.375261109808404e-53, 'DET']}
{'PRT': [1.2518354076751536e-57, 'NOUN'], 'ADV': [1.2518354076751536e-57, 'NOUN'], 'ADP': [1.105489929051484e-54, 'NOUN']}
{'ADJ': [2.259082995379748e-59, 'ADP'], 'X': [2.259082995379748e-59, 'ADP'], 'DET': [7.7185099000787455e-56, 'ADP']}
{'ADJ': [3.5207965578015394e-59, 'DET']}
{'ADJ': [1.351208636622959e-63, 'ADJ']}
{'NOUN': [1.4357216436160743e-66, 'ADJ'], 'VERB': [1.43572164361607

{'ADJ': [7.072461321108911e-54, 'ADP'], 'DET': [1.0254959356339648e-50, 'ADP'], 'NOUN': [1.0254959356339648e-50, 'ADP'], 'NUM': [1.0254959356339648e-50, 'ADP']}
{'ADJ': [1.743542670157223e-53, 'DET'], 'ADP': [1.743542670157223e-53, 'DET']}
{'NOUN': [1.5879383075050972e-57, 'ADJ']}
{'X': [2.50403066365098e-60, 'NOUN']}
{'ADJ': [8.228760965893893e-66, 'X'], 'PRT': [3.117892750001711e-61, 'X'], 'ADP': [3.117892750001711e-61, 'X']}
{'NOUN': [4.359016796829725e-66, 'ADP'], 'VERB': [2.318337137513538e-65, 'PRT']}
{'X': [9.898591991134863e-68, 'VERB']}
{'ADJ': [3.25288138745068e-73, 'X'], 'PRT': [1.2325227742772617e-68, 'X'], 'ADP': [1.2325227742772617e-68, 'X']}
{'ADJ': [1.2593381303695087e-72, 'ADP'], 'VERB': [1.8329067415398577e-72, 'PRT']}
{'ADJ': [1.19979864659781e-76, 'VERB'], 'DET': [1.1260042601980875e-73, 'VERB'], 'NOUN': [1.1260042601980875e-73, 'VERB'], 'NUM': [1.1260042601980875e-73, 'VERB']}
{'ADJ': [2.848292910708367e-76, 'DET'], 'NOUN': [2.848292910708367e-76, 'DET']}
{'NOUN': 

{'NOUN': [4.824541718423637e-15, 'ADJ']}
{'VERB': [1.291910839820197e-19, 'NOUN']}
{'X': [4.0787057388176504e-21, 'VERB']}
{'ADP': [2.3499917083494313e-23, 'X']}
{'ADJ': [2.4011192540535417e-27, 'ADP'], 'DET': [3.481585722717374e-24, 'ADP'], 'NOUN': [3.481585722717374e-24, 'ADP'], 'NUM': [3.481585722717374e-24, 'ADP']}
{'NOUN': [1.142007627833161e-27, 'DET']}
{'VERB': [6.11611265758552e-32, 'NOUN']}
{'ADJ': [5.92522969859996e-35, 'VERB'], 'NOUN': [5.92522969859996e-35, 'VERB'], 'ADV': [5.92522969859996e-35, 'VERB'], 'VERB': [5.92522969859996e-35, 'VERB']}
{'NOUN': [6.295830481766301e-38, 'ADJ']}
{'ADP': [4.776356783083345e-40, 'NOUN']}
{'NUM': [1.0681653023535386e-44, 'ADP']}
{'ADJ': [6.792143184182774e-50, 'NUM'], 'NOUN': [5.870342648865288e-47, 'NUM']}
{'.': [2.1389136237424945e-49, 'NOUN']}
{'ADJ': [1.1934592636723694e-54, '.'], 'X': [1.1934592636723694e-54, '.'], 'DET': [4.2258339599625635e-51, '.']}
{'NOUN': [1.0395984044162117e-54, 'DET']}
{'ADJ': [2.4472965020458063e-60, 'NOUN']

{'ADJ': [6.538740947148717e-42, 'ADP']}
{'NOUN': [1.7865551690616185e-45, 'ADJ'], 'VERB': [1.7865551690616185e-45, 'ADJ']}
{'PRT': [3.0323984264395804e-50, 'NOUN'], 'ADV': [5.835756060922174e-50, 'VERB'], 'ADP': [2.677896711298671e-47, 'NOUN']}
{'NOUN': [2.2463242351205791e-51, 'ADP']}
{'DET': [1.021138478175342e-54, 'NOUN'], 'ADV': [1.021138478175342e-54, 'NOUN'], 'ADP': [2.070840307400363e-53, 'NOUN']}
{'X': [8.62290052981534e-56, 'ADP']}
{'VERB': [1.5608111907688372e-60, 'X']}
{'ADV': [2.5491833457537756e-64, 'VERB']}
{'PRT': [2.7900176711253123e-69, 'ADV'], 'ADV': [8.191278670812686e-69, 'ADV'], 'ADP': [1.494614687691222e-66, 'ADV']}
{'.': [3.9913407299474096e-69, 'ADP']}
{'NUM': [1.1446120359349149e-73, '.']}
{'NUM': [2.253485198517398e-75, 'NUM']}
{'X': [5.8277528586332755e-77, 'NUM']}
{'PRT': [4.6597002218766924e-80, 'X'], 'NOUN': [4.6597002218766924e-80, 'X'], 'ADV': [4.6597002218766924e-80, 'X'], 'ADP': [1.3135249981496114e-78, 'X']}
{'NOUN': [2.93822936917211e-82, 'ADP']}
{'A

{'NOUN': [2.9143648941269608e-134, 'PRON']}
{'.': [2.3422382679059698e-135, 'NOUN']}

 ['First', 'of', 'America', ',', 'which', '*T*-1', 'now', 'has', '45', 'banks', 'and', '$', '12.5', 'billion', '*U*', 'in', 'assets', ',', 'announced', 'an', 'agreement', '*', 'to', 'acquire', 'the', 'Peoria', ',', 'Ill.', ',', 'bank', 'holding', 'company', 'in', 'January', '.'] 

{'ADJ': [8.218139301196673e-06, 'START']}
{'ADJ': [8.218139301196673e-06, 'START'], 'NOUN': [0.0003138562704196687, 'START']}
{'ADP': [1.3175326223636114e-05, 'NOUN']}
{'NOUN': [3.3155932949431e-09, 'ADP']}
{'.': [3.4347345884144325e-10, 'NOUN']}
{'DET': [8.139574467844484e-13, '.'], 'ADP': [8.139574467844484e-13, '.']}
{'X': [5.134350484223409e-15, 'DET']}
{'ADV': [2.507172643460657e-18, 'X']}
{'VERB': [2.1665520644753932e-20, 'ADV']}
{'NUM': [1.0658464815792055e-24, 'VERB']}
{'NOUN': [6.544803405361385e-28, 'NUM']}
{'ADJ': [3.08139651089842e-33, 'NOUN'], 'NOUN': [7.150785563189792e-33, 'NOUN'], 'ADP': [1.5046185090120978e-

{'NOUN': [6.024626365377952e-28, 'NUM']}
{'NOUN': [6.582445425612973e-33, 'NOUN'], 'VERB': [2.3473053436087347e-30, 'NOUN']}
{'ADV': [7.590763838020584e-33, 'VERB']}
{'VERB': [7.568647453461837e-37, 'ADV']}
{'X': [2.793453587130562e-38, 'VERB']}
{'ADJ': [9.179864356892795e-44, 'X'], 'PRT': [3.478267584024537e-39, 'X'], 'ADP': [3.478267584024537e-39, 'X']}
{'NOUN': [5.835413096601087e-43, 'ADP'], 'VERB': [7.758891239202327e-43, 'PRT']}
{'ADJ': [1.0157753252709204e-46, 'VERB'], 'ADV': [1.2672151792265549e-46, 'VERB']}
{'ADP': [2.854616678502083e-49, 'ADV']}
{'NUM': [1.1491110749789993e-52, 'ADP']}
{'NOUN': [4.233652523120229e-56, 'NUM'], 'VERB': [4.233652523120229e-56, 'NUM']}
{'ADP': [3.095081752173902e-58, 'NOUN']}
{'ADJ': [3.1624198339126566e-62, 'ADP'], 'DET': [4.585459770230572e-59, 'ADP'], 'NOUN': [4.585459770230572e-59, 'ADP'], 'NUM': [4.585459770230572e-59, 'ADP']}
{'ADJ': [2.2818070304383293e-62, 'DET']}
{'NOUN': [1.3854431195311621e-65, 'ADJ']}
{'PRT': [1.3921314270828544e-67, 

{'ADP': [1.2214875136261558e-84, 'NOUN']}
{'NOUN': [1.7077189678866502e-89, 'ADP']}
{'NOUN': [2.98533817081094e-93, 'NOUN']}
{'.': [2.399278594250449e-94, 'NOUN']}

 ['Santa', 'Ana', 'Community', 'Redevelopment', 'Agency', ',', 'Calif.', '--'] 

{'NOUN': [6.277125408393374e-05, 'START']}
{'NOUN': [1.3716646618262376e-09, 'NOUN']}
{'NOUN': [4.496000530721704e-14, 'NOUN']}
{'NOUN': [4.912284402743367e-19, 'NOUN']}
{'NOUN': [2.1468447691274886e-23, 'NOUN']}
{'.': [2.2239886887590438e-24, 'NOUN']}
{'NOUN': [1.4706107236725337e-28, '.']}
{'.': [7.131558545605141e-31, 'NOUN']}

 ['Edward', 'L.', 'Kane', 'succeeded', 'Mr.', 'Taylor', 'as', 'chairman', '.'] 

{'NOUN': [5.0217003267146994e-05, 'START']}
{'NOUN': [3.2919951883829708e-09, 'NOUN']}
{'NOUN': [3.596800424577364e-14, 'NOUN']}
{'VERB': [4.815737668343403e-19, 'NOUN']}
{'NOUN': [6.703962907515144e-22, 'VERB']}
{'NOUN': [1.4649363229443596e-26, 'NOUN']}
{'ADV': [4.5358833583818956e-30, 'NOUN'], 'ADP': [8.486899952771521e-29, 'NOUN']}
{'

{'X': [1.2695249353361513e-17, 'VERB']}
{'ADJ': [2.0859603248411287e-22, 'X'], 'DET': [3.1831022554983304e-19, 'X'], 'NOUN': [3.1831022554983304e-19, 'X'], 'NUM': [3.1831022554983304e-19, 'X']}
{'NOUN': [6.264605864332681e-22, 'DET']}
{'ADP': [2.6298096837358725e-23, 'NOUN']}
{'NOUN': [7.72095436127945e-27, 'ADP']}
{'NOUN': [8.435836122358785e-32, 'NOUN']}
{'NOUN': [2.129105899483217e-32, 'NOUN']}
{'.': [2.205612397185683e-33, 'NOUN']}
{'PRON': [4.0245792676075226e-36, '.']}
{'X': [3.5915419267277737e-37, 'PRON']}
{'VERB': [1.6252416492482217e-40, 'X']}
{'NOUN': [1.1350285755052457e-44, 'VERB']}
{'NOUN': [8.680844655192263e-49, 'NOUN']}
{'NUM': [8.71273132382148e-53, 'NOUN']}
{'.': [4.164187227808587e-54, 'NUM']}
{'NOUN': [5.300613472240357e-55, '.']}
{'X': [2.5862066383586346e-57, 'NOUN']}
{'ADJ': [8.498808159334685e-63, 'X'], 'X': [3.703189625485867e-62, 'X'], 'DET': [3.0559095042880747e-59, 'X']}
{'ADJ': [1.267229485640945e-62, 'DET']}
{'NOUN': [1.9235570189857382e-66, 'ADJ']}
{'ADP

{'NOUN': [3.3160600409091937e-29, 'ADP'], 'VERB': [1.3521255406271167e-28, 'PRT']}
{'ADP': [5.095491443587872e-31, 'VERB']}
{'NUM': [2.2790705994816107e-35, 'ADP']}
{'ADJ': [1.449192723676107e-40, 'NUM'], 'NOUN': [1.2525145041159329e-37, 'NUM']}
{'PRT': [2.338542567765059e-41, 'NOUN'], 'NOUN': [2.338542567765059e-41, 'NOUN'], 'ADV': [2.338542567765059e-41, 'NOUN'], 'ADP': [3.5820606791168133e-39, 'NOUN']}
{'NUM': [1.2016165569673588e-42, 'ADP']}
{'ADJ': [7.64071973637815e-48, 'NUM'], 'PRT': [2.1904087056751628e-44, 'NUM'], 'ADP': [2.1904087056751628e-44, 'NUM']}
{'NUM': [1.4112700385558096e-45, 'ADP']}
{'ADJ': [8.973843423202303e-51, 'NUM'], 'NOUN': [7.755951890728894e-48, 'NUM']}
{'PRT': [1.4480968955173547e-51, 'NOUN'], 'NOUN': [1.4480968955173547e-51, 'NOUN'], 'ADV': [1.4480968955173547e-51, 'NOUN'], 'ADP': [2.218121243754487e-49, 'NOUN']}
{'NUM': [9.921035033996434e-54, 'ADP']}
{'.': [3.67864707228827e-55, 'NUM']}

 ['In', 'Thailand', ',', 'for', 'example', ',', 'the', 'government'

{'NOUN': [1.7673026975034254e-25, 'DET']}
{'NOUN': [5.792810799209664e-30, 'NOUN']}
{'NOUN': [6.329166098304082e-35, 'NOUN'], 'VERB': [2.2569857313709442e-32, 'NOUN']}
{'ADJ': [1.4773956385879451e-36, 'VERB'], 'DET': [1.3865274708929953e-33, 'VERB'], 'NOUN': [1.3865274708929953e-33, 'VERB'], 'NUM': [1.3865274708929953e-33, 'VERB']}
{'.': [2.082562747035631e-35, 'NOUN']}
{'ADJ': [2.324033822627965e-40, '.']}
{'.': [8.688118129393117e-43, 'ADJ']}
{'NOUN': [3.830007395930471e-47, '.']}
{'.': [3.967633453943041e-48, 'NOUN']}
{'X': [1.2920530302499255e-50, '.']}
{'VERB': [2.532126915604739e-51, 'X']}
{'ADJ': [3.3149994787013984e-56, 'VERB'], 'X': [1.0395585665594885e-55, 'VERB'], 'DET': [7.330828286268732e-53, 'VERB']}
{'VERB': [1.626974580861643e-57, 'DET']}
{'NOUN': [2.6512238948732823e-61, 'VERB']}
{'ADP': [1.1129533802255238e-62, 'NOUN']}
{'NOUN': [1.4470625088032928e-65, 'ADP']}
{'DET': [6.57808514443895e-69, 'NOUN'], 'ADV': [6.57808514443895e-69, 'NOUN'], 'ADP': [1.3340172908729256e-6

{'ADV': [2.0739401129052997e-05, 'START']}
{'ADV': [1.3328363720625783e-09, 'ADV'], 'ADP': [3.007936348409217e-08, 'ADV']}
{'ADP': [1.1542949055823845e-10, 'ADP']}
{'VERB': [1.71725035745072e-16, 'ADP']}
{'PRT': [6.205182308580581e-21, 'VERB'], 'ADV': [1.1779683127525713e-19, 'VERB'], 'ADP': [2.9101246607653634e-19, 'VERB']}
{'ADJ': [5.352189709308251e-23, 'ADP'], 'NOUN': [5.352189709308251e-23, 'ADP']}
{'NOUN': [4.874527715274531e-27, 'ADJ']}
{'.': [5.0496871770654065e-28, 'NOUN']}
{'ADV': [1.1465179542615933e-31, '.']}
{'.': [5.368723972865008e-33, 'ADV']}
{'PRON': [1.4813926992713136e-35, '.']}
{'VERB': [1.101680335792167e-37, 'PRON']}
{'X': [4.0163546565497236e-39, 'VERB']}
{'ADJ': [6.599284685838237e-44, 'X'], 'DET': [1.0070276849473233e-40, 'X'], 'NOUN': [1.0070276849473233e-40, 'X'], 'NUM': [1.0070276849473233e-40, 'X']}
{'NOUN': [6.331110771675063e-44, 'DET'], 'VERB': [6.331110771675063e-44, 'DET']}
{'NOUN': [6.917307167387653e-49, 'NOUN'], 'VERB': [2.099333095567873e-46, 'VERB

In [18]:
sentence=input("Enter a sentence: ")
sent=sentence.split(" ")
try:
    last=sent[-1]
    last_2nd, last=last[:len(last)-1], last[-1]
    sent[-1]=last_2nd
    sent.append(last)
    print(viterbi_algo(sent))
except:
    print("Invalid sentence!")
    print("Sentence must contain atleast one word.")

Enter a sentence: This is my miniproject.

 ['This', 'is', 'my', 'miniproject', '.'] 

{'DET': [0.0011270933210653508, 'START']}
{'VERB': [1.9689793250510392e-06, 'DET']}
{'PRON': [5.438422175726463e-10, 'VERB']}
{'NOUN': [1.2036079674988705e-10, 'PRON']}
{'.': [9.673245264220395e-12, 'NOUN']}
[('This', 'DET'), ('is', 'VERB'), ('my', 'PRON'), ('miniproject', 'NOUN'), ('.', '.')]


In [19]:
"""TAG         MEANING
   ADJ         Adjective
   ADP         Adposition/Preposition
   ADV         Adverb
   CONJ        Conjunction
   DET         Determiner
   NOUN        Noun
   NUM         Number
   PRT         Particle
   PRON        Pronoun
   VERB        Verb
   .           Punctuation mark
   X           Other"""

'TAG         MEANING\n   ADJ         Adjective\n   ADP         Adposition/Preposition\n   ADV         Adverb\n   CONJ        Conjunction\n   DET         Determiner\n   NOUN        Noun\n   NUM         Number\n   PRT         Particle\n   PRON        Pronoun\n   VERB        Verb\n   .           Punctuation mark\n   X           Other'