In [1]:
import numpy as np
import pandas as pd

from nltk.util import ngrams
from nltk.corpus import brown


In [2]:
tagged_sents = brown.tagged_sents()

In [3]:
len(tagged_sents) #total sentences

57340

Separating tags (POS)

In [4]:
all_tok = []
for sent in tagged_sents:
    tokens = [item[1] for item in sent] #taken tag from the tuple
    tokens = [token.lower() for token in tokens if token.isalpha()]
    all_tok.append(tokens)

all unique tags

In [22]:
unique_state = np.unique([item for sublist in all_tok for item in sublist])

### Translation probabilities 

#### probability of a tag givena a certain tag occurs before it

In [47]:
trans_prob = pd.DataFrame(0, index = unique_state,
                         columns = unique_state)

trans_prob.head() #all unique POS stated in corpus

Unnamed: 0,abl,abn,abx,ap,at,be,bed,bedz,beg,bem,...,vb,vbd,vbg,vbn,vbz,wdt,wpo,wps,wql,wrb
abl,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
abn,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
abx,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ap,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
at,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


in the dataframe row tags occur first followed by their corresponding column tags

i.e P( row[x] | col[y] )

In [48]:
for sent in all_tok:
    for i in range(len(sent)-1):  
        first = sent[i]
        second = sent[i+1]
        trans_prob[first][second] += 1
    
trans_prob.head() 

Unnamed: 0,abl,abn,abx,ap,at,be,bed,bedz,beg,bem,...,vb,vbd,vbg,vbn,vbz,wdt,wpo,wps,wql,wrb
abl,0,0,0,1,0,5,1,7,1,0,...,35,13,7,12,2,1,0,0,0,3
abn,0,1,0,12,52,4,33,41,0,0,...,188,82,60,57,43,16,2,3,0,20
abx,0,0,0,1,3,5,14,10,0,0,...,38,19,9,24,17,4,1,0,0,1
ap,0,36,0,135,3041,40,49,54,2,0,...,268,207,91,150,97,26,5,6,49,68
at,346,541,74,112,224,620,194,1644,74,23,...,6197,4014,2547,1484,1453,470,38,6,1,834


In [49]:
trans_prob = trans_prob.div( trans_prob.sum(axis = 1), axis = 0)
trans_prob.head()

Unnamed: 0,abl,abn,abx,ap,at,be,bed,bedz,beg,bem,...,vb,vbd,vbg,vbn,vbz,wdt,wpo,wps,wql,wrb
abl,0.0,0.0,0.0,0.003195,0.0,0.015974,0.003195,0.022364,0.003195,0.0,...,0.111821,0.041534,0.022364,0.038339,0.00639,0.003195,0.0,0.0,0.0,0.009585
abn,0.0,0.000361,0.0,0.004334,0.018779,0.001445,0.011918,0.014807,0.0,0.0,...,0.067895,0.029614,0.021668,0.020585,0.015529,0.005778,0.000722,0.001083,0.0,0.007223
abx,0.0,0.0,0.0,0.00156,0.00468,0.0078,0.021841,0.015601,0.0,0.0,...,0.059282,0.029641,0.014041,0.037441,0.026521,0.00624,0.00156,0.0,0.0,0.00156
ap,0.0,0.003995,0.0,0.01498,0.337439,0.004439,0.005437,0.005992,0.000222,0.0,...,0.029738,0.022969,0.010098,0.016644,0.010763,0.002885,0.000555,0.000666,0.005437,0.007545
at,0.003879,0.006065,0.00083,0.001256,0.002511,0.006951,0.002175,0.018431,0.00083,0.000258,...,0.069475,0.045002,0.028555,0.016637,0.01629,0.005269,0.000426,6.7e-05,1.1e-05,0.00935


Above are the<b> transational probabilities </b>for each token.<br>
For each row sum over all of its columns to get its total occurence.<br>
Then divide for each observtion's attribute by that total to get the probability

### Emission probabilities

probability of word given a certain tag

In [14]:
tag_tokens = []
for sent in tagged_sents:
    tokens = [ (token[0].lower(), token[1]) for token in sent if token[0].isalpha()]
    tag_tokens += tokens

[('the', 'AT')]

In [19]:
print(tag_tokens[:10])

[('the', 'AT'), ('fulton', 'NP-TL'), ('county', 'NN-TL'), ('grand', 'JJ-TL'), ('jury', 'NN-TL'), ('said', 'VBD'), ('friday', 'NR'), ('an', 'AT'), ('investigation', 'NN'), ('of', 'IN')]


In [20]:
#unique vocab
unique_toks = np.unique([item[0] for item in tag_tokens])

In [24]:
len(unique_toks)

40234

In [23]:
# vocab(observation) by tags(State)
em_probs = pd.DataFrame(0, index = unique_state,
                       columns = unique_toks)
em_probs.head()

Unnamed: 0,a,aa,aaa,aaawww,aah,aaron,ab,aback,abandon,abandoned,...,zoooop,zorrillas,zounds,zu,zubkovskaya,zur,zurcher,zurich,zwei,zworykin
abl,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
abn,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
abx,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ap,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
at,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [29]:
for item in tag_tokens:
    row = str(item[1]).lower()
    col = str(item[0])
    try:
        em_probs[row][col] += 1
    except:
        continue

In [34]:
em_probs = em_probs.div(em_probs.sum(axis = 0), 1)
em_probs.fillna(0, inplace = True)

### Viterbi 

In [39]:
obs_seq_sent = tagged_sents[5]
print(obs_seq_sent)

[('It', 'PPS'), ('recommended', 'VBD'), ('that', 'CS'), ('Fulton', 'NP'), ('legislators', 'NNS'), ('act', 'VB'), ('``', '``'), ('to', 'TO'), ('have', 'HV'), ('these', 'DTS'), ('laws', 'NNS'), ('studied', 'VBN'), ('and', 'CC'), ('revised', 'VBN'), ('to', 'IN'), ('the', 'AT'), ('end', 'NN'), ('of', 'IN'), ('modernizing', 'VBG'), ('and', 'CC'), ('improving', 'VBG'), ('them', 'PPO'), ("''", "''"), ('.', '.')]


In [42]:
sent = [(token[0].lower(), token[1].lower()) for token in  obs_seq_sent if token[1].isalpha()]

In [44]:
obs_seq = [item[0] for item in sent]
print(obs_seq)

['it', 'recommended', 'that', 'fulton', 'legislators', 'act', 'to', 'have', 'these', 'laws', 'studied', 'and', 'revised', 'to', 'the', 'end', 'of', 'modernizing', 'and', 'improving', 'them']


In [50]:
df = pd.DataFrame(0, index = trans_prob.columns, columns = ['init'] + obs_seq)

In [52]:
df['init'] = 1 / len(df)

In [53]:
for k, item in enumerate(obs_seq):
    for state in list(df.index):
        try:
            prev = obs_seq[k-1]
            probs = trans_prob[state] * em_probs.loc[state][item]
            final_probs = (probs * np.max(df.loc[state][prev]) )
            df.loc[state][item] = final_probs
        except:
            continue

In [54]:
df

Unnamed: 0,init,it,recommended,that,fulton,legislators,act,to,have,these,...,and,revised,to.1,the,end,of,modernizing,and.1,improving,them
abl,0.014706,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
abn,0.014706,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
abx,0.014706,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ap,0.014706,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
at,0.014706,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
wdt,0.014706,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
wpo,0.014706,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
wps,0.014706,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
wql,0.014706,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
