In [1]:
import pandas as pd
from collections import Counter as ctr
from operator import itemgetter

In [2]:
train = pd.read_csv('train.txt', delimiter=' ', names=['word', 'pos', 'other'])
train['word'] = train.word.str.lower()

In [3]:
train[:3]

Unnamed: 0,word,pos,other
0,confidence,NN,B-NP
1,in,IN,B-PP
2,the,DT,B-NP


In [4]:
train['ppos'] = train.pos.shift(1)
train['ppos'] = train.ppos.fillna('O')

train[:3]

Unnamed: 0,word,pos,other,ppos
0,confidence,NN,B-NP,O
1,in,IN,B-PP,NN
2,the,DT,B-NP,IN


### Write the functions

$$P(T_i|T_{i-1})$$

$$P(W_i|T_i)$$

In [5]:
pos = list(set(train.pos))
smooth = 0.00001

pos

['PRP$',
 '#',
 '$',
 'PDT',
 'VB',
 'EX',
 'JJ',
 'VBZ',
 'RP',
 'IN',
 'NNPS',
 'TO',
 'RBR',
 'WP$',
 'SYM',
 '``',
 ',',
 'MD',
 ')',
 'VBG',
 'VBD',
 'JJS',
 'PRP',
 'NNP',
 'DT',
 "''",
 'VBP',
 'NN',
 'UH',
 'NNS',
 'JJR',
 'RBS',
 'RB',
 'WP',
 'CD',
 'CC',
 '(',
 ':',
 'VBN',
 'WRB',
 'FW',
 '.',
 'POS',
 'WDT']

In [6]:
pos_ppos = {}
pos_ppos_denom = {}
for col_name in set(train.ppos):
    sub_frame = train[train.ppos == col_name]
    counted = ctr(sub_frame.pos)
    pos_ppos[col_name] = counted
    pos_ppos_denom[col_name] = len(sub_frame)
    

# conditional 
def Ptt(Ti,Tprev):
    if Ti not in pos_ppos[Tprev]: return smooth
    return pos_ppos[Tprev][Ti] / pos_ppos_denom[Tprev]

word_pos = {}
word_pos_denom = {}
for col_name in set(train.pos):
    sub_frame = train[train.pos == col_name]
    counted = ctr(sub_frame.word)
    word_pos[col_name] = counted
    word_pos_denom[col_name] = len(sub_frame)
    

# conditional 
def Pwt(W, T):
    if W not in word_pos[T]: return smooth
    return word_pos[T][W] / word_pos_denom[T]

In [7]:
Pwt('the','DT')

0.583419689119171

### Evaluate: build the trellis

- example trellis with an example sequence

In [8]:
sequence = ['the', 'government']

In [9]:
emissions = []
transitions = []
for word in sequence:
    emission = [(t,Pwt(word, t)) for t in pos]
    transition = [(tprev,Ptt(ti,tprev)) for ti in pos for tprev,prob in emission]
    emissions.append(emission)
    transitions.append(transition)

### Evaluate: decode

In [10]:
hyp = []

for e,t in zip(emissions, transitions):
    # argmax of e
    e = sorted(e, key = itemgetter(1))[-1]
    hyp.append(e)

### Evaluate: check accuracy

- load test data
- pass all word sequences through your hmm
- decode
- find the most probable sequence
- compare to the "gold"

In [11]:
hyp

[('DT', 0.583419689119171), ('NN', 0.005738547782532259)]

In [12]:
test = pd.read_csv('test.txt', delimiter=' ', names=['word', 'pos', 'other'])
test['word'] = test.word.str.lower()

In [13]:
test.describe()

Unnamed: 0,word,pos,other
count,47377,47377,47377
unique,7495,43,19
top,the,NN,I-NP
freq,2407,6642,14376


In [14]:
test[:3]

Unnamed: 0,word,pos,other
0,rockwell,NNP,B-NP
1,international,NNP,I-NP
2,corp.,NNP,I-NP


In [23]:
test['hyp'] = test.word.apply(lambda x: [(t,Pwt(x, t)) for t in pos])

In [24]:
test['hyp'] = test.hyp.apply(lambda x: sorted(x, key = itemgetter(1))[-1])

In [25]:
test['hyp'] = test.hyp.apply(lambda x: x[0])

In [26]:
test[:3]

Unnamed: 0,word,pos,other,hyp
0,rockwell,NNP,B-NP,WDT
1,international,NNP,I-NP,NNP
2,corp.,NNP,I-NP,NNP


In [27]:
from sklearn.metrics import accuracy_score

accuracy_score(test.pos, test.hyp)

0.8567237267028305