In [2]:
import pandas as pd
from collections import Counter as ctr
from operator import itemgetter

In [3]:
train = pd.read_csv('train.txt', delimiter=' ', names=['word', 'pos', 'other'])
train['word'] = train.word.str.lower()

In [4]:
train[:3]

Unnamed: 0,word,pos,other
0,confidence,NN,B-NP
1,in,IN,B-PP
2,the,DT,B-NP


In [5]:
train['ppos'] = train.pos.shift(1)
train['ppos'] = train.ppos.fillna('O')

train[:3]

Unnamed: 0,word,pos,other,ppos
0,confidence,NN,B-NP,O
1,in,IN,B-PP,NN
2,the,DT,B-NP,IN


### Write the functions

$$P(T_i|T_{i-1})$$

$$P(W_i|T_i)$$

In [6]:
pos = list(set(train.pos))
smooth = 0.00001

pos

['WP$',
 ')',
 'VB',
 'EX',
 'VBD',
 'VBN',
 ':',
 'WP',
 '$',
 'CD',
 'RBR',
 '#',
 'RBS',
 'WRB',
 'RB',
 'DT',
 'PRP$',
 'NNP',
 'MD',
 'WDT',
 'FW',
 '.',
 'NNPS',
 'NNS',
 '``',
 ',',
 "''",
 'VBG',
 'JJR',
 'UH',
 'RP',
 'IN',
 'NN',
 '(',
 'PRP',
 'PDT',
 'JJS',
 'TO',
 'JJ',
 'SYM',
 'POS',
 'VBP',
 'CC',
 'VBZ']

In [7]:
pos_ppos = {}
pos_ppos_denom = {}
for col_name in set(train.ppos):
    sub_frame = train[train.ppos == col_name]
    counted = ctr(sub_frame.pos)
    pos_ppos[col_name] = counted
    pos_ppos_denom[col_name] = len(sub_frame)
    

# conditional 
def Ptt(Ti,Tprev):
    if Ti not in pos_ppos[Tprev]: return smooth
    return pos_ppos[Tprev][Ti] / pos_ppos_denom[Tprev]

word_pos = {}
word_pos_denom = {}
for col_name in set(train.pos):
    sub_frame = train[train.pos == col_name]
    counted = ctr(sub_frame.word)
    word_pos[col_name] = counted
    word_pos_denom[col_name] = len(sub_frame)
    

# conditional 
def Pwt(W, T):
    if W not in word_pos[T]: return smooth
    return word_pos[T][W] / word_pos_denom[T]

In [9]:
Pwt('the','DT')

0.583419689119171

### Evaluate: build the trellis

- example trellis with an example sequence

In [11]:
sequence = ['the', 'government']

In [12]:
emissions = []
transitions = []
for word in sequence:
    emission = [(t,Pwt(word, t)) for t in pos]
    transition = [(tprev,Ptt(ti,tprev)) for ti in pos for tprev,prob in emission]
    emissions.append(emission)
    transitions.append(transition)

### Evaluate: decode

In [14]:
hyp = []

for e,t in zip(emissions, transitions):
    # argmax of e
    e = sorted(e, key = itemgetter(1))[-1]
    hyp.append(e)

### Evaluate: check accuracy

- load test data
- pass all word sequences through your hmm
- decode
- find the most probable sequence
- compare to the "gold"

In [11]:
hyp

[('DT', 0.583419689119171), ('NN', 0.005738547782532259)]

### Methods to help with determining accuracy

In [48]:
def emiss(word = ''):
    return [(t,Pwt(word,t)) for t in pos]

def trans(c):
    return [[t, Ptt(t,c)] for t in pos]

#New Trellis
def evaluate(word, rest):
    list = emiss(word)
    max(list)
    sTags = []
    tag = max(list, key = itemgetter(1))[0]
    sTags.append(tag)
    i = 1
    while len(rest) > i:
        tList = trans(tag)
        nWord = rest[i]
        i = i + 1
        eList = emiss(nWord)
        cList = [(t[0], t[1] * e[1]) for t, e in zip(tList, eList)]
        tag = max(cList, key = itemgetter(1))[0]
        sTags.append(tag)
    return sTags

### Creating the test set

In [49]:
test = pd.read_csv('test.txt', delimiter=' ', names=['word', 'pos', 'other'])
test['word'] = test.word.str.lower()

In [50]:
test.describe()
test

Unnamed: 0,word,pos,other
0,rockwell,NNP,B-NP
1,international,NNP,I-NP
2,corp.,NNP,I-NP
3,'s,POS,B-NP
4,tulsa,NNP,I-NP
...,...,...,...
47372,according,VBG,B-PP
47373,to,TO,B-PP
47374,mr.,NNP,B-NP
47375,harlow,NNP,I-NP


### Evaluating accuracy of test set

In [51]:
test['hyp'] = evaluate(test.word[0], test.word,)
test['hyp']

0        WP$
1        NNP
2        NNP
3        POS
4        NNP
        ... 
47372    VBG
47373     TO
47374    NNP
47375    NNP
47376      .
Name: hyp, Length: 47377, dtype: object

In [56]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

accuracy_score(test.pos, test.hyp)

0.9055659919370158

### Making the confusion matrix

In [58]:
act = test.word
hyp = test.hyp
confusion = pd.crosstab(test.pos, hyp, rownames = ['actual'], colnames = ['predicted'], margins = True)
confusion

predicted,#,$,'',(,),",",.,:,CC,CD,...,VBG,VBN,VBP,VBZ,WDT,WP,WP$,WRB,``,All
actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
#,11,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,11
$,0,384,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,384
'',0,0,314,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,316
(,0,0,0,76,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,77
),0,0,0,0,77,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,77
",",0,0,0,0,0,2389,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2390
.,0,0,0,0,0,0,1975,0,0,0,...,0,0,0,0,0,0,0,0,0,1975
:,0,0,0,0,0,0,0,237,0,0,...,0,0,0,0,0,0,0,0,0,238
CC,0,0,0,0,0,0,0,0,1210,0,...,0,0,0,0,0,0,0,0,0,1214
CD,0,0,0,0,0,0,0,0,0,1750,...,0,0,0,1,0,0,0,0,0,1918


### Based off the confusion matrix, what are the common errors that your tagger is making?

I believe the common error my tagger is making, based off the confusion matrix, is that of properly evaluating where common words/tags are supposed to go. It has a fairly good accuracy for most, however it seems to have problems with NN, NNP, IN and a handful of other tags. It will generally assume that the actual tag for the word/pos is one of the later tags based off its position. This is likely due to the fact that the 'other' tag isn't being leveraged to help with accuracy and that this is a fairly greedy approach.