## Basic Trigram HMM tagger

In [1]:
from collections import defaultdict

In [2]:
with open('gene.counts') as f:
    l=0
    for line in f:
        print line.strip().split( )
        l+=1
        if l>5:break

['1', 'WORDTAG', 'O', 'mind']
['20', 'WORDTAG', 'O', 'resting']
['1', 'WORDTAG', 'I-GENE', 'SOX']
['2', 'WORDTAG', 'I-GENE', 'holoenzyme']
['2', 'WORDTAG', 'I-GENE', 'hydrolase']
['2', 'WORDTAG', 'I-GENE', 'barley']


In [3]:
class HMM:
    def __init__(self,handle):
        self.words=defaultdict(float)
        self.ngrams={1:{},2:{},3:{}}
        self.word_counts=defaultdict(float)
        
        for l in handle:
            t=l.strip().split(" ")
            count=int(t[0])
            keys=tuple(t[2:])
            
            if t[1]=='1-GRAM':self.ngrams[1][keys[0]]=count
            elif t[1]=='2-GRAM':self.ngrams[2][keys]=count
            elif t[1]=='3-GRAM':self.ngrams[3][keys]=count
            elif t[1]=='WORDTAG':
                if count<5.0:
                    r_k=list(keys)
                    r_k[1]='_RARE_'
                    r_k=tuple(r_k)
                    self.words[r_k]+=count
                    self.word_counts[r_k[1]]+=count
                else:
                    self.words[keys]=count
                    self.word_counts[keys[1]]+=count
                    
    def tags(self):
        return self.ngrams[1].keys()
    
    def word_count(self,word):
        return self.word_counts.get(word,0.0)
    
    def trigram_prob(self,trigram):
        bigram=trigram[:-1]
        return self.ngrams[3].get(trigram,0.0)/float(self.ngrams[2].get(bigram))
    
    def emmisoin_prob(self,word,tag):
        if tag in ['*','STOP']: return 0.0
        new_word=self.replace_word(word)
        return self.words.get((tag,new_word),0.0)/float(self.ngrams[1][tag])
    
    def replace_word(self,word):
        if self.word_count(word)<5.0: return '_RARE_'
        else:
            return word

In [4]:
def argmax(ls):
    return max(ls,key=lambda x:x[1])

In [14]:
f=open('gene.counts','r')

In [9]:
f.close()

In [10]:
with open('gene.test') as ft:
    sen=[]
    tmp=[]
    for l in ft:
        if l!='\n':
            tmp.append(l.strip())
        else:
            sen.append(tmp)
            tmp=[]

In [6]:
def unigram(hmm,sentence):
    K=hmm.tags()
    return argmax([(y,hmm.emmisoin_prob(sentence,y))for y in K])[0]

In [7]:
def argmax(ls):
    return max(ls,key=lambda x:x[1])

In [12]:
def viterbi(hmm,sentence):
    
    def K(k):
        if k in (-1,0):
            return '*'
        else:
            return hmm.tags()
    n=len(sentence)
    x=[""]+sentence
    y=[""]*(n+1)
    
    
    pi={}
    pi[0,'*','*']=1.0
    bp={}
    
    for k in range(1,n+1):
        #print "k: "+str(k)
        for u in K(k-1):
            #print "u: ",u
            for v in K(k):
                #print "v: ",v
                bp[k,u,v],pi[k,u,v]=argmax([(w,pi[k-1,w,u]*hmm.trigram_prob((w,u,v))*hmm.emmisoin_prob(x[k],v)) for w in K(k-2)])
                #print pi[k,u,v]
                #hh=[(w,pi[k-1,w,u]*hmm.trigram_prob((w,u,v))*hmm.emmisoin_prob(x[k],v)) for w in K(k-2)]
                #print hh
                
    (y[n-1],y[n]),score=argmax([((u,v),pi[n,u,v]*hmm.trigram_prob((u,v,'STOP'))) for u in K(n-1) for v in K(n)])
    for k in range(n-2,0,-1):
        y[k]=bp[k+2,y[k+1],y[k+2]]
    y[0]='*'
    
    scores=[pi[i,y[i-1],y[i]] for i in range(1,n)]
    return y[1:n+1], scores+[score]

In [15]:
h=HMM(f)

In [21]:
viterbi(h,sen[1])