In [63]:
from numpy import array, ones, zeros

class HMM:
    def __init__(self, state_list, observation_list, trans_start_list,
                transition_proba_1=None, transition_proba_2=None,
                observation_proba=None,
                initial_state_proba=None,
                smoothing_obs=0.01):
        print "HMM creating with:"
        self.N = len(state_list)
        self.M = len(observation_list)
        self.T = len(trans_start_list)
        #self.I = len(inits_list)
        print str(self.N)+" states."
        print str(self.M)+" observations."
        self.omega_Y = state_list
        self.omega_X = observation_list
        self.omega_T = trans_start_list
        #self.omega_I = []
        """for i in range(26):
            pair = ('#',chr(i+97))
            self.omega_I.append(pair)
        """     
        if transition_proba_1 is None:
            self.transition_proba_1 = zeros((self.N,self.N),float)
        else:
            self.transition_proba_1 = transition_proba_1
        if transition_proba_2 is None:
            self.transition_proba_2 = zeros((self.N,self.T),float)
        else:
            self.transition_proba_2 = transition_proba_2
        if observation_proba is None:
            self.observation_proba = zeros((self.M, self.N), float)
        else:
            self.observation_proba = observation_proba
        if initial_state_proba is None:
            self.initial_state_proba = zeros((self.N,),float)
        self.make_indexs()
        self.smoothing_obs = smoothing_obs
    
    def make_indexs(self):
        self.Y_index = {}
        self.X_index = {}
        self.T_index = {}
        #self.I_index = {}
        for i in range(self.N):
            self.Y_index[self.omega_Y[i]] = i
        for i in range(self.M):
            self.X_index[self.omega_X[i]] = i
        for i in range(self.T):
            self.T_index[self.omega_T[i]] = i
        #for i in range(len(self.omega_I)):
         #   self.I_index[self.omega_I[i]] = i


    def observation_estimation(self, pair_counts):
        for pair in pair_counts:
            letter = pair[0]
            tag = pair[1]
            cpt = pair_counts[pair]
            k = 0
            if letter in self.X_index:
                k = self.X_index[letter]
            i = self.Y_index[tag]
            self.observation_proba[k,i] = cpt
        self.observation_proba = self.observation_proba + self.smoothing_obs
        self.observation_proba = self.observation_proba/self.observation_proba.sum(axis=0).reshape(1,self.N)
    
    def transition_estimation(self, trans_counts_1, trans_counts_2):
        for pair in trans_counts_2:
            i = self.Y_index[pair[1]]
            j = self.T_index[pair[0]]
            self.transition_proba_2[i,j] = trans_counts_2[pair]
        self.transition_proba_2 = self.transition_proba_2/self.transition_proba_2.sum(axis=0).reshape(1,self.T)
        for pair in trans_counts_1:
            i = self.Y_index[pair[1]]
            j = self.Y_index[pair[0]]
            self.transition_proba_1[i,j] = trans_counts_1[pair]
        self.transition_proba_1 = self.transition_proba_1/self.transition_proba_1.sum(axis=0).reshape(1,self.N)
    
    def init_estimation(self, init_counts):
        for tag in init_counts:
            i = self.Y_index[tag]
            self.initial_state_proba[i] = init_counts[tag]
        self.initial_state_proba = self.initial_state_proba/sum(self.initial_state_proba)
    
    def supervised_training(self, pair_counts, trans_counts_1, trans_counts_2, init_counts):
        self.observation_estimation(pair_counts)
        self.transition_estimation(trans_counts_1, trans_counts_2)
        self.init_estimation(init_counts)

    def viterbi(self,obs):
        V = [{}]
        path = {}
        for y in range(self.N):
            V[0][y] = self.initial_state_proba[y]* self.observation_proba[self.X_index[obs[0]],y]
            path[y]=[y]
       
        for t in range(1,len(obs)):
            V.append({})
            newpath = {}
            if t==1:
                 for y in range(self.N):#
                    (prob, state) = max([(V[t-1][y0] * self.transition_proba_1[y,y0] * self.observation_proba[self.X_index[obs[t]],y], y0) for y0 in range(self.N)])
                    V[t][y] = prob
                    newpath[y]=path[state]+[y]
                path = newpath
            else:
                for y in range(self.N):
                    (prob, state) = max([(V[t-2][self.Y_index[self.omega_T[y0][0]]]*self.transition_proba_2[self.Y_index[self.omega_T[y0][1]],self.Y_index[self.omega_T[y0][0]]]*self.transition_proba_1[y,self.Y_index[self.omega_T[y0][1]]]*self.observation_proba[self.X_index[obs[t]],y],self.Y_index[self.omega_T[y0][1]]) for y0 in range(self.T)])
                    V[t][y] = prob
                    newpath[y]=path[state]+[y]
                path = newpath
        (prob, state) = max([(V[len(obs)-1][state],state) for state in range(self.N)])
        return path[state]

IndentationError: unindent does not match any outer indentation level (<ipython-input-63-59aa54be3c65>, line 107)

In [None]:
import pickle

with open('train10.pkl','rb') as f:
    data = pickle.load(f)

In [None]:
def make_counts(corpus):
    c_tags = dict()
    c_obs = dict()
    c_trans_1 = dict()
    c_trans_2 = dict()
    c_pairs = dict()
    c_inits = dict()
    c_biobs = dict()# used to record the last two states before the state now
    for word in corpus:
        for i in range(len(word)):
            pair = word[i]
            tag = pair[1]
            obs = pair[0]
            if tag in c_tags:
                c_tags[tag]+=1
            else:
                c_tags[tag]=1
            if obs in c_obs:
                c_obs[obs]+=1
            else:
                c_obs[obs]=1
            if pair in c_pairs:
                c_pairs[pair]+=1
            else:
                c_pairs[pair]=1
            if i>0:
                trans_1 = (word[i-1][1],tag)
                if trans_1 in c_trans_1:
                    c_trans_1[trans_1]+=1
                else:
                    c_trans_1[trans_1]=1
            else:
                init = tag
                if init in c_inits:
                     c_inits[init] += 1
                else:
                     c_inits[init] = 1
            if i>=2:
                bi_obs = (word[i-2][1],word[i-1][1])
                if bi_obs in c_biobs:
                    c_biobs[bi_obs]+=1
                else:
                    c_biobs[bi_obs]=1
                trans = (bi_obs,tag)
                if trans in c_trans_2:
                    c_trans_2[trans]+=1
                else:
                    c_trans_2[trans]=1
               
    return c_obs, c_tags, c_pairs, c_trans_1,c_trans_2, c_inits, c_biobs


In [None]:
c_obs, c_tags, c_pairs, c_trans_1,c_trans_2, c_inits, c_trans_start_list = make_counts(data)

In [None]:
hmm = HMM(c_tags.keys(),c_obs.keys(),c_trans_start_list.keys(),transition_proba_1=None,transition_proba_2=None,
         observation_proba=None,
         initial_state_proba=None,smoothing_obs=0.001)
hmm.supervised_training(c_pairs, c_trans_1, c_trans_2, c_inits)

In [None]:
with open('test10.pkl','rb') as f:
    data_test = pickle.load(f)

In [None]:
hmm.Y_index[hmm.omega_T[0][1]]

In [None]:
errors = 0
total = 0
errors_letters = 0

for word in data_test:
    obs = []
    true = []
    for pair in word:
        obs.append(pair[0])
        true.append(hmm.Y_index[pair[1]])
    ls_states = hmm.viterbi(obs)
    print ls_states, true
    for i in  range(len(ls_states)):
        total+=1
        if ls_states[i]!=true[i]:
            errors_letters+=1
print "Error rate =", errors_letters*1.0/total