In [45]:
import pickle
from numpy import array, ones,zeros
import sys

class HMM:
    def __init__(self, 
               state_list,observation_list,
               transition_proba=None, 
               observation_proba=None,
               initial_state_proba=None,
               smoothing_obs=0.01):
        print "HMM creating with: "
        self.N = len(state_list)#number of states
        self.M = len(observation_list)#number of observation
        print str(self.N)+" states"
        print str(self.M)+" observations."
        self.omega_Y = state_list
        self.omega_X = observation_list
        if transition_proba is None:
            self.transition_proba = zeros((self.N,self.N), float)
        else:
            self.transition_proba = transition_proba
        if observation_proba is None:
            self.observation_proba = zeros((self.M, self.N),float)
        else:
            self.observation_proba = observation_proba
        if initial_state_proba is None:
            self.initial_state_proba = zeros((self.N,),float)
        self.make_indexes()
        self.smoothing_obs = smoothing_obs
    
    def make_indexes(self):
        self.Y_index = {}
        for i in range(self.N):
            self.Y_index[self.omega_Y[i]] = i
        self.X_index = {}
        for i in range(self.M):
            self.X_index[self.omega_X[i]] = i
            
    def observation_estimation(self, pair_counts):
        for pair in pair_counts:
            wrd = pair[0]
            tag = pair[1]
            cpt = pair_counts[pair]
            k = 0
            if wrd in self.X_index:
                k = self.X_index[wrd]
            i = self.Y_index[tag]
            self.observation_proba[k,i] = cpt
        
        self.observation_proba = self.observation_proba + self.smoothing_obs
        self.observation_proba = self.observation_proba/self.observation_proba.sum(axis=0).reshape(1,self.N)
        
    def transition_estimation(self, trans_counts):
        for pair in trans_counts:
            i = self.Y_index[pair[1]]
            j = self.Y_index[pair[0]]
            self.transition_proba[i,j] = trans_counts[pair]
        self.transition_proba = self.transition_proba/self.transition_proba.sum(axis=0).reshape(1,self.N)
        
    def init_estimation(self, init_counts):
        for tag in init_counts:
            i = self.Y_index[tag]
            self.initial_state_proba[i] = init_counts[tag]
        self.initial_state_proba = self.initial_state_proba/sum(self.initial_state_proba)
            
    def supervised_training(self, pair_counts, trans_counts, init_counts):
        self.observation_estimation(pair_counts)
        self.transition_estimation(trans_counts)
        self.init_estimation(init_counts)

with open('train10.pkl','rb') as f:
    data = pickle.load(f)

In [46]:
data[0], data[1], data[2]

([('b', 'b'), ('y', 'y')],
 [('t', 't'), ('h', 'h'), ('e', 'e'), ('i', 'i'), ('r', 'r')],
 [('o', 'o'), ('w', 'w'), ('n', 'n')])

In [47]:
len(data)

29057

In [48]:
def error_counts():
    error_words = 0
    error_letters = 0
    total_words = 0
    total_letters = 0
    is_error = False
    for word in data:
        total_words += 1
        is_error = False
        for pair in word:
            total_letters += 1
            if pair[0]!=pair[1]:
                if is_error == False:
                    error_words+=1
                    error_letters+=1
                    is_error=True
                else:
                    error_letters+=1

    print "total words :", total_words
    print "total letters: ", total_letters
    print "Error words rate: " ,error_words*1.0/total_words
    print "Error letter rate: ",error_letters*1.0/total_letters

In [49]:
error_counts()

total words : 29057
total letters:  143168
Error words rate:  0.379942870909
Error letter rate:  0.0996102481001


In [50]:
def make_counts(corpus):
    c_tags = dict()
    c_obs = dict()
    c_trans = dict()
    c_pairs = dict()
    c_inits = dict()
    for word in corpus:
        for i in range(len(word)):
            pair = word[i]
            tag = pair[1]
            obs = pair[0]
            if tag in c_tags:
                c_tags[tag]+=1
            else:
                c_tags[tag]=1
            if obs in c_obs:
                c_obs[obs]+=1
            else:
                c_obs[obs]=1
            if pair in c_pairs:
                c_pairs[pair]+=1
            else:
                c_pairs[pair]=1
            if i>0:
                trans = (word[i-1][1],tag)
                if trans in c_trans:
                    c_trans[trans]+=1
                else:
                    c_trans[trans]=1
            else:
                if tag in c_inits:
                    c_inits[tag]+=1
                else:
                    c_inits[tag]=1
    return c_obs, c_tags, c_pairs, c_trans, c_inits
            

In [51]:
c_obs, c_tags, c_pairs, c_trans, c_inits = make_counts(data)

In [52]:
print c_tags
print c_obs
print c_trans

{'a': 10560, 'c': 4808, 'b': 2070, 'e': 18091, 'd': 4541, 'g': 2736, 'f': 3379, 'i': 10976, 'h': 6683, 'k': 590, 'j': 108, 'm': 3773, 'l': 6417, 'o': 11935, 'n': 9778, 'q': 150, 'p': 3217, 's': 9762, 'r': 8247, 'u': 3931, 't': 13877, 'w': 2229, 'v': 1927, 'y': 2985, 'x': 274, 'z': 124}
{'a': 9765, 'c': 4530, 'b': 2289, 'e': 16638, 'd': 5256, 'g': 3161, 'f': 3879, 'i': 10274, 'h': 6558, 'k': 1463, 'j': 1001, 'm': 3677, 'l': 6266, 'o': 11342, 'n': 9106, 'q': 534, 'p': 3432, 's': 9328, 'r': 8443, 'u': 3886, 't': 12941, 'w': 2782, 'v': 2051, 'y': 3266, 'x': 645, 'z': 655}
{('c', 'u'): 143, ('i', 'h'): 2, ('r', 'w'): 16, ('h', 's'): 21, ('n', 'o'): 757, ('o', 'x'): 5, ('s', 'o'): 748, ('r', 'p'): 37, ('k', 'y'): 1, ('a', 'q'): 2, ('f', 'c'): 5, ('o', 'w'): 456, ('w', 'd'): 10, ('f', 'r'): 228, ('m', 'i'): 285, ('r', 'i'): 767, ('c', 'q'): 11, ('h', 'w'): 2, ('a', 's'): 693, ('o', 'u'): 871, ('s', 'k'): 29, ('e', 'g'): 104, ('n', 't'): 1137, ('t', 'i'): 1754, ('n', 'u'): 43, ('r', 'b'): 12, 

In [54]:
hmm = HMM(c_tags.keys(),c_obs.keys(),transition_proba=None,
         observation_proba=None,
         initial_state_proba=None,smoothing_obs=0.001)
hmm.supervised_training(c_pairs, c_trans, c_inits)

HMM creating with: 
26 states
26 observations.


In [55]:
print hmm.transition_proba.sum(axis=0)

[ 1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  1.]


In [56]:
print sum(hmm.initial_state_proba)

1.0
