In [1]:
import pickle
import numpy as np
from numpy import array, ones,zeros
import sys

class HMM:
    def __init__(self, 
               state_list,observation_list,
               transition_proba=None, 
               observation_proba=None,
               initial_state_proba=None,
               smoothing_obs=0.01):
        print "HMM creating with: "
        self.N = len(state_list)#number of states
        self.M = len(observation_list)#number of observation
        print str(self.N)+" states"
        print str(self.M)+" observations."
        self.omega_Y = state_list
        self.omega_X = observation_list
        if transition_proba is None:
            self.transition_proba = zeros((self.N,self.N), float)
        else:
            self.transition_proba = transition_proba
        if observation_proba is None:
            self.observation_proba = zeros((self.M, self.N),float)
        else:
            self.observation_proba = observation_proba
        if initial_state_proba is None:
            self.initial_state_proba = zeros((self.N,),float)
        self.make_indexes()
        self.smoothing_obs = smoothing_obs
    
    def make_indexes(self):
        self.Y_index = {}
        for i in range(self.N):
            self.Y_index[self.omega_Y[i]] = i
        self.X_index = {}
        for i in range(self.M):
            self.X_index[self.omega_X[i]] = i
            
    def observation_estimation(self, pair_counts):
        for pair in pair_counts:
            obs = pair[0]
            tag = pair[1]
            cpt = pair_counts[pair]
            k = 0
            if obs in self.X_index:
                k = self.X_index[obs]
            i = self.Y_index[tag]
            self.observation_proba[k,i] = cpt
        
        self.observation_proba = self.observation_proba + self.smoothing_obs
        self.observation_proba = self.observation_proba/self.observation_proba.sum(axis=0).reshape(1,self.N)
        
    def transition_estimation(self, trans_counts):
        for pair in trans_counts:
            i = self.Y_index[pair[1]]
            j = self.Y_index[pair[0]]
            self.transition_proba[i,j] = trans_counts[pair]
        self.transition_proba = self.transition_proba/self.transition_proba.sum(axis=0).reshape(1,self.N)
        
    def init_estimation(self, init_counts):
        for tag in init_counts:
            i = self.Y_index[tag]
            self.initial_state_proba[i] = init_counts[tag]
        self.initial_state_proba = self.initial_state_proba/sum(self.initial_state_proba)
            
    def supervised_training(self, pair_counts, trans_counts, init_counts):
        self.observation_estimation(pair_counts)
        self.transition_estimation(trans_counts)
        self.init_estimation(init_counts)
        
    def viterbi(self, obs):#obs is chars sequence
        V = [{}]
        path = {}

        # Initialize base cases (t == 0)
        for y in range(self.N):
            V[0][y] = self.initial_state_proba[y] * self.observation_proba[self.Y_index[obs[0]],y]
            path[y] = [y]

        # Run Viterbi for t > 0
        for t in range(1,len(obs)):
            V.append({})
            newpath = {}

            for y in range(self.N):
                (prob, state) = max([(V[t-1][y0] * self.transition_proba[y,y0] * self.observation_proba[self.X_index[obs[t]],y], y0) for y0 in range(self.N)])
                V[t][y] = prob
                newpath[y] = path[state] + [y]

            # Don't need to remember the old paths
            path = newpath

        (prob, state) = max([(V[len(obs) - 1][y], y) for y in range(self.N)])
        return path[state]
    
with open('train10.pkl','rb') as f:
    data = pickle.load(f)

In [2]:
data[0], data[1], data[2]

([('b', 'b'), ('y', 'y')],
 [('t', 't'), ('h', 'h'), ('e', 'e'), ('i', 'i'), ('r', 'r')],
 [('o', 'o'), ('w', 'w'), ('n', 'n')])

In [3]:
len(data)

29057

In [4]:
def error_counts(data):
    error_words = 0
    error_letters = 0
    total_words = 0
    total_letters = 0
    is_error = False
    for word in data:
        total_words += 1
        is_error = False
        for pair in word:
            total_letters += 1
            if pair[0]!=pair[1]:
                if is_error == False:
                    error_words+=1
                    error_letters+=1
                    is_error=True
                else:
                    error_letters+=1

    print "total words :", total_words
    print "total letters: ", total_letters
    print "Error words rate: " ,error_words*1.0/total_words
    print "Error letter rate: ",error_letters*1.0/total_letters

In [5]:
error_counts(data)

total words : 29057
total letters:  143168
Error words rate:  0.379942870909
Error letter rate:  0.0996102481001


In [6]:
def make_counts(corpus):
    c_tags = dict()
    c_obs = dict()
    c_trans = dict()
    c_pairs = dict()
    c_inits = dict()
    for word in corpus:
        for i in range(len(word)):
            pair = word[i]
            tag = pair[1]
            obs = pair[0]
            if tag in c_tags:
                c_tags[tag]+=1
            else:
                c_tags[tag]=1
            if obs in c_obs:
                c_obs[obs]+=1
            else:
                c_obs[obs]=1
            if pair in c_pairs:
                c_pairs[pair]+=1
            else:
                c_pairs[pair]=1
            if i>0:
                trans = (word[i-1][1],tag)
                if trans in c_trans:
                    c_trans[trans]+=1
                else:
                    c_trans[trans]=1
            else:
                if tag in c_inits:
                    c_inits[tag]+=1
                else:
                    c_inits[tag]=1
    return c_obs, c_tags, c_pairs, c_trans, c_inits
            

In [7]:
c_obs, c_tags, c_pairs, c_trans, c_inits = make_counts(data)

In [8]:
print c_inits

{'a': 3093, 'c': 1210, 'b': 1429, 'e': 880, 'd': 853, 'g': 490, 'f': 1035, 'i': 2352, 'h': 1048, 'k': 79, 'j': 54, 'm': 1171, 'l': 675, 'o': 2291, 'n': 757, 'q': 34, 'p': 1535, 's': 2172, 'r': 758, 'u': 288, 't': 5058, 'w': 1582, 'v': 136, 'y': 74, 'x': 3}


In [9]:
hmm = HMM(c_tags.keys(),c_obs.keys(),transition_proba=None,
         observation_proba=None,
         initial_state_proba=None,smoothing_obs=0.001)
hmm.supervised_training(c_pairs, c_trans, c_inits)

HMM creating with: 
26 states
26 observations.


In [10]:
print sum(hmm.initial_state_proba)

1.0


In [11]:
with open('test20.pkl','rb') as f:
    data_test = pickle.load(f)

In [12]:
errors = 0
total = 0
errors_letters = 0

for word in data_test:
    obs = []
    true = []
    for pair in word:
        obs.append(pair[0])
        true.append(hmm.Y_index[pair[1]])
    ls_states = hmm.viterbi(obs)

    for i in  range(len(ls_states)):
        total+=1
        if ls_states[i]!=true[i]:
            errors_letters+=1
print "Error rate =", errors_letters*1.0/total

Error rate = 0.13234677371


In [13]:
error_counts(data_test)

total words : 3374
total letters:  16691
Error words rate:  0.593360995851
Error letter rate:  0.194056677251


In [14]:
#test for the dict usage
path={}
path[1]=[1]
print path[1]

[1]
