# Apprentissage non-supervisé

### Correction de fautes de frappe

In [1]:
import numpy as np
import pandas as pd
import pickle
import time

In [2]:
class HMM:

    def __init__(self, A, B, pi):
        self.A = A
        self.B = B
        self.pi = pi
        

    def forward(self, Y):
        N = self.A.shape[0]
        T = len(Y)

        alpha = np.zeros([N,T])
        alpha[:,0] = self.pi * self.B[:, Y[0]]

        for t in range(1,T):
            for i in range(N):
                alpha[i,t] = self.B[i, Y[t]] * np.sum(alpha[:,t-1] * self.A[:,i])

        return alpha

    def backward(self, Y):
        N = self.A.shape[0]
        T = len(Y)

        beta = np.zeros([N,T])
        beta[:,-1] = 1

        for t in range(T-2,-1,-1):
            for i in range(N):
                beta[i,t] = np.sum(beta[:,t+1] * self.A[i,:] * self.B[:, Y[t+1]])

        return beta

    def baum_welch_train(self, Y):
        
        # La mise a jour des parametres ne s'effectue pas correctement.
        tps = time.clock()
        N = self.A.shape[0]
        T = len(Y)

        # Effectuons les etapes de forward et backward
        alpha = self.forward(Y)
        beta = self.backward(Y)

        # Stockons P( Y | theta )
        Y_proba = np.sum(alpha[:,-1])

        # Calculons gamma
        gamma = alpha * beta / Y_proba
        
        # Calculons xi
        xi = np.zeros([T-1, N, N])
        for t in range(T-1):
            xi[t,:,:] = (alpha[:,t] * self.A.T).T * beta[:, t+1] * self.B[:,Y[t+1]] / Y_proba
 
        self.pi = gamma[:,0]
        self.A = np.sum(xi,axis = 0)/np.sum(gamma[:,:-1],axis = 1)
        
        for i in range(N):
            for t1 in range(T):
                self.B[i,t1] = 0
                ind = np.argwhere(Y == t1)
                self.B[i,t1] += np.sum(gamma[i,ind])
                self.B[i,t1]/= np.sum(gamma[i,:])
                
        print("Temps d'entrainement %.2fs" %(time.clock()-tps))
        
        return self.A, self.B, self.pi
        
    def viterbi(self, Y):
        N = self.A.shape[0]
        T = len(Y)
        old = np.zeros([T-1, N])

        V = np.zeros((N, T))
        V[:,0] = self.pi * self.B[:,Y[0]]

        for t in range(1, T):
            for n in range(N):
                probas = V[:,t-1] * self.A[:,n] * self.B[n, Y[t]]
                old[t-1,n] = np.argmax(probas)
                V[n,t] = np.max(probas)

        return V, old

In [3]:
path = 'typos-data/'

# Données avec 10% de typos
train10 = pickle.load(open(path+'train10.pkl', 'rb'))
test10 = pickle.load(open(path+'test10.pkl', 'rb'))

# Données avec 20% de typos
train20 = pickle.load(open(path+'train20.pkl', 'rb'))
test20 = pickle.load(open(path+'test20.pkl', 'rb'))

train = train10
test = test10

tot = len(train + test)
print ("Nombre de phrases totales = " + str(tot))
print ("Nombre de phrases de train = " + str(len(train)))
print ("Nombre de phrases de test  = " + str(len(test)))

Nombre de phrases totales = 30558
Nombre de phrases de train = 29057
Nombre de phrases de test  = 1501


In [4]:
temp = [[letter[0] for letter in word] for word in train10]
texte = [letter for word in temp for letter in word]
texte[:17]

['b',
 'y',
 't',
 'h',
 'e',
 'i',
 'r',
 'o',
 'w',
 'n',
 'a',
 'c',
 'v',
 'o',
 'u',
 'n',
 't']

In [6]:
alphabet = 'abcdefghijklmnopqrstuvwxyz'
alphabet.index('a')

0

In [7]:
dic_len = len(np.unique(texte))
texte_len = len(texte)
print(dic_len)
print(texte_len)

26
143168


In [8]:
A_init = np.ones([dic_len,dic_len])/dic_len
B_init = np.ones([dic_len,texte_len])/dic_len
pi_init = np.ones(dic_len)/dic_len

In [9]:
observations = [alphabet.index(letter) for letter in texte]
observations[:10]

[1, 24, 19, 7, 4, 8, 17, 14, 22, 13]

In [10]:
hmm = HMM(A=A_init, B=B_init, pi = pi_init)

A,B,pi = hmm.baum_welch_train(observations)



Temps d'entrainement 347.31s


In [11]:
#On voit ici que l'apprentissage ne s'est pas bien effectue, donc l'algorithme de viterbi ne peut pas fonctionner
hmm.viterbi(observations)

(array([[ nan,  nan,  nan, ...,  nan,  nan,  nan],
        [ nan,  nan,  nan, ...,  nan,  nan,  nan],
        [ nan,  nan,  nan, ...,  nan,  nan,  nan],
        ..., 
        [ nan,  nan,  nan, ...,  nan,  nan,  nan],
        [ nan,  nan,  nan, ...,  nan,  nan,  nan],
        [ nan,  nan,  nan, ...,  nan,  nan,  nan]]),
 array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        ..., 
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.]]))