# Début d'implémentation du modèle

## Traitement des données
A réfléchir par la suite.

### Etape 0 : on importe ce qui est nécessaire

In [2]:
import numpy as np
import torch
from torch.autograd import Variable
import torch.nn.functional as F
import math

### Étape 1 : Créer le vocabulaire à partir du corpus de phrases

In [3]:
corpus = [
    'il est un roi',
    'elle est une reine',
    'il est un homme',
    'elle est une femme',
    'londres est la capitale du royaume uni',
    "berlin est la capitale de l allemagne",
    'paris est la capitale de la france',
]

In [4]:
def tokenize(corpus):
    tokens = [phrase.split() for phrase in corpus]
    return tokens

t_corpus = tokenize(corpus)
t_corpus

[['il', 'est', 'un', 'roi'],
 ['elle', 'est', 'une', 'reine'],
 ['il', 'est', 'un', 'homme'],
 ['elle', 'est', 'une', 'femme'],
 ['londres', 'est', 'la', 'capitale', 'du', 'royaume', 'uni'],
 ['berlin', 'est', 'la', 'capitale', 'de', 'l', 'allemagne'],
 ['paris', 'est', 'la', 'capitale', 'de', 'la', 'france']]

In [5]:
voc = []
for phrase in t_corpus:
    for mot in phrase:
        if mot not in voc:
            voc.append(mot)
voc_size = len(voc)
voc

['il',
 'est',
 'un',
 'roi',
 'elle',
 'une',
 'reine',
 'homme',
 'femme',
 'londres',
 'la',
 'capitale',
 'du',
 'royaume',
 'uni',
 'berlin',
 'de',
 'l',
 'allemagne',
 'paris',
 'france']

### Étape 2 : on s'intéresse aux mots centraux et aux contextes suivant taille de fenêtre

In [6]:
mot_index = {w: index for (index, w) in enumerate(voc)}
index_mot = {index: w for (index, w) in enumerate(voc)}

taille_fenetre = 3
index_pairs = []
# On traite chaque phrase.
for phrase in t_corpus:
    indices = [mot_index[mot] for mot in phrase]
    # On traite chaque mot comme un mot central
    for center_word in range(len(indices)):
        # Pour chaque fenetre possible
        for w in range(-taille_fenetre, taille_fenetre + 1):
            context_word = center_word + w
            # On fait attention à ne pas sauter de phrases
            if context_word < 0 or context_word >= len(indices) or center_word == context_word:
                continue
            context_word_ind = indices[context_word]
            index_pairs.append((indices[center_word], context_word_ind))


In [7]:
index_pairs_np = np.array(index_pairs)
index_pairs_np

array([[ 0,  1],
       [ 0,  2],
       [ 0,  3],
       [ 1,  0],
       [ 1,  2],
       [ 1,  3],
       [ 2,  0],
       [ 2,  1],
       [ 2,  3],
       [ 3,  0],
       [ 3,  1],
       [ 3,  2],
       [ 4,  1],
       [ 4,  5],
       [ 4,  6],
       [ 1,  4],
       [ 1,  5],
       [ 1,  6],
       [ 5,  4],
       [ 5,  1],
       [ 5,  6],
       [ 6,  4],
       [ 6,  1],
       [ 6,  5],
       [ 0,  1],
       [ 0,  2],
       [ 0,  7],
       [ 1,  0],
       [ 1,  2],
       [ 1,  7],
       [ 2,  0],
       [ 2,  1],
       [ 2,  7],
       [ 7,  0],
       [ 7,  1],
       [ 7,  2],
       [ 4,  1],
       [ 4,  5],
       [ 4,  8],
       [ 1,  4],
       [ 1,  5],
       [ 1,  8],
       [ 5,  4],
       [ 5,  1],
       [ 5,  8],
       [ 8,  4],
       [ 8,  1],
       [ 8,  5],
       [ 9,  1],
       [ 9, 10],
       [ 9, 11],
       [ 1,  9],
       [ 1, 10],
       [ 1, 11],
       [ 1, 12],
       [10,  9],
       [10,  1],
       [10, 11],
       [10, 12

### Étape 3 : création du modèle

In [8]:
#Couche d'entrée
def get_input_layer(word_idx):
    x = torch.zeros(voc_size).float()
    x[word_idx] = 1.0
    return x

# Choix de dimension
embedding_dims = 10
# Initialisation
W1 = Variable(torch.randn(embedding_dims, voc_size).float(), requires_grad=True)
W2 = Variable(torch.randn(voc_size, embedding_dims).float(), requires_grad=True)
num_epochs = 100
learning_rate = 0.001

# Différentes étapes
for epo in range(num_epochs):
    loss_val = 0
    for data, target in index_pairs:
        x = Variable(get_input_layer(data)).float()
        y_true = Variable(torch.from_numpy(np.array([target])).long())

        z1 = torch.matmul(W1, x)
        z2 = torch.matmul(W2, z1)
    
        log_softmax = F.log_softmax(z2, dim=0)

        loss = F.nll_loss(log_softmax.view(1,-1), y_true)
        loss_val += loss.data
        
        loss.backward()
        W1.data -= learning_rate * W1.grad.data
        W2.data -= learning_rate * W2.grad.data

        W1.grad.data.zero_()
        W2.grad.data.zero_()

In [9]:
W1

tensor([[-1.9555e-01,  7.2886e-01,  1.0322e+00, -8.6814e-01,  5.1501e-01,
          1.5084e+00, -3.1089e-01,  1.4521e+00, -9.7177e-01, -1.0397e-01,
          1.8555e+00,  2.0235e+00, -2.1156e+00,  3.9244e-01, -7.6099e-01,
          2.1998e+00,  5.8356e-01, -1.2200e-01, -3.3776e-01,  8.0611e-01,
         -6.6319e-01],
        [ 9.4960e-02, -1.4288e-01, -2.6139e-01, -3.7451e-01,  7.3089e-01,
          4.3774e-01, -7.6348e-02,  4.7126e-01,  1.2195e-01, -3.0937e-01,
         -2.7749e-02,  9.0792e-02,  1.1726e+00, -1.2627e+00, -2.6967e-01,
          1.0601e-01, -1.6745e+00, -1.0016e+00, -3.8822e-01, -3.4536e-01,
          8.5108e-01],
        [ 2.2097e-01, -1.6801e-01, -2.6975e-01,  6.1842e-03,  1.0199e+00,
         -1.2153e-01, -2.6017e-01,  3.3633e-01,  2.1459e+00,  8.5230e-01,
          3.6541e-02,  5.2933e-01,  1.8022e+00, -1.3127e-01, -2.2334e-01,
          1.6428e+00, -1.9520e+00,  1.1803e+00,  1.6481e+00,  1.8366e+00,
         -1.3019e+00],
        [-1.0421e+00,  5.5622e-01, -3.9836e

In [10]:
W2

tensor([[ 1.0272e+00,  1.5142e+00, -1.8979e+00,  1.9843e+00,  1.2192e+00,
          8.4625e-01, -7.1607e-01, -7.0266e-01, -6.3734e-01, -4.5258e-01],
        [ 5.8777e-03, -3.1362e-01,  1.0872e+00, -1.1838e+00, -7.8668e-02,
          3.7533e-01, -2.0305e-01,  2.7986e-01, -1.0218e-01,  2.1370e-01],
        [-4.2832e-01,  2.4936e-02, -4.8093e-01, -4.2972e-01,  5.7291e-01,
         -3.0096e-01, -4.2489e-01,  1.0053e-01,  1.3057e+00,  6.0753e-01],
        [-7.2527e-01, -1.8543e-01,  5.3229e-01, -2.4526e-01, -6.7638e-01,
          1.8366e+00,  5.5246e-02,  4.6854e-01, -1.7854e-01,  6.7894e-01],
        [-7.8903e-01,  9.2313e-01, -1.1991e+00, -8.0325e-01,  1.4768e+00,
         -9.4519e-01,  1.8388e-01,  1.4917e-01,  1.2319e+00, -5.1932e-01],
        [-1.2087e-01,  6.4334e-01,  9.7421e-01,  3.4288e-01, -4.2727e-01,
          1.1688e+00, -8.7069e-01,  7.4865e-01, -6.1737e-01, -3.7798e-02],
        [ 4.9395e-01,  3.4851e-01,  5.3250e-01,  6.4358e-01, -1.2064e+00,
          1.0689e+00, -9.2787e-0

In [26]:
# Distance/similarité cosinus
def cos_distance(u, v):
    return (np.dot(u, v)  / (math.sqrt(np.dot(u, u)) *  (math.sqrt(np.dot(v, v)))))

In [27]:
# Dictionnaire des poids
mot_poids = {index_mot[index]: poids.detach().numpy() for (index, poids) in enumerate(W2)}
mot_poids

{'il': array([ 1.0271939 ,  1.5142411 , -1.8979274 ,  1.9842857 ,  1.2191842 ,
         0.8462504 , -0.7160699 , -0.7026647 , -0.63734436, -0.4525826 ],
       dtype=float32),
 'est': array([ 0.00587774, -0.31361765,  1.087241  , -1.1837964 , -0.07866777,
         0.3753333 , -0.2030453 ,  0.27986485, -0.10218013,  0.21370181],
       dtype=float32),
 'un': array([-0.4283153 ,  0.02493649, -0.48093316, -0.42972043,  0.5729108 ,
        -0.3009559 , -0.42488915,  0.10052869,  1.3056543 ,  0.6075299 ],
       dtype=float32),
 'roi': array([-0.7252708 , -0.1854279 ,  0.53229386, -0.24526359, -0.67638224,
         1.8366107 ,  0.05524555,  0.4685403 , -0.17853668,  0.6789356 ],
       dtype=float32),
 'elle': array([-0.78903353,  0.92313313, -1.1990632 , -0.8032484 ,  1.4767618 ,
        -0.94518656,  0.18388386,  0.14917396,  1.2318693 , -0.51932156],
       dtype=float32),
 'une': array([-0.12087134,  0.643345  ,  0.97421396,  0.34287968, -0.4272664 ,
         1.1687648 , -0.870695  ,  0

In [28]:
# Quelques résultats
cos_distance(mot_poids["homme"],mot_poids["femme"])

0.38157896279506315

In [29]:
cos_distance(mot_poids["homme"],mot_poids["il"])

-0.2913856975124925

In [32]:
cos_distance(mot_poids["homme"] + mot_poids["roi"] - mot_poids["femme"],mot_poids["reine"])

0.43000641219376956