# Début d'implémentation du modèle

### Étape 0 : Importations

In [278]:
import numpy as np
import torch
from torch.autograd import Variable
import torch.nn.functional as F
import math
import pandas as pd
import random

### Étape 1 : Récupération des données

In [279]:
df = pd.read_csv("C:/Users/torna/Documents/StatApp/StatApp/data/sample1.txt",sep='\n',header=None)

### Étape 2 : Créer le vocabulaire à partir du corpus de phrases

In [280]:
df2 = df[0:10]

corpus = []
for index, row in df2.iterrows():
    for j, column in row.iteritems():
        corpus.append(column)

In [281]:
corpus_corr = []

for phrase in corpus:
    # Suppression de la ponctuation
    phrase = phrase.replace("?","")
    phrase = phrase.replace(".","")
    phrase = phrase.replace("!","")
    phrase = phrase.replace(";","")
    phrase = phrase.replace(",","")
    phrase = phrase.replace(":","")
    # On met tout en minuscule
    phrase = phrase.lower()
    # On ajoute la phrase
    corpus_corr.append(phrase)

In [282]:
def tokenize(corpus):
    tokens = [phrase.split() for phrase in corpus]
    return tokens

t_corpus = tokenize(corpus_corr)

In [283]:
# On supprime les mentions @nicknames
for phrase in t_corpus:
    for mot in phrase:
        if mot[0] == '@':
            phrase.remove(mot)

In [284]:
voc = []
freqs = {}
for phrase in t_corpus:
    for mot in phrase:
        if mot not in voc:
            voc.append(mot)
            freqs[mot] = 1
        else:
            freqs[mot] +=1
voc_size = len(voc)

###### Sub-sampling

In [285]:
total_mots = 0
for phrase in t_corpus:
    total_mots += len(phrase)

In [286]:
for key, value in freqs.items():
    freqs[key] = value / total_mots

In [287]:
p_drop = {word: min((math.sqrt(freqs[word]/0.001)+1)*(0.001/freqs[word]),1) for word in freqs}

In [288]:
train_words = [word for word in voc if np.random.random() < (p_drop[word])]
voc_size = len(train_words)

In [289]:
corpus_samp = []
for phrase in t_corpus:
    phrase_samp = []
    for mot in phrase:
        if mot in train_words:
            phrase_samp.append(mot)
    corpus_samp.append(phrase_samp)

### Étape 3 : Créations pairs mots centraux / contexte

In [290]:
mot_index = {w: index for (index, w) in enumerate(train_words)}
index_mot = {index: w for (index, w) in enumerate(train_words)}

taille_fenetre = 4
index_pairs = []
# On traite chaque phrase.
for phrase in corpus_samp:
    indices = [mot_index[mot] for mot in phrase]
    # On traite chaque mot comme un mot central
    for center_word in range(len(indices)):
        # Pour chaque fenetre possible
        for w in range(-taille_fenetre, taille_fenetre + 1):
            context_word = center_word + w
            # On fait attention à ne pas sauter de phrases
            if context_word < 0 or context_word >= len(indices) or center_word == context_word:
                continue
            context_word_ind = indices[context_word]
            index_pairs.append((indices[center_word], context_word_ind))

In [291]:
index_pairs_np = np.array(index_pairs)

### Étape 4 : Création du modèle

In [292]:
#Couche d'entrée
def get_input_layer(word_idx):
    x = torch.zeros(voc_size).float()
    x[word_idx] = 1.0
    return x

# Choix de dimension
embedding_dims = 10
# Initialisation
# Variable : comme Tensor mais avec les valeurs qui changent pendant le traitement
W1 = Variable(torch.randn(embedding_dims, voc_size).float(), requires_grad=True)
W2 = Variable(torch.randn(voc_size, embedding_dims).float(), requires_grad=True)
num_epochs = 5 # "époques"
learning_rate = 0.01

# Différentes étapes
for epo in range(num_epochs):
    loss_val = 0
    for word, context in index_pairs:
        x = Variable(get_input_layer(word)).float()
        y_true = Variable(torch.from_numpy(np.array([context])).long())

        # Matmul = produits matriciels de deux tensors
        z1 = torch.matmul(W1, x)
        z2 = torch.matmul(W2, z1)
    
        # Calcul softmax
        log_softmax = F.log_softmax(z2, dim=0)
        
        # nll_loss(pred/target) - negative log likehood
        loss = F.nll_loss(log_softmax.view(1,-1), y_true)
        loss_val += loss.data
        
        # Propagation
        loss.backward()
        W1.data -= learning_rate * W1.grad.data
        W2.data -= learning_rate * W2.grad.data

        W1.grad.data.zero_()
        W2.grad.data.zero_()

In [293]:
W2[0:10]

tensor([[ 0.0504,  0.6502,  0.4829, -0.3945,  0.0564,  0.4515,  2.2502,  0.5078,
         -0.0333,  0.4068],
        [-0.4991, -0.0359, -0.0813,  0.6033, -0.2522,  1.1190,  0.6002,  0.7018,
          0.1371, -0.9672],
        [-0.2307, -1.7345, -0.4785, -0.5673,  1.2067,  0.5514,  1.3583, -0.5170,
          0.4886, -0.0573],
        [ 0.8444, -0.8946, -1.5188, -0.1301,  1.5705,  0.2099,  0.0330, -1.9745,
         -0.4497, -1.2978],
        [ 1.6875,  1.2736, -0.5157,  1.1318,  0.8934,  0.4470,  0.5132,  0.4536,
          0.3562,  0.1779],
        [ 0.4951, -1.1750, -0.8675, -0.0324, -0.8883, -1.5926, -1.4009, -1.2481,
         -1.3336,  0.2735],
        [-1.0025, -1.3701,  1.0445, -0.0246,  0.7833, -0.9320, -0.6022, -0.1857,
          1.7320,  0.6987],
        [ 1.6818, -1.3460, -1.1980, -0.2698, -0.8087,  1.0260, -0.0188,  0.3060,
          0.7526,  0.5768],
        [ 0.6953,  0.0922,  0.3825,  0.9420,  0.3644,  0.6338,  0.2019, -0.7270,
          1.6056, -0.9292],
        [ 0.2215,  

In [294]:
# Distance/similarité cosinus
def cos_distance(u, v):
    return (np.dot(u, v)  / (math.sqrt(np.dot(u, u)) *  (math.sqrt(np.dot(v, v)))))

In [295]:
# Dictionnaire des poids
mot_poids = {index_mot[index]: poids.detach().numpy() for (index, poids) in enumerate(W2)}

### Étape 5 : Résultats du modèle

In [296]:
def mot_plus_proche(word, n=5):
    word_distance = {}
    for mot in mot_poids:
        if mot != word:
            word_distance[mot] = (cos_distance(mot_poids[mot],(mot_poids[word])))
    word_distance = sorted(word_distance.items(), key=lambda t: t[1],reverse=True)
    return word_distance[0:10]