# Début d'implémentation du modèle

### Étape 0 : Importations

In [158]:
import numpy as np
import torch
from torch.autograd import Variable
import torch.nn.functional as F
import math
import pandas as pd
import random
from random import *
from numpy.random import multinomial

### Étape 1 : Récupération des données

In [2]:
df = pd.read_csv("C:/Users/torna/Documents/StatApp/StatApp/data/sample1.txt",sep='\n',header=None)

### Étape 2 : Créer le vocabulaire à partir du corpus de phrases

In [134]:
df2 = df[0:100]

corpus = []
for index, row in df2.iterrows():
    for j, column in row.iteritems():
        corpus.append(column)

In [135]:
corpus_corr = []

for phrase in corpus:
    # Suppression de la ponctuation
    phrase = phrase.replace("?","")
    phrase = phrase.replace(".","")
    phrase = phrase.replace("!","")
    phrase = phrase.replace(";","")
    phrase = phrase.replace(",","")
    phrase = phrase.replace(":","")
    phrase = phrase.replace("#","")
    # On met tout en minuscule
    phrase = phrase.lower()
    # On ajoute la phrase
    corpus_corr.append(phrase)

In [136]:
def tokenize(corpus):
    tokens = [phrase.split() for phrase in corpus]
    return tokens

t_corpus = tokenize(corpus_corr)

In [137]:
# On supprime les mentions @nicknames
corpus_ok = []
for phrase in t_corpus:
    phrase_bis = []
    for mot in phrase:
        if mot[0] == '@':
            mot = "nickname"
        phrase_bis.append(mot)
    if len(phrase) >= 5:
        corpus_ok.append(phrase_bis)
t_corpus = corpus_ok

In [138]:
voc = []
freqs = {}
for phrase in t_corpus:
    for mot in phrase:
        if mot not in voc:
            voc.append(mot)
            freqs[mot] = 1
        else:
            freqs[mot] +=1
voc_size = len(voc)
print(voc_size)

505


###### Calcul des probas pour le subsampling et le negative sampling

In [139]:
total_mots = 0
for phrase in t_corpus:
    total_mots += len(phrase)

In [140]:
for key, value in freqs.items():
    freqs[key] = value / total_mots

In [141]:
# Probabilité d'être gardé dans le subsampling
p_sub = {word: min((math.sqrt(freqs[word]/0.001)+1)*(0.001/freqs[word]),1) for word in freqs}

In [142]:
p_neg_1 = {word: freqs[word]**(3/4) for word in freqs}
total_neg = 0
for word in p_neg_1:
    total_neg+=p_neg_1[word]
p_neg = {word: p_neg_1[word]/total_neg for word in p_neg_1}

In [143]:
def subSampling(phrase):
    phrase_samp = []
    for mot in phrase:
        if np.random.random() < (p_sub[mot]):
                phrase_samp.append(mot)
    return phrase_samp

In [144]:
mot_index = {w: index for (index, w) in enumerate(voc)}
index_mot = {index: w for (index, w) in enumerate(voc)}

### Étape 3 : Créations pairs mots centraux / contexte

In [38]:

#taille_fenetre = 4
#index_pairs = []
# On traite chaque phrase.
#for phrase in t_corpus:
 #   indices = [mot_index[mot] for mot in phrase]
    # On traite chaque mot comme un mot central
   # for center_word in range(len(indices)):
       # Pour chaque fenetre possible
       # for w in range(-taille_fenetre, taille_fenetre + 1):
      #      context_word = center_word + w
            # On fait attention à ne pas sauter de phrases
     #       if context_word < 0 or context_word >= len(indices) or center_word == context_word:
    #            continue
   #         context_word_ind = indices[context_word]
  #          index_pairs.append((indices[center_word], context_word_ind))
            
#index_pairs_np = np.array(index_pairs)
#index_pairs_np[0:150]            


### Étape 4 : Création du modèle

In [186]:
#Couche d'entrée
def get_input_layer(word_idx):
    x = torch.zeros(voc_size).float()
    x[word_idx] = 1.0
    return x

# Choix de dimension
embedding_dims = 10
# Initialisation
# Variable : comme Tensor mais avec les valeurs qui changent pendant le traitement
W1 = Variable(torch.randn(embedding_dims, voc_size).float(), requires_grad=True)
W2 = Variable(torch.randn(voc_size, embedding_dims).float(), requires_grad=True)
num_epochs = 100 # "époques"
learning_rate = 0.01
taille_fenetre = 5


# Différentes étapes
for epo in range(num_epochs):
    loss_val = 0
    for phrase in t_corpus:
        # Sub-sampling : pour chaque phrase, on réalise le subsampling éventuel.
        phrase_samp = subSampling(phrase)
        # Ensuite, on choisit un mot focus/contexte au hasard
        
        # On crée tous les couples par phrase
        index_pairs = []
        indices = [mot_index[mot] for mot in phrase_samp]
        # On traite chaque mot comme un mot central
        for center_word in range(len(indices)):
        # Pour chaque fenetre possible
            for w in range(-taille_fenetre, taille_fenetre + 1):
                context_word = center_word + w
                # On fait attention à ne pas sauter de phrases
                if context_word < 0 or context_word >= len(indices) or center_word == context_word:
                    continue
                context_word_ind = indices[context_word]
                index_pairs.append((indices[center_word], context_word_ind))
                
        # On en choisit une
        focus, context = choice(index_pairs)
        
        # Negative samples
        sampled_index = np.array(multinomial(4, list(p_neg.values())))
        word_list = []
        for index, count in enumerate(sampled_index):
            for _ in range(count):
                 word_list.append(index)

        
        x = Variable(get_input_layer(focus)).float()
        y = Variable(torch.from_numpy(np.array([context])).long())
        z1 = torch.matmul(W1, x)
        z2 = torch.matmul(W2, z1)
               
        log_softmax = F.log_softmax(z2, dim=0)

        # nll_loss(pred/target) - negative log likehood
        loss = F.nll_loss(log_softmax.view(1,-1), y)
        loss_val += loss.data

        # Propagation - revoir Pytorch.optimization
        loss.backward()
        W1.data -= learning_rate * W1.grad.data
        W2.data -= learning_rate * W2.grad.data

        W1.grad.data.zero_()
        W2.grad.data.zero_()

    if epo%10==0:
        print(f"Loss à l'époque {epo}: {loss_val}")

Loss à l'époque 0: 924.6293334960938
Loss à l'époque 10: 965.7247924804688
Loss à l'époque 20: 838.9000854492188
Loss à l'époque 30: 815.5311279296875
Loss à l'époque 40: 772.3552856445312
Loss à l'époque 50: 801.9276733398438
Loss à l'époque 60: 785.9378051757812
Loss à l'époque 70: 780.1234130859375
Loss à l'époque 80: 721.62939453125
Loss à l'époque 90: 742.2842407226562


In [187]:
# Différents problèmes :
# - la loss ne diminue pas toujours (mais la tendance est OK)
# - revoir la partie sélection du couple, parfois bug ??

In [154]:
W2[0:10]

tensor([[ 0.3431,  1.1478, -0.3646,  0.9585,  1.2924,  1.7197,  1.7613, -0.4461,
          0.1445, -1.4674],
        [ 0.1458,  0.0233,  0.5180, -1.9255,  0.5275,  0.6784, -1.3668,  1.1555,
          1.4334, -1.1032],
        [-1.2393,  1.1643,  1.1756,  0.1937, -0.8903, -0.4984, -0.0095, -1.3632,
          1.6272,  0.1972],
        [ 1.3565,  0.7157,  0.9915, -0.1102, -0.4371, -0.7880,  0.3525, -0.4517,
          0.5484, -0.7496],
        [ 1.1653, -0.2500,  1.4164,  1.5448,  1.7698, -1.3863, -0.8301, -0.3855,
          0.3140, -1.2120],
        [-0.7621,  0.5651,  1.0514, -0.4918,  1.3155, -1.9175, -0.3335, -1.6094,
          0.3318, -1.0806],
        [ 0.6262,  0.4406,  1.9083,  0.1963,  0.8050,  0.2576, -1.3520,  0.6384,
         -0.7337,  1.0205],
        [-0.1517, -0.1985,  0.1403, -1.0160, -0.2920, -1.9189, -0.7851, -2.0339,
          2.3004, -0.4903],
        [-0.5881, -0.7291,  0.9962,  0.6894, -0.6274,  0.4354,  0.0166, -1.6822,
          0.3377,  0.2643],
        [ 0.2961, -

In [28]:
# Distance/similarité cosinus
def cos_distance(u, v):
    return (np.dot(u, v)  / (math.sqrt(np.dot(u, u)) *  (math.sqrt(np.dot(v, v)))))

In [29]:
# Dictionnaire des poids
mot_poids = {index_mot[index]: poids.detach().numpy() for (index, poids) in enumerate(W2)}

### Étape 5 : Résultats du modèle

In [30]:
def mot_plus_proche(word, n=5):
    word_distance = {}
    for mot in mot_poids:
        if mot != word:
            word_distance[mot] = (cos_distance(mot_poids[mot],(mot_poids[word])))
    word_distance = sorted(word_distance.items(), key=lambda t: t[1],reverse=True)
    return word_distance[0:10]

In [31]:
mot_poids
mot_plus_proche("mort")

[('rathalos', 0.7310312707276541),
 ('y', 0.7117773667606464),
 ('mtue', 0.6972553839008429),
 ('réveille', 0.6645014134124577),
 ('madre', 0.6638478961311802),
 ('😭', 0.6551260058122947),
 ('dodo', 0.6544916175622695),
 ('pense', 0.6393783723895966),
 ('glisse', 0.6249456152564727),
 ('✅', 0.6135785278345168)]