# Début d'implémentation du modèle

### Étape 0 : Importations

In [1]:
import numpy as np
import torch
from torch.autograd import Variable
import torch.nn.functional as F
import math
import pandas as pd
import random

### Étape 1 : Récupération des données

In [2]:
df = pd.read_csv("C:/Users/torna/Documents/StatApp/StatApp/data/sample1.txt",sep='\n',header=None)

### Étape 2 : Créer le vocabulaire à partir du corpus de phrases

In [4]:
df2 = df[0:100]

corpus = []
for index, row in df2.iterrows():
    for j, column in row.iteritems():
        corpus.append(column)

In [5]:
corpus_corr = []

for phrase in corpus:
    # Suppression de la ponctuation
    phrase = phrase.replace("?","")
    phrase = phrase.replace(".","")
    phrase = phrase.replace("!","")
    phrase = phrase.replace(";","")
    phrase = phrase.replace(",","")
    phrase = phrase.replace(":","")
    phrase = phrase.replace("#","")
    # On met tout en minuscule
    phrase = phrase.lower()
    # On ajoute la phrase
    corpus_corr.append(phrase)

In [6]:
def tokenize(corpus):
    tokens = [phrase.split() for phrase in corpus]
    return tokens

t_corpus = tokenize(corpus_corr)

In [7]:
# On supprime les mentions @nicknames
for phrase in t_corpus:
    for mot in phrase:
        if mot[0] == '@':
            mot = "nickname"
            #phrase.remove(mot)

In [8]:
voc = []
freqs = {}
for phrase in t_corpus:
    for mot in phrase:
        if mot not in voc:
            voc.append(mot)
            freqs[mot] = 1
        else:
            freqs[mot] +=1
voc_size = len(voc)
print(voc_size)

534


###### Calcul des probas pour le subsampling et le negative sampling

In [9]:
total_mots = 0
for phrase in t_corpus:
    total_mots += len(phrase)

In [10]:
for key, value in freqs.items():
    freqs[key] = value / total_mots

In [16]:
# Probabilité d'être gardé dans le subsampling
p_sub = {word: min((math.sqrt(freqs[word]/0.001)+1)*(0.001/freqs[word]),1) for word in freqs}
p_sub

{'il': 0.3085152475485722,
 'mérite': 1,
 'd’': 0.9354507244218565,
 'être': 0.7699019513592786,
 'bloquer': 1,
 'la': 0.3810586955442616,
 'lettre': 1,
 'de': 0.19885620600864837,
 'l’': 0.7699019513592786,
 'alphabet': 1,
 'et': 0.45549018979507455,
 'fière': 1,
 'je': 0.33264556877508666,
 "t'": 0.7699019513592786,
 "en'": 1,
 'voi': 1,
 'att': 1,
 "j'": 0.5340210732352012,
 'avais': 1,
 'oublié': 1,
 'est': 0.22523489955511697,
 '1': 0.9354507244218565,
 'heure': 1,
 'eeeeh': 1,
 'jfais': 1,
 'go': 0.7699019513592786,
 'qui': 0.6640701700396551,
 'a': 0.426490309931942,
 'les': 0.3468397718334131,
 'programmes': 1,
 'mais': 0.6640701700396551,
 'j’': 0.426490309931942,
 'ai': 0.9354507244218565,
 'même': 0.7699019513592786,
 'pas': 0.2720632656294843,
 'navigo': 1,
 'ptdddddr': 1,
 '😭': 0.6640701700396551,
 'en': 0.33264556877508666,
 'tout': 1,
 'cas': 1,
 'demoiselle': 1,
 'bien': 1,
 'raison': 1,
 'le': 0.2578612045328258,
 'rathalos': 1,
 'un': 0.45549018979507455,
 'gros': 0.9

In [13]:
p_neg_1 = {word: freqs[word]**(3/4) for word in freqs}
total_neg = 0
for word in p_neg_1:
    total_neg+=p_neg_1[word]
p_neg = {word: p_neg_1[word]/total_neg for word in p_neg_1}
p_neg

{'il': 0.010442550307439529,
 'mérite': 0.0012472970211424876,
 'd’': 0.0028432223618129735,
 'être': 0.0035278887272145345,
 'bloquer': 0.0012472970211424876,
 'la': 0.00804184724983674,
 'lettre': 0.0012472970211424876,
 'de': 0.01833144755695522,
 'l’': 0.0035278887272145345,
 'alphabet': 0.0012472970211424876,
 'et': 0.006481145438244303,
 'fière': 0.0012472970211424876,
 'je': 0.009506886946772953,
 "t'": 0.0035278887272145345,
 "en'": 0.0012472970211424876,
 'voi': 0.0012472970211424876,
 'att': 0.0012472970211424876,
 "j'": 0.00536776402266863,
 'avais': 0.0012472970211424876,
 'oublié': 0.0012472970211424876,
 'est': 0.015587209894162663,
 '1': 0.0028432223618129735,
 'heure': 0.002097695187670709,
 'eeeeh': 0.0012472970211424876,
 'jfais': 0.0012472970211424876,
 'go': 0.0035278887272145345,
 'qui': 0.00417058895157505,
 'a': 0.007014066597752414,
 'les': 0.009027467049179817,
 'programmes': 0.0012472970211424876,
 'mais': 0.00417058895157505,
 'j’': 0.007014066597752414,
 'ai

### Étape 3 : Créations pairs mots centraux / contexte

In [21]:
mot_index = {w: index for (index, w) in enumerate(voc)}
index_mot = {index: w for (index, w) in enumerate(voc)}

taille_fenetre = 4
index_pairs = []
# On traite chaque phrase.
for phrase in t_corpus:
    indices = [mot_index[mot] for mot in phrase]
    # On traite chaque mot comme un mot central
    for center_word in range(len(indices)):
        # Pour chaque fenetre possible
        for w in range(-taille_fenetre, taille_fenetre + 1):
            context_word = center_word + w
            # On fait attention à ne pas sauter de phrases
            if context_word < 0 or context_word >= len(indices) or center_word == context_word:
                continue
            context_word_ind = indices[context_word]
            index_pairs.append((indices[center_word], context_word_ind))

In [22]:
index_pairs_np = np.array(index_pairs)
index_pairs_np[0:150]

array([[ 0,  1],
       [ 0,  2],
       [ 0,  3],
       [ 0,  4],
       [ 1,  0],
       [ 1,  2],
       [ 1,  3],
       [ 1,  4],
       [ 1,  5],
       [ 2,  0],
       [ 2,  1],
       [ 2,  3],
       [ 2,  4],
       [ 2,  5],
       [ 2,  6],
       [ 3,  0],
       [ 3,  1],
       [ 3,  2],
       [ 3,  4],
       [ 3,  5],
       [ 3,  6],
       [ 3,  7],
       [ 4,  0],
       [ 4,  1],
       [ 4,  2],
       [ 4,  3],
       [ 4,  5],
       [ 4,  6],
       [ 4,  7],
       [ 4,  8],
       [ 5,  1],
       [ 5,  2],
       [ 5,  3],
       [ 5,  4],
       [ 5,  6],
       [ 5,  7],
       [ 5,  8],
       [ 5,  9],
       [ 6,  2],
       [ 6,  3],
       [ 6,  4],
       [ 6,  5],
       [ 6,  7],
       [ 6,  8],
       [ 6,  9],
       [ 7,  3],
       [ 7,  4],
       [ 7,  5],
       [ 7,  6],
       [ 7,  8],
       [ 7,  9],
       [ 8,  4],
       [ 8,  5],
       [ 8,  6],
       [ 8,  7],
       [ 8,  9],
       [ 9,  5],
       [ 9,  6],
       [ 9,  7

### Étape 4 : Création du modèle

In [26]:
#Couche d'entrée
def get_input_layer(word_idx):
    x = torch.zeros(voc_size).float()
    x[word_idx] = 1.0
    return x

# Choix de dimension
embedding_dims = 10
# Initialisation
# Variable : comme Tensor mais avec les valeurs qui changent pendant le traitement
W1 = Variable(torch.randn(embedding_dims, voc_size).float(), requires_grad=True)
W2 = Variable(torch.randn(voc_size, embedding_dims).float(), requires_grad=True)
num_epochs = 10 # "époques"
learning_rate = 0.01
taille_fenetre = 4


# Différentes étapes
for epo in range(num_epochs):
    loss_val = 0
# On traite chaque phrase.
    for phrase in t_corpus:
        # Sub-sampling : pour chaque phrase, on réalise le subsampling éventuel.
        for mot in phrase:
            indice_mot = mot_index[mot]
            if np.random.random() < (p_sub[indice_mot]):
                




                
#### OLD

    for word, context in index_pairs:
        # Sub-sampling : garde-t-on le mot contexte sur lequel on est ? Cela dépend de la proba calculée précédemment
        word_context = index_mot[context]
        # On tire un nombre selon une loi uniforme, si on est inf à la proba, on continue
        if np.random.random() < (p_sub[word_context]):
            # Prévoir aussi le negative sampling, l'idée est d'aller prendre un mot qui n'est pas dans le contexte (ou, plutôt un
            # mot au hasard dans le voc et la proba qu'il soit dans le contexte est faible !)
            x = Variable(get_input_layer(word)).float()
            y_true = Variable(torch.from_numpy(np.array([context])).long())

            # Matmul = produits matriciels de deux tensors
            z1 = torch.matmul(W1, x)
            z2 = torch.matmul(W2, z1)

            # Calcul softmax
            log_softmax = F.log_softmax(z2, dim=0)

            # nll_loss(pred/target) - negative log likehood
            loss = F.nll_loss(log_softmax.view(1,-1), y_true)
            loss_val += loss.data

            # Propagation - revoir Pytorch.optimization
            loss.backward()
            W1.data -= learning_rate * W1.grad.data
            W2.data -= learning_rate * W2.grad.data

            W1.grad.data.zero_()
            W2.grad.data.zero_()
            
    print(f'Loss at epo {epo}: {loss_val/len(index_pairs)}')

Loss at epo 0: 7.0737624168396
Loss at epo 1: 6.267773151397705
Loss at epo 2: 5.776594161987305
Loss at epo 3: 5.562662601470947
Loss at epo 4: 5.346123695373535
Loss at epo 5: 5.246977806091309
Loss at epo 6: 5.110166072845459
Loss at epo 7: 4.987285614013672
Loss at epo 8: 4.922267436981201
Loss at epo 9: 4.838533878326416


In [27]:
W2[0:10]

tensor([[ 0.7824,  0.8372,  0.2454, -0.7312,  0.2949, -1.4171,  0.2990, -2.1360,
          0.3035,  0.0654],
        [-0.6371, -0.3506, -0.1499,  0.7204, -1.6352, -0.2240,  1.4447, -0.0897,
          0.2691, -0.6611],
        [-0.6357, -0.7362,  2.0941,  0.6366, -0.5323,  0.6220, -1.4391,  0.7243,
          0.6533,  0.1675],
        [-0.2818,  0.1482,  1.5661,  0.6969, -0.4618,  0.4569, -1.5302,  0.4331,
          1.3121,  0.1289],
        [ 0.2498,  0.6748,  1.0356, -0.5042,  0.3089, -0.5584,  0.5458, -1.4254,
          1.1541, -0.6753],
        [-0.3368, -0.2806, -0.8249, -0.7820, -0.8063, -0.6073,  0.5271, -0.8514,
         -0.3191,  0.0962],
        [-0.7488, -0.5720,  0.4821, -0.5200, -1.4534,  0.2015, -1.1262,  0.2537,
         -0.3297,  0.0439],
        [ 0.4570,  1.2932, -0.1451, -0.7181, -0.5081, -0.3333, -1.0181,  0.9577,
         -0.2211, -1.2171],
        [ 0.6413,  0.2936,  0.8162, -0.3947,  1.7993, -1.4037,  0.8278,  0.8026,
          0.0972, -0.1462],
        [ 0.6748,  

In [28]:
# Distance/similarité cosinus
def cos_distance(u, v):
    return (np.dot(u, v)  / (math.sqrt(np.dot(u, u)) *  (math.sqrt(np.dot(v, v)))))

In [29]:
# Dictionnaire des poids
mot_poids = {index_mot[index]: poids.detach().numpy() for (index, poids) in enumerate(W2)}

### Étape 5 : Résultats du modèle

In [30]:
def mot_plus_proche(word, n=5):
    word_distance = {}
    for mot in mot_poids:
        if mot != word:
            word_distance[mot] = (cos_distance(mot_poids[mot],(mot_poids[word])))
    word_distance = sorted(word_distance.items(), key=lambda t: t[1],reverse=True)
    return word_distance[0:10]

In [31]:
mot_poids
mot_plus_proche("mort")

[('rathalos', 0.7310312707276541),
 ('y', 0.7117773667606464),
 ('mtue', 0.6972553839008429),
 ('réveille', 0.6645014134124577),
 ('madre', 0.6638478961311802),
 ('😭', 0.6551260058122947),
 ('dodo', 0.6544916175622695),
 ('pense', 0.6393783723895966),
 ('glisse', 0.6249456152564727),
 ('✅', 0.6135785278345168)]