# Début d'implémentation du modèle

### Étape 0 : Importations

In [1]:
import numpy as np
import torch
from torch.autograd import Variable
import torch.nn.functional as F
import math
import pandas as pd
import random

### Étape 1 : Récupération des données

In [2]:
df = pd.read_csv("C:/Users/torna/Documents/StatApp/StatApp/data/sample1.txt",sep='\n',header=None)

### Étape 2 : Créer le vocabulaire à partir du corpus de phrases

In [3]:
df2 = df[0:100]

corpus = []
for index, row in df2.iterrows():
    for j, column in row.iteritems():
        corpus.append(column)

In [4]:
corpus_corr = []

for phrase in corpus:
    # Suppression de la ponctuation
    phrase = phrase.replace("?","")
    phrase = phrase.replace(".","")
    phrase = phrase.replace("!","")
    phrase = phrase.replace(";","")
    phrase = phrase.replace(",","")
    phrase = phrase.replace(":","")
    phrase = phrase.replace("#","")
    # On met tout en minuscule
    phrase = phrase.lower()
    # On ajoute la phrase
    corpus_corr.append(phrase)

In [5]:
def tokenize(corpus):
    tokens = [phrase.split() for phrase in corpus]
    return tokens

t_corpus = tokenize(corpus_corr)

In [31]:
# On supprime les mentions @nicknames
corpus_ok = []
for phrase in t_corpus:
    phrase_bis = []
    for mot in phrase:
        if mot[0] == '@':
            mot = "nickname"
        phrase_bis.append(mot)
    corpus_ok.append(phrase_bis)
t_corpus = corpus_ok

In [32]:
voc = []
freqs = {}
for phrase in t_corpus:
    for mot in phrase:
        if mot not in voc:
            voc.append(mot)
            freqs[mot] = 1
        else:
            freqs[mot] +=1
voc_size = len(voc)
print(voc_size)

524


###### Calcul des probas pour le subsampling et le negative sampling

In [8]:
total_mots = 0
for phrase in t_corpus:
    total_mots += len(phrase)

In [9]:
for key, value in freqs.items():
    freqs[key] = value / total_mots

In [34]:
# Probabilité d'être gardé dans le subsampling
p_sub = {word: min((math.sqrt(freqs[word]/0.001)+1)*(0.001/freqs[word]),1) for word in freqs}
p_sub[0:10]

TypeError: unhashable type: 'slice'

In [11]:
p_neg_1 = {word: freqs[word]**(3/4) for word in freqs}
total_neg = 0
for word in p_neg_1:
    total_neg+=p_neg_1[word]
p_neg = {word: p_neg_1[word]/total_neg for word in p_neg_1}
p_neg

{'il': 0.009753021158773238,
 'mérite': 0.0011649370968134988,
 'd’': 0.002655482333094781,
 'être': 0.003294939683250378,
 'bloquer': 0.0011649370968134988,
 'la': 0.007510838260209577,
 'lettre': 0.0011649370968134988,
 'de': 0.017121008817793677,
 'l’': 0.003294939683250378,
 'alphabet': 0.0011649370968134988,
 '@reinedonna': 0.0011649370968134988,
 'et': 0.006053190717908292,
 'fière': 0.0011649370968134988,
 'je': 0.008879140326466525,
 "t'": 0.003294939683250378,
 "en'": 0.0011649370968134988,
 'voi': 0.0011649370968134988,
 'att': 0.0011649370968134988,
 "j'": 0.005013326682380654,
 'avais': 0.0011649370968134988,
 'oublié': 0.0011649370968134988,
 'est': 0.014557975152459027,
 '1': 0.002655482333094781,
 'heure': 0.001959182857413081,
 'eeeeh': 0.0011649370968134988,
 'jfais': 0.0011649370968134988,
 'go': 0.003294939683250378,
 'qui': 0.003895201947007034,
 'a': 0.006550922707855008,
 'les': 0.008431376871419378,
 'programmes': 0.0011649370968134988,
 'mais': 0.003895201947007

### Étape 3 : Créations pairs mots centraux / contexte

In [12]:
mot_index = {w: index for (index, w) in enumerate(voc)}
index_mot = {index: w for (index, w) in enumerate(voc)}

taille_fenetre = 4
index_pairs = []
# On traite chaque phrase.
for phrase in t_corpus:
    indices = [mot_index[mot] for mot in phrase]
    # On traite chaque mot comme un mot central
    for center_word in range(len(indices)):
        # Pour chaque fenetre possible
        for w in range(-taille_fenetre, taille_fenetre + 1):
            context_word = center_word + w
            # On fait attention à ne pas sauter de phrases
            if context_word < 0 or context_word >= len(indices) or center_word == context_word:
                continue
            context_word_ind = indices[context_word]
            index_pairs.append((indices[center_word], context_word_ind))

In [13]:
index_pairs_np = np.array(index_pairs)
index_pairs_np[0:150]

array([[ 0,  1],
       [ 0,  2],
       [ 0,  3],
       [ 0,  4],
       [ 1,  0],
       [ 1,  2],
       [ 1,  3],
       [ 1,  4],
       [ 1,  5],
       [ 2,  0],
       [ 2,  1],
       [ 2,  3],
       [ 2,  4],
       [ 2,  5],
       [ 2,  6],
       [ 3,  0],
       [ 3,  1],
       [ 3,  2],
       [ 3,  4],
       [ 3,  5],
       [ 3,  6],
       [ 3,  7],
       [ 4,  0],
       [ 4,  1],
       [ 4,  2],
       [ 4,  3],
       [ 4,  5],
       [ 4,  6],
       [ 4,  7],
       [ 4,  8],
       [ 5,  1],
       [ 5,  2],
       [ 5,  3],
       [ 5,  4],
       [ 5,  6],
       [ 5,  7],
       [ 5,  8],
       [ 5,  9],
       [ 6,  2],
       [ 6,  3],
       [ 6,  4],
       [ 6,  5],
       [ 6,  7],
       [ 6,  8],
       [ 6,  9],
       [ 7,  3],
       [ 7,  4],
       [ 7,  5],
       [ 7,  6],
       [ 7,  8],
       [ 7,  9],
       [ 8,  4],
       [ 8,  5],
       [ 8,  6],
       [ 8,  7],
       [ 8,  9],
       [ 9,  5],
       [ 9,  6],
       [ 9,  7

### Étape 4 : Création du modèle

In [19]:
#Couche d'entrée
def get_input_layer(word_idx):
    x = torch.zeros(voc_size).float()
    x[word_idx] = 1.0
    return x

# Choix de dimension
embedding_dims = 10
# Initialisation
# Variable : comme Tensor mais avec les valeurs qui changent pendant le traitement
W1 = Variable(torch.randn(embedding_dims, voc_size).float(), requires_grad=True)
W2 = Variable(torch.randn(voc_size, embedding_dims).float(), requires_grad=True)
num_epochs = 10 # "époques"
learning_rate = 0.01
taille_fenetre = 4


# Différentes étapes
for epo in range(num_epochs):
    loss_val = 0
# On traite chaque phrase.
    for phrase in t_corpus:
        # Sub-sampling : pour chaque phrase, on réalise le subsampling éventuel.
        for mot in phrase:
            indice_mot = mot_index[mot]
            if np.random.random() < (p_sub[indice_mot]):
                x = Variable(get_input_layer(word)).float()
                y_true = Variable(torch.from_numpy(np.array([context])).long())
                z1 = torch.matmul(W1, x)
                z2 = torch.matmul(W2, z1)
                
                
                
                log_softmax = F.log_softmax(z2, dim=0)

                # nll_loss(pred/target) - negative log likehood
                loss = F.nll_loss(log_softmax.view(1,-1), y_true)
                loss_val += loss.data

                # Propagation - revoir Pytorch.optimization
                loss.backward()
                W1.data -= learning_rate * W1.grad.data
                W2.data -= learning_rate * W2.grad.data

                W1.grad.data.zero_()
                W2.grad.data.zero_()

    print(f'Loss at epo {epo}: {loss_val/len(index_pairs)}')


                
#### OLD

    for word, context in index_pairs:
        # Sub-sampling : garde-t-on le mot contexte sur lequel on est ? Cela dépend de la proba calculée précédemment
        word_context = index_mot[context]
        # On tire un nombre selon une loi uniforme, si on est inf à la proba, on continue
        if np.random.random() < (p_sub[word_context]):
            # Prévoir aussi le negative sampling, l'idée est d'aller prendre un mot qui n'est pas dans le contexte (ou, plutôt un
            # mot au hasard dans le voc et la proba qu'il soit dans le contexte est faible !)
            x = Variable(get_input_layer(word)).float()
            y_true = Variable(torch.from_numpy(np.array([context])).long())
            print(y_true)
            # Matmul = produits matriciels de deux tensors
            z1 = torch.matmul(W1, x)
            z2 = torch.matmul(W2, z1)

            # Calcul softmax
            log_softmax = F.log_softmax(z2, dim=0)

            # nll_loss(pred/target) - negative log likehood
            loss = F.nll_loss(log_softmax.view(1,-1), y_true)
            loss_val += loss.data

            # Propagation - revoir Pytorch.optimization
            loss.backward()
            W1.data -= learning_rate * W1.grad.data
            W2.data -= learning_rate * W2.grad.data

            W1.grad.data.zero_()
            W2.grad.data.zero_()
            
    print(f'Loss at epo {epo}: {loss_val/len(index_pairs)}')

IndentationError: expected an indented block (<ipython-input-19-e0c5168699ce>, line 35)

In [22]:
def get_input_layer(word_idx):
    x = torch.zeros(voc_size).float()
    x[word_idx] = 1.0
    return x   
x = Variable(get_input_layer(1)).float()
x    

tensor([0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 

In [16]:
W2[0:10]

NameError: name 'W2' is not defined

In [28]:
# Distance/similarité cosinus
def cos_distance(u, v):
    return (np.dot(u, v)  / (math.sqrt(np.dot(u, u)) *  (math.sqrt(np.dot(v, v)))))

In [29]:
# Dictionnaire des poids
mot_poids = {index_mot[index]: poids.detach().numpy() for (index, poids) in enumerate(W2)}

### Étape 5 : Résultats du modèle

In [30]:
def mot_plus_proche(word, n=5):
    word_distance = {}
    for mot in mot_poids:
        if mot != word:
            word_distance[mot] = (cos_distance(mot_poids[mot],(mot_poids[word])))
    word_distance = sorted(word_distance.items(), key=lambda t: t[1],reverse=True)
    return word_distance[0:10]

In [31]:
mot_poids
mot_plus_proche("mort")

[('rathalos', 0.7310312707276541),
 ('y', 0.7117773667606464),
 ('mtue', 0.6972553839008429),
 ('réveille', 0.6645014134124577),
 ('madre', 0.6638478961311802),
 ('😭', 0.6551260058122947),
 ('dodo', 0.6544916175622695),
 ('pense', 0.6393783723895966),
 ('glisse', 0.6249456152564727),
 ('✅', 0.6135785278345168)]