# Exercice 3

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import gensim.downloader as api
from gensim.models import Word2Vec

def softmax(x):
    return np.exp(x) / np.sum(np.exp(x), axis=-1, keepdims=True)


print("\nWord2Vec")
phrase_entrainement = [
    ["I", "prefer", "eating", "burgers", "at","the", "restaurant"],
    ["The", "cat", "sleeps", "on", "floor"],
    ["We", "like", "fast", "food", "and", "good","drinks"],
    ["Would", "you", "like", "some", "coffee", "or", "tea"],
]


model = Word2Vec(sentences=phrase_entrainement, vector_size=8, window=2, min_count=1, epochs=10, seed=42)

#Phrase
phrase = ["I", "prefer", "eating", "some", "good", "burgers", "at", "the", "restaurant"]
print(f"\nPhrase: {' '.join(phrase)}")


inputs = np.array([model.wv[word] for word in phrase])
max_len = len(phrase)


#initialisation xavier
def xavier_initialization(shape):
    limit = np.sqrt(6 / (shape[0] + shape[1]))
    return np.random.uniform(-limit, limit, shape)


dimension = model.vector_size
np.random.seed(42)
Wq_norm = np.random.randn(dimension, dimension) 
Wk_norm = np.random.randn(dimension, dimension) 
Wv_norm = np.random.randn(dimension, dimension) 

Wq_xav = xavier_initialization((dimension, dimension))
Wk_xav = xavier_initialization((dimension, dimension))
Wv_xav = xavier_initialization((dimension, dimension))

#Positional encoding
def positional_encoding(max_len, d, n=10000):
    P = np.zeros((max_len, d))
    for k in range(max_len):
        for i in np.arange(int(d/2)):
            denominator = np.power(n, 2*i/d)
            P[k, 2*i] = np.sin(k/denominator)
            P[k, 2*i+1] = np.cos(k/denominator)
    return P


pos_enc = positional_encoding(max_len, dimension)
inputs_pos = inputs + pos_enc


def calcul_Q_K_V(X, Wq, Wk, Wv):
    Q = np.dot(X, Wq)
    K = np.dot(X, Wk)
    V = np.dot(X, Wv)
    scores = np.dot(Q, K.T) / np.sqrt(K.shape[-1])
    weights = softmax(scores)
    output = np.dot(weights, V)
    return scores, weights, output

#Normal 
scores1, weight1, output1 = calcul_Q_K_V(inputs, Wq_norm, Wk_norm, Wv_norm)

#Xavier 
scores2, weight2, output2 = calcul_Q_K_V(inputs, Wq_xav, Wk_xav, Wv_xav)

# Xavier + Positional Encoding
scores3, weight3, output3 = calcul_Q_K_V(inputs_pos, Wq_xav, Wk_xav, Wv_xav)



plt.figure(figsize=(18, 10))

plt.subplot(2, 3, 1)
sns.heatmap(weight1, annot=True, cmap="viridis", xticklabels=phrase, yticklabels=phrase)
plt.title("Attention Weights Initialisation Normale")

plt.subplot(2, 3, 2)
sns.heatmap(weight2, annot=True, cmap="viridis", xticklabels=phrase, yticklabels=phrase)
plt.title("Attention Weights Initialisation Xavier")

plt.subplot(2, 3, 3)
sns.heatmap(weight1 - weight2, annot=True, cmap="hot", xticklabels=phrase, yticklabels=phrase)
plt.title("Difference Xavier vs Normal")

plt.subplot(2, 3, 4)
sns.heatmap(pos_enc, annot=True, cmap="viridis", yticklabels=phrase)
plt.title("Encodage Positionnel ")

plt.subplot(2, 3, 5)
sns.heatmap(weight3, annot=True, cmap="viridis", xticklabels=phrase, yticklabels=phrase)
plt.title("Attention Weights - Xavier + Position")

plt.subplot(2, 3, 6)
sns.heatmap(weight3-weight2, annot=True, cmap="hot", xticklabels=phrase, yticklabels=phrase)
plt.title("Différence avec/sans Position")

plt.tight_layout()
plt.show()

1- L'initialisation des poids agit sur la stabilité et la qualité de l'attention. Donc si l'initialisation est aléatoire,  les poids peuvent donner une attention qui est moins bonne, Xavier permet de contourner ca



2- L'encodage positionnel ajoute une information sur la position de chaque mot, ce qui permet au modele de differencier des phrases comme 'le chat dort' et 'dort le chat'