# Exercice 5

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


def softmax(x):
    return np.exp(x) / np.sum(np.exp(x), axis=-1, keepdims=True)

def layer_norm(x):
    return (x - np.mean(x, axis=-1, keepdims=True)) / np.std(x, axis=-1, keepdims=True)



sentence = "I am very happy to work at Paris and love Marseille university".split()
vocab_size = len(sentence)


word_to_index = {word: idx for idx, word in enumerate(sentence)}
embeddings = np.eye(vocab_size)
inputs = np.array([embeddings[word_to_index[word]] for word in sentence])


np.random.seed(0)
Wq = np.random.rand(vocab_size, vocab_size)
Wk = np.random.rand(vocab_size, vocab_size)
Wv = np.random.rand(vocab_size, vocab_size)
W1 = np.random.rand(vocab_size, vocab_size)
W2 = np.random.rand(vocab_size, vocab_size)


#sans Layer normalisation
Q = np.dot(inputs, Wq)
K = np.dot(inputs, Wk)
V = np.dot(inputs, Wv)
scores = np.dot(Q, K.T) / np.sqrt(K.shape[1])
attention_weights_sans_norm = softmax(scores)
attention_output_sans_norm = np.dot(attention_weights_sans_norm, V)

#Layer normalisation AVANT attention
inputs_norm = layer_norm(inputs)
Q_norm_Avant = np.dot(inputs_norm, Wq)
K_norm_Avant = np.dot(inputs_norm, Wk)
V_norm_Avant = np.dot(inputs_norm, Wv)
scores_norm_Avant = np.dot(Q_norm_Avant, K_norm_Avant.T) / np.sqrt(K_norm_Avant.shape[1])
attention_weights_norm_Avant = softmax(scores_norm_Avant)
attention_output_norm_Avant = np.dot(attention_weights_norm_Avant, V_norm_Avant)

#Layer normalisation APRES attention
attention_output_norm_Apres = layer_norm(attention_output_sans_norm + inputs) 



#Feed-Forward Layer relu
ffn_relu =np.dot(attention_output_sans_norm, W1)
ffn_relu= np.maximum(0, ffn_relu)
ffn_relu= np.dot(ffn_relu, W2)
encoder_output_relu = layer_norm(ffn_relu + attention_output_sans_norm)


def leaky_relu(x):
    alpha=0.01
    return np.where(x > 0, x, x * alpha)

def elu(x):
    alpha=1
    return np.where(x > 0, x, alpha * (np.exp(x) - 1))



#leaky ReLU
ffn_leaky =np.dot(attention_output_sans_norm, W1)
ffn_leaky= leaky_relu(ffn_leaky)
ffn_leaky= np.dot(ffn_leaky, W2)
encoder_output_leaky = layer_norm(ffn_leaky + attention_output_sans_norm)

#elu
ffn_elu =np.dot(attention_output_sans_norm, W1)
ffn_elu= elu(ffn_elu)
ffn_elu= np.dot(ffn_elu, W2)
encoder_output_elu = layer_norm(ffn_elu + attention_output_sans_norm) 

#masquage
#difference phrases de longueurs diffrentes
phrases = [
    ["I", "love", "Marseille"],
    ["I", "am", "very", "happy", "to", "work", "at", "Paris"]
]

max_len = max(len(s) for s in phrases)

#padding
liste_inputs = []
for ligne in phrases:
    emb = [embeddings[word_to_index[word]] for word in ligne]
    #padding selon longueur phrase
    while len(emb) < max_len:
        emb.append(np.zeros(vocab_size))  #des zeros de taille vocab_size
    liste_inputs.append(np.array(emb))

inputs_padded = np.array(liste_inputs)

#Q,K,V masquer
np.random.seed(0)
Wq_mask = np.random.rand(vocab_size, vocab_size)
Wk_mask = np.random.rand(vocab_size, vocab_size)
Wv_mask = np.random.rand(vocab_size, vocab_size)

#ensemble de resultat
resulats_ensemble_mask = []

for i, phrase_inputs in enumerate(inputs_padded): 
    
    vrai_longueur = len(phrases[i])
    
    Q =np.dot(phrase_inputs, Wq_mask)
    K =np.dot(phrase_inputs, Wk_mask)
    V =np.dot(phrase_inputs, Wv_mask)


    scores = np.dot(Q, K.T) / np.sqrt(K.shape[1])

    scores_masquer= scores.copy()
    scores_masquer[:, vrai_longueur:]= -1000000000000 
    scores_masquer[vrai_longueur:, :]= -1000000000000
    
    attention_weights_sans_mask = softmax(scores)
    attention_weights_mask = softmax(scores_masquer)
    
    #pour empecher nan
    attention_weights_mask[vrai_longueur:, :] = 0.0
      
    output_sans_mask = np.dot(attention_weights_sans_mask, V)
    output_mask = np.dot(attention_weights_mask, V)
    
    phrase_label = phrases[i] + ['padding'] * (max_len - vrai_longueur)
    
    resulats_ensemble_mask.append({
        'phrase': phrases[i],
        'phrase_label': phrase_label,
        'vrai_longueur': vrai_longueur,
        'attention_weights_sans_mask': attention_weights_sans_mask,
        'attention_weights_mask': attention_weights_mask,
        'output_sans_mask': output_sans_mask,
        'output_mask': output_mask
    })



#layer normalisation
plt.figure(figsize=(20, 8))

plt.subplot(2, 3, 1)
sns.heatmap(attention_weights_sans_norm, annot=True, cmap='viridis', xticklabels=sentence, yticklabels=sentence, fmt='.2f')
plt.title('Sans Layer Normalisation')

plt.subplot(2, 3, 2)
sns.heatmap(attention_weights_norm_Avant, annot=True, cmap='viridis', xticklabels=sentence, yticklabels=sentence, fmt='.2f')
plt.title('Layer Normalisation AVANT')

plt.subplot(2, 3, 3)
sns.heatmap(attention_weights_norm_Avant - attention_weights_sans_norm, annot=True, cmap='hot', xticklabels=sentence, yticklabels=sentence, fmt='.2f')
plt.title('Différence Normalisation AVANT vs Sans')

plt.subplot(2, 3, 4)
sns.heatmap(attention_output_sans_norm, annot=True, cmap='viridis', yticklabels=sentence, fmt='.2f')
plt.title('Output Attention Sans Normalisation')

plt.subplot(2, 3, 5)
sns.heatmap(attention_output_norm_Apres, annot=True, cmap='viridis', yticklabels=sentence, fmt='.2f')
plt.title('Output Attention Norm APRÈS')

plt.suptitle('Layer Normalization', fontsize=16)
plt.tight_layout()
plt.show()





#activation
plt.figure(figsize=(18, 6))

plt.subplot(2,2,1)
sns.heatmap(encoder_output_relu, annot=True, cmap='viridis', yticklabels=sentence)
plt.title('FFN Output ReLU (avec entrées positives)')

plt.subplot(2,2 ,2)
sns.heatmap(encoder_output_leaky, annot=True, cmap='viridis', yticklabels=sentence)
plt.title('FFN Output Leaky ReLU (identique car x > 0)')

plt.subplot(2,2 ,3)
sns.heatmap(encoder_output_elu, annot=True, cmap='viridis', yticklabels=sentence)
plt.title('FFN Output ELU (identique car x > 0)')

plt.subplot(2,2 ,4)
diff_activations = np.abs(encoder_output_leaky - encoder_output_relu)
sns.heatmap(diff_activations, annot=True, cmap='hot', yticklabels=sentence)
plt.title('Différence Leaky ReLU vs ReLU ( 0 car x positifs)')

plt.suptitle('Fonctions activation entrees positives donne résultats similaires', fontsize=16)
plt.tight_layout()
plt.show()


#mask
plt.figure(figsize=(18, 10))

for i, result in enumerate(resulats_ensemble_mask):
    #sans masque
    plt.subplot(2, 2, i*2 + 1)
    sns.heatmap(result['attention_weights_sans_mask'], annot=True, cmap='viridis', xticklabels=result['phrase_label'], yticklabels=result['phrase_label'])
    plt.title(f'Phrase {i+1} SANS Masque)')
    
    # Avec masque
    plt.subplot(2, 2, i*2 + 2)
    sns.heatmap(result['attention_weights_mask'], annot=True, cmap='viridis', xticklabels=result['phrase_label'], yticklabels=result['phrase_label'])
    plt.title(f'Phrase {i+1} AVEC Masque')

plt.suptitle('Mask', fontsize=16)
plt.tight_layout()
plt.show()


1- La normalisation stabilise l'attention pour cela elle stabilise les échelles des données. 
Avant l'attention, elle équilibre la distribution des poids. 
Après elle standardise les sorties pour les couches suivantes. 
La position change les résultats mais améliore quand même la stabilité


2- Avec masquage, l'attention ignore  le padding. 
Sans masquage, elle gaspille du poids sur les tokens vides. 
Le masquage préserve le sens en concentrant l'attention uniquement sur les mots réels, 
c'est essentiel pour traiter des phrases de longueurs différentes.