<a href="https://colab.research.google.com/github/BatoolAyman/Deep_Learning/blob/main/Transformer_code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np

# Step 1: Input Embedding
# Input sentence: "I love Tennis"
X = np.array([
    [1, 0, 0, 0],  # I
    [0, 1, 0, 0],  # love
    [0, 0, 1, 0]   # Tennis
])

# Step 2: Multi-Head Attention
# Assume h = 2 heads, d_k = d_model / h = 4 / 2 = 2
d_model = 4
h = 2
d_k = d_model // h

# Define weight matrices for Head 1 and Head 2
# Head 1 weights
W_Q1 = np.array([
    [1, 0],
    [0, 1],
    [0, 0],
    [0, 0]
])
W_K1 = np.array([
    [0, 0],
    [0, 0],
    [1, 0],
    [0, 1]
])
W_V1 = np.array([
    [1, 0],
    [0, 1],
    [0, 0],
    [0, 0]
])

# Head 2 weights
W_Q2 = np.array([
    [0, 1],
    [1, 0],
    [0, 0],
    [0, 0]
])
W_K2 = np.array([
    [0, 0],
    [0, 0],
    [0, 1],
    [1, 0]
])
W_V2 = np.array([
    [0, 1],
    [1, 0],
    [0, 0],
    [0, 0]
])

# Compute Q, K, V for Head 1
Q1 = X @ W_Q1
K1 = X @ W_K1
V1 = X @ W_V1

# Compute Q, K, V for Head 2
Q2 = X @ W_Q2
K2 = X @ W_K2
V2 = X @ W_V2

In [2]:
# Step 3: Compute Attention Scores
def softmax(x):
    exp_x = np.exp(x - np.max(x, axis=-1, keepdims=True))  # Numerical stability
    return exp_x / np.sum(exp_x, axis=-1, keepdims=True)

def attention(Q, K, V):
    d_k = Q.shape[-1]
    scores = (Q @ K.T) / np.sqrt(d_k)  # Scaled dot-product attention
    attention_weights = softmax(scores)  # Apply softmax
    output = attention_weights @ V  # Weighted sum of values
    return output

# Compute attention for Head 1
attention_output_head1 = attention(Q1, K1, V1)

# Compute attention for Head 2
attention_output_head2 = attention(Q2, K2, V2)

In [3]:
# Step 4: Concatenate Heads
multihead_output = np.concatenate((attention_output_head1, attention_output_head2), axis=-1)

# Step 5: Print Results
print("Input Embedding (X):")
print(X)
print("\nHead 1 - Attention Output:")
print(attention_output_head1)
print("\nHead 2 - Attention Output:")
print(attention_output_head2)
print("\nMulti-Head Attention Output (Concatenated):")
print(multihead_output)

Input Embedding (X):
[[1 0 0 0]
 [0 1 0 0]
 [0 0 1 0]]

Head 1 - Attention Output:
[[0.24825508 0.24825508]
 [0.33333333 0.33333333]
 [0.33333333 0.33333333]]

Head 2 - Attention Output:
[[0.24825508 0.24825508]
 [0.33333333 0.33333333]
 [0.33333333 0.33333333]]

Multi-Head Attention Output (Concatenated):
[[0.24825508 0.24825508 0.24825508 0.24825508]
 [0.33333333 0.33333333 0.33333333 0.33333333]
 [0.33333333 0.33333333 0.33333333 0.33333333]]
