<a href="https://colab.research.google.com/github/Avinashreddy-07/Homework-5_ML/blob/main/Homework-5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Question -1

In [1]:
import numpy as np

def scaled_dot_product_attention(Q, K, V):
    """
    Q, K, V: numpy arrays of shape (seq_len, d_k)
    Returns: attention_weights, context_vector
    """

    # 1. Compute raw attention scores: QK^T
    scores = np.dot(Q, K.T)

    # 2. Scale by sqrt(d_k)
    d_k = Q.shape[-1]
    scaled_scores = scores / np.sqrt(d_k)

    # 3. Apply softmax row-wise
    # subtract max for numerical stability
    exp_scores = np.exp(scaled_scores - np.max(scaled_scores, axis=-1, keepdims=True))
    attention_weights = exp_scores / np.sum(exp_scores, axis=-1, keepdims=True)

    # 4. Multiply weights with V
    context_vector = np.dot(attention_weights, V)

    return attention_weights, context_vector


# Example usage
Q = np.random.rand(3, 4)  # seq=3, d_k=4
K = np.random.rand(3, 4)
V = np.random.rand(3, 4)

attn_wts, ctx = scaled_dot_product_attention(Q, K, V)
print("Attention Weights:\n", attn_wts)
print("Context Vector:\n", ctx)


Attention Weights:
 [[0.29848396 0.34158447 0.35993157]
 [0.32124848 0.28982088 0.38893064]
 [0.27423777 0.3807327  0.34502952]]
Context Vector:
 [[0.45414656 0.62930578 0.88446539 0.55907972]
 [0.42634493 0.61489103 0.88839965 0.55909714]
 [0.47503994 0.63816909 0.88103249 0.56035685]]


Question -2

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class SimpleTransformerEncoder(nn.Module):
    def __init__(self, d_model=128, num_heads=8, ff_dim=512):
        super().__init__()

        # (a) Multi-head self-attention
        self.attn = nn.MultiheadAttention(embed_dim=d_model, num_heads=num_heads, batch_first=True)

        # LayerNorm for residuals
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)

        # (b) Feed-forward network
        self.ff = nn.Sequential(
            nn.Linear(d_model, ff_dim),
            nn.ReLU(),
            nn.Linear(ff_dim, d_model)
        )

    def forward(self, x):
        # x shape: (batch, seq_len, d_model)

        # 1) Self-attention
        attn_output, _ = self.attn(x, x, x)    # Q=K=V=x

        # 2) Add & Norm
        x = self.norm1(x + attn_output)

        # 3) Feed-forward network
        ff_output = self.ff(x)

        # 4) Add & Norm
        x = self.norm2(x + ff_output)

        return x


In [3]:
model = SimpleTransformerEncoder(d_model=128, num_heads=8)

x = torch.randn(32, 10, 128)     # (batch, seq_len, d_model)

out = model(x)
print("Output shape:", out.shape)


Output shape: torch.Size([32, 10, 128])
