### Attention Mechanism in PyTorch

In [1]:
import torch

# 3 words embedding size = 4
embeddings = torch.tensor([
    [1.0, 0.0, 1.0, 0.0], # I
    [0.0, 2.0, 0.0, 2.0], # Love
    [1.0, 1.0, 1.0, 1.0]  # NLP
])

print(embeddings.shape)

torch.Size([3, 4])


In [2]:
d_model = 4

W_Q = torch.randn(d_model, d_model)
W_K = torch.randn(d_model, d_model)
W_V = torch.randn(d_model, d_model)

Q = embeddings @ W_Q
K = embeddings @ W_K
V = embeddings @ W_V


In [3]:
scores = Q @ K.T
print(scores)


tensor([[ -0.4913,  -0.5518,  -0.7672],
        [  2.5580, -12.2371,  -3.5605],
        [  0.7877,  -6.6704,  -2.5475]])


In [4]:
import math

scaled_scores = scores / math.sqrt(d_model)
attention_weights = torch.softmax(scaled_scores, dim=1)

print(attention_weights)


tensor([[3.5195e-01, 3.4146e-01, 3.0659e-01],
        [9.5462e-01, 5.8494e-04, 4.4793e-02],
        [8.2460e-01, 1.9803e-02, 1.5560e-01]])


In [5]:
output = attention_weights @ V
print(output)


tensor([[-1.5858,  0.7537, -1.2524,  1.0643],
        [-2.4474,  0.4640, -0.2237,  2.6898],
        [-2.3961,  0.5252, -0.3923,  2.5272]])


### Self Attention that Learn Language Pattern with Positional Encoding

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import math

In [2]:
sentences = [
    "i love nlp",
    "nlp loves me",
    "i love learning"
]

### This line is to convert word to numbers

In [3]:
words = sorted(set(" ".join(sentences).split()))
word_to_idx = {w: i for i,  w in enumerate(words)}
idx_to_word = {i: w for w , i in word_to_idx.items()}

vocab_size = len(words)

### This is to Create Sequences

In [4]:
seq_length = 2
x,y = [], []

for sentence in sentences:
    tokens = sentence.split()
    for i in range(len(tokens) - seq_length):
        x.append([word_to_idx[tokens[i]],
                  word_to_idx[tokens[i + 1]]])
        y.append(word_to_idx[tokens[i + 2]])

x = torch.tensor(x)
y = torch.tensor(y)

print(x)
print(y)


tensor([[0, 2],
        [5, 3],
        [0, 2]])
tensor([5, 4, 1])


### Self Attention Layer

In [5]:
class SelfAttention(nn.Module):
    def __init__(self, embed_dim):
      super().__init__()  
      self.Wq = nn.Linear(embed_dim, embed_dim)
      self.Wk = nn.Linear(embed_dim, embed_dim)
      self.Wv = nn.Linear(embed_dim, embed_dim)

    def forward(self, x):
        Q = self.Wq(x)
        K = self.Wk(x)
        V = self.Wv(x)

        scores = torch.matmul(Q, K.transpose(-2 , -1))
        weights = torch.softmax(scores , dim = -1)
        out = torch.matmul(weights, V)
        return out 

### Model Class

In [8]:
class TinyTransformer(nn.Module):
    def __init__(self, vocab_size, embed_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.pos_enc = PositionalEncoding(embed_dim)
        self.attention = SelfAttention(embed_dim)
        self.fc = nn.Linear(embed_dim, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        x = self.pos_enc(x)
        x = self.attention(x)
        x = x[:, -1, :]
        return self.fc(x)

### Training Loops

In [10]:
model = TinyTransformer(vocab_size, embed_dim = 16)
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

for epoch in range(300):
    optimizer.zero_grad()
    out = model(x)
    loss = loss_fn(out, y)
    loss.backward()
    optimizer.step()

    if epoch % 50 == 0:
        print(f"Epoch {epoch}, Loss:{loss.item():.4f}")
    

Epoch 0, Loss:1.9859
Epoch 50, Loss:0.4626
Epoch 100, Loss:0.4623
Epoch 150, Loss:0.4622
Epoch 200, Loss:0.4622
Epoch 250, Loss:0.4622


### Multi Head Attention

### Importations 

In [14]:
import torch
import torch.nn as nn
import torch.optim as optim

### Toy data been used

In [15]:
sentences = [
    "i love nlp",
    "nlp loves me",
    "i love learning"
]

words = sorted(set(" ".join(sentences).split()))
word_to_idx = {w: i for i, w in enumerate(words)}
idx_to_word = {i: w for w, i in word_to_idx.items()}
vocab_size = len(words)

print("Vocabulary:", word_to_idx)

Vocabulary: {'i': 0, 'learning': 1, 'love': 2, 'loves': 3, 'me': 4, 'nlp': 5}


### Create sequence 

In [16]:
seq_length = 2
x,y = [],[]

for sentence in sentences:
    tokens = sentence.split()
    for i in range(len(tokens) - seq_length):
        x.append([word_to_idx[tokens[i]], word_to_idx[tokens[i + 1]]])
        y.append(word_to_idx[tokens[i + 2]])


x = torch.tensor(x, dtype=torch.long)
y = torch.tensor(y, dtype=torch.long)

print("Input Sequences", x)
print("Labels", y)

Input Sequences tensor([[0, 2],
        [5, 3],
        [0, 2]])
Labels tensor([5, 4, 1])


In [17]:
class PositionalEncoding(nn.Module):
    def __init__(self, embed_dim, max_len=100):
        super().__init__()
        pe = torch.zeros(max_len, embed_dim)
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(
            torch.arange(0, embed_dim, 2) * (-math.log(10000.0) / embed_dim)
        )
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.register_buffer("pe", pe)

    def forward(self, x):
        return x + self.pe[:x.size(1)]

### Module Class 

In [18]:
class MultiHeadSelfAttention(nn.Module):
    def __init__(self, embed_dim, num_heads):
        super().__init__()
        assert embed_dim % num_heads == 0

        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.head_dim = embed_dim // num_heads

        self.Wq = nn.Linear(embed_dim, embed_dim)
        self.Wk = nn.Linear(embed_dim, embed_dim)
        self.Wv = nn.Linear(embed_dim, embed_dim)
        self.out_proj = nn.Linear(embed_dim, embed_dim)

    def forward(self, x):
        B, S, E = x.size()

        Q = self.Wq(x)
        K = self.Wk(x)
        V = self.Wv(x)

        Q = Q.view(B, S, self.num_heads, self.head_dim).transpose(1, 2)
        K = K.view(B, S, self.num_heads, self.head_dim).transpose(1, 2)
        V = V.view(B, S, self.num_heads, self.head_dim).transpose(1, 2)

        scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.head_dim)

        # Causal mask
        mask = torch.tril(torch.ones(S, S)).to(x.device)
        scores = scores.masked_fill(mask == 0, float("-inf"))

        weights = torch.softmax(scores, dim=-1)
        out = torch.matmul(weights, V)

        out = out.transpose(1, 2).contiguous().view(B, S, E)
        return self.out_proj(out)


In [19]:
class FeedForward(nn.Module):
    def __init__(self, embed_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(embed_dim, 4 * embed_dim),
            nn.ReLU(),
            nn.Linear(4 * embed_dim, embed_dim)
        )

    def forward(self, x):
        return self.net(x)

In [20]:
class TransformerBlock(nn.Module):
    def __init__(self, embed_dim, num_heads):
        super().__init__()
        self.attn = MultiHeadSelfAttention(embed_dim, num_heads)
        self.ffn = FeedForward(embed_dim)
        self.ln1 = nn.LayerNorm(embed_dim)
        self.ln2 = nn.LayerNorm(embed_dim)

    def forward(self, x):
        x = self.ln1(x + self.attn(x))
        x = self.ln2(x + self.ffn(x))
        return x

### Training

In [21]:
class TinyGPT(nn.Module):
    def __init__(self, vocab_size, embed_dim=16, num_heads=2):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.positional = PositionalEncoding(embed_dim)
        self.block = TransformerBlock(embed_dim, num_heads)
        self.fc = nn.Linear(embed_dim, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        x = self.positional(x)
        x = self.block(x)
        x = x[:, -1, :]
        return self.fc(x)

In [22]:
model = TinyGPT(vocab_size)
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

for epoch in range(300):
    optimizer.zero_grad()
    out = model(x)
    loss = loss_fn(out, y)
    loss.backward()
    optimizer.step()

    if epoch % 50 == 0:
        print(f"Epoch {epoch}, Loss: {loss.item():.4f}")

Epoch 0, Loss: 1.5980
Epoch 50, Loss: 0.4644
Epoch 100, Loss: 0.4632
Epoch 150, Loss: 0.4628
Epoch 200, Loss: 0.4626
Epoch 250, Loss: 0.4625
