 ## 1Ô∏è‚É£ Imports and Environment Check

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import random

from transformer_blocks import Block


In [2]:
print("Torch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
print("GPU name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "None")


Torch version: 2.5.1
CUDA available: True
GPU name: NVIDIA GeForce GTX 1650


 ## 2Ô∏è‚É£ Create a Small Text Corpus (New Example)

In [3]:
corpus = [
    "i love south indian food",
    "we traveled by train to chennai",
    "the dosa was crispy and tasty",
    "tea tastes better in the evening",
    "we visited the beach at sunrise",
    "spicy food makes me happy",
    "the journey was long but fun",
    "coffee and snacks are perfect",
]

# Add end-of-sentence token
corpus = [s + " <END>" for s in corpus]

text = " ".join(corpus)
print(text)


i love south indian food <END> we traveled by train to chennai <END> the dosa was crispy and tasty <END> tea tastes better in the evening <END> we visited the beach at sunrise <END> spicy food makes me happy <END> the journey was long but fun <END> coffee and snacks are perfect <END>


 ## 3Ô∏è‚É£ Vocabulary Construction

In [4]:
words = list(set(text.split()))
vocab_size = len(words)

print("Vocabulary:", words)
print("Vocab size:", vocab_size)


Vocabulary: ['coffee', 'in', 'chennai', 'makes', 'south', 'we', 'train', 'tasty', 'crispy', 'i', 'the', 'long', 'perfect', 'better', 'food', 'indian', 'beach', 'snacks', 'are', 'was', 'me', 'to', 'visited', 'traveled', '<END>', 'and', 'tastes', 'love', 'by', 'evening', 'tea', 'at', 'fun', 'but', 'journey', 'happy', 'spicy', 'sunrise', 'dosa']
Vocab size: 39


In [5]:
word2idx = {w: i for i, w in enumerate(words)}
idx2word = {i: w for w, i in word2idx.items()}

print("word2idx:", word2idx)


word2idx: {'coffee': 0, 'in': 1, 'chennai': 2, 'makes': 3, 'south': 4, 'we': 5, 'train': 6, 'tasty': 7, 'crispy': 8, 'i': 9, 'the': 10, 'long': 11, 'perfect': 12, 'better': 13, 'food': 14, 'indian': 15, 'beach': 16, 'snacks': 17, 'are': 18, 'was': 19, 'me': 20, 'to': 21, 'visited': 22, 'traveled': 23, '<END>': 24, 'and': 25, 'tastes': 26, 'love': 27, 'by': 28, 'evening': 29, 'tea': 30, 'at': 31, 'fun': 32, 'but': 33, 'journey': 34, 'happy': 35, 'spicy': 36, 'sunrise': 37, 'dosa': 38}


 ## 4Ô∏è‚É£ Encode Text as Token IDs

In [6]:
data = torch.tensor([word2idx[w] for w in text.split()], dtype=torch.long)

print("Encoded data:", data)
print("Total tokens:", len(data))


Encoded data: tensor([ 9, 27,  4, 15, 14, 24,  5, 23, 28,  6, 21,  2, 24, 10, 38, 19,  8, 25,
         7, 24, 30, 26, 13,  1, 10, 29, 24,  5, 22, 10, 16, 31, 37, 24, 36, 14,
         3, 20, 35, 24, 10, 34, 19, 11, 33, 32, 24,  0, 25, 17, 18, 12, 24])
Total tokens: 53


 ## 5Ô∏è‚É£ Hyperparameters

In [7]:
block_size = 6
embedding_dim = 32
n_heads = 2
n_layers = 2
learning_rate = 1e-3
epochs = 1500
batch_size = 16


 ## 6Ô∏è‚É£ Mini-batch Sampling Function

In [8]:
def get_batch():
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i + block_size] for i in ix])
    y = torch.stack([data[i + 1:i + block_size + 1] for i in ix])
    return x, y


 ## 7Ô∏è‚É£ TinyGPT Model Definition

In [9]:
class TinyGPT(nn.Module):
    def __init__(self):
        super().__init__()

        self.token_embedding = nn.Embedding(vocab_size, embedding_dim)
        self.position_embedding = nn.Embedding(block_size, embedding_dim)

        self.blocks = nn.Sequential(
            *[Block(embedding_dim, block_size, n_heads) for _ in range(n_layers)]
        )

        self.ln_f = nn.LayerNorm(embedding_dim)
        self.head = nn.Linear(embedding_dim, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        tok_emb = self.token_embedding(idx)
        pos_emb = self.position_embedding(torch.arange(T, device=idx.device))

        x = tok_emb + pos_emb
        x = self.blocks(x)
        x = self.ln_f(x)
        logits = self.head(x)

        loss = None
        if targets is not None:
            B, T, C = logits.shape
            loss = F.cross_entropy(
                logits.view(B * T, C),
                targets.view(B * T)
            )

        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size:]
            logits, _ = self(idx_cond)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            next_idx = torch.multinomial(probs, 1)
            idx = torch.cat((idx, next_idx), dim=1)
        return idx


 ## 8Ô∏è‚É£ Model Initialization and Optimizer

In [10]:
model = TinyGPT()
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

print("Total parameters:", sum(p.numel() for p in model.parameters()))


Total parameters: 28007


 ## 9Ô∏è‚É£ Training Loop

In [11]:
for step in range(epochs):
    xb, yb = get_batch()
    logits, loss = model(xb, yb)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if step % 300 == 0:
        print(f"Step {step}, Loss = {loss.item():.4f}")


Step 0, Loss = 3.7808
Step 300, Loss = 0.1283
Step 600, Loss = 0.1144
Step 900, Loss = 0.0849
Step 1200, Loss = 0.1054


 ## üîü Text Generation

In [14]:
start_word = "i"
context = torch.tensor([[word2idx[start_word]]], dtype=torch.long)

generated = model.generate(context, max_new_tokens=30)

print("\nGenerated text:\n")
print(" ".join(idx2word[int(i)] for i in generated[0]))



Generated text:

i love south indian food <END> we traveled by train to chennai <END> the dosa was crispy and tasty <END> tea tastes better in the evening <END> we visited the beach
