In [None]:
import torch
import torch.nn as nn
from torch.nn import functional as F
import numpy as np

In [None]:
batch_size = 32        # Number of sequences processed in parallel
block_size = 128       # Maximum context length (T)
n_embd = 64           # Embedding dimension (C)
n_head = 4            # Number of attention heads
n_layer = 4           # Number of transformer blocks
dropout = 0.1         # Dropout probability
learning_rate = 3e-4  # AdamW learning rate
max_iters = 2000      # Training iterations
eval_interval = 100   # Print loss every N iterations
eval_iters = 20       # Average loss over N batches for evaluation
device = 'cuda' if torch.cuda.is_available() else 'cpu'

torch.manual_seed(1337)

<torch._C.Generator at 0x7c0cc0f284b0>

In [None]:
kannada_text = """
ಹಸಿರು ಎಲೆಗಳ ನಡುವೆ ಹೂವು ಅರಳುತ್ತದೆ.
ನದಿಯ ಹರಿವು ಎಂದಿಗೂ ನಿಲ್ಲುವುದಿಲ್ಲ.
ಜ್ಞಾನವೇ ಶ್ರೇಷ್ಠ ಸಂಪತ್ತು.
ಸತ್ಯವೇ ದೇವರು, ಧರ್ಮವೇ ಜೀವನ.
ಮರದ ನೆರಳಿನಲ್ಲಿ ಪಕ್ಷಿಗಳು ಹಾಡುತ್ತವೆ.
ಬೆಳಗಿನ ಸೂರ್ಯ ಎಲ್ಲರಿಗೂ ಸಮಾನ.
ಪ್ರೀತಿ ಎಲ್ಲ ಬಾಧೆಗಳನ್ನು ದಾಟುತ್ತದೆ.
ಕಲಿಕೆಯಲ್ಲಿ ವಯಸ್ಸಿಲ್ಲ.
ಶಾಂತಿ ಮನಸ್ಸಿನಲ್ಲಿ ಹುಟ್ಟುತ್ತದೆ.
ನಮ್ರತೆಯೇ ಶ್ರೇಷ್ಠ ಗುಣ.
ಹೂವಿನ ಸುಗಂಧ ದೂರ ಹರಡುತ್ತದೆ.
ಕಷ್ಟವೇ ಯಶಸ್ಸಿಗೆ ದಾರಿ.
ಸಹಾನುಭೂತಿ ಮಾನವೀಯತೆಯ ಮೂಲ.
ಕನಸುಗಳು ನಿಜವಾಗಲು ಶ್ರಮ ಬೇಕು.
ಪ್ರಕೃತಿಯೇ ನಮ್ಮ ಗುರು.
ಮಳೆಯ ಹನಿಗಳು ಭೂಮಿಯನ್ನು ತಣಿಸುತ್ತವೆ.
ನಗು ಅತ್ಯುತ್ತಮ ಔಷಧ.
ಸಮಯವೇ ಅಮೂಲ್ಯ ಧನ.
ಸ್ನೇಹವೇ ಜೀವನದ ಆಧಾರ.
ತಾಳ್ಮೆಯಿಂದ ಎಲ್ಲವೂ ಸಾಧ್ಯ.
ಪುಸ್ತಕಗಳು ಜ್ಞಾನದ ಭಂಡಾರ.
ಆಕಾಶವು ಮಿತಿಯಿಲ್ಲದ ವಿಸ್ತಾರ.
ಸಂಗೀತ ಮನಸ್ಸಿಗೆ ಆಹಾರ.
ಚಂದ್ರನ ಬೆಳಕು ರಾತ್ರಿಯನ್ನು ಬೆಳಗಿಸುತ್ತದೆ.
ಕನ್ನಡವೇ ನಮ್ಮ ಹೆಮ್ಮೆಯ ಭಾಷೆ.
ಎಲೆಗಳ ಸಪ್ಪಳ ಗಾಳಿಯ ಹಾಡು.
ಸಾಗರದ ಅಲೆಗಳು ನಿರಂತರ ಚಲನೆ.
ಬೆಳಗಾವಲು ಹೊಸ ಭರವಸೆಯ ಸಂಕೇತ.
ತಾರೆಗಳು ಆಕಾಶದ ಆಭರಣ.
ಮಾತು ಮನಸ್ಸಿನ ಕನ್ನಡಿ.
"""


In [None]:
class CharTokenizer:
    def __init__(self, text):
       chars = sorted(list(set(text)))
       self.vocab_size = len(chars)
       self.char_to_idx = {ch: i for i, ch in enumerate(chars)}
       self.idx_to_char = {i: ch for i, ch in enumerate(chars)}
       print(f"Tokenizer initialized: {self.vocab_size} unique characters")
       print(f"Sample chars: {chars[:10]}")

    def encode(self, text):
        """Convert string to list of integers"""
        return [self.char_to_idx[ch] for ch in text]

    def decode(self, indices):
        """Convert list of integers back to string"""
        return ''.join([self.idx_to_char[i] for i in indices])

# Initialize tokenizer
tokenizer = CharTokenizer(kannada_text)
vocab_size = tokenizer.vocab_size

# Encode entire dataset
data = torch.tensor(tokenizer.encode(kannada_text), dtype=torch.long)
print(f"Dataset size: {len(data)} characters")

# Train/val split (90/10)
n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]



Tokenizer initialized: 45 unique characters
Sample chars: ['\n', ' ', ',', '.', 'ಂ', 'ಅ', 'ಆ', 'ಎ', 'ಔ', 'ಕ']
Dataset size: 786 characters


In [None]:
all_chars = list(tokenizer.char_to_idx.keys())
print("Full Vocabulary:")
print(''.join(all_chars))

# Print the character and its corresponding integer ID
for char, idx in tokenizer.char_to_idx.items():
    print(f"Char: '{char}' | ID: {idx}")

Full Vocabulary:

 ,.ಂಅಆಎಔಕಗಚಜಞಟಠಡಣತದಧನಪಬಭಮಯರಲಳವಶಷಸಹಾಿೀುೂೃೆೇೊ್
Char: '
' | ID: 0
Char: ' ' | ID: 1
Char: ',' | ID: 2
Char: '.' | ID: 3
Char: 'ಂ' | ID: 4
Char: 'ಅ' | ID: 5
Char: 'ಆ' | ID: 6
Char: 'ಎ' | ID: 7
Char: 'ಔ' | ID: 8
Char: 'ಕ' | ID: 9
Char: 'ಗ' | ID: 10
Char: 'ಚ' | ID: 11
Char: 'ಜ' | ID: 12
Char: 'ಞ' | ID: 13
Char: 'ಟ' | ID: 14
Char: 'ಠ' | ID: 15
Char: 'ಡ' | ID: 16
Char: 'ಣ' | ID: 17
Char: 'ತ' | ID: 18
Char: 'ದ' | ID: 19
Char: 'ಧ' | ID: 20
Char: 'ನ' | ID: 21
Char: 'ಪ' | ID: 22
Char: 'ಬ' | ID: 23
Char: 'ಭ' | ID: 24
Char: 'ಮ' | ID: 25
Char: 'ಯ' | ID: 26
Char: 'ರ' | ID: 27
Char: 'ಲ' | ID: 28
Char: 'ಳ' | ID: 29
Char: 'ವ' | ID: 30
Char: 'ಶ' | ID: 31
Char: 'ಷ' | ID: 32
Char: 'ಸ' | ID: 33
Char: 'ಹ' | ID: 34
Char: 'ಾ' | ID: 35
Char: 'ಿ' | ID: 36
Char: 'ೀ' | ID: 37
Char: 'ು' | ID: 38
Char: 'ೂ' | ID: 39
Char: 'ೃ' | ID: 40
Char: 'ೆ' | ID: 41
Char: 'ೇ' | ID: 42
Char: 'ೊ' | ID: 43
Char: '್' | ID: 44


In [None]:
def get_batch(split):
    data_source = train_data if split == 'train' else val_data
    # Ensure the upper bound for randint is at least 1 to avoid 'from >= to' error
    upper_bound = max(1, len(data_source) - block_size)
    ix = torch.randint(upper_bound, (batch_size,))
    x = torch.stack([data_source[i:i+block_size] for i in ix])
    y = torch.stack([data_source[i+1:i+block_size+1] for i in ix])
    return x.to(device), y.to(device)

@torch.no_grad()
def estimate_loss(model):
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [None]:
class Head(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.head_size = head_size
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B, T, C = x.shape

        q = self.query(x)
        k = self.key(x)
        v = self.value(x)

        weights = q @ k.transpose(-2, -1)

        weights = weights/(self.head_size ** 0.5)

        weights = weights.masked_fill(self.tril[:T, :T] == 0, float('-inf'))

        # Apply softmax to get attention probabilities
        weights = F.softmax(weights, dim=-1)  # (B, T, T)
        weights = self.dropout(weights)

        # Weighted aggregation of values
        # Shape: (B, T, T) @ (B, T, head_size) -> (B, T, head_size)
        out = weights @ v

        return out


In [None]:
class MultiHeadAttention(nn.Module):
    def __init__(self, n_head, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(n_head)])

        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Droput(dropout)

    def forward(self, x):
        out = self.dropout(self.proj(out))
        return out


In [None]:
class FeedForward(nn.Module):
    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )
    def forward(self, x):
        return self.net(x)

In [None]:
class Block(nn.Module):
    def __init__(self, n_embd, n_head):
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedForward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x


In [None]:
class MultiHeadAttention(nn.Module):
    def __init__(self, n_head, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(n_head)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout) # Corrected typo: Droput -> Dropout

    def forward(self, x):
        # Concatenate outputs from all heads along the last dimension
        out = torch.cat([h(x) for h in self.heads], dim=-1) # Fixed undefined 'out' and aggregation
        out = self.proj(out)
        out = self.dropout(out)
        return out

class Block(nn.Module):
    def __init__(self, n_embd, n_head):
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size) # This will now refer to the new MultiHeadAttention
        self.ffwd = FeedForward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

class GPTModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd)
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape
        tok_emb = self.token_embedding_table(idx)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # Corrected typo: arrange -> arange
        x = tok_emb + pos_emb
        x = self.blocks(x)
        x = self.ln_f(x)
        logits = self.lm_head(x)
        if targets is None:
            loss = None
        else:
            # Get actual sequence lengths for logits and targets
            _, T_logits, C = logits.shape
            _, T_targets = targets.shape

            # Adjust logits or targets length to match for loss calculation
            if T_logits > T_targets: # This is the case causing the error (idx longer than targets)
                # Crop logits to match the length of targets
                logits_to_use = logits[:, :T_targets, :].contiguous()
                targets_to_use = targets.contiguous()
            elif T_logits < T_targets: # Less expected, but handle for robustness (targets longer than logits)
                # Crop targets to match the length of logits
                logits_to_use = logits.contiguous()
                targets_to_use = targets[:, :T_logits].contiguous()
            else: # Lengths are already consistent
                logits_to_use = logits.contiguous()
                targets_to_use = targets.contiguous()

            # Reshape for cross_entropy: (N, C) for logits and (N) for targets
            loss = F.cross_entropy(logits_to_use.view(-1, C), targets_to_use.view(-1))

        return logits, loss

    def generate(self, idx, max_new_tokens):
        """
        Generate new tokens autoregressively.

        Args:
            idx: Starting context, shape (B, T)
            max_new_tokens: Number of tokens to generate

        Returns:
            idx: Extended sequence, shape (B, T + max_new_tokens)
        """
        for _ in range(max_new_tokens):
            # Crop context to block_size
            idx_cond = idx[:, -block_size:]

            # Get predictions
            logits, _ = self(idx_cond)

            # Focus on last time step
            logits = logits[:, -1, :]  # (B, vocab_size)

            # Apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1)

            # Sample from distribution
            idx_next = torch.multinomial(probs, num_samples=1)  # (B, 1)

            # Append to sequence
            idx = torch.cat([idx, idx_next], dim=1)  # (B, T+1)

        return idx

# ============================================================================
# TRAINING LOOP
# ============================================================================
def train():
    """Main training function"""

    # Initialize model
    model = GPTModel()
    model = model.to(device)

    # Count parameters
    n_params = sum(p.numel() for p in model.parameters())
    print(f"\nModel initialized with {n_params:,} parameters")
    print(f"Training on {device}\n")

    # Optimizer
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

    # Training loop
    print("Starting training...\n")
    for iter in range(max_iters):

        # Evaluate loss periodically
        if iter % eval_interval == 0 or iter == max_iters - 1:
            losses = estimate_loss(model)
            print(f"Step {iter:4d} | Train loss: {losses['train']:.4f} | Val loss: {losses['val']:.4f}")

        # Sample batch
        xb, yb = get_batch('train')

        # Forward pass
        logits, loss = model(xb, yb)

        # Backward pass
        optimizer.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()

    print("\n" + "="*60)
    print("Training complete!")
    print("="*60 + "\n")

    return model

# ============================================================================
# TEXT GENERATION
# ============================================================================
def generate_text(model, prompt="", max_tokens=200):
    """Generate text from the model"""
    model.eval()

    # Encode prompt or start with newline
    if prompt:
        context = torch.tensor([tokenizer.encode(prompt)], dtype=torch.long, device=device)
    else:
        context = torch.zeros((1, 1), dtype=torch.long, device=device)

    # Generate
    generated_ids = model.generate(context, max_new_tokens=max_tokens)[0].tolist()
    generated_text = tokenizer.decode(generated_ids)

    return generated_text

# ============================================================================
# MAIN EXECUTION
# ============================================================================
if __name__ == "__main__":
    print("="*60)
    print("MICRO-KANNADAGPT: Transformer from Scratch")
    print("="*60)
    print(f"\nHyperparameters:")
    print(f"  - Embedding dim: {n_embd}")
    print(f"  - Attention heads: {n_head}")
    print(f"  - Transformer layers: {n_layer}")
    print(f"  - Context length: {block_size}")
    print(f"  - Batch size: {batch_size}")
    print(f"  - Learning rate: {learning_rate}")
    print(f"  - Vocabulary size: {vocab_size}")
    print(f"  - Device: {device}")

    # Train the model
    model = train()

    # Generate sample text
    print("\n" + "="*60)
    print("SAMPLE GENERATION")
    print("="*60)

    print("\nGeneration 1 (from scratch):")
    print("-" * 60)
    output1 = generate_text(model, prompt="", max_tokens=150)
    print(output1)

    print("\n\nGeneration 2 (with prompt):")
    print("-" * 60)
    output2 = generate_text(model, prompt="ಜ್ಞಾನ", max_tokens=100)
    print(output2)

    print("\n" + "="*60)
    print("\u2713 Project complete! Model successfully trained and generated text.")
    print("="*60)

MICRO-KANNADAGPT: Transformer from Scratch

Hyperparameters:
  - Embedding dim: 64
  - Attention heads: 4
  - Transformer layers: 4
  - Context length: 128
  - Batch size: 32
  - Learning rate: 0.0003
  - Vocabulary size: 45
  - Device: cuda

Model initialized with 213,293 parameters
Training on cuda

Starting training...

Step    0 | Train loss: 3.9803 | Val loss: 4.0110
Step  100 | Train loss: 2.5936 | Val loss: 3.1121
Step  200 | Train loss: 2.1880 | Val loss: 3.1283
Step  300 | Train loss: 1.8508 | Val loss: 3.3132
Step  400 | Train loss: 1.4498 | Val loss: 3.6921
Step  500 | Train loss: 1.0559 | Val loss: 4.1341
Step  600 | Train loss: 0.6863 | Val loss: 4.5825
Step  700 | Train loss: 0.4299 | Val loss: 4.9145
Step  800 | Train loss: 0.2782 | Val loss: 5.1897
Step  900 | Train loss: 0.1947 | Val loss: 5.5532
Step 1000 | Train loss: 0.1411 | Val loss: 5.7161
Step 1100 | Train loss: 0.1043 | Val loss: 5.9256
Step 1200 | Train loss: 0.0858 | Val loss: 5.9579
Step 1300 | Train loss: 0