In [2]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import tiktoken

In [3]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False):
        super().__init__()
        assert d_out % num_heads == 0, "d_out must be divisible by n_heads"

        self.d_out = d_out
        self.num_heads = num_heads
        self.head_dim = d_out // num_heads

        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.out_proj = nn.Linear(d_out, d_out)
        self.dropout = nn.Dropout(dropout)
        self.register_buffer('mask', torch.triu(torch.ones(context_length, context_length), diagonal=1))

    def forward(self, x):
        b, num_tokens, _ = x.shape

        keys = self.W_key(x)  # Shape: (b, num_tokens, d_out)
        queries = self.W_query(x)
        values = self.W_value(x)

        keys = keys.view(b, num_tokens, self.num_heads, self.head_dim)
        values = values.view(b, num_tokens, self.num_heads, self.head_dim)
        queries = queries.view(b, num_tokens, self.num_heads, self.head_dim)

        # Transpose: (b, num_tokens, num_heads, head_dim) -> (b, num_heads, num_tokens, head_dim)
        keys = keys.transpose(1, 2)
        queries = queries.transpose(1, 2)
        values = values.transpose(1, 2)

        attn_scores = queries @ keys.transpose(2, 3)  # Dot product for each head
        mask_bool = self.mask.bool()[:num_tokens, :num_tokens]
        attn_scores.masked_fill_(mask_bool, -torch.inf)

        attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1)
        attn_weights = self.dropout(attn_weights)

        # Shape: (b, num_tokens, num_heads, head_dim)
        context_vec = (attn_weights @ values).transpose(1, 2)
        context_vec = context_vec.reshape(b, num_tokens, self.d_out)
        context_vec = self.out_proj(context_vec)  # optional projection

        return context_vec

class GPTBlock(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.ln1 = nn.LayerNorm(cfg["emb_dim"])
        self.attn = MultiHeadAttention(
            d_in=cfg["emb_dim"],
            d_out=cfg["emb_dim"],
            context_length=cfg["context_length"],
            dropout=cfg["drop_rate"],
            num_heads=cfg["n_heads"],
            qkv_bias=cfg["qkv_bias"]  # Match your original config

        )
        self.ln2 = nn.LayerNorm(cfg["emb_dim"])
        self.ffn = nn.Sequential(
            nn.Linear(cfg["emb_dim"], 4 * cfg["emb_dim"]),
            nn.GELU(),
            nn.Linear(4 * cfg["emb_dim"], cfg["emb_dim"]),
            nn.Dropout(cfg["drop_rate"])
        )

    def forward(self, x):
        x = x + self.attn(self.ln1(x))  # Residual connection
        x = x + self.ffn(self.ln2(x))   # Residual connection
        return x

class GPTModel(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
        self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"])
        self.drop = nn.Dropout(cfg["drop_rate"])
        self.blocks = nn.Sequential(*[GPTBlock(cfg) for _ in range(cfg["n_layers"])])
        self.ln_f = nn.LayerNorm(cfg["emb_dim"])
        self.head = nn.Linear(cfg["emb_dim"], cfg["vocab_size"], bias=False)

        # Weight tying and initialization
        self.head.weight = self.tok_emb.weight
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, idx):
        device = idx.device
        b, t = idx.size()

        # Token and position embeddings
        tok_emb = self.tok_emb(idx)
        pos = torch.arange(0, t, dtype=torch.long, device=device)
        pos_emb = self.pos_emb(pos)

        x = self.drop(tok_emb + pos_emb)
        x = self.blocks(x)
        x = self.ln_f(x)
        return self.head(x)

In [49]:
GPT_CONFIG = {
    "vocab_size": 50257,
    "context_length": 128,
    "emb_dim": 256,  # Changed for smaller GPT
    "n_heads": 4,     # Reduced heads
    "n_layers": 4,    # Fewer layers for faster training
    "drop_rate": 0.1, # Lower dropout
    "qkv_bias": True
}

In [51]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [58]:
# 1. Load tiktoken tokenizer
tokenizer = tiktoken.get_encoding("gpt2")

# 2. Dataset for tokenized lines
class CustomDataset(Dataset):
    def __init__(self, file_path, tokenizer, block_size=512):
        with open(file_path, 'r', encoding='utf-8') as f:
            text = f.read()

        # Tokenize whole text at once
        tokens = tokenizer.encode(text, allowed_special={'<|endoftext|>'})
        self.block_size = block_size
        self.examples = []

        for i in range(0, len(tokens) - block_size, block_size):
            chunk = tokens[i:i+block_size]
            self.examples.append(torch.tensor(chunk, dtype=torch.long))

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        x = self.examples[idx]
        return {
            'input_ids': x,
            'labels': x.clone(),  # Labels are same as inputs for language modeling
        }

# 3. Load your model
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"  # Optional but useful

state_dict = torch.load("/content/mini_gpt2_model.pth", map_location="cpu")

model = GPTModel(GPT_CONFIG)
model.load_state_dict(state_dict)
device = "cpu"
model.train()

# 4. Dataset and DataLoader
dataset = CustomDataset("/content/fine_tune_custom_qna.txt", tokenizer, block_size=512)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

# 5. Optimizer and loss
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
criterion = nn.CrossEntropyLoss()

# 6. Manual training loop
epochs = 3
vocab_size = model.tok_emb.num_embeddings

for epoch in range(epochs):
    loop = tqdm(dataloader, desc=f"Epoch {epoch+1}/{epochs}")
    for batch in loop:
        input_ids = batch["input_ids"].to(device)
        labels = batch["labels"].to(device)

        # Clamp both input and label IDs
        input_ids = input_ids.clamp(max=vocab_size - 1)
        labels = labels.clamp(max=vocab_size - 1)

        outputs = model(input_ids)  # (batch, seq_len, vocab_size)
        loss = criterion(outputs.view(-1, vocab_size), labels.view(-1))

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        loop.set_postfix(loss=loss.item())

# 7. Save fine-tuned model
torch.save(model.state_dict(), "fine_tuned_gpt2_model.pth")

Epoch 1/3:   0%|          | 0/5 [00:00<?, ?it/s]


IndexError: index out of range in self

In [45]:
print(f"Input IDs max: {input_ids.max()}, shape: {input_ids.shape}")
print(f"Labels max: {labels.max()}, min: {labels.min()}, shape: {labels.shape}")
print(f"Model vocab size: {model.tok_emb.num_embeddings}")
print(f"Model device: {next(model.parameters()).device}")
print(f"Input device: {input_ids.device}")

Input IDs max: 50256, shape: torch.Size([2, 512])
Labels max: 50256, min: 11, shape: torch.Size([2, 512])
Model vocab size: 50257
Model device: cpu
Input device: cpu


In [47]:
checkpoint = torch.load("/content/mini_gpt2_model.pth", map_location="cpu")
print(type(checkpoint))

<class 'collections.OrderedDict'>


In [53]:
state_dict = torch.load("/content/mini_gpt2_model.pth", map_location="cpu")
missing, unexpected = model.load_state_dict(state_dict, strict=False)
print("Missing keys:", missing)
print("Unexpected keys:", unexpected)

Missing keys: []
Unexpected keys: []


In [59]:
print("Max input ID:", input_ids.max().item(), "Vocab size:", vocab_size)

Max input ID: 50256 Vocab size: 50257


In [60]:
print("Model vocab size:", model.tok_emb.num_embeddings)

Model vocab size: 50257


In [61]:
print("Input shape:", input_ids.shape)
print("Input max ID:", input_ids.max().item())
print("Input dtype:", input_ids.dtype)
print("Model vocab size:", model.tok_emb.num_embeddings)
print("Model device:", next(model.parameters()).device)
print("Input device:", input_ids.device)

Input shape: torch.Size([2, 512])
Input max ID: 50256
Input dtype: torch.int64
Model vocab size: 50257
Model device: cpu
Input device: cpu


In [62]:
print("Labels max ID:", labels.max().item())
print("Labels min ID:", labels.min().item())

Labels max ID: 50256
Labels min ID: 0
