## Load vocab

In [85]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/gpt2mini/pytorch/default/1/decoder.py
/kaggle/input/gpt2mini/pytorch/default/1/gpt2.py
/kaggle/input/model-components/merges.txt
/kaggle/input/model-components/vocab.json
/kaggle/input/model-components/utils.py


In [242]:
from datasets import load_dataset

# Load and tokenize
dataset = load_dataset("roneneldan/TinyStories", split="train")
texts = [sample["text"] for sample in dataset]

In [217]:
print(os.listdir("/kaggle/input"))

['gpt2mini', 'model-components']


In [218]:
import json
with open("/kaggle/input/model-components/vocab.json") as f:
    word2idx = json.load(f)
    print(type(word2idx))
    print(list(word2idx.items())[:10])

idx2word = {int(v): k for k, v in word2idx.items()}
# pad_id = word2idx["<pad>"]

<class 'dict'>
[('<s>', 0), ('<pad>', 1), ('</s>', 2), ('<unk>', 3), ('<mask>', 4), ('!', 5), ('"', 6), ('#', 7), ('$', 8), ('%', 9)]


In [219]:
print(word2idx["cat"])

9661


## Testing Tokenization

In [220]:
import sys
sys.dont_write_bytecode = True # disabling __pycache__
sys.path.insert(0, '/kaggle/input/model-components')
from utils import Tokenizer
# from utils import clean_text

tokenizer = Tokenizer()
tokenizer.upload_vocab(word2idx)
tokenizer.encode("dog")

[11902]

## Testing the Model

In [221]:
# embedding_dim == hidden_size == (D)
# embedding_dim % num_heads == 0
embedding_dim = 64

ff_embedding_dim = 128 # ff_embedding_dim = 4 × embedding_dim
max_seq_len = 10
dropout = 0.1
num_heads = 2
vocab_size = tokenizer.get_vocab_size()
num_layers = 2

In [222]:
print(vocab_size)

19716


In [223]:
sys.path.insert(0, '/kaggle/input/gpt2mini/pytorch/default/1')
from gpt2 import GPT2Model
import torch
import torch.nn as nn

encoded = tokenizer.encode("dog")
input_tensor = torch.tensor(encoded).unsqueeze(0)

model = GPT2Model(vocab_size,embedding_dim,ff_embedding_dim,max_seq_len,num_heads,num_layers,dropout = 0.1)

# for each position in the sequence, you get a distribution over all vocab tokens.
logits = model(input_tensor)  # (B, T, V)

# Shift targets for next-token prediction
# shift_logits = logits[:, :-1, :].contiguous()
# shift_labels = input_tensor[:, 1:].contiguous()

# Flatten for CrossEntropyLoss
# loss_fn = nn.CrossEntropyLoss()
# loss = loss_fn(
#     shift_logits.view(-1, vocab_size),
#     shift_labels.view(-1)
# )

In [224]:
tokenized_text = tokenizer.encode("once day a time a cat")
if hasattr(tokenized_text, "ids"):
    tokenized_text = tokenized_text.ids
print(tokenized_text)

[15976, 1131, 69, 3325, 69, 9661]


In [225]:
tokenizer.encode("<pad>")

[1]

In [226]:
class CustomTextDataset(torch.utils.data.Dataset):
    def __init__(self, texts, tokenizer, seq_len):
        self.texts = texts
        self.seq_len = seq_len
        self.tokenizer = tokenizer
        self.pad_id = word2idx["<pad>"]  # Make sure you have a padding token

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        
        # Tokenize and convert to IDs
        token_ids = self.tokenizer.encode(text)
        if hasattr(token_ids, "ids"):
            token_ids = token_ids.ids
        
        # Handle sequences that are too short or too long
        if len(token_ids) >= self.seq_len:
            # Take the first seq_len tokens if too long
            token_ids = token_ids[:self.seq_len]
        else:
            # Pad with <pad> tokens if too short
            padding = [self.pad_id] * (self.seq_len - len(token_ids))
            token_ids = token_ids + padding
        
        # Create input and target sequences
        x = torch.tensor(token_ids[:-1], dtype=torch.long)  # Input sequence
        y = torch.tensor(token_ids[1:], dtype=torch.long)   # Target sequence
        
        return x, y

## Loading Data

In [227]:
from torch.utils.data import DataLoader
# from data.dataset import TextDataset

# Use longer sequences for testing

dataset = CustomTextDataset(texts,tokenizer, seq_len=10)
dataloader = DataLoader(
    dataset, 
    batch_size=16,  # Reduced from 32
    shuffle=True, 
    pin_memory=True
)

## Training the Model

In [228]:
# embedding_dim == hidden_size == (D)
# embedding_dim % num_heads == 0
embedding_dim = 64

ff_embedding_dim = 128 # ff_embedding_dim = 4 × embedding_dim
max_seq_len = 10
dropout = 0.1
num_heads = 2
vocab_size = tokenizer.get_vocab_size()
num_layers = 2

epochs = 50
device = 'cuda' if torch.cuda.is_available() else 'cpu'
batch_size = 16

In [244]:
x, y = next(iter(dataloader))
#-----Important-------#
# if this cell causes device error comment or remove the comment for the line bellow
x, y = x.to(device), y.to(device)
logits = model(x)
print("Logits shape:", logits.shape)

Logits shape: torch.Size([16, 9, 19716])


In [None]:
optimizer = torch.optim.Adam(model.parameters(), lr=3e-5)
def train_model_with_checkpoints(
    model, 
    dataloader, 
    criterion, 
    optimizer, 
    epochs, 
    device, 
    vocab_size, 
    pad_id, 
    checkpoint_dir="model-components/checkpoints",
    checkpoint_freq=1  # Save every N epochs
):
    # Create checkpoint directory
    os.makedirs(checkpoint_dir, exist_ok=True)
    
    model = model.to(device)
    history = {
        'train_loss': [],
        'train_acc': [],
        'lr_history': []
    }
    
    for epoch in range(epochs):
        model.train()
        epoch_loss = 0
        epoch_correct = 0
        epoch_total = 0
        
        progress_bar = tqdm(dataloader, desc=f"Epoch {epoch+1}/{epochs}")
        
        for batch_idx, (x, y) in enumerate(progress_bar):
            x, y = x.to(device), y.to(device)
            
            # Forward pass
            logits = model(x)
            
            # Calculate metrics
            loss = criterion(logits.view(-1, vocab_size), y.view(-1))
            preds = logits.argmax(dim=-1)
            mask = y != pad_id
            correct = (preds[mask] == y[mask]).float().sum()
            total = mask.float().sum()
            acc = correct / total if total > 0 else 0
            
            # Backward pass
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            # Update metrics
            epoch_loss += loss.item()
            epoch_correct += correct.item()
            epoch_total += total.item()
            
            progress_bar.set_postfix({
                'loss': f"{loss.item():.4f}",
                'acc': f"{acc.item():.2%}"
            })
        
        # Save epoch metrics
        avg_loss = epoch_loss / len(dataloader)
        avg_acc = epoch_correct / epoch_total if epoch_total > 0 else 0
        history['train_loss'].append(avg_loss)
        history['train_acc'].append(avg_acc)
        history['lr_history'].append(optimizer.param_groups[0]['lr'])
        
        # Save checkpoint
        if (epoch + 1) % checkpoint_freq == 0:
            checkpoint_path = os.path.join(
                checkpoint_dir, 
                f"checkpoint_epoch_{epoch+1}.pt"
            )
            torch.save({
                'epoch': epoch + 1,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'loss': avg_loss,
                'accuracy': avg_acc,
                'history': history
            }, checkpoint_path)
            print(f"Saved checkpoint to {checkpoint_path}")
        
        print(f"Epoch {epoch+1} | Loss: {avg_loss:.4f} | Acc: {avg_acc:.2%}")
    
    return model, history


# Example usage:
trained_model, training_history = train_model(model, dataloader, criterion, 
                                           optimizer, epochs, device, 
                                           vocab_size,1)

Epoch 1/50:   0%|          | 0/3125 [00:00<?, ?it/s]

## Generate Text

In [None]:
# Load model
# GPT2Config.vocab_size = len(word2idx)
# model = GPT2Model(GPT2Config())
# model.load_state_dict(torch.load("gpt2_tiny.pth", map_location="cpu"))
# model.eval()

In [None]:
# initialize tokenizer with texts

def generate_text(prompt, max_new_tokens=50):
    input_ids = tokenizer.encode(prompt)
    input_tensor = torch.tensor([input_ids], dtype=torch.long).to(device)  # [1, T]
    
    with torch.no_grad():
        for _ in range(max_new_tokens):
            logits = model(input_tensor)  # [1, T, vocab]
            next_token_logits = logits[:, -1, :]  # last position
            next_token = torch.argmax(next_token_logits, dim=-1).unsqueeze(0)  # [1, 1]
            input_tensor = torch.cat([input_tensor, next_token], dim=1)  # grow the sequence

    return tokenizer.decode(input_tensor[0].tolist())


In [None]:
prompt = "Once upon a time"
print(generate_text(prompt, max_new_tokens=16))