## Load vocab

In [25]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/utilities/merges.txt
/kaggle/input/utilities/vocab.json
/kaggle/input/utilities/place_holder
/kaggle/input/utilities/tiny_stories_vocab.json
/kaggle/input/utils/pytorch/default/1/utils.py
/kaggle/input/gpt2minimodel/pytorch/default/1/gpt2MiniModel.pt
/kaggle/input/gpt2mini/pytorch/default/1/decoder.py
/kaggle/input/gpt2mini/pytorch/default/1/gpt2.py


In [None]:
from datasets import load_dataset

# Load and tokenize
dataset = load_dataset("roneneldan/TinyStories", split="train")
# texts = [sample["text"] for sample in dataset]
texts = [sample["text"] for sample in dataset.select(range(50000))]


In [27]:
print(os.listdir("/kaggle/input"))

['utilities', 'utils', 'gpt2minimodel', 'gpt2mini']


In [28]:
import json
with open("/kaggle/input/utilities/tiny_stories_vocab.json") as f:
    word2idx = json.load(f)
    print(type(word2idx))
    print(list(word2idx.items())[:10])

idx2word = {int(v): k for k, v in word2idx.items()}
pad_id = word2idx["<pad>"]

<class 'dict'>
[('<pad>', 0), ('<unk>', 1), ('<sos>', 2), ('<eos>', 3), ('"', 4), ('""I', 5), ('""No', 6), ('"\'Hello,', 7), ('"\'Let\'s', 8), ('"\'Why', 9)]


In [29]:
print(word2idx["Little"])

8011


## Testing Tokenization

In [30]:
import sys
sys.dont_write_bytecode = True # disabling __pycache__
sys.path.insert(0, '/kaggle/input/utils/pytorch/default/1/')
from utils import Tokenizer
# from utils import clean_text

tokenizer = Tokenizer()
tokenizer.upload_vocab(word2idx)
tokenizer.encode("little")

[33448]

## Testing the Model

In [31]:
# embedding_dim == hidden_size == (D)
# embedding_dim % num_heads == 0
embedding_dim = 64

ff_embedding_dim = 128 # ff_embedding_dim = 4 × embedding_dim
max_seq_len = 10
dropout = 0.1
num_heads = 2
vocab_size = tokenizer.get_vocab_size()
num_layers = 2

In [32]:
print(vocab_size)

57374


In [33]:
sys.path.insert(0, '/kaggle/input/gpt2mini/pytorch/default/1')
from gpt2 import GPT2Model
import torch
import torch.nn as nn

encoded = tokenizer.encode("dog")
input_tensor = torch.tensor(encoded).unsqueeze(0)

model = GPT2Model(vocab_size,embedding_dim,ff_embedding_dim,max_seq_len,num_heads,num_layers,dropout = 0.1)

# for each position in the sequence, you get a distribution over all vocab tokens.
logits = model(input_tensor)  # (B, T, V)

# Shift targets for next-token prediction
# shift_logits = logits[:, :-1, :].contiguous()
# shift_labels = input_tensor[:, 1:].contiguous()

# Flatten for CrossEntropyLoss
# loss_fn = nn.CrossEntropyLoss()
# loss = loss_fn(
#     shift_logits.view(-1, vocab_size),
#     shift_labels.view(-1)
# )

In [34]:
tokenized_text = tokenizer.encode("one day a little cat")
if hasattr(tokenized_text, "ids"):
    tokenized_text = tokenized_text.ids
print(tokenized_text)

[37567, 21502, 12199, 33448, 17846]


In [35]:
tokenizer.encode("<pad>")

[0]

In [36]:
class CustomTextDataset(torch.utils.data.Dataset):
    def __init__(self, texts, tokenizer, seq_len):
        self.texts = texts
        self.seq_len = seq_len
        self.tokenizer = tokenizer
        self.pad_id = word2idx["<pad>"]  # Make sure you have a padding token

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        
        # Tokenize and convert to IDs
        token_ids = self.tokenizer.encode(text)
        if hasattr(token_ids, "ids"):
            token_ids = token_ids.ids
        
        # Handle sequences that are too short or too long
        if len(token_ids) >= self.seq_len:
            # Take the first seq_len tokens if too long
            token_ids = token_ids[:self.seq_len]
        else:
            # Pad with <pad> tokens if too short
            padding = [self.pad_id] * (self.seq_len - len(token_ids))
            token_ids = token_ids + padding
        
        # Create input and target sequences
        x = torch.tensor(token_ids[:-1], dtype=torch.long)  # Input sequence
        y = torch.tensor(token_ids[1:], dtype=torch.long)   # Target sequence
        
        return x, y

## Loading Data

In [37]:
from torch.utils.data import DataLoader
# from data.dataset import TextDataset

# Use longer sequences for testing

dataset = CustomTextDataset(texts,tokenizer, seq_len=10)
dataloader = DataLoader(
    dataset, 
    batch_size=32,
    shuffle=True, 
    pin_memory=True
)

## Training the Model

In [16]:
# embedding_dim == hidden_size == (D)
# embedding_dim % num_heads == 0
embedding_dim = 64

ff_embedding_dim = 128 # ff_embedding_dim = 4 × embedding_dim
max_seq_len = 10
dropout = 0.1
num_heads = 2
vocab_size = tokenizer.get_vocab_size()
num_layers = 2


device = 'cuda' if torch.cuda.is_available() else 'cpu'
batch_size = 32

In [38]:
x, y = next(iter(dataloader))
# #-----Important-------#
# # if this cell causes device error comment or remove the comment for the line bellow
# x, y = x.to(device), y.to(device)
logits = model(x)
print("Logits shape:", logits.shape)

Logits shape: torch.Size([32, 9, 57374])


In [39]:
from tqdm import tqdm
import os
import torch

In [19]:


optimizer = torch.optim.Adam(model.parameters(), lr=3e-5)
criterion = nn.CrossEntropyLoss(
    ignore_index = 1, 
    label_smoothing = 0.1
)
epochs = 5
def train_model(
    model,
    dataloader,
    criterion,
    optimizer,
    epochs,
    device,
    vocab_size,
    pad_id,
    checkpoint_dir="check_point",
    checkpoint_freq=5,
    start_epoch=0,
    history=None
):
    os.makedirs(checkpoint_dir, exist_ok=True)
    model = model.to(device)

    if history is None:
        history = {
            'train_loss': [],
            'train_acc': [],
            'lr_history': []
        }

    for epoch in range(start_epoch, epochs):
        model.train()
        epoch_loss = 0
        epoch_correct = 0
        epoch_total = 0

        progress_bar = tqdm(dataloader, desc=f"Epoch {epoch+1}/{epochs}")

        for batch_idx, (x, y) in enumerate(progress_bar):
            x, y = x.to(device), y.to(device)

            logits = model(x)
            loss = criterion(logits.view(-1, vocab_size), y.view(-1))

            preds = logits.argmax(dim=-1)
            mask = y != pad_id
            correct = (preds[mask] == y[mask]).float().sum()
            total = mask.float().sum()
            acc = correct / total if total > 0 else 0

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()
            epoch_correct += correct.item()
            epoch_total += total.item()

            progress_bar.set_postfix({
                'loss': f"{loss.item():.4f}",
                'acc': f"{acc.item():.2%}"
            })

        avg_loss = epoch_loss / len(dataloader)
        avg_acc = epoch_correct / epoch_total if epoch_total > 0 else 0
        history['train_loss'].append(avg_loss)
        history['train_acc'].append(avg_acc)
        history['lr_history'].append(optimizer.param_groups[0]['lr'])

        if (epoch + 1) % checkpoint_freq == 0:
            checkpoint_path = os.path.join(checkpoint_dir, f"checkpoint_epoch_{epoch+1}.pt")
            torch.save({
                'epoch': epoch + 1,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'loss': avg_loss,
                'accuracy': avg_acc,
                'history': history
            }, checkpoint_path)
            print(f"Saved checkpoint to {checkpoint_path}")

        print(f"Epoch {epoch+1} | Loss: {avg_loss:.4f} | Acc: {avg_acc:.2%}")

    return model, history


# Example usage:
trained_model, training_history = train_model(
    model=model,
    dataloader=dataloader,
    criterion=criterion,
    optimizer=optimizer,
    epochs=epochs,
    device=device,
    vocab_size=vocab_size,
    pad_id=0
)



Epoch 1/5: 100%|██████████| 1563/1563 [00:22<00:00, 69.16it/s, loss=6.1627, acc=13.89%]


Epoch 1 | Loss: 8.0956 | Acc: 15.20%


Epoch 2/5: 100%|██████████| 1563/1563 [00:21<00:00, 71.64it/s, loss=4.4958, acc=40.97%]


Epoch 2 | Loss: 5.1377 | Acc: 26.70%


Epoch 3/5: 100%|██████████| 1563/1563 [00:21<00:00, 71.95it/s, loss=3.5558, acc=62.50%]


Epoch 3 | Loss: 4.2017 | Acc: 48.32%


Epoch 4/5: 100%|██████████| 1563/1563 [00:21<00:00, 71.46it/s, loss=3.5620, acc=56.94%]


Epoch 4 | Loss: 3.8499 | Acc: 53.34%


Epoch 5/5: 100%|██████████| 1563/1563 [00:21<00:00, 71.44it/s, loss=4.1160, acc=49.31%]


Saved checkpoint to check_point/checkpoint_epoch_5.pt
Epoch 5 | Loss: 3.6822 | Acc: 55.16%


# after check point

In [None]:

optimizer = torch.optim.Adam(model.parameters(), lr=3e-5)
criterion = nn.CrossEntropyLoss(
    ignore_index = 1, 
    label_smoothing = 0.1
)
checkpoint_path = "check_point/checkpoint_epoch_50.pt"  # Adjust path if needed
checkpoint = torch.load(checkpoint_path, map_location=device)

# === Load model weights ===
model.load_state_dict(checkpoint['model_state_dict'])

# === Re-create the optimizer and load its state ===
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

# === Extract training state ===
start_epoch = checkpoint['epoch']  # This will be 10
history = checkpoint['history']

trained_model, training_history = train_model(
    model=model,
    dataloader=dataloader,
    criterion=criterion,
    optimizer=optimizer,
    epochs=100,  # final epoch number you want to reach
    device=device,
    vocab_size=vocab_size,
    pad_id=0,
    checkpoint_dir="check_point",
    checkpoint_freq=5,
    start_epoch=start_epoch,   # resume from epoch 60
    history=history            # continue the same history
)

Epoch 51/100: 100%|██████████| 1563/1563 [00:23<00:00, 67.14it/s, loss=2.2946, acc=81.94%]


Epoch 51 | Loss: 2.4576 | Acc: 79.29%


Epoch 52/100: 100%|██████████| 1563/1563 [00:23<00:00, 67.11it/s, loss=2.4376, acc=78.47%]


Epoch 52 | Loss: 2.4499 | Acc: 79.43%


Epoch 53/100: 100%|██████████| 1563/1563 [00:23<00:00, 67.41it/s, loss=3.0378, acc=71.53%]


Epoch 53 | Loss: 2.4401 | Acc: 79.64%


Epoch 54/100: 100%|██████████| 1563/1563 [00:23<00:00, 66.90it/s, loss=2.2186, acc=79.17%]


Epoch 54 | Loss: 2.4322 | Acc: 79.76%


Epoch 55/100: 100%|██████████| 1563/1563 [00:23<00:00, 67.07it/s, loss=2.2761, acc=84.72%]


Saved checkpoint to check_point/checkpoint_epoch_55.pt
Epoch 55 | Loss: 2.4224 | Acc: 79.97%


Epoch 56/100: 100%|██████████| 1563/1563 [00:23<00:00, 66.52it/s, loss=2.0368, acc=85.42%]


Epoch 56 | Loss: 2.4158 | Acc: 80.10%


Epoch 57/100: 100%|██████████| 1563/1563 [00:23<00:00, 66.75it/s, loss=1.7790, acc=93.75%]


Epoch 57 | Loss: 2.4048 | Acc: 80.26%


Epoch 58/100: 100%|██████████| 1563/1563 [00:23<00:00, 67.20it/s, loss=2.2738, acc=82.64%]


Epoch 58 | Loss: 2.3987 | Acc: 80.42%


Epoch 59/100: 100%|██████████| 1563/1563 [00:23<00:00, 67.08it/s, loss=2.2739, acc=84.03%]


Epoch 59 | Loss: 2.3907 | Acc: 80.50%


Epoch 60/100: 100%|██████████| 1563/1563 [00:23<00:00, 67.06it/s, loss=2.3555, acc=79.86%]


Saved checkpoint to check_point/checkpoint_epoch_60.pt
Epoch 60 | Loss: 2.3811 | Acc: 80.70%


Epoch 61/100: 100%|██████████| 1563/1563 [00:23<00:00, 67.13it/s, loss=2.4667, acc=80.56%]


Epoch 61 | Loss: 2.3726 | Acc: 80.87%


Epoch 62/100: 100%|██████████| 1563/1563 [00:23<00:00, 66.86it/s, loss=2.5729, acc=77.08%]


Epoch 62 | Loss: 2.3652 | Acc: 81.01%


Epoch 63/100: 100%|██████████| 1563/1563 [00:23<00:00, 66.87it/s, loss=1.9260, acc=86.81%]


Epoch 63 | Loss: 2.3587 | Acc: 81.14%


Epoch 64/100: 100%|██████████| 1563/1563 [00:23<00:00, 67.21it/s, loss=2.3778, acc=83.33%]


Epoch 64 | Loss: 2.3496 | Acc: 81.32%


Epoch 65/100: 100%|██████████| 1563/1563 [00:23<00:00, 67.27it/s, loss=2.4049, acc=81.94%]


Saved checkpoint to check_point/checkpoint_epoch_65.pt
Epoch 65 | Loss: 2.3395 | Acc: 81.53%


Epoch 66/100: 100%|██████████| 1563/1563 [00:23<00:00, 66.97it/s, loss=2.7567, acc=75.69%]


Epoch 66 | Loss: 2.3327 | Acc: 81.66%


Epoch 67/100: 100%|██████████| 1563/1563 [00:23<00:00, 66.96it/s, loss=2.3259, acc=81.94%]


Epoch 67 | Loss: 2.3246 | Acc: 81.82%


Epoch 68/100: 100%|██████████| 1563/1563 [00:23<00:00, 67.10it/s, loss=2.0924, acc=86.11%]


Epoch 68 | Loss: 2.3166 | Acc: 81.97%


Epoch 69/100: 100%|██████████| 1563/1563 [00:23<00:00, 67.04it/s, loss=2.4571, acc=82.64%]


Epoch 69 | Loss: 2.3104 | Acc: 82.06%


Epoch 70/100: 100%|██████████| 1563/1563 [00:23<00:00, 65.97it/s, loss=2.0437, acc=85.42%]


Saved checkpoint to check_point/checkpoint_epoch_70.pt
Epoch 70 | Loss: 2.3024 | Acc: 82.23%


Epoch 71/100: 100%|██████████| 1563/1563 [00:23<00:00, 66.93it/s, loss=2.4928, acc=78.47%]


Epoch 71 | Loss: 2.2937 | Acc: 82.40%


Epoch 72/100: 100%|██████████| 1563/1563 [00:23<00:00, 67.36it/s, loss=2.3723, acc=84.03%]


Epoch 72 | Loss: 2.2861 | Acc: 82.57%


Epoch 73/100: 100%|██████████| 1563/1563 [00:23<00:00, 66.88it/s, loss=2.2206, acc=82.64%]


Epoch 73 | Loss: 2.2785 | Acc: 82.69%


Epoch 74/100: 100%|██████████| 1563/1563 [00:23<00:00, 66.97it/s, loss=1.9068, acc=90.28%]


Epoch 74 | Loss: 2.2703 | Acc: 82.87%


Epoch 75/100: 100%|██████████| 1563/1563 [00:23<00:00, 66.96it/s, loss=2.4729, acc=79.86%]


Saved checkpoint to check_point/checkpoint_epoch_75.pt
Epoch 75 | Loss: 2.2603 | Acc: 83.10%


Epoch 76/100: 100%|██████████| 1563/1563 [00:23<00:00, 67.25it/s, loss=2.2777, acc=78.47%]


Epoch 76 | Loss: 2.2522 | Acc: 83.27%


Epoch 77/100: 100%|██████████| 1563/1563 [00:23<00:00, 66.99it/s, loss=1.9663, acc=90.97%]


Epoch 77 | Loss: 2.2442 | Acc: 83.42%


Epoch 78/100: 100%|██████████| 1563/1563 [00:23<00:00, 67.04it/s, loss=2.3313, acc=83.33%]


Epoch 78 | Loss: 2.2342 | Acc: 83.60%


Epoch 79/100: 100%|██████████| 1563/1563 [00:23<00:00, 67.01it/s, loss=2.1276, acc=87.50%]


Epoch 79 | Loss: 2.2265 | Acc: 83.79%


Epoch 80/100: 100%|██████████| 1563/1563 [00:23<00:00, 67.20it/s, loss=1.8614, acc=90.28%]


Saved checkpoint to check_point/checkpoint_epoch_80.pt
Epoch 80 | Loss: 2.2160 | Acc: 83.98%


Epoch 81/100:  68%|██████▊   | 1057/1563 [00:15<00:07, 67.62it/s, loss=1.9676, acc=89.58%]

# Test the model

In [17]:

test_dataset = load_dataset("roneneldan/TinyStories", split="validation")
test_texts = [sample["text"] for sample in test_dataset]

test_dataset = CustomTextDataset(test_texts,tokenizer, seq_len=10)
test_dataloader = DataLoader(
    test_dataset, 
    batch_size=32,
    shuffle=True, 
    pin_memory=True
)

In [18]:
from tqdm import tqdm

def test_loop(model, dataloader, criterion, device, vocab_size, pad_id):
    model.eval()
    total_loss = 0.0
    total_correct = 0
    total_tokens = 0

    with torch.no_grad():
        for x, y in tqdm(dataloader, desc="Evaluating"):
            x, y = x.to(device), y.to(device)

            # 1) Forward
            logits = model(x)                   # [batch, seq_len, vocab_size]
            batch_size, seq_len, vocab_size = logits.shape

            # 2) Flatten
            logits_flat = logits.view(-1, vocab_size)  # [batch*seq_len, vocab_size]
            y_flat      = y.view(-1)                   # [batch*seq_len]

            # 3) Loss
            loss = criterion(logits_flat, y_flat)

            # 4) Metrics
            mask = (y_flat != pad_id)                  # ignore padding
            preds = logits_flat.argmax(dim=-1)         # [batch*seq_len]
            correct = (preds[mask] == y_flat[mask]).sum().item()
            total = mask.sum().item()

            # 5) Accumulate
            total_loss    += loss.item()
            total_correct += correct
            total_tokens  += total

    # 6) Averages
    avg_loss = total_loss / len(dataloader)
    accuracy = total_correct / total_tokens if total_tokens > 0 else 0.0

    return avg_loss, accuracy


In [19]:
model_path = "/kaggle/input/gpt2minimodel/pytorch/default/1/gpt2MiniModel.pt"

check_point = torch.load(model_path, map_location=device)

model.load_state_dict(check_point['model_state_dict'])

model.to(device)
model.eval()


GPT2Model(
  (embeddings): EmbeddingLayer(
    (token_embedding): Embedding(19716, 64)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (layers): ModuleList(
    (0-1): 2 x DecoderLayer(
      (attn): ResidualBlock(
        (sub_layer): MultiHeadSelfAttention(
          (qkv_proj): Linear(in_features=64, out_features=192, bias=True)
          (out_proj): Linear(in_features=64, out_features=64, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (norm): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      )
      (ffn): ResidualBlock(
        (sub_layer): FeedForward(
          (ff): Sequential(
            (0): Linear(in_features=64, out_features=128, bias=True)
            (1): GELU(approximate='none')
            (2): Linear(in_features=128, out_features=64, bias=True)
            (3): Dropout(p=0.1, inplace=False)
          )
        )
        (norm): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      )
    )
  )
  (ln_f): LayerNorm((64,),

In [44]:
test_loss, test_accuracy = test_loop(
    model, 
    test_dataloader,
    nn.CrossEntropyLoss(ignore_index=1, label_smoothing=0.1), 
    device=device,
    vocab_size=vocab_size,
    pad_id=1   
)

print(f"Test Loss: {test_loss:.4f}")
print(f"Test Accuracy: {test_accuracy:.2%}")

Evaluating: 100%|██████████| 688/688 [00:03<00:00, 192.86it/s]

Test Loss: 1.7535
Test Accuracy: 88.82%





## Generate Text

In [27]:
# from tokenizers import  Tokenizer, models, trainers, pre_tokenizers

# tokenizer = Tokenizer(models.BPE())
# tokenizer.pre_tokenizers = pre_tokenizers.Whitespace()
# trainer = trainers.BpeTrainer(vocab_size=10000, special_tokens=["<pad>", "<unk>"])
# tokenizer.train(files=["tiny_stories.txt"], trainer=trainer)
# tokenizer.save("bpe_tokenizer.json")






In [30]:
# from tokenizers import Tokenizer
# tokenizer = Tokenizer.from_file("bpe_tokenizer.json")

# ids = tokenizer.encode("one day a Little Girl went to school").ids
# decoded = tokenizer.decode(ids)
# print(decoded)

one da y a  Little  G ir l  went to  school


In [37]:
# ids = tokenizer.encode("one day a cat went to school").ids
# print(ids)

[1475, 3804, 1119, 1337, 3007]


In [None]:
# Load model
# GPT2Config.vocab_size = len(word2idx)
# model = GPT2Model(GPT2Config())
# model.load_state_dict(torch.load("gpt2_tiny.pth", map_location="cpu"))
# model.eval()

In [20]:

def generate_text(prompt, max_new_tokens=50):
    input_ids = tokenizer.encode(prompt)
    input_tensor = torch.tensor([input_ids], dtype=torch.long).to(device)  # [1, T]
    
    with torch.no_grad():
        for _ in range(max_new_tokens):
            logits = model(input_tensor)  # [1, T, vocab]
            next_token_logits = logits[:, -1, :]  # last position
            next_token = torch.argmax(next_token_logits, dim=-1).unsqueeze(0)  # [1, 1]
            input_tensor = torch.cat([input_tensor, next_token], dim=1)  # grow the sequence

    return tokenizer.decode(input_tensor[0].tolist())

In [21]:
print(tokenizer.encode("one day a cat went to school"))

[495, 1131, 69, 9661, 3, 2044, 17497]


In [None]:
prompt = "once upon time a little cat called"
print(generate_text(prompt, max_new_tokens=16))