## Load vocab

In [18]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/gpt2mini/pytorch/default/1/decoder.py
/kaggle/input/gpt2mini/pytorch/default/1/gpt2.py
/kaggle/input/model-components/merges.txt
/kaggle/input/model-components/vocab.json
/kaggle/input/model-components/utils.py


In [19]:
from datasets import load_dataset

# Load and tokenize
dataset = load_dataset("roneneldan/TinyStories", split="train")
texts = [sample["text"] for sample in dataset.select(range(50000))]

In [20]:
print(os.listdir("/kaggle/input"))

['gpt2mini', 'model-components']


In [21]:
import json
with open("/kaggle/input/model-components/vocab.json") as f:
    word2idx = json.load(f)
    print(type(word2idx))
    print(list(word2idx.items())[:10])

idx2word = {int(v): k for k, v in word2idx.items()}
# pad_id = word2idx["<pad>"]

<class 'dict'>
[('<s>', 0), ('<pad>', 1), ('</s>', 2), ('<unk>', 3), ('<mask>', 4), ('!', 5), ('"', 6), ('#', 7), ('$', 8), ('%', 9)]


In [22]:
print(word2idx["cat"])

9661


## Testing Tokenization

In [23]:
import sys
sys.dont_write_bytecode = True # disabling __pycache__
sys.path.insert(0, '/kaggle/input/model-components')
from utils import Tokenizer
# from utils import clean_text

tokenizer = Tokenizer()
tokenizer.upload_vocab(word2idx)
tokenizer.encode("dog")

[11902]

## Testing the Model

In [24]:
# embedding_dim == hidden_size == (D)
# embedding_dim % num_heads == 0
embedding_dim = 128

ff_embedding_dim = 512 # ff_embedding_dim = 4 × embedding_dim
max_seq_len = 200
dropout = 0.1
num_heads = 4
vocab_size = tokenizer.get_vocab_size()
num_layers = 4

In [25]:
print(vocab_size)

19716


In [26]:
sys.path.insert(0, '/kaggle/input/gpt2mini/pytorch/default/1')
from gpt2 import GPT2Model
import torch
import torch.nn as nn

encoded = tokenizer.encode("dog")
input_tensor = torch.tensor(encoded).unsqueeze(0)

model = GPT2Model(vocab_size,embedding_dim,ff_embedding_dim,max_seq_len,num_heads,num_layers,dropout = 0.1)

# for each position in the sequence, you get a distribution over all vocab tokens.
logits = model(input_tensor)  # (B, T, V)

# Shift targets for next-token prediction
# shift_logits = logits[:, :-1, :].contiguous()
# shift_labels = input_tensor[:, 1:].contiguous()

# Flatten for CrossEntropyLoss
# loss_fn = nn.CrossEntropyLoss()
# loss = loss_fn(
#     shift_logits.view(-1, vocab_size),
#     shift_labels.view(-1)
# )

In [27]:
tokenized_text = tokenizer.encode("once day a time a cat")
if hasattr(tokenized_text, "ids"):
    tokenized_text = tokenized_text.ids
print(tokenized_text)

[15976, 1131, 69, 3325, 69, 9661]


In [28]:
class CustomTextDataset(torch.utils.data.Dataset):
    def __init__(self, texts, tokenizer, seq_len):
        self.data = []
        self.seq_len = seq_len
        self.tokenizer = tokenizer

        for text in texts:
            token_ids = self.tokenizer.encode(text)
            if hasattr(token_ids, "ids"):  # in case tokenizer.encode returns an object
                token_ids = token_ids.ids

            for i in range(len(token_ids) - seq_len):
                x = token_ids[i:i + seq_len]
                y = token_ids[i + 1:i + 1 + seq_len]
                self.data.append((x, y))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        x, y = self.data[idx]
        return torch.tensor(x, dtype=torch.long), torch.tensor(y, dtype=torch.long)

## Loading Data

In [29]:
from torch.utils.data import DataLoader
# from data.dataset import TextDataset

# Use longer sequences for testing

dataset = CustomTextDataset(texts,tokenizer, seq_len=10)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, num_workers=2, pin_memory=True)

Epoch 1/3:   0%|          | 0/256946 [11:32<?, ?it/s, loss=3.36]


## Training the Model

In [30]:
# embedding_dim == hidden_size == (D)
# embedding_dim % num_heads == 0
embedding_dim = 128

ff_embedding_dim = 512 # ff_embedding_dim = 4 × embedding_dim
max_seq_len = 200
dropout = 0.1
num_heads = 4
vocab_size = tokenizer.get_vocab_size()
num_layers = 4

epochs = 3
device = 'cuda' if torch.cuda.is_available() else 'cpu'
batch_size = 16

In [None]:
from tqdm import tqdm
model = GPT2Model(
        vocab_size,
        embedding_dim,
        ff_embedding_dim,max_seq_len
        ,num_heads,
        num_layers,
        dropout = 0.1
    ).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-4)

loss_history = []

def train_loop(model, dataloader, criterion, optimizer, epochs, device, vocab_size):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        progress_bar = tqdm(dataloader, desc=f"Epoch {epoch+1}/{epochs}")
        for batch_idx, (x, y) in enumerate(dataloader):
            x, y = x.to(device), y.to(device)  # x: [B, T], y: [B, T]

            logits = model(x)  # [B, T, vocab_size]
            logits = logits.view(-1, vocab_size)   # [B*T, V]
            y = y.view(-1)                                # [B*T]

            loss = criterion(logits, y)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            # scheduler.step()  # if used

            total_loss += loss.item()
            loss_history.append(loss.item())

            # if batch_idx % 100 == 0:
            #     print(f"Epoch {epoch+1}, Step {batch_idx}, Loss: {loss.item():.4f}")
            progress_bar.set_postfix(loss=loss.item())

        avg_loss = total_loss / len(dataloader)
        print(f"[{epoch+1:>3d}/{epochs:>3d}], Average loss:{avg_loss:>5f}")

    # Save model
    # torch.save(model.state_dict(), "gpt2_tiny.pth")

train_loop(model, dataloader, criterion, optimizer, epochs, device, vocab_size)


Epoch 1/3:   0%|          | 0/256946 [03:37<?, ?it/s, loss=3.38]

## Generate Text

In [None]:
# Load model
# GPT2Config.vocab_size = len(word2idx)
# model = GPT2Model(GPT2Config())
# model.load_state_dict(torch.load("gpt2_tiny.pth", map_location="cpu"))
# model.eval()

In [None]:
# initialize tokenizer with texts

def generate_text(prompt, max_new_tokens=50):
    input_ids = tokenizer.encode(prompt)
    input_tensor = torch.tensor([input_ids], dtype=torch.long).to(device)  # [1, T]
    
    with torch.no_grad():
        for _ in range(max_new_tokens):
            logits = model(input_tensor)  # [1, T, vocab]
            next_token_logits = logits[:, -1, :]  # last position
            next_token = torch.argmax(next_token_logits, dim=-1).unsqueeze(0)  # [1, 1]
            input_tensor = torch.cat([input_tensor, next_token], dim=1)  # grow the sequence

    return tokenizer.decode(input_tensor[0].tolist())


In [None]:
prompt = "Once upon a time"
print(generate_text(prompt, max_new_tokens=16))