In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/tiny-shakespeare-dataset/tiny shakespeare.txt


In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from tokenizers import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing
from transformers import PreTrainedTokenizerFast
import os
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import ByteLevel
from transformers import PreTrainedTokenizerFast

In [3]:
# Load tiny_shakespeare.txt
with open("/kaggle/input/tiny-shakespeare-dataset/tiny shakespeare.txt", "r", encoding="utf-8") as f:
    shakespeare_text = f.read()

In [4]:
# Tokenizer (BPE)
def train_bpe_tokenizer(texts, vocab_size=5000):
    os.makedirs("bpe_tokenizer", exist_ok=True)

    tokenizer = Tokenizer(BPE(unk_token="<unk>"))
    tokenizer.pre_tokenizer = ByteLevel()
    trainer = BpeTrainer(vocab_size=vocab_size, special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>"])
    tokenizer.train_from_iterator(texts, trainer)
    
    # Save as tokenizer.json (needed for PreTrainedTokenizerFast)
    tokenizer.save("bpe_tokenizer/tokenizer.json")

    return tokenizer

In [5]:
# Load or train tokenizer
if not os.path.exists("bpe_tokenizer/tokenizer.json"):
    print("Training new BPE tokenizer...")
    train_bpe_tokenizer([shakespeare_text])

Training new BPE tokenizer...


In [6]:
# Load tokenizer
bpe_tokenizer = PreTrainedTokenizerFast(
    tokenizer_file="/kaggle/working/bpe_tokenizer/tokenizer.json",
    unk_token="<unk>", pad_token="<pad>", cls_token="<s>", sep_token="</s>", mask_token="<mask>"
)

In [7]:
class ShakespeareDataset(Dataset):
    def __init__(self, text, context_length=256):
        global bpe_tokenizer  # Ensure global tokenizer is loaded

        tokens = bpe_tokenizer(text, return_tensors="pt")["input_ids"].squeeze()
        
        # Ensure data is long enough
        if tokens.numel() < context_length:
            raise ValueError(f"Tokenized text is too short! Only {tokens.numel()} tokens available.")

        self.data = tokens
        self.context_length = context_length
        print(f"Tokenized data size: {self.data.shape}")  # Check the size
        print(f"First few tokens: {self.data[:10]}")  # Inspect the first tokens


    def __len__(self):
        return max(0, len(self.data) - self.context_length)  # Prevent negative length

    def __getitem__(self, idx):
        return (self.data[idx:idx+self.context_length],
                self.data[idx+1:idx+self.context_length+1])


In [8]:
# GPT Model Configuration
class GPTConfig:
    def __init__(self, vocab_size, context_length=256, 
                 n_layers=12, n_heads=12, d_model=768, d_ff=3072, dropout=0.1):
        self.vocab_size = vocab_size
        self.context_length = context_length
        self.n_layers = n_layers
        self.n_heads = n_heads
        self.d_model = d_model
        self.d_ff = d_ff
        self.dropout = dropout

class GPT(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.token_embedding = nn.Embedding(config.vocab_size, config.d_model)
        self.position_embedding = nn.Parameter(torch.zeros(1, config.context_length, config.d_model))
        self.layers = nn.ModuleList([
            TransformerBlock(config) for _ in range(config.n_layers)
        ])
        self.ln_f = nn.LayerNorm(config.d_model)
        self.head = nn.Linear(config.d_model, config.vocab_size, bias=False)
        self.apply(self._init_weights)
    
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            nn.init.normal_(module.weight, mean=0.0, std=0.02)
    
    def forward(self, x):
        B, T = x.shape
        tok_emb = self.token_embedding(x)
        pos_emb = self.position_embedding[:, :T, :]
        x = tok_emb + pos_emb
        for layer in self.layers:
            x = layer(x)
        x = self.ln_f(x)
        logits = self.head(x)
        return logits

class TransformerBlock(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.ln1 = nn.LayerNorm(config.d_model)
        self.attn = MultiHeadAttention(config)
        self.ln2 = nn.LayerNorm(config.d_model)
        self.ffn = FeedForward(config)
    
    def forward(self, x):
        x = x + self.attn(self.ln1(x))
        x = x + self.ffn(self.ln2(x))
        return x

class MultiHeadAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        assert config.d_model % config.n_heads == 0
        self.n_heads = config.n_heads
        self.d_head = config.d_model // config.n_heads
        self.W_qkv = nn.Linear(config.d_model, 3 * config.d_model, bias=False)
        self.W_o = nn.Linear(config.d_model, config.d_model, bias=False)
        self.dropout = nn.Dropout(config.dropout)
    
    def forward(self, x):
        B, T, C = x.shape
        qkv = self.W_qkv(x).reshape(B, T, 3, self.n_heads, self.d_head).permute(2, 0, 3, 1, 4)
        q, k, v = qkv[0], qkv[1], qkv[2]
        attn_scores = (q @ k.transpose(-2, -1)) / (self.d_head ** 0.5)
        attn_probs = F.softmax(attn_scores, dim=-1)
        attn_probs = self.dropout(attn_probs)
        attn_output = (attn_probs @ v).transpose(1, 2).reshape(B, T, C)
        return self.W_o(attn_output)

class FeedForward(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.fc1 = nn.Linear(config.d_model, config.d_ff)
        self.gelu = nn.GELU()
        self.fc2 = nn.Linear(config.d_ff, config.d_model)
        self.dropout = nn.Dropout(config.dropout)
    
    def forward(self, x):
        return self.dropout(self.fc2(self.gelu(self.fc1(x))))

In [9]:
# Training Setup
config = GPTConfig(vocab_size=5000, context_length=256, n_layers=12, n_heads=12, d_model=768, d_ff=3072)
model = GPT(config)
optimizer = optim.AdamW(model.parameters(), lr=3e-4, betas=(0.9, 0.95), weight_decay=0.1)

dataset = ShakespeareDataset(shakespeare_text)

Tokenized data size: torch.Size([333336])
First few tokens: tensor([  69,  485, 1010,   13,   68, 2156,  145, 2561,  616, 2116])


In [10]:
# Training Loop
def train_model(model, dataset, epochs=10, batch_size=32):
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    loss_fn = nn.CrossEntropyLoss()
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for x, y in dataloader:
            x, y = x.cuda(), y.cuda()
            optimizer.zero_grad()
            logits = model(x)
            loss = loss_fn(logits.view(-1, config.vocab_size), y.view(-1))
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch + 1}, Loss: {total_loss / len(dataloader)}")

In [11]:
print(f"Model Parameters: {sum(p.numel() for p in model.parameters()) / 1e6:.2f}M")

Model Parameters: 92.90M


In [12]:
# Load dataset
context_length = 256
dataset = ShakespeareDataset(shakespeare_text, context_length=context_length)
print(f"Dataset size: {len(dataset)}")  # Should be > 0
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)


Tokenized data size: torch.Size([333336])
First few tokens: tensor([  69,  485, 1010,   13,   68, 2156,  145, 2561,  616, 2116])
Dataset size: 333080


In [13]:
# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [14]:
# Initialize model
config = GPTConfig(vocab_size=5000, context_length=context_length, n_layers=12, n_heads=12, d_model=768, d_ff=3072)
model = GPT(config).to(device)

In [15]:
# Loss function and optimizer
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=3e-4, betas=(0.9, 0.95), weight_decay=0.1)

In [16]:
# Training loop
def train_model(model, dataloader, epochs=1):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for x, y in dataloader:
            x, y = x.to(device), y.to(device)
            optimizer.zero_grad()
            logits = model(x)
            loss = loss_fn(logits.view(-1, config.vocab_size), y.view(-1))
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        
        print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss / len(dataloader):.4f}")

In [17]:
# Train the model
train_model(model, dataloader, epochs=1)

Epoch 1/1, Loss: 0.6818


In [18]:
# Save the model
torch.save(model.state_dict(), "nano_gpt_92.90M.pth")
print("Model saved as nano_gpt_92.90M.pth")

Model saved as nano_gpt_92.90M.pth
