In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/refined-bookcorpus-dataset/BookCorpus3.csv


In [3]:
import torch
import torch.nn as nn

class DecoderEmbeddings(nn.Module):
    def __init__(self, vocab_size, embed_dim, max_len):
        super().__init__()
        self.token_embed = nn.Embedding(vocab_size, embed_dim)
        self.pos_embed = nn.Embedding(max_len, embed_dim)
        self.dropout = nn.Dropout(0.1)

    def forward(self, input_ids):
        seq_len = input_ids.size(1)
        positions = torch.arange(0, seq_len, device=input_ids.device).unsqueeze(0)  # [1, seq_len]
        token_embeddings = self.token_embed(input_ids)       # [batch, seq_len, dim]
        pos_embeddings = self.pos_embed(positions)           # [1, seq_len, dim]
        return self.dropout(token_embeddings + pos_embeddings)

In [4]:
def generate_causal_mask(seq_len, device):
    mask = torch.tril(torch.ones(seq_len, seq_len, device=device))  # lower triangular
    return mask == 0  # False = allow attend, True = mask

In [19]:
class MultiHeadSelfAttention(nn.Module):
    def __init__(self, embed_dim, num_heads):
        super().__init__()
        assert embed_dim % num_heads == 0

        self.num_heads = num_heads
        self.head_dim = embed_dim // num_heads

        self.qkv_proj = nn.Linear(embed_dim, embed_dim * 3)
        self.out_proj = nn.Linear(embed_dim, embed_dim)

    def forward(self, x, attn_mask=None):
        batch_size, seq_len, embed_dim = x.size()

        # Get Q, K, V
        qkv = self.qkv_proj(x)  # [B, T, 3 * D]
        qkv = qkv.view(batch_size, seq_len, 3, self.num_heads, self.head_dim)
        qkv = qkv.permute(2, 0, 3, 1, 4)  # [3, B, H, T, D]
        q, k, v = qkv[0], qkv[1], qkv[2]  # Each: [B, H, T, D]

        # Attention scores
        scores = (q @ k.transpose(-2, -1)) / (self.head_dim ** 0.5)  # [B, H, T, T]

        if attn_mask is not None:
            scores = scores.masked_fill(attn_mask.unsqueeze(0).unsqueeze(0), float('-inf'))
        attn_weights = torch.softmax(scores, dim=-1)  # [B, H, T, T]
        attn_output = attn_weights @ v  # [B, H, T, D]

        # Merge heads
        attn_output = attn_output.transpose(1, 2).contiguous()  # [B, T, H, D]
        attn_output = attn_output.view(batch_size, seq_len, embed_dim)

        return self.out_proj(attn_output)

In [20]:
class FeedForward(nn.Module):
    def __init__(self, embed_dim, ff_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(embed_dim, ff_dim),
            nn.GELU(),
            nn.Linear(ff_dim, embed_dim)
        )

    def forward(self, x):
        return self.net(x)


In [21]:
class DecoderBlock(nn.Module):
    def __init__(self, embed_dim, num_heads, ff_dim):
        super().__init__()
        self.ln1 = nn.LayerNorm(embed_dim)
        self.attn = MultiHeadSelfAttention(embed_dim, num_heads)
        self.ln2 = nn.LayerNorm(embed_dim)
        self.ff = FeedForward(embed_dim, ff_dim)

    def forward(self, x, attn_mask):
        # Self-attention with residual
        attn_out = self.attn(self.ln1(x), attn_mask)
        x = x + attn_out

        # Feedforward with residual
        ff_out = self.ff(self.ln2(x))
        x = x + ff_out

        return x


In [22]:
class DecoderOnlyTransformer(nn.Module):
    def __init__(self, vocab_size, max_len, embed_dim, num_heads, depth, ff_dim):
        super().__init__()
        self.embedding = DecoderEmbeddings(vocab_size, embed_dim, max_len)

        self.blocks = nn.ModuleList([
            DecoderBlock(embed_dim, num_heads, ff_dim)
            for _ in range(depth)
        ])

        self.ln_final = nn.LayerNorm(embed_dim)
        self.head = nn.Linear(embed_dim, vocab_size)  # Language modeling head

    def forward(self, input_ids):
        """
        input_ids: [B, T]
        """
        B, T = input_ids.size()
        x = self.embedding(input_ids)  # [B, T, D]

        # Generate causal mask: True where mask is applied
        mask = generate_causal_mask(T, input_ids.device)

        for block in self.blocks:
            x = block(x, attn_mask=mask)

        x = self.ln_final(x)  # [B, T, D]
        logits = self.head(x)  # [B, T, vocab_size]

        return logits

In [9]:
import pandas as pd
from tokenizers import ByteLevelBPETokenizer
from torch.utils.data import Dataset

# Parameters
csv_path = "/kaggle/input/refined-bookcorpus-dataset/BookCorpus3.csv"
max_paragraphs = 200_000    # Adjust based on time/memory
min_char_len = 200
seq_len = 128               # Sequence length for training

# 1. Load dataset
df = pd.read_csv(csv_path)
df = df.dropna()
paragraphs = df.iloc[:max_paragraphs, 0].tolist()

# 2. Filter paragraphs
filtered_paragraphs = [p.strip() for p in paragraphs if len(p.strip()) >= min_char_len]

print(f"Loaded {len(filtered_paragraphs)} paragraphs after filtering.")


Loaded 198159 paragraphs after filtering.


In [10]:
from pathlib import Path

# Save texts to a temporary file for tokenizer training
with open("paragraphs.txt", "w", encoding="utf-8") as f:
    for p in filtered_paragraphs:
        f.write(p + "\n")

# Train a ByteLevel BPE tokenizer
tokenizer = ByteLevelBPETokenizer()
tokenizer.train(files="paragraphs.txt", vocab_size=30_000, min_frequency=2, special_tokens=[
    "<s>", "<pad>", "</s>", "<unk>", "<mask>"
])
os.makedirs("tokenizer", exist_ok=True)

# Save tokenizer
tokenizer.save_model("tokenizer")






['tokenizer/vocab.json', 'tokenizer/merges.txt']

In [11]:
from tokenizers import Tokenizer
import torch

# Load the tokenizer from vocab + merges
tokenizer = ByteLevelBPETokenizer(
    "tokenizer/vocab.json",
    "tokenizer/merges.txt"
)


# Tokenize entire text into one flat list of token IDs
all_ids = []

for paragraph in filtered_paragraphs:
    ids = tokenizer.encode(paragraph).ids
    all_ids.extend(ids)

print("Total tokens:", len(all_ids))

# Split into chunks of seq_len
sequences = []

for i in range(0, len(all_ids) - seq_len, seq_len):
    input_ids = all_ids[i:i+seq_len]
    sequences.append(torch.tensor(input_ids, dtype=torch.long))

print("Total sequences:", len(sequences))


Total tokens: 18868576
Total sequences: 147410


In [12]:
class CausalLanguageModelingDataset(Dataset):
    def __init__(self, sequences):
        self.sequences = sequences

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        x = self.sequences[idx][:-1]  # input: all except last
        y = self.sequences[idx][1:]   # target: all except first
        return {'input_ids': x, 'labels': y}

# Create Dataset
dataset = CausalLanguageModelingDataset(sequences)


In [23]:
from torch.utils.data import DataLoader
import torch.nn.functional as F

# Hyperparameters
batch_size = 32
learning_rate = 3e-4
num_epochs = 10

# Dataloader
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = DecoderOnlyTransformer(
    vocab_size=30000,  # or tokenizer.get_vocab_size()
    max_len=seq_len,
    embed_dim=512,
    num_heads=8,
    depth=6,
    ff_dim=2048
).to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)


In [None]:
from tqdm import tqdm

model.train()

for epoch in range(num_epochs):
    total_loss = 0
    progress_bar = tqdm(dataloader, desc=f"Epoch {epoch + 1}/{num_epochs}", leave=False)

    for batch in progress_bar:
        input_ids = batch['input_ids'].to(device)
        labels = batch['labels'].to(device)

        logits = model(input_ids)
        loss = F.cross_entropy(
            logits.view(-1, logits.size(-1)),
            labels.view(-1)
        )

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        # Update progress bar with current batch loss
        progress_bar.set_postfix(loss=loss.item())

    avg_loss = total_loss / len(dataloader)
    print(f"✅ Epoch {epoch + 1}/{num_epochs} | Avg Loss: {avg_loss:.4f}")


                                                                          

✅ Epoch 1/10 | Avg Loss: 5.2244


                                                                          

✅ Epoch 2/10 | Avg Loss: 4.5440


Epoch 3/10:  20%|██        | 933/4607 [02:32<10:03,  6.08it/s, loss=4.22]

In [None]:
def generate(model, tokenizer, prompt, max_new_tokens=50):
    model.eval()
    input_ids = tokenizer.encode(prompt).ids
    input_tensor = torch.tensor([input_ids], device=device)

    for _ in range(max_new_tokens):
        with torch.no_grad():
            logits = model(input_tensor)
            next_token_logits = logits[0, -1, :]
            next_token = torch.argmax(next_token_logits).item()
            input_tensor = torch.cat([input_tensor, torch.tensor([[next_token]], device=device)], dim=1)

    return tokenizer.decode(input_tensor[0].tolist())


In [None]:
generate(model, tokenizer, "hello")