In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [3]:
import torch
import torch.nn as nn

class DecoderEmbeddings(nn.Module):
    def __init__(self, vocab_size, embed_dim, max_len):
        super().__init__()
        self.token_embed = nn.Embedding(vocab_size, embed_dim)
        self.pos_embed = nn.Embedding(max_len, embed_dim)
        self.dropout = nn.Dropout(0.1)

    def forward(self, input_ids):
        seq_len = input_ids.size(1)
        positions = torch.arange(0, seq_len, device=input_ids.device).unsqueeze(0)  # [1, seq_len]
        token_embeddings = self.token_embed(input_ids)       # [batch, seq_len, dim]
        pos_embeddings = self.pos_embed(positions)           # [1, seq_len, dim]
        return self.dropout(token_embeddings + pos_embeddings)

In [4]:
def generate_causal_mask(seq_len, device):
    mask = torch.tril(torch.ones(seq_len, seq_len, device=device))  # lower triangular
    return mask == 0  # False = allow attend, True = mask

In [6]:
class MultiHeadSelfAttention(nn.Module):
    def __init__(self, embed_dim, num_heads):
        super().__init__()
        assert embed_dim % num_heads == 0

        self.num_heads = num_heads
        self.head_dim = embed_dim // num_heads

        self.qkv_proj = nn.Linear(embed_dim, embed_dim * 3)
        self.out_proj = nn.Linear(embed_dim, embed_dim)

    def forward(self, x, attn_mask=None):
        batch_size, seq_len, embed_dim = x.size()

        # Get Q, K, V
        qkv = self.qkv_proj(x)  # [B, T, 3 * D]
        qkv = qkv.view(batch_size, seq_len, 3, self.num_heads, self.head_dim)
        qkv = qkv.permute(2, 0, 3, 1, 4)  # [3, B, H, T, D]
        q, k, v = qkv[0], qkv[1], qkv[2]  # Each: [B, H, T, D]

        # Attention scores
        scores = (q @ k.transpose(-2, -1)) / (self.head_dim ** 0.5)  # [B, H, T, T]

        if attn_mask is not None:
            scores = scores.masked_fill(attn_mask.unsqueeze(0).unsqueeze(0), float('-inf'))
        attn_weights = torch.softmax(scores, dim=-1)  # [B, H, T, T]
        attn_output = attn_weights @ v  # [B, H, T, D]

        # Merge heads
        attn_output = attn_output.transpose(1, 2).contiguous()  # [B, T, H, D]
        attn_output = attn_output.view(batch_size, seq_len, embed_dim)

        return self.out_proj(attn_output)

In [7]:
class FeedForward(nn.Module):
    def __init__(self, embed_dim, ff_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(embed_dim, ff_dim),
            nn.GELU(),
            nn.Linear(ff_dim, embed_dim)
        )

    def forward(self, x):
        return self.net(x)


In [8]:
class DecoderBlock(nn.Module):
    def __init__(self, embed_dim, num_heads, ff_dim):
        super().__init__()
        self.ln1 = nn.LayerNorm(embed_dim)
        self.attn = MultiHeadSelfAttention(embed_dim, num_heads)
        self.ln2 = nn.LayerNorm(embed_dim)
        self.ff = FeedForward(embed_dim, ff_dim)

    def forward(self, x, attn_mask):
        # Self-attention with residual
        attn_out = self.attn(self.ln1(x), attn_mask)
        x = x + attn_out

        # Feedforward with residual
        ff_out = self.ff(self.ln2(x))
        x = x + ff_out

        return x


In [9]:
class DecoderOnlyTransformer(nn.Module):
    def __init__(self, vocab_size, max_len, embed_dim, num_heads, depth, ff_dim):
        super().__init__()
        self.embedding = DecoderEmbeddings(vocab_size, embed_dim, max_len)

        self.blocks = nn.ModuleList([
            DecoderBlock(embed_dim, num_heads, ff_dim)
            for _ in range(depth)
        ])

        self.ln_final = nn.LayerNorm(embed_dim)
        self.head = nn.Linear(embed_dim, vocab_size)  # Language modeling head

    def forward(self, input_ids):
        """
        input_ids: [B, T]
        """
        B, T = input_ids.size()
        x = self.embedding(input_ids)  # [B, T, D]

        # Generate causal mask: True where mask is applied
        mask = generate_causal_mask(T, input_ids.device)

        for block in self.blocks:
            x = block(x, attn_mask=mask)

        x = self.ln_final(x)  # [B, T, D]
        logits = self.head(x)  # [B, T, vocab_size]

        return logits

In [12]:
import pandas as pd
from tokenizers import ByteLevelBPETokenizer
from torch.utils.data import Dataset

# Parameters
csv_path = "/kaggle/input/refined-bookcorpus-dataset/BookCorpus3.csv"
max_paragraphs = 400_000    # Adjust based on time/memory
min_char_len = 200
seq_len = 128               # Sequence length for training

# 1. Load dataset
df = pd.read_csv(csv_path)
df = df.dropna()
paragraphs = df.iloc[:max_paragraphs, 0].tolist()

# 2. Filter paragraphs
filtered_paragraphs = [p.strip() for p in paragraphs if len(p.strip()) >= min_char_len]

print(f"Loaded {len(filtered_paragraphs)} paragraphs after filtering.")


Loaded 395901 paragraphs after filtering.


In [13]:
from pathlib import Path
import os
# Save texts to a temporary file for tokenizer training
with open("paragraphs.txt", "w", encoding="utf-8") as f:
    for p in filtered_paragraphs:
        f.write(p + "\n")

# Train a ByteLevel BPE tokenizer
tokenizer = ByteLevelBPETokenizer()
tokenizer.train(files="paragraphs.txt", vocab_size=30_000, min_frequency=2, special_tokens=[
    "<s>", "<pad>", "</s>", "<unk>", "<mask>"
])
os.makedirs("tokenizer", exist_ok=True)

# Save tokenizer
tokenizer.save_model("tokenizer")






['tokenizer/vocab.json', 'tokenizer/merges.txt']

In [14]:
from tokenizers import Tokenizer
import torch

# Load the tokenizer from vocab + merges
tokenizer = ByteLevelBPETokenizer(
    "tokenizer/vocab.json",
    "tokenizer/merges.txt"
)


# Tokenize entire text into one flat list of token IDs
all_ids = []

for paragraph in filtered_paragraphs:
    ids = tokenizer.encode(paragraph).ids
    all_ids.extend(ids)

print("Total tokens:", len(all_ids))

# Split into chunks of seq_len
sequences = []

for i in range(0, len(all_ids) - seq_len, seq_len):
    input_ids = all_ids[i:i+seq_len]
    sequences.append(torch.tensor(input_ids, dtype=torch.long))

print("Total sequences:", len(sequences))


Total tokens: 36422547
Total sequences: 284551


In [15]:
class CausalLanguageModelingDataset(Dataset):
    def __init__(self, sequences):
        self.sequences = sequences

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        x = self.sequences[idx][:-1]  # input: all except last
        y = self.sequences[idx][1:]   # target: all except first
        return {'input_ids': x, 'labels': y}

# Create Dataset
dataset = CausalLanguageModelingDataset(sequences)


In [16]:
from torch.utils.data import DataLoader
import torch.nn.functional as F

# Hyperparameters
batch_size = 32
learning_rate = 3e-4
num_epochs = 6

# Dataloader
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = DecoderOnlyTransformer(
    vocab_size=35000,  # or tokenizer.get_vocab_size()
    max_len=seq_len,
    embed_dim=512,
    num_heads=8,
    depth=6,
    ff_dim=2048
).to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)


In [17]:
from tqdm import tqdm

model.train()

for epoch in range(num_epochs):
    total_loss = 0
    progress_bar = tqdm(dataloader, desc=f"Epoch {epoch + 1}/{num_epochs}", leave=False)

    for batch in progress_bar:
        input_ids = batch['input_ids'].to(device)
        labels = batch['labels'].to(device)

        logits = model(input_ids)
        loss = F.cross_entropy(
            logits.view(-1, logits.size(-1)),
            labels.view(-1)
        )

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        # Update progress bar with current batch loss
        progress_bar.set_postfix(loss=loss.item())

    avg_loss = total_loss / len(dataloader)
    print(f"✅ Epoch {epoch + 1}/{num_epochs} | Avg Loss: {avg_loss:.4f}")

                                                                         

✅ Epoch 1/6 | Avg Loss: 5.0705


                                                                         

✅ Epoch 2/6 | Avg Loss: 4.4116


                                                                         

✅ Epoch 3/6 | Avg Loss: 4.1843


                                                                         

✅ Epoch 4/6 | Avg Loss: 4.0414


                                                                         

✅ Epoch 5/6 | Avg Loss: 3.9364


                                                                         

✅ Epoch 6/6 | Avg Loss: 3.8534




In [18]:
def generate(model, tokenizer, prompt, max_length=50, temperature=1, top_k=50):
    model.eval()
    device = next(model.parameters()).device

    # Tokenize properly
    encoding = tokenizer.encode(prompt)
    input_ids = encoding.ids
    input_ids = torch.tensor([input_ids], dtype=torch.long).to(device)
    generated = input_ids.clone()

    for _ in range(max_length):
        logits = model(generated)  # [1, T, vocab_size]
        next_token_logits = logits[:, -1, :] / temperature  # [1, vocab_size]

        # Top-k filtering
        if top_k is not None:
            values, indices = torch.topk(next_token_logits, top_k)
            mask = torch.full_like(next_token_logits, float('-inf'))
            mask.scatter_(1, indices, values)
            next_token_logits = mask

        probs = torch.softmax(next_token_logits, dim=-1)
        next_token = torch.multinomial(probs, num_samples=1)  # [1, 1]

        generated = torch.cat((generated, next_token), dim=1)

        # Optional: Stop on EOS token if available
        if hasattr(tokenizer, 'token_to_id') and tokenizer.token_to_id('[EOS]') is not None:
            if next_token.item() == tokenizer.token_to_id('[EOS]'):
                break

    # Decode back to text
    output_ids = generated[0].tolist()
    return tokenizer.decode(output_ids)


In [None]:
generate(model, tokenizer, "I went to eat")

In [21]:
from transformers import PreTrainedTokenizerFast
import os

# Save tokenizer
os.makedirs("hf_model", exist_ok=True)
from transformers import GPT2TokenizerFast

tokenizer = GPT2TokenizerFast(
    vocab_file="tokenizer/vocab.json",
    merges_file="tokenizer/merges.txt"
)
tokenizer.save_pretrained("hf_model")

# Save model
torch.save(model.state_dict(), "hf_model/pytorch_model.bin")


In [22]:
from transformers import PreTrainedModel, PretrainedConfig
import torch.nn as nn
import torch
import torch.nn.functional as F

class DecoderOnlyTransformerConfig(PretrainedConfig):
    model_type = "decoder-only-transformer"

    def __init__(self, vocab_size=35000, max_len=256, embed_dim=512,
                 num_heads=8, depth=6, ff_dim=2048, **kwargs):
        super().__init__(**kwargs)
        self.vocab_size = vocab_size
        self.max_len = max_len
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.depth = depth
        self.ff_dim = ff_dim

class HFDecoderOnlyTransformer(PreTrainedModel):
    config_class = DecoderOnlyTransformerConfig

    def __init__(self, config):
        super().__init__(config)
        self.model = DecoderOnlyTransformer(
            vocab_size=config.vocab_size,
            max_len=config.max_len,
            embed_dim=config.embed_dim,
            num_heads=config.num_heads,
            depth=config.depth,
            ff_dim=config.ff_dim,
        )

    def forward(self, input_ids, labels=None):
        logits = self.model(input_ids)
        loss = None
        if labels is not None:
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), labels.view(-1))
        return {"loss": loss, "logits": logits}


2025-06-01 09:50:51.281636: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748771451.470752      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748771451.534404      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [24]:
import json
# Create config.json
config = {
    "model_type": "decoder-only-transformer",
    "vocab_size": 35000,
    "max_len": 256,
    "embed_dim": 512,
    "num_heads": 8,
    "depth": 6,
    "ff_dim": 2048
}
with open(os.path.join("hf_model", "config.json"), "w") as f:
    json.dump(config, f, indent=4)

In [29]:
!python -c "from huggingface_hub.hf_api import HfFolder; HfFolder.save_token('')"


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [32]:
%cd /kaggle/working/hf_model
!git init

/kaggle/working/hf_model


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[33mhint: Using 'master' as the name for the initial branch. This default branch name[m
[33mhint: is subject to change. To configure the initial branch name to use in all[m
[33mhint: [m
[33mhint: 	git config --global init.defaultBranch <name>[m
[33mhint: [m
[33mhint: Names commonly chosen instead of 'master' are 'main', 'trunk' and[m
[33mhint: 'development'. The just-created branch can be renamed via this command:[m
[33mhint: [m
[33mhint: 	git branch -m <name>[m
Initialized empty Git repository in /kaggle/working/hf_model/.git/


In [33]:
!git config user.email "designsdilip@gmail.com"
!git config user.name "coder-dilip"

!git remote add origin https://<hf token>@huggingface.co/dilip025/mini-gpt1
!git branch -M main


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [34]:
!git add .
!git commit -m "Initial commit"


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[main (root-commit) 11330cc] Initial commit
 8 files changed, 178804 insertions(+)
 create mode 100644 added_tokens.json
 create mode 100644 config.json
 create mode 100644 merges.txt
 create mode 100644 pytorch_model.bin
 create mode 100644 special_tokens_map.json
 create mode 100644 tokenizer.json
 create mode 100644 tokenizer_config.json
 create mode 100644 vocab.json


In [41]:
from huggingface_hub import HfApi

token = "<hf_token>"
repo_id = "dilip025/mini-gpt1"

api = HfApi()
api.create_repo(repo_id=repo_id, token=token, exist_ok=True)

RepoUrl('https://huggingface.co/dilip025/mini-gpt1', endpoint='https://huggingface.co', repo_type='model', repo_id='dilip025/mini-gpt1')

In [43]:
!rm -rf /kaggle/working/hf_model/.git


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [47]:
from huggingface_hub import upload_file
import os

folder_path = "/kaggle/working/hf_model"

for root, dirs, files in os.walk(folder_path):
    for file in files:
        local_path = os.path.join(root, file)
        remote_path = os.path.relpath(local_path, folder_path)
        upload_file(
            path_or_fileobj=local_path,
            path_in_repo=remote_path,
            repo_id=repo_id,
            token=token
        )


No files have been modified since last commit. Skipping to prevent empty commit.
No files have been modified since last commit. Skipping to prevent empty commit.
No files have been modified since last commit. Skipping to prevent empty commit.
No files have been modified since last commit. Skipping to prevent empty commit.
No files have been modified since last commit. Skipping to prevent empty commit.
No files have been modified since last commit. Skipping to prevent empty commit.
No files have been modified since last commit. Skipping to prevent empty commit.
No files have been modified since last commit. Skipping to prevent empty commit.


In [45]:
import json
import os

# Define the config dictionary
config = {
    "model_type": "decoder-only-transformer",
    "vocab_size": 35000,
    "max_len": 256,
    "embed_dim": 512,
    "num_heads": 8,
    "depth": 6,
    "ff_dim": 2048
}

# Create hf_model directory if it doesn't exist
os.makedirs("hf_model", exist_ok=True)

# Write config.json
with open("hf_model/config.json", "w") as f:
    json.dump(config, f, indent=4)

# Create tokenizer_config.json (needed for Hugging Face)
tokenizer_config = {
    "add_prefix_space": True,
    "model_max_length": 256,
    "tokenizer_class": "PreTrainedTokenizerFast",
    "unk_token": "<unk>",
    "bos_token": "<s>",
    "eos_token": "</s>"
}

with open("hf_model/tokenizer_config.json", "w") as f:
    json.dump(tokenizer_config, f, indent=4)

# Create README.md
readme = """# Mini GPT1 Clone

This is a decoder-only transformer model (GPT1-style) trained from scratch using PyTorch.

## Model Details

- **Architecture**: Decoder-only Transformer
- **Layers**: 6
- **Embedding Size**: 512
- **Heads**: 8
- **Feedforward Dim**: 2048
- **Sequence Length**: 256
- **Vocab Size**: 35,000

## Tokenizer

Trained using `ByteLevelBPETokenizer` from the `tokenizers` library.

## Inference Example

```python
from transformers import PreTrainedTokenizerFast, AutoModelForCausalLM
import torch

tokenizer = PreTrainedTokenizerFast(tokenizer_file="tokenizer/tokenizer.json")
model = AutoModelForCausalLM.from_pretrained("dilip025/mini-gpt1")

prompt = "Once upon a time,"
input_ids = tokenizer(prompt, return_tensors="pt").input_ids
outputs = model.generate(input_ids, max_length=50)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
License
MIT
"""

with open("hf_model/README.md", "w") as f:
    f.write(readme)

