<a href="https://colab.research.google.com/github/G0nkly/pytorch_sandbox/blob/main/bondoGPT/bondoGPT_alpha.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#################
# LOAD RAW DATA #
#################

In [2]:
from datasets import load_dataset
import re

# 1. Load the raw dataset (wikitext-2-raw-v1 is a good small choice)
dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="train")

cleaned_text = []
for item in dataset:
    text = item["text"].strip()

    # 2. Skip empty lines and section headers
    if not text or text.startswith("="):
        continue

    # 3. Simple Markup Removal (you may need more sophisticated parsing)
    # Remove internal links (e.g., [[text]]) and only keep the display text
    text = re.sub(r'\[\[[^|\]]+\|([^\]]+)\]\]', r'\1', text)
    text = re.sub(r'\[\[([^\]]+)\]\]', r'\1', text)

    # Remove bold/italics markers (''')
    text = text.replace("'''", "").replace("''", "")

    # Filter out lines that might still contain complex templates or tags
    if "{{" not in text and "}}" not in text:
        cleaned_text.append(text)

# 4. Join all the cleaned paragraphs into one large string
final_corpus = "\n\n".join(cleaned_text)

with open("tiny_wikitext.txt", "w", encoding="utf-8") as f:
    f.write(final_corpus)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
###################
# TRAIN TOKENIZER #
###################

In [4]:
import sentencepiece as spm

# Train a 2k-token tokenizer on your dataset
spm.SentencePieceTrainer.Train(
    '--input=/home/bonda/tiny_wikitext.txt --model_prefix=tiny_sp2 --vocab_size=2048 --model_type=bpe'
)

# Load it
import sentencepiece as spm
sp = spm.SentencePieceProcessor()
sp.Load("tiny_sp2.model")

sentencepiece_trainer.cc(178) LOG(INFO) Running command: --input=/home/bonda/tiny_wikitext.txt --model_prefix=tiny_sp2 --vocab_size=2048 --model_type=bpe
sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input: /home/bonda/tiny_wikitext.txt
  input_format: 
  model_prefix: tiny_sp2
  model_type: BPE
  vocab_size: 2048
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  seed_sentencepieces_file: 
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0
  

True

ce=▁has
bpe_model_trainer.cc(159) LOG(INFO) Updating active symbols. max_freq=3321 min_freq=706
bpe_model_trainer.cc(268) LOG(INFO) Added: freq=3136 size=320 all=23673 active=1822 piece=▁sc
bpe_model_trainer.cc(268) LOG(INFO) Added: freq=2944 size=340 all=24705 active=2854 piece=▁part
bpe_model_trainer.cc(268) LOG(INFO) Added: freq=2701 size=360 all=25491 active=3640 piece=▁im
bpe_model_trainer.cc(268) LOG(INFO) Added: freq=2520 size=380 all=26394 active=4543 piece=▁pre
bpe_model_trainer.cc(268) LOG(INFO) Added: freq=2342 size=400 all=27450 active=5599 piece=▁would
bpe_model_trainer.cc(159) LOG(INFO) Updating active symbols. max_freq=2341 min_freq=532
bpe_model_trainer.cc(268) LOG(INFO) Added: freq=2259 size=420 all=28157 active=2078 piece=ased
bpe_model_trainer.cc(268) LOG(INFO) Added: freq=2098 size=440 all=29284 active=3205 piece=ey
bpe_model_trainer.cc(268) LOG(INFO) Added: freq=1985 size=460 all=30253 active=4174 piece=▁most
bpe_model_trainer.cc(268) LOG(INFO) Added: freq=1904 siz

In [5]:
##################
# CREATE DATASET #
##################

eq=477 size=1520 all=53240 active=2910 piece=▁nov
bpe_model_trainer.cc(268) LOG(INFO) Added: freq=468 size=1540 all=53481 active=3151 piece=▁Old
bpe_model_trainer.cc(268) LOG(INFO) Added: freq=460 size=1560 all=53804 active=3474 piece=▁sk
bpe_model_trainer.cc(268) LOG(INFO) Added: freq=451 size=1580 all=54225 active=3895 piece=omen
bpe_model_trainer.cc(268) LOG(INFO) Added: freq=446 size=1600 all=54500 active=4170 piece=▁establ
bpe_model_trainer.cc(159) LOG(INFO) Updating active symbols. max_freq=446 min_freq=155
bpe_model_trainer.cc(268) LOG(INFO) Added: freq=442 size=1620 all=54755 active=2979 piece=▁plan
bpe_model_trainer.cc(268) LOG(INFO) Added: freq=433 size=1640 all=55059 active=3283 piece=▁announ
bpe_model_trainer.cc(268) LOG(INFO) Added: freq=427 size=1660 all=55307 active=3531 piece=▁27
bpe_model_trainer.cc(268) LOG(INFO) Added: freq=423 size=1680 all=55655 active=3879 piece=igade
bpe_model_trainer.cc(268) LOG(INFO) Added: freq=417 size=1700 all=55923 active=4147 piece=▁good
b

In [6]:
import torch
from torch.utils.data import Dataset
import sentencepiece as spm

class CharDataset(Dataset):
    def __init__(self, sp_model_path, text_path, block_size=128):
        self.sp = spm.SentencePieceProcessor()
        self.sp.load(sp_model_path)
        with open(text_path, "r", encoding="utf-8") as f:
            text = f.read()
        ids = self.sp.encode(text, out_type=int)
        self.data = torch.tensor(ids, dtype=torch.long)
        self.block_size = block_size

    def __len__(self):
        return max(1, (len(self.data) - 1) // self.block_size)

    def __getitem__(self, idx):
        start = idx * self.block_size
        x = self.data[start:start+self.block_size]
        y = self.data[start+1:start+1+self.block_size]
        # pad if necessary
        if x.size(0) < self.block_size:
            pad = torch.full((self.block_size - x.size(0),), 0, dtype=torch.long)
            x = torch.cat([x, pad])
            y = torch.cat([y, pad])
        return x, y

In [7]:
################
# CREATE MODEL #
################

In [8]:
import math
import torch
import torch.nn as nn
import torch.nn.functional as F

class GPTConfig:
    def __init__(self, vocab_size=2048, block_size=128, n_layer=6, n_head=8, n_embd=256):
        self.vocab_size = vocab_size
        self.block_size = block_size
        self.n_layer = n_layer
        self.n_head = n_head
        self.n_embd = n_embd

class CausalSelfAttention(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        assert cfg.n_embd % cfg.n_head == 0
        self.n_head = cfg.n_head
        self.head_dim = cfg.n_embd // cfg.n_head
        self.c_attn = nn.Linear(cfg.n_embd, 3 * cfg.n_embd)
        self.c_proj = nn.Linear(cfg.n_embd, cfg.n_embd)
        # causal mask will be applied using broadcast
        self.register_buffer("mask", torch.tril(torch.ones(cfg.block_size, cfg.block_size)).unsqueeze(0).unsqueeze(0))

    def forward(self, x):
        B, T, C = x.size()
        qkv = self.c_attn(x)  # (B,T,3C)
        q, k, v = qkv.split(C, dim=2)
        # reshape heads
        q = q.view(B, T, self.n_head, self.head_dim).transpose(1,2)  # (B, nh, T, hd)
        k = k.view(B, T, self.n_head, self.head_dim).transpose(1,2)
        v = v.view(B, T, self.n_head, self.head_dim).transpose(1,2)
        att = (q @ k.transpose(-2,-1)) / math.sqrt(self.head_dim)
        att = att.masked_fill(self.mask[:, :, :T, :T] == 0, float("-inf"))
        att = F.softmax(att, dim=-1)
        y = att @ v
        y = y.transpose(1,2).contiguous().view(B, T, C)
        y = self.c_proj(y)
        return y

class Block(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.ln1 = nn.LayerNorm(cfg.n_embd)
        self.attn = CausalSelfAttention(cfg)
        self.ln2 = nn.LayerNorm(cfg.n_embd)
        self.mlp = nn.Sequential(
            nn.Linear(cfg.n_embd, 4*cfg.n_embd),
            nn.GELU(),
            nn.Linear(4*cfg.n_embd, cfg.n_embd)
        )

    def forward(self, x):
        x = x + self.attn(self.ln1(x))
        x = x + self.mlp(self.ln2(x))
        return x

class TinyGPT(nn.Module):
    def __init__(self, cfg: GPTConfig):
        super().__init__()
        self.cfg = cfg
        self.tok_emb = nn.Embedding(cfg.vocab_size, cfg.n_embd)
        self.pos_emb = nn.Embedding(cfg.block_size, cfg.n_embd)
        self.blocks = nn.Sequential(*[Block(cfg) for _ in range(cfg.n_layer)])
        self.ln_f = nn.LayerNorm(cfg.n_embd)
        self.head = nn.Linear(cfg.n_embd, cfg.vocab_size, bias=False)

    def forward(self, idx, targets=None):
        B, T = idx.size()
        assert T <= self.cfg.block_size
        pos = torch.arange(0, T, device=idx.device).unsqueeze(0)
        x = self.tok_emb(idx) + self.pos_emb(pos)
        x = self.blocks(x)
        x = self.ln_f(x)
        logits = self.head(x)
        loss = None
        if targets is not None:
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
        return logits, loss

    @torch.no_grad()
    def generate(self, idx, max_new_tokens=50, temperature=1.0):
        self.eval()
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -self.cfg.block_size:]
            logits, _ = self(idx_cond)
            probs = F.softmax(logits[:, -1, :]/temperature, dim=-1)
            next_id = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, next_id), dim=1)
        return idx

In [9]:
###############
# TRAIN MODEL #
###############

In [11]:
import os
import math
import torch
from torch.utils.data import DataLoader
from tqdm import trange, tqdm

sp_model = "/home/bonda/tiny_sp.model"
data = "/home/bonda/tiny_wikitext.txt"
out_dir = "/home/bonda/checkpoints"
batch_size = 32
block_size = 768
lr = 2e-4
epochs = 10
device = "cuda"

os.makedirs(out_dir, exist_ok=True)

cfg = GPTConfig(vocab_size=2048, block_size=block_size, n_layer=6, n_head=8, n_embd=256)
model = TinyGPT(cfg).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)

dataset = CharDataset(sp_model, data, block_size=block_size)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, drop_last=True, num_workers=2)

scaler = torch.cuda.amp.GradScaler()
global_step = 0

for epoch in range(epochs):
    model.train()
    pbar = tqdm(dataloader, desc=f"Epoch {epoch+1}")
    for x, y in pbar:
        x = x.to(device)
        y = y.to(device)
        optimizer.zero_grad()
        with torch.cuda.amp.autocast():
            logits, loss = model(x, targets=y)
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        global_step += 1
        if global_step % 100 == 0:
            # save checkpoint
            ckpt = os.path.join(out_dir, f"ckpt_step{global_step}.pt")
            torch.save({
                "model_state": model.state_dict(),
                "optim_state": optimizer.state_dict(),
                "step": global_step,
                "cfg": vars(cfg)
            }, ckpt)
        pbar.set_postfix({"loss": f"{loss.item():.4f}"})

# final save
torch.save({"model_state": model.state_dict(), "cfg": vars(cfg)}, os.path.join(out_dir, "final.pt"))


  scaler = torch.cuda.amp.GradScaler()
  with torch.cuda.amp.autocast():
Epoch 1: 100%|█████████████████████████████████████████████████████████████████████████| 177/177 [00:59<00:00,  2.97it/s, loss=4.6850]
Epoch 2: 100%|█████████████████████████████████████████████████████████████████████████| 177/177 [01:00<00:00,  2.91it/s, loss=4.2701]
Epoch 3: 100%|█████████████████████████████████████████████████████████████████████████| 177/177 [01:01<00:00,  2.86it/s, loss=4.1697]
Epoch 4: 100%|█████████████████████████████████████████████████████████████████████████| 177/177 [01:02<00:00,  2.84it/s, loss=4.0691]
Epoch 5: 100%|█████████████████████████████████████████████████████████████████████████| 177/177 [01:02<00:00,  2.84it/s, loss=3.9804]
Epoch 6: 100%|█████████████████████████████████████████████████████████████████████████| 177/177 [01:02<00:00,  2.82it/s, loss=3.9375]
Epoch 7: 100%|█████████████████████████████████████████████████████████████████████████| 177/177 [01:03<00:00,  2.81i

In [12]:
#############
# INFERENCE #
#############

In [13]:
import torch
import sentencepiece as spm

ckpt = "checkpoints/final.pt"
prompt = "A car is something that"
max_new = 1000

sp = spm.SentencePieceProcessor()
sp.load(sp_model)

ck = torch.load(ckpt, map_location="cpu")
cfg = GPTConfig(**ck["cfg"]) if "cfg" in ck else GPTConfig()
model = TinyGPT(cfg)
model.load_state_dict(ck["model_state"])
model = model.cuda().eval()

ids = sp.encode(prompt, out_type=int)
import torch
x = torch.tensor([ids], dtype=torch.long).cuda()
out = model.generate(x, max_new_tokens=max_new)
print(sp.decode(out[0].tolist()))

  ck = torch.load(ckpt, map_location="cpu")


A car is something that be on iachter simaksideddened with that with the Union at Stale conse zze and foundations max Blubeaneciccal of the Aveacimits weudy . Dowsonster propative , Mz endsca  ⁇  deep then Unityrica  ⁇   ⁇  is one of the music Res  ⁇  that the lated or in the black wanted with the also tarts of a  ⁇  it is perportray  ⁇ - ⁇  side of the books are tacagaz to Se  ⁇  whisticult er twenty . valuesmenies of the Video .RS has does not upon  ⁇  speak Estembergles  ⁇ - ⁇  to  ⁇ - ⁇  prown raidental unlovelos was succ swucklearet and epicactorp that the diredlish Par in the first ,  ⁇  form ganaissues and about battle star  ⁇  so  ⁇ - ⁇  bassinated A. Charles called , Sir press the In about . The gave the Uylauring the best needed as the most truck is damentatory that Mey  ⁇  Ma  ⁇ ly  ⁇  charababy region to decricrial Na Ber . King Greatyan paintedib, others have wart . Stone  ⁇  greena 's arey place that it does noted in starel of the locrigs  ⁇  each followed to but Darcript