In [32]:
import math
import os
import time
from dataclasses import dataclass

import torch
import torch.nn as nn
import torch.nn.functional as F
from asttokens.util import stmt_class_names
from jinja2.compiler import optimizeconst
from networkx.algorithms.mis import maximal_independent_set
from torch.utils.data import DataLoader

from datasets import load_dataset
from transformers import AutoTokenizer


In [None]:
def set_seed(seed=42):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)


set_seed(42)

device = "cuda" if torch.cuda.is_available() else "cpu"
device


In [None]:
@dataclass
class GPT2Config:
    vocab_size: int = 50257
    block_size: int = 256
    n_layer: int = 6
    n_head: int = 6
    n_embd: int = 384
    dropout: float = 0.1
    bias: bool = True


In [None]:
class LayerNorm(nn.Module):
    def __init__(self, n_embd, biais=True, eps=1e-5):
        super().__init__()
        self.weight = nn.Parameter(torch.ones(n_embd))
        self.bias = nn.Parameter(torch.zeros(n_embd)) if biais else None
        self.eps = eps

    def forward(self, x):
        return F.layer_norm(x, (x.size(-1),), self.weight, self.bias, self.eps)

In [12]:
class CausalSelfAttention(nn.Module):
    def __init__(self, config: GPT2Config):
        super().__init__()
        assert config.n_embd % config.n_head == 0
        self.n_head = config.n_head
        self.head_dim = config.n_embd // config.n_head
        self.dropout = config.dropout

        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=config.bias)
        self.c_proj = nn.Linear(config.n_embd, config.n_embd, bias=config.bias)

        self.attn_drop = nn.Dropout(config.dropout)
        self.resid_drop = nn.Dropout(config.dropout)

        mask = torch.tril(torch.ones(config.block_size, config.block_size)).view(1, 1, config.block_size,
                                                                                 config.block_size)
        self.register_buffer("biais", mask)

    def forward(self, x):
        B, T, C = x.shape
        qkv = self.c_attn(x)
        q, k, v = qkv.split(C, dim=2)

        q = q.view(B, T, self.n_head, self.head_dim).transpose(1, 2)
        k = k.view(B, T, self.n_head, self.head_dim).transpose(1, 2)
        v = v.view(B, T, self.n_head, self.head_dim).transpose(1, 2)

        att = (q @ k.transpose(-2, -1)) / math.sqrt(self.head_dim)
        att = att.masked_fill(self.biais[:, :, :T, :T] == 0, float('-inf'))
        att = F.softmax(att, dim=-1)
        att = self.attn_drop(att)

        y = att @ v
        y = y.transpose(1, 2).contiguous().view(B, T, C)

        y = self.c_proj(y)
        y = self.resid_drop(y)

        return y


In [21]:
class MLP(nn.Module):
    def __init__(self, config: GPT2Config):
        super().__init__()
        self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd, bias=config.bias)
        self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd, bias=config.bias)
        self.drop = nn.Dropout(config.dropout)

    def forward(self, x):
        x = self.c_fc(x)
        x = F.gelu(x)
        x = self.c_proj(x)
        x = self.drop(x)
        return x

In [22]:
class Block(nn.Module):
    def __init__(self, config: GPT2Config):
        super().__init__()
        self.ln_1 = LayerNorm(config.n_embd, biais=config.bias)
        self.attn = CausalSelfAttention(config)
        self.ln_2 = LayerNorm(config.n_embd, biais=config.bias)
        self.mlp = MLP(config)

    def forward(self, x):
        x = x + self.attn(self.ln_1(x))
        x = x + self.mlp(self.ln_2(x))
        return x

In [23]:
class GPT2(nn.Module):
    def __init__(self, config: GPT2Config):
        super().__init__()
        self.config = config

        self.wte = nn.Embedding(config.vocab_size, config.n_embd)
        self.wpe = nn.Embedding(config.block_size, config.n_embd)
        self.drop = nn.Dropout(config.dropout)

        self.h = nn.ModuleList([Block(config) for _ in range(config.n_layer)])
        self.ln_f = LayerNorm(config.n_embd, biais=config.bias)

        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)

        self.lm_head.weight = self.wte.weight

        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, idx, targets=None):
        B, T = idx.shape
        assert T <= self.config.block_size

        pos = torch.arange(0, T, device=idx.device).unsqueeze(0)
        x = self.wte(idx) + self.wpe(pos)
        x = self.drop(x)

        for block in self.h:
            x = block(x)

        x = self.ln_f(x)
        logits = self.lm_head(x)

        loss = None
        if targets is not None:
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))

        return logits, loss

In [24]:
config = GPT2Config(block_size=128, n_layer=4, n_head=4, n_embd=256, dropout=0.1)
model = GPT2(config).to(device)

x = torch.randint(0, config.vocab_size, (2, 32), device=device)
logits, loss = model(x, x)
logits.shape, loss.item()

(torch.Size([2, 32, 50257]), 10.530092239379883)

In [26]:
ds = load_dataset("wikitext", "wikitext-2-raw-v1")
ds

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Generating test split: 100%|██████████| 4358/4358 [00:00<00:00, 398673.40 examples/s]
Generating train split: 100%|██████████| 36718/36718 [00:00<00:00, 2040685.51 examples/s]
Generating validation split: 100%|██████████| 3760/3760 [00:00<00:00, 536542.14 examples/s]


DatasetDict({
    test: Dataset({
        features: ['text'],
        num_rows: 4358
    })
    train: Dataset({
        features: ['text'],
        num_rows: 36718
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 3760
    })
})

In [27]:
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # pratique pour batcher

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [28]:
def tokenize_function(examples):
    return tokenizer(examples["text"])


tokenized = ds.map(tokenize_function, batched=True, remove_columns=["text"])
tokenized

Map: 100%|██████████| 4358/4358 [00:00<00:00, 18564.41 examples/s]
Map: 100%|██████████| 36718/36718 [00:00<00:00, 37471.55 examples/s]
Map: 100%|██████████| 3760/3760 [00:00<00:00, 37628.47 examples/s]


DatasetDict({
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 4358
    })
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 36718
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 3760
    })
})

In [29]:
# concat tous les input_ids en une grande liste
def build_stream(split, key="input_ids"):
    ids = []
    for ex in split:
        ids.extend(ex[key])
        ids.append(tokenizer.eos_token_id)
    return torch.tensor(ids, dtype=torch.long)


train_stream = build_stream(tokenized["train"])
val_stream = build_stream(tokenized["validation"])

len(train_stream), len(val_stream)

(2428602, 251049)

In [35]:
class StreamDataset(torch.utils.data.Dataset):
    def __init__(self, stream_ids: torch.Tensor, block_size: int):
        self.data = stream_ids
        self.block_size = block_size

    def __len__(self):
        return (len(self.data) - 1) // self.block_size

    def __getitem__(self, i):
        start = i * self.block_size
        x = self.data[start: start + self.block_size]
        y = self.data[start + 1: start + 1 + self.block_size]
        return x, y


train_ds = StreamDataset(train_stream, config.block_size)
val_ds = StreamDataset(val_stream, config.block_size)

train_loader = DataLoader(train_ds, batch_size=16, shuffle=True, drop_last=True)
val_loader = DataLoader(val_ds, batch_size=16, shuffle=False, drop_last=True)

next(iter(train_loader))[0].shape

torch.Size([16, 128])

In [36]:
def get_lr(step, warmup_steps, max_steps, max_lr):
    if step < warmup_steps:
        return max_lr * step / warmup_steps

    progress = (step - warmup_steps) / max(1, (max_steps - warmup_steps))
    return 0.5 * max_lr * (1.0 + math.cos(math.pi * progress))


optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4, betas=(0.9, 0.95), weight_decay=0.01)

In [37]:
@torch.no_grad()
def evaluate(model, loader, max_batches=50):
    model.eval()
    losses = []
    for i, (x, y) in enumerate(loader):
        if i >= max_batches:
            break

        x, y = x.to(device), y.to(device)
        _, loss = model(x, y)
        losses.append(loss.item())
    model.train()
    return sum(losses) / len(losses)

In [38]:
from torch.cuda.amp import autocast, GradScaler

use_amp = device == "cuda"
scaler = GradScaler(enabled=use_amp)

max_steps = 2000
warmup_steps = 200
max_lr = 3e-4
grad_clip = 1.0
log_every = 50
eval_every = 200

model.train()
t0 = time.time()
step = 0

for epoch in range(1000):
    for x, y in train_loader:
        if step >= max_steps:
            break

        lr = get_lr(step, warmup_steps, max_steps, max_lr)
        for pg in optimizer.param_groups:
            pg['lr'] = lr

        x, y = x.to(device), y.to(device)
        optimizer.zero_grad(set_to_none=True)

        with autocast(enabled=use_amp):
            _, loss = model(x, y)

        scaler.scale(loss).backward()
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
        scaler.step(optimizer)
        scaler.update()

        if step % log_every == 0:
            dt = time.time() - t0
            print(f"step {step:5d} | loss {loss.item():.4f} | lr {lr:.2e} | dt {dt:.1f}")
            t0 = time.time()

        step += 1

    if step >= max_steps:
        break

  scaler = GradScaler(enabled=use_amp)
  with autocast(enabled=use_amp):


step     0 | loss 10.8745 | lr 0.00e+00 | dt 0.4
step    50 | loss 10.0340 | lr 7.50e-05 | dt 0.9
step   100 | loss 8.5851 | lr 1.50e-04 | dt 0.9
step   150 | loss 7.4156 | lr 2.25e-04 | dt 0.9
step   200 | loss 7.0035 | lr 3.00e-04 | dt 0.9
step   250 | loss 6.8241 | lr 2.99e-04 | dt 0.9
step   300 | loss 6.5002 | lr 2.98e-04 | dt 0.9
step   350 | loss 6.6637 | lr 2.95e-04 | dt 0.9
step   400 | loss 6.2998 | lr 2.91e-04 | dt 0.9
step   450 | loss 6.6390 | lr 2.86e-04 | dt 0.9
step   500 | loss 6.2773 | lr 2.80e-04 | dt 0.9
step   550 | loss 6.4866 | lr 2.73e-04 | dt 0.9
step   600 | loss 6.1682 | lr 2.65e-04 | dt 0.9
step   650 | loss 6.4175 | lr 2.56e-04 | dt 1.0
step   700 | loss 6.2013 | lr 2.46e-04 | dt 1.0
step   750 | loss 6.3738 | lr 2.36e-04 | dt 0.9
step   800 | loss 6.2861 | lr 2.25e-04 | dt 0.9
step   850 | loss 6.1527 | lr 2.13e-04 | dt 0.9
step   900 | loss 6.0703 | lr 2.01e-04 | dt 0.9
step   950 | loss 6.1933 | lr 1.89e-04 | dt 0.9
step  1000 | loss 6.0555 | lr 1.76e-04

In [39]:
@torch.no_grad()
def generate(model, prompt, max_new_tokens=80, temperature=1.0, top_k=0):
    model.eval()
    idx = tokenizer(prompt, return_tensors="pt")["input_ids"].to(device)

    for _ in range(max_new_tokens):
        idx_cond = idx[:, -model.config.block_size:]
        logits, _ = model(idx_cond)
        logits = logits[:, -1, :] / max(1e-8, temperature)

        if top_k is not None:
            v, ix = torch.topk(logits, top_k)
            logits[logits < v[:, [-1]]] = float("-inf")

        probs = F.softmax(logits, dim=-1)
        next_id = torch.multinomial(probs, num_samples=1)
        idx = torch.cat([idx, next_id], dim=1)

    out = tokenizer.decode(idx[0].tolist())
    model.train()
    return out


In [40]:
print(generate(model, "In the beginning", max_new_tokens=80, temperature=0.9, top_k=40))

In the beginning of the song 's death of the song was the song " . He was " for the play of a episode , " ( " ) " ) " , which became " the " " and " an " which was the song " . 
<|endoftext|> Other songs , " , he is about " " The first released the video " , " the song " for " , " . " and "


In [43]:
import os

save_dir = r"C:\workspace\GPT2\models"
os.makedirs(save_dir, exist_ok=True)

ckpt_path = os.path.join(save_dir, "gpt2_scratch.pt")
ckpt_path

torch.save(
    {
        "config": config.__dict__,
        "state_dict": model.state_dict(),
    },
    ckpt_path
)

os.path.exists(ckpt_path)
print(f"✅ Modèle sauvegardé dans : {ckpt_path}")

✅ Modèle sauvegardé dans : C:\workspace\GPT2\models\gpt2_scratch.pt


In [45]:
ckpt = torch.load(ckpt_path, map_location=device)

config = GPT2Config(**ckpt["config"])
model = GPT2(config).to(device)
model.load_state_dict(ckpt["state_dict"])
model.eval()

GPT2(
  (wte): Embedding(50257, 256)
  (wpe): Embedding(128, 256)
  (drop): Dropout(p=0.1, inplace=False)
  (h): ModuleList(
    (0-3): 4 x Block(
      (ln_1): LayerNorm()
      (attn): CausalSelfAttention(
        (c_attn): Linear(in_features=256, out_features=768, bias=True)
        (c_proj): Linear(in_features=256, out_features=256, bias=True)
        (attn_drop): Dropout(p=0.1, inplace=False)
        (resid_drop): Dropout(p=0.1, inplace=False)
      )
      (ln_2): LayerNorm()
      (mlp): MLP(
        (c_fc): Linear(in_features=256, out_features=1024, bias=True)
        (c_proj): Linear(in_features=1024, out_features=256, bias=True)
        (drop): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (ln_f): LayerNorm()
  (lm_head): Linear(in_features=256, out_features=50257, bias=False)
)