# Imports

In [1]:
import numpy as np
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from torch.utils.checkpoint import checkpoint, checkpoint_sequential
import transformers
import pandas as pd
import os
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
from dotenv import load_dotenv
load_dotenv()

True

Environment variables

In [2]:
PATH = os.getenv("PATH")
DATAPATH = os.getenv("DATAPATH")
PREPARED_DATA_DIR = os.getenv("PREPARED_DATA_DIR")
CACHE_DIR = os.getenv("CACHE_DIR")
#TOK_NAME = "deepseek-ai/DeepSeek-R1-0528-Qwen3-8B"
TOK_NAME = os.getenv("TOK_NAME")
PARQUET_DATA_DIR = os.getenv("PARQUET_DATA_DIR")

## Config

In [3]:
GPT_CONFIG = {
    'vocab_size': 50257, # in 151670 (if you use tokenizer.vocab_size then you get partial vocab_size without added tokens)
    'context_length': 1024,
    'emb_dim': 256, #768
    'n_heads': 4,#12,
    'n_layers': 4,#12,
    'drop_rate': 0.05, # 0l1
    'qkv_bias': False,
    'num_segments': 2
    }

In [4]:
device = 'cuda' if (torch.cuda.is_available()) else 'cpu'
device

'cuda'

# Dataset

## Load Tokenizer

In [5]:
tok = transformers.AutoTokenizer.from_pretrained(TOK_NAME, cache_dir=CACHE_DIR)

Check tokenizer

In [6]:
#tok.get_added_vocab

In [7]:
tok.vocab_size

50257

In [8]:
# If tokenizer dont have pad_token
tok.pad_token = tok.eos_token

In [9]:
tok('Привет, как дела mhjm', return_tensors='pt', padding='max_length', max_length=2048)['input_ids'].shape

torch.Size([1, 2048])

## Actual version of dataset

In [10]:
class CustomDatasetV3(Dataset):
    def __init__(self, dataframe: str, tokenizer: object, max_length: int):
        self.tokenizer = tokenizer
        self.input_ids = []
        self.target_ids = []

        for i, curr_chunk in tqdm(dataframe.iterrows(), total=dataframe.shape[0]):
            token_ids = tokenizer(curr_chunk['Sample'], return_tensors='pt', padding='max_length', max_length=max_length+1)['input_ids']
            input_chunk = token_ids[:,:max_length].view(-1)
            target_chunk = token_ids[:,1:max_length+1].view(-1)
            #print(input_chunk.size(), target_chunk.size(),)
            self.input_ids.append(input_chunk)
            self.target_ids.append(target_chunk)
        
    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, index):

        return self.input_ids[index], self.target_ids[index]

# Load actual data and dataloader

In [11]:
data_parquet = pd.read_parquet(PARQUET_DATA_DIR)

In [12]:
data_parquet.shape

(224662, 2)

In [13]:
#train_cd = CustomDatasetV3(dataframe=data_parquet.iloc[:100000], tokenizer=tok, max_length=GPT_CONFIG['context_length'])#MY_GPT_CONFIG['context_length'])
train_cd = CustomDatasetV3(dataframe=data_parquet.iloc[:100], tokenizer=tok, max_length=GPT_CONFIG['context_length'])#MY_GPT_CONFIG['context_length'])

  0%|          | 0/100 [00:00<?, ?it/s]

In [14]:
#val_cd = CustomDatasetV3(dataframe=data_parquet.iloc[-10000:], tokenizer=tok, max_length=GPT_CONFIG['context_length'])#MY_GPT_CONFIG['context_length'])
val_cd = CustomDatasetV3(dataframe=data_parquet.iloc[-100:], tokenizer=tok, max_length=GPT_CONFIG['context_length'])#

  0%|          | 0/100 [00:00<?, ?it/s]

batch_size maybe 8 or 12 (or 16) check

In [None]:
train_data = DataLoader(dataset=train_cd, batch_size=4, shuffle=True, num_workers=0) # num_workers=2 don't work?
val_data = DataLoader(dataset=val_cd, batch_size=4, shuffle=True, num_workers=0) # num_workers=2 don't work?

In [16]:
next(iter(train_data))

[tensor([[12466,   251, 16142,  ..., 12466,   123, 25443],
         [12466,   240,   220,  ..., 16843, 20375, 21727],
         [12466,   248, 21169,  ...,   114, 18849,   140],
         [12466,   253, 15166,  ..., 20375, 45367, 12466]]),
 tensor([[  251, 16142, 12466,  ...,   123, 25443,   112],
         [  240,   220, 21727,  ..., 20375, 21727, 20375],
         [  248, 21169, 18849,  ..., 18849,   140,   115],
         [  253, 15166, 12466,  ..., 45367, 12466,   122]])]

# LLM Code

In [17]:
class MultiHeadAttentionDP_QKV(nn.Module):
    def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False):
        super().__init__()
        assert (d_out % num_heads == 0)

        self.d_out = d_out
        self.num_heads = num_heads
        self.head_dim = d_out // num_heads

        self.W_qkv = nn.Linear(d_in, 3 * d_out, bias=qkv_bias)

        self.out_proj = nn.Linear(d_out, d_out)
        self.dropout = dropout#nn.Dropout(dropout)
        self.register_buffer('mask', torch.triu(torch.ones(context_length, context_length), diagonal=1))

    def forward(self, x):
        b, num_tokens, d_in = x.size()
        qkv = self.W_qkv(x) # b, num_tokens, 3 * self.d_out
        queries, keys, values = qkv.split(self.d_out, dim=2)

        queries = queries.view(b, num_tokens, self.num_heads, self.head_dim)
        keys = keys.view(b, num_tokens, self.num_heads, self.head_dim)
        values = values.view(b, num_tokens, self.num_heads, self.head_dim)

        queries = queries.transpose(1, 2) # b, self.num_heads, num_tokens, self.head_dim
        keys = keys.transpose(1, 2) # b, self.num_heads, num_tokens, self.head_dim
        values = values.transpose(1, 2) # b, self.num_heads, num_tokens, self.head_dim

        # All code below we replace with torch.nn.functional.scaled_dot_product_attention
        context_vec = torch.nn.functional.scaled_dot_product_attention(queries, keys, values, attn_mask=None, dropout_p=self.dropout if self.training else 0, is_causal=True)

        # att_scores = queries @ keys.transpose(2, 3) # shapes = (num_tokens, self.head_dim) @ (self.head_dim, num_tokens) -> (num_tokens, num_tokens)
        # mask_bool = self.mask.bool()[:num_tokens, :num_tokens]
        # att_scores.masked_fill_(mask_bool, -torch.inf)
        # att_weights = torch.softmax(att_scores / keys.shape[-1]**0.5, dim=-1)
        # att_weights = self.dropout(att_weights)
        # context_vec = (att_weights @ values).transpose(1, 2) # (num_tokens, num_tokens) @ (num_tokens, self.head_dim) -> (num_tokens, self.head_dim) -> transpose(1,2) of (b, self.num_heads, num_tokens, self.head_dim) ->
        # # -> (b, num_tokens, self.num_heads, self.head_dim) as view in previous code after inference of Linear layers
        
        # Reshape etc
        context_vec = context_vec.transpose(1, 2).contiguous().view(b, num_tokens, self.d_out)
        context_vec = self.out_proj(context_vec)
        return context_vec

In [19]:
#mha_dp_qkv = MultiHeadAttentionDP_QKV(d_in=embed_dim, d_out=embed_dim, context_length=context_len, dropout=dropout, num_heads=num_heads, qkv_bias=qkv_bias)

## Additional classes

In [20]:
class FeedForward(nn.Module):
    def __init__(self, emb_dim):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(emb_dim, 4 * emb_dim),
            nn.Mish(), #GELU(),
            nn.Linear(4 * emb_dim, emb_dim)
        )
    def forward(self, x):
        return self.layers(x)

In [21]:
class ReversibleTransformerBlock(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        emb_dim = cfg['emb_dim']//2
        self.attn = MultiHeadAttentionDP_QKV(d_in=emb_dim, 
                                       d_out=emb_dim, 
                                       context_length=cfg['context_length'], 
                                       dropout=cfg['drop_rate'], 
                                       num_heads=cfg['n_heads'], 
                                       qkv_bias=cfg['qkv_bias'])
        self.ff = FeedForward(emb_dim)
        self.norm1 = nn.LayerNorm(emb_dim) #LayerNorm(cfg['emb_dim'])
        self.norm2 = nn.LayerNorm(emb_dim) #LayerNorm(cfg['emb_dim'])
        self.drop_resid = nn.Dropout(cfg['drop_rate'])

    def forward(self, x1, x2):
        # reversible update
        # y1 = x1 + f(x2)
        # y2 = x2 + g(y1)

        def f(u):
            u = self.norm1(u)
            attn_output = self.attn(u)
            attn_output = self.drop_resid(attn_output)
            return attn_output
        
        def g(v):
            return self.drop_resid(self.ff(self.norm2(v)))
        
        f_x2 = checkpoint(f, x2)
        y1 = x1 + f_x2
        g_y1 = checkpoint(g, y1)
        y2 = x2 + g_y1

        return y1, y2

In [22]:
class GPTModelRev(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.tok_emb = nn.Embedding(cfg['vocab_size'], cfg['emb_dim'])
        self.pos_emb = nn.Embedding(cfg['context_length'], cfg['emb_dim'])
        self.drop_emb = nn.Dropout(cfg['drop_rate'])

        self.trf_blocks = nn.Sequential(*[ReversibleTransformerBlock(cfg) for _ in range(cfg['n_layers'])])
        self.final_norm = nn.LayerNorm(cfg['emb_dim'])
        self.out_head = nn.Linear(cfg['emb_dim'], cfg['vocab_size'], bias=False)

    def forward(self, in_idx):
        batch_size, seq_len = in_idx.size()
        tok_embeds = self.tok_emb(in_idx)
        pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device))
        x = tok_embeds + pos_embeds
        x = self.drop_emb(x)

        # initialize reversible pairs: split features
        # split last dim
        x1, x2 = torch.chunk(x, 2, dim=-1)  # each (batch_size, seq_len, emb_dim//2)

        # Now we change x = self.trf_blocks(x) to: 
        for layer in self.trf_blocks:
            x1, x2 = layer(x1, x2)
        # merge
        x = torch.cat([x1, x2], dim=-1)  # (b, s, dim)
        
        # Now as usual
        x = self.final_norm(x)
        logits = self.out_head(x)
        return logits

## Generation

In [23]:
def generate(model, idx, max_new_tokens, context_size):
    model.eval()
    for _ in range(max_new_tokens):
        idx_cond = idx[:,-context_size:]
        with torch.no_grad():
            logits = model(idx_cond)
        logits = logits[:, -1, :]
        probas = torch.softmax(logits, dim=-1)
        idx_next = torch.argmax(probas, dim=-1, keepdim=True)
        idx = torch.cat((idx, idx_next), dim=1)
    return idx

## Check

In [30]:
m = GPTModelRev(GPT_CONFIG)

In [31]:
for x, y in train_data:
    print(x.size())
    r = m(x)
    break

torch.Size([4, 1024])


  return fn(*args, **kwargs)


In [32]:
r.size()

torch.Size([4, 1024, 50257])

# Training

In [24]:
from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter()

In [61]:
class Trainer():
    def __init__(self, optimizer, params, device):
        self.optimizer = optimizer
        self.params = params
        self.device = device
    
    def train_model(self, model, tokenizer, train_dataloader, val_dataloader, writer=None, grad_accum=1, max_norm=1.0, scheduler=None):
        train_epoch_loss = []
        val_loss = []
        cumulative_tokens_get = []
        tokens_get = 0
        scaler = torch.cuda.amp.GradScaler()

        for epoch in range(self.params['N_EPOCHS']):
            total_loss = 0.0
            self.optimizer.zero_grad()
            for step, (x, y) in enumerate(train_dataloader):
                if not (model.training):
                    model.train()
                x, y = x.to(self.device), y.to(self.device)
                tokens_get += len(x.flatten())
                cumulative_tokens_get.append(tokens_get)
                with torch.autocast(device_type=device, dtype=torch.bfloat16):
                    logits = model(x)
                    loss = nn.functional.cross_entropy(logits.flatten(0, 1), y.flatten())

                scaler.scale(loss).backward() #loss.backward()
                curr_train_loss = loss.item()
                total_loss += curr_train_loss
                writer.add_scalar("Tokens get", tokens_get, step)

                if (step % grad_accum == 0):
                    scaler.unscale_(self.optimizer)
                    torch.nn.utils.clip_grad_norm_(
                        model.parameters(), max_norm
                    )
                    scaler.step(self.optimizer)
                    scaler.update()
                    self.optimizer.zero_grad()
                    if scheduler:
                        scheduler.step()

                if (self.params['verbose'] is True) and (tokens_get % self.params['verbose_freq'] == 0):
                    sample = tokenizer.decode(generate(model=model, idx=torch.tensor(tokenizer('Я большая языковая модель и ')['input_ids'], device=self.device).unsqueeze(0), max_new_tokens=25, context_size=1024).squeeze(0).tolist())
                    print(f'Epoch {epoch}: Train loss = {loss}, sample: {sample}')
                    if (writer is not None):
                        writer.add_scalar("Loss/train in step", loss, epoch)
                        writer.add_text("Sample", str(sample), epoch)
                        if (self.params['gradients'] is True):
                            grads = []
                            for name, param in model.named_parameters():
                                if ('weight' in name):
                                    if (param.grad is not None):
                                        grads.append(param.grad.abs().flatten().mean().cpu().detach().numpy())
                            writer.add_scalar("train/gradients", np.array(grads).flatten().mean(), epoch)


                    model.eval()
                    with torch.no_grad():
                        for x, y in val_dataloader:
                            x, y = x.to(self.device), y.to(self.device)
                            logits = model(x)
                            loss = nn.functional.cross_entropy(logits.flatten(0, 1), y.flatten())
                            val_loss.append(loss)
                        if (writer is not None):
                            writer.add_scalar("Loss/train in check", torch.mean(torch.tensor(curr_train_loss, device='cpu')), epoch)
                            writer.add_scalar("Loss/val in check", torch.mean(torch.tensor(val_loss, device='cpu')), epoch)
            writer.close()

In [62]:
params = {'N_EPOCHS': 5, 
          'verbose': True, 
          'verbose_freq': 1,
          'gradients': True}

In [51]:
model = GPTModelRev(GPT_CONFIG)

In [52]:
model = model.to(device)

In [53]:
model

GPTModelRev(
  (tok_emb): Embedding(50257, 256)
  (pos_emb): Embedding(1024, 256)
  (drop_emb): Dropout(p=0.05, inplace=False)
  (trf_blocks): Sequential(
    (0): ReversibleTransformerBlock(
      (attn): MultiHeadAttentionDP_QKV(
        (W_qkv): Linear(in_features=128, out_features=384, bias=False)
        (out_proj): Linear(in_features=128, out_features=128, bias=True)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=128, out_features=512, bias=True)
          (1): Mish()
          (2): Linear(in_features=512, out_features=128, bias=True)
        )
      )
      (norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      (norm2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      (drop_resid): Dropout(p=0.05, inplace=False)
    )
    (1): ReversibleTransformerBlock(
      (attn): MultiHeadAttentionDP_QKV(
        (W_qkv): Linear(in_features=128, out_features=384, bias=False)
        (out_proj): Linear(in_features=128

In [63]:
amount_of_parameters = sum([p.numel() for p in model.parameters()])
amount_of_parameters

26785792

In [64]:
len(train_data)

25

In [None]:
opt = torch.optim.AdamW(params=model.parameters(), lr=0.01)
grad_accum = 2
total_steps = (len(train_data) // grad_accum) * params["N_EPOCHS"]
#total_steps = params["N_EPOCHS"] * len(train_data)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(opt, T_max=total_steps
trainer = Trainer(optimizer=opt, params=params, device=device)

In [None]:
trainer.train_model(model=model, 
                    tokenizer=tok, 
                    train_dataloader=train_data, 
                    val_dataloader=val_data, 
                    writer=writer, 
                    grad_accum=grad_accum, 
                    scheduler=scheduler)
writer.flush()

  scaler = torch.cuda.amp.GradScaler()
  return fn(*args, **kwargs)
  writer.add_scalar("train/gradients", np.array(grads).flatten().mean(), epoch)
  ret = ret.dtype.type(ret / rcount)


Epoch 0: Train loss = 5.437976837158203, sample: Я большая языковая модель и  �о � � �о �о � �о � � � �о �о � �о � �о �
Epoch 0: Train loss = 3.93634033203125, sample: Я большая языковая модель и  �о � � �о �о � �о � � � �о �о � �о � �о �
Epoch 0: Train loss = 3.9625244140625, sample: Я большая языковая модель и  �о � � �о �о � �о � � � �о �о � �о � �о �
Epoch 0: Train loss = 3.9942569732666016, sample: Я большая языковая модель и  �о � � �о �о � �о � � � �о �о � �о � �о �
Epoch 0: Train loss = 4.015851974487305, sample: Я большая языковая модель и ееееееееееееее�ееееееееее
Epoch 0: Train loss = 3.8237686157226562, sample: Я большая языковая модель и ееееееееееееее�ееееееееее
Epoch 0: Train loss = 3.8851795196533203, sample: Я большая языковая модель и ееееееееееееее�ееееееееее
Epoch 0: Train loss = 3.9865798950195312, sample: Я большая языковая модель и ееееееееееееее�ееееееееее
Epoch 0: Train loss = 3.8604507446289062, sample: Я большая языковая модель и со�о�о�о�о�о�о�о�о�о�о�о�о�о�

KeyboardInterrupt: 

How to use tensorboard?  
tensorboard --logdir=GPT_training or you name (instead of GPT_training) or tensorboard --logdir=runs  
http://localhost:6006  

### Saving weights

In [None]:
#torch.save(model.state_dict(), "model.pth") # without state of optimizer
torch.save({
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': opt.state_dict(),
    }, "model_and_optimizer.pth") # with state of optimizer

### Loading weights

In [None]:
checkpoint = torch.load("model_and_optimizer.pth")
model = GPTModel(GPT_CONFIG)
model.load_state_dict(checkpoint['model_state_dict'])
opt = torch.optim.AdamW(model.parameters(), lr=0.01)
opt.load_state_dict(checkpoint['optimizer_state_dict'])
model.train()

In [None]:
torch.randint(0, 100, size=(10, 1024)).size()

In [None]:
model(torch.randint(0, 100, size=(50, 1024))).size()