# Imports

In [3]:
import numpy as np
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
import transformers
import pandas as pd
import os
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
from dotenv import load_dotenv
load_dotenv()

True

Environment variables

In [4]:
PATH = os.getenv("PATH")
DATAPATH = os.getenv("DATAPATH")
PREPARED_DATA_DIR = os.getenv("PREPARED_DATA_DIR")
CACHE_DIR = os.getenv("CACHE_DIR")
#TOK_NAME = "deepseek-ai/DeepSeek-R1-0528-Qwen3-8B"
TOK_NAME = os.getenv("TOK_NAME")
PARQUET_DATA_DIR = os.getenv("PARQUET_DATA_DIR")

## Config

In [5]:
GPT_CONFIG = {
    'vocab_size': 50257, # in 151670 (if you use tokenizer.vocab_size then you get partial vocab_size without added tokens)
    'context_length': 1024,
    'emb_dim': 768, #768
    'n_heads': 2,#12,
    'n_layers': 2,#12,
    'drop_rate': 0.05, # 0l1
    'qkv_bias': False
    }

In [6]:
device = 'cuda' if (torch.cuda.is_available()) else 'cpu'
device

'cuda'

# Dataset

## Load Tokenizer

In [7]:
tok = transformers.AutoTokenizer.from_pretrained(TOK_NAME, cache_dir=CACHE_DIR)

Check tokenizer

In [None]:
tok.get_added_vocab

In [None]:
tok.vocab_size

In [8]:
# If tokenizer dont have pad_token
tok.pad_token = tok.eos_token

In [None]:
tok('Привет, как дела mhjm', return_tensors='pt', padding='max_length', max_length=2048)['input_ids'].shape

# Prepare data

## Raw data

In [None]:
with open(DATAPATH, encoding='utf8', mode='r') as file:
    d = file.read()

In [None]:
len(d)

In [None]:
1423181938//131072

In [None]:
num_chunks=25
stride = len(d)//num_chunks

for i, chunk_idx in tqdm(enumerate(range(0, len(d), stride))):
    with open(os.path.join(PREPARED_DATA_DIR, f'chunk_{i}.txt'), mode='w') as file:
        file.write(d[chunk_idx:chunk_idx+stride])
    print(i, chunk_idx)

In [None]:
#num_chunks=25
stride = 131072#len(d)//num_chunks

data_parquet = pd.DataFrame([], columns=['Sample', 'Chunk'])
for i, chunk_idx in tqdm(enumerate(range(0, len(d), stride))):
    data_parquet.loc[len(data_parquet)] = ['sdgsgsg', 0]
    #with open(os.path.join(PREPARED_DATA_DIR, f'chunk_{i}.txt'), mode='w') as file:
    #    file.write(d[chunk_idx:chunk_idx+stride])
    print(i, chunk_idx)

In [None]:
data_parquet = pd.DataFrame([], columns=['Sample', 'Chunk'])
for i, filename in tqdm(enumerate(os.listdir(PREPARED_DATA_DIR)), total=len(os.listdir(PREPARED_DATA_DIR))):
    with open(os.path.join(PREPARED_DATA_DIR, filename), encoding='utf8', mode='r') as file:
        current_file = file.read()
        stride = 2048*3
        mas = ''
        for article in current_file.split('/n'):
            for sentence in article.split('.'):
                if (len(mas)+len(sentence) < stride):
                    mas += sentence
                else:
                    data_parquet.loc[len(data_parquet)] = [mas, i]
                    mas = ''
            
        # for chunk_idx in tqdm(range(0, len(current_file), stride)):
        #     current_chunk = current_file[chunk_idx:chunk_idx+stride]
        #     data_parquet.loc[len(data_parquet)] = ['sdgsgsg', 0]

In [None]:
2048*3

In [None]:
data_parquet.to_parquet(PARQUET_DATA_DIR)

In [None]:
d[200:250]

## Analysis

Небольшой анализ длины предложений

In [None]:
with open(os.path.join(PREPARED_DATA_DIR, os.listdir(PREPARED_DATA_DIR)[0]), encoding='utf8', mode='r') as file:
    d = file.read()

In [None]:
splt = d.split('.')

In [None]:
lens = [len(elem) for elem in splt]

In [None]:
max(lens)

In [None]:
plt.hist(lens, bins=20, range=(0, 1000))

Если взять длину абзацев

In [None]:
splt = d.split('/n')

In [None]:
splt[10]

In [None]:
len(splt)

In [None]:
lens = [len(elem) for elem in splt]

In [None]:
max(lens)

In [None]:
plt.hist(lens, bins=20)

Class for dataset

## Old versions of dataset

In [None]:
class CustomDatasetV1(Dataset):
    def __init__(self, txt: str, tokenizer: object, max_length: int, stride: int):
        self.tokenizer = tokenizer
        self.input_ids = []
        self.target_ids = []

        token_ids = tokenizer.encode(txt)

        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1:i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))
        
    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, index):
        return self.input_ids[index], self.target_ids[index]

In [None]:
class CustomDatasetV2(Dataset):
    def __init__(self, dataframe: str, tokenizer: object, max_length: int, stride: int):
        self.tokenizer = tokenizer
        self.input_ids = []
        self.target_ids = []

        for i, curr_chunk in dataframe.iterrows():
            token_ids = tokenizer.encode(curr_chunk['Sample'])
            for i in range(0, len(token_ids) - max_length, stride):
                input_chunk = token_ids[i:i + max_length]
                target_chunk = token_ids[i + 1:i + max_length + 1]
                self.input_ids.append(torch.tensor(input_chunk))
                self.target_ids.append(torch.tensor(target_chunk))
        
    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, index):

        return self.input_ids[index], self.target_ids[index]

In [None]:
cd = CustomDatasetV2(dataframe=data_parquet.iloc[:100], tokenizer=tok, max_length=1024, stride=1)

## Actual version of dataset

In [None]:
class CustomDatasetV3(Dataset):
    def __init__(self, dataframe: str, tokenizer: object, max_length: int):
        self.tokenizer = tokenizer
        self.input_ids = []
        self.target_ids = []

        for i, curr_chunk in tqdm(dataframe.iterrows(), total=dataframe.shape[0]):
            token_ids = tokenizer(curr_chunk['Sample'], return_tensors='pt', padding='max_length', max_length=max_length+1)['input_ids']
            input_chunk = token_ids[:,:max_length].view(-1)
            target_chunk = token_ids[:,1:max_length+1].view(-1)
            #print(input_chunk.size(), target_chunk.size(),)
            self.input_ids.append(input_chunk)
            self.target_ids.append(target_chunk)
        
    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, index):

        return self.input_ids[index], self.target_ids[index]

# Load actual data and dataloader

In [None]:
data_parquet = pd.read_parquet(PARQUET_DATA_DIR)

In [None]:
data_parquet.shape

In [None]:
train_cd = CustomDatasetV3(dataframe=data_parquet.iloc[:100000], tokenizer=tok, max_length=GPT_CONFIG['context_length'])#MY_GPT_CONFIG['context_length'])
#train_cd = CustomDatasetV3(dataframe=data_parquet.iloc[:100], tokenizer=tok, max_length=GPT_CONFIG['context_length'])#MY_GPT_CONFIG['context_length'])

In [None]:
val_cd = CustomDatasetV3(dataframe=data_parquet.iloc[-10000:], tokenizer=tok, max_length=GPT_CONFIG['context_length'])#MY_GPT_CONFIG['context_length'])
#val_cd = CustomDatasetV3(dataframe=data_parquet.iloc[-100:], tokenizer=tok, max_length=GPT_CONFIG['context_length'])#

In [None]:
train_data = DataLoader(dataset=train_cd, batch_size=4, shuffle=True, num_workers=0)
val_data = DataLoader(dataset=val_cd, batch_size=4, shuffle=True, num_workers=0)

In [None]:
next(iter(train_data))

# LLM Code

## Simple Attention

### With prints

In [9]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False):
        super().__init__()
        assert (d_out % num_heads == 0)
        self.d_out = d_out
        self.num_heads = num_heads
        self.head_dim = d_out // num_heads
        self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.out_proj = nn.Linear(d_out, d_out)
        self.dropout = nn.Dropout(dropout)
        self.register_buffer('mask', torch.triu(torch.ones(context_length, context_length)))

    def forward(self, x):
        b, num_tokens, d_in = x.size()
        keys = self.W_key(x) # b, num_tokens, self.d_out
        queries = self.W_query(x) # b, num_tokens, self.d_out
        values = self.W_value(x) # b, num_tokens, self.d_out
        print('values.shape is ' , values.shape)

        keys = keys.view(b, num_tokens, self.num_heads, self.head_dim)
        queries = queries.view(b, num_tokens, self.num_heads, self.head_dim)
        values = values.view(b, num_tokens, self.num_heads, self.head_dim)
        print('values.shape (after view) is ' , values.shape)

        keys = keys.transpose(1, 2) # b, self.num_heads, num_tokens, self.head_dim
        queries = queries.transpose(1, 2) # b, self.num_heads, num_tokens, self.head_dim
        values = values.transpose(1, 2) # b, self.num_heads, num_tokens, self.head_dim
        print('values.shape (after transpose) is ' , values.shape)

        att_scores = queries @ keys.transpose(2, 3) # shapes = (num_tokens, self.head_dim) @ (self.head_dim, num_tokens) -> (num_tokens, num_tokens)
        print('att_scores.shape is ' , att_scores.shape)

        mask_bool = self.mask.bool()[:num_tokens, :num_tokens]
        att_scores.masked_fill_(mask_bool, -torch.inf)

        att_weights = torch.softmax(att_scores / keys.shape[-1]**.5, dim=-1)
        att_weights = self.dropout(att_weights)

        context_vec = (att_weights @ values).transpose(1, 2) # (num_tokens, num_tokens) @ (num_tokens, self.head_dim) -> (num_tokens, self.head_dim) -> transpose(1,2) of (b, self.num_heads, num_tokens, self.head_dim) ->
        # -> (b, num_tokens, self.num_heads, self.head_dim) as view in previous code after inference of Linear layers
        print('context_vec.shape is ' , att_scores.shape)
        context_vec = context_vec.contiguous().view(b, num_tokens, self.d_out)
        print('context_vec.shape is ' , att_scores.shape)
        context_vec = self.out_proj(context_vec)
        return context_vec

#### Check

In [None]:
mha = MultiHeadAttention(d_in=768, d_out=768, context_length=2048, dropout=0.1, num_heads=4)

In [None]:
torch.rand(10, 2048, 768).size()

In [None]:
a = mha(torch.rand(10, 2048, 768))

In [None]:
a.shape

### Without prints

In [10]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False):
        super().__init__()
        assert (d_out % num_heads == 0)

        self.d_out = d_out
        self.num_heads = num_heads
        self.head_dim = d_out // num_heads

        self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.out_proj = nn.Linear(d_out, d_out)
        self.dropout = nn.Dropout(dropout)
        self.register_buffer('mask', torch.triu(torch.ones(context_length, context_length), diagonal=1))

    def forward(self, x):
        b, num_tokens, d_in = x.size()
        keys = self.W_key(x) # b, num_tokens, self.d_out
        queries = self.W_query(x) # b, num_tokens, self.d_out
        values = self.W_value(x) # b, num_tokens, self.d_out

        keys = keys.view(b, num_tokens, self.num_heads, self.head_dim)
        queries = queries.view(b, num_tokens, self.num_heads, self.head_dim)
        values = values.view(b, num_tokens, self.num_heads, self.head_dim)

        keys = keys.transpose(1, 2) # b, self.num_heads, num_tokens, self.head_dim
        queries = queries.transpose(1, 2) # b, self.num_heads, num_tokens, self.head_dim
        values = values.transpose(1, 2) # b, self.num_heads, num_tokens, self.head_dim

        att_scores = queries @ keys.transpose(2, 3) # shapes = (num_tokens, self.head_dim) @ (self.head_dim, num_tokens) -> (num_tokens, num_tokens)

        mask_bool = self.mask.bool()[:num_tokens, :num_tokens]
        att_scores.masked_fill_(mask_bool, -torch.inf)

        att_weights = torch.softmax(att_scores / keys.shape[-1]**0.5, dim=-1)
        att_weights = self.dropout(att_weights)

        context_vec = (att_weights @ values).transpose(1, 2) # (num_tokens, num_tokens) @ (num_tokens, self.head_dim) -> (num_tokens, self.head_dim) -> transpose(1,2) of (b, self.num_heads, num_tokens, self.head_dim) ->
        # -> (b, num_tokens, self.num_heads, self.head_dim) as view in previous code after inference of Linear layers
        context_vec = context_vec.contiguous().view(b, num_tokens, self.d_out)
        context_vec = self.out_proj(context_vec)
        return context_vec

## Other Attention

In [None]:
class CausalSelfAttention(nn.Module):

    def __init__(self, d_in, d_out, context_length, dropout, num_heads,):
        super().__init__()
        # key, query, value projections for all heads, but in a batch
        self.c_attn = nn.Linear(d_in, 3 * d_in)
        # output projection
        self.c_proj = nn.Linear(d_in, d_in)
        # regularization
        self.n_head = num_heads
        self.n_embd = d_out

    def forward(self, x):
        B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)
        # calculate query, key, values for all heads in batch and move head forward to be the batch dim
        # nh is "number of heads", hs is "head size", and C (number of channels) = nh * hs
        # e.g. in GPT-2 (124M), n_head=12, hs=64, so nh*hs=C=768 channels in the Transformer
        qkv = self.c_attn(x)
        q, k, v = qkv.split(self.n_embd, dim=2)
        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        y = F.scaled_dot_product_attention(q, k, v, is_causal=True) # flash attention
        y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side
        # output projection
        y = self.c_proj(y)
        return y

## Additional classes

In [11]:
class LayerNorm(nn.Module):
    def __init__(self, emb_dim):
        super().__init__()
        self.eps = 1e-5
        self.scale = nn.Parameter(torch.ones(emb_dim))
        self.shift = nn.Parameter(torch.zeros(emb_dim))
    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        var = x.var(dim=-1, keepdim=True, unbiased=False)
        norm_x = (x - mean) / torch.sqrt(var + self.eps)
        return self.scale * norm_x + self.shift

In [None]:
# class LayerNorm(nn.Module):
#     def __init__(self, emb_dim):
#         super().__init__()
#         self.eps = 1e-5
#         self.scale = nn.Parameter(torch.ones(emb_dim))
#         self.shift = nn.Parameter(torch.zeros(emb_dim))

#     def forward(self, x):
#         mean = x.mean(dim=-1, keepdim=True)
#         var = x.var(dim=-1, keepdim=True, unbiased=False)
#         norm_x = (x - mean) / torch.sqrt(var + self.eps)
#         return self.scale * norm_x + self.shift

In [None]:
# class GELU(nn.Module):
#     def __init__(self):
#         super().__init__()

#     def forward(self, x):
#         return 0.5 * x * (1 + torch.tanh( torch.sqrt(torch.tensor(2.0 / torch.pi)) * (x + 0.044715 * torch.pow(x, 3)) ))

In [12]:
class GELU(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        return 0.5 * x * (1 + torch.tanh(
            torch.sqrt(torch.tensor(2.0 / torch.pi)) * 
            (x + 0.044715 * torch.pow(x, 3))
        ))

In [13]:
class FeedForward(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(cfg['emb_dim'], 4 * cfg['emb_dim']),
            GELU(),
            nn.Linear(4 * cfg['emb_dim'], cfg['emb_dim'])
        )
    def forward(self, x):
        return self.layers(x)

## Transformer Block

In [14]:
class TransformerBlock(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.attn = MultiHeadAttention(d_in=cfg['emb_dim'], 
                                       d_out=cfg['emb_dim'], 
                                       context_length=cfg['context_length'], 
                                       dropout=cfg['drop_rate'], 
                                       num_heads=cfg['n_heads'], 
                                       qkv_bias=cfg['qkv_bias'])
        self.ff = FeedForward(cfg)
        self.norm1 = LayerNorm(cfg['emb_dim'])
        self.norm2 = LayerNorm(cfg['emb_dim'])
        self.drop_resid = nn.Dropout(cfg['drop_rate'])
    
    def forward(self, x):
        #x = x + self.drop_resid(self.attn(self.norm1(x)))
        #x = x + self.drop_resid(self.ff(self.norm2(x)))
        shortcut = x
        x = self.norm1(x)
        x = self.attn(x)
        x = self.drop_resid(x)
        x = x + shortcut

        shortcut = x
        x = self.norm2(x)
        x = self.ff(x)
        x = self.drop_resid(x)
        x = x + shortcut
        return x

In [None]:
# class TransformerBlock(nn.Module):
#     def __init__(self, cfg):
#         super().__init__()
#         self.att = MultiHeadAttention(
#             d_in=cfg["emb_dim"],
#             d_out=cfg["emb_dim"],
#             context_length=cfg["context_length"],
#             num_heads=cfg["n_heads"], 
#             dropout=cfg["drop_rate"],
#             qkv_bias=cfg["qkv_bias"])
#         self.ff = FeedForward(cfg)
#         self.norm1 = LayerNorm(cfg["emb_dim"])
#         self.norm2 = LayerNorm(cfg["emb_dim"])
#         self.drop_shortcut = nn.Dropout(cfg["drop_rate"])

#     def forward(self, x):
#         # Shortcut connection for attention block
#         shortcut = x
#         x = self.norm1(x)
#         x = self.att(x)  # Shape [batch_size, num_tokens, emb_size]
#         x = self.drop_shortcut(x)
#         x = x + shortcut  # Add the original input back

#         # Shortcut connection for feed forward block
#         shortcut = x
#         x = self.norm2(x)
#         x = self.ff(x)
#         x = self.drop_shortcut(x)
#         x = x + shortcut  # Add the original input back

#         return x

### Check

In [None]:
tb = TransformerBlock(GPT_CONFIG)

In [None]:
GPT_CONFIG

In [None]:
init_size = 10, 1024, 768

In [None]:
tb(torch.rand(*init_size, device='cpu')).size()

In [None]:
tb(torch.rand(*init_size, device='cpu'))

In [None]:
s = nn.Sequential(*[TransformerBlock(GPT_CONFIG) for _ in range(4)])

In [None]:
s(torch.rand(*init_size))

In [None]:
s(torch.rand(*init_size)).size()

In [None]:
init_size == s(torch.rand(*init_size)).size()

In [None]:
init_size == tb(torch.rand(init_size)).size()

## GPT class

In [15]:
class GPTModel(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.tok_emb = nn.Embedding(cfg['vocab_size'], cfg['emb_dim'])
        self.pos_emb = nn.Embedding(cfg['context_length'], cfg['emb_dim'])
        self.drop_emb = nn.Dropout(cfg['drop_rate'])

        self.trf_blocks = nn.Sequential(*[TransformerBlock(cfg) for _ in range(cfg['n_layers'])])
        self.final_norm = nn.LayerNorm(cfg['emb_dim'])
        self.out_head = nn.Linear(cfg['emb_dim'], cfg['vocab_size'], bias=False)

    def forward(self, in_idx):
        batch_size, seq_len = in_idx.size()
        tok_embeds = self.tok_emb(in_idx)
        pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device))
        x = tok_embeds + pos_embeds
        x = self.drop_emb(x)
        x = self.trf_blocks(x)
        x = self.final_norm(x)
        logits = self.out_head(x)
        return logits

### Check

In [None]:
m = GPTModel(GPT_CONFIG)

In [None]:
for x, y in data_load:
    print(x.size())
    r = m(x)
    break

In [None]:
r.size()

## Generation

In [16]:
def generate(model, idx, max_new_tokens, context_size):
    model.eval()
    for _ in range(max_new_tokens):
        idx_cond = idx[:,-context_size:]
        with torch.no_grad():
            logits = model(idx_cond)
        logits = logits[:, -1, :]
        probas = torch.softmax(logits, dim=-1)
        idx_next = torch.argmax(probas, dim=-1, keepdim=True)
        idx = torch.cat((idx, idx_next), dim=1)
    return idx

### Check

In [None]:
torch.tensor(tok('Привет, как дела ')['input_ids']).unsqueeze(0), #tok('Привет, как дела mhjm', return_tensors='pt', padding='max_length', max_length=2048)['input_ids'].shape
# tok('Привет, как дела mhjm', return_tensors='pt', padding='max_length', max_length=1024)['input_ids'].shape

In [None]:
tok.decode(generate(model=m, idx=torch.tensor(tok('Привет, как дела ')['input_ids']).unsqueeze(0), max_new_tokens=20, context_size=1024).squeeze(0).tolist())

In [None]:
torch.tensor([1,2,3]).unsqueeze(0)

In [None]:
torch.tensor([1,2,3]).unsqueeze(1)

In [None]:
torch.tensor([[1,2,3]]).squeeze(0)

## Training

In [None]:
from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter()

In [None]:
class Trainer():
    def __init__(self, optimizer, params, device):
        self.optimizer = optimizer
        self.params = params
        self.device = device
    
    def train_model(self, model, tokenizer, train_dataloader, val_dataloader, writer=None):
        train_loss = []
        val_loss = []
        tokens_get = 0
        for epoch in range(self.params['N_EPOCHS']):

            
            for x, y in train_dataloader:
                if not (model.training):
                    model.train()
                x, y = x.to(self.device), y.to(self.device)
                self.optimizer.zero_grad()
                logits = model(x)
                loss = nn.functional.cross_entropy(logits.flatten(0, 1), y.flatten())
                loss.backward()
                self.optimizer.step()
                train_loss.append(loss)
                tokens_get += len(x.flatten())

                if (self.params['verbose'] is True) and (tokens_get % self.params['verbose_freq'] == 0):
                    sample = tokenizer.decode(generate(model=model, idx=torch.tensor(tokenizer('Я большая языковая модель и ')['input_ids'], device=self.device).unsqueeze(0), max_new_tokens=25, context_size=1024).squeeze(0).tolist())
                    print(f'Epoch {epoch}: Train loss = {loss}, sample: {sample}')
                    if (writer is not None):
                        writer.add_scalar("Loss/train in step", loss, epoch)
                        writer.add_text("Sample", str(sample), epoch)
                        if (self.params['gradients'] is True):
                            grads = []
                            for name, param in model.named_parameters():
                                if ('weight' in name):
                                    grads.append(param.grad.abs().flatten().mean().cpu().detach().numpy())
                            writer.add_scalar("train/gradients", np.array(grads).flatten().mean(), epoch)


                    model.eval()
                    with torch.no_grad():
                        for x, y in val_dataloader:
                            x, y = x.to(self.device), y.to(self.device)
                            logits = model(x)
                            loss = nn.functional.cross_entropy(logits.flatten(0, 1), y.flatten())
                            val_loss.append(loss)
                        if (writer is not None):
                            writer.add_scalar("Loss/train in check", torch.mean(torch.tensor(train_loss, device='cpu')), epoch)
                            writer.add_scalar("Loss/val in check", torch.mean(torch.tensor(val_loss, device='cpu')), epoch)
            writer.close()
            

In [None]:
params = {'N_EPOCHS': 5, 
          'verbose': True, 
          'verbose_freq': 1,
          'gradients': True}

In [None]:
model = GPTModel(GPT_CONFIG)

In [None]:
model = model.to(device)

In [None]:
opt = torch.optim.AdamW(params=model.parameters(), lr=0.01)
trainer = Trainer(optimizer=opt, params=params, device=device)

In [None]:
trainer.train_model(model=model, tokenizer=tok, train_dataloader=train_data, val_dataloader=val_data, writer=writer)
writer.flush()

How to use tensorboard?  
tensorboard --logdir=GPT_training or you name (instead of GPT_training) or tensorboard --logdir=runs  
http://localhost:6006  

### Saving weights

In [None]:
#torch.save(model.state_dict(), "model.pth") # without state of optimizer
torch.save({
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': opt.state_dict(),
    }, "model_and_optimizer.pth") # with state of optimizer

### Loading weights

In [None]:
checkpoint = torch.load("model_and_optimizer.pth")
model = GPTModel(GPT_CONFIG)
model.load_state_dict(checkpoint['model_state_dict'])
opt = torch.optim.AdamW(model.parameters(), lr=0.01)
opt.load_state_dict(checkpoint['optimizer_state_dict'])
model.train()

In [None]:
torch.randint(0, 100, size=(10, 1024)).size()

In [None]:
model(torch.randint(0, 100, size=(50, 1024))).size()

### Check