# Imports

In [None]:
import numpy as np
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
import transformers
import pandas as pd
import os
import json
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
from dotenv import load_dotenv
load_dotenv()

In [None]:
#from tqdm import tqdm
#tqdm.pandas()

Environment variables

In [None]:
PATH = os.getenv("PATH")
DATAPATH = os.getenv("DATAPATH")
PREPARED_DATA_DIR = os.getenv("PREPARED_DATA_DIR")
CACHE_DIR = os.getenv("CACHE_DIR")
#TOK_NAME = "deepseek-ai/DeepSeek-R1-0528-Qwen3-8B"
TOK_NAME = os.getenv("TOK_NAME")
PARQUET_DATA_DIR = os.getenv("PARQUET_DATA_DIR")
CSV1 = 'G:\\My_files\\Programming\\My_projects\\LLM\\GPT-like_trained\\Data\\Processed\\CSV\\1_Tokenized_2049_padded.csv'
CSV2 = 'G:\\My_files\\Programming\\My_projects\\LLM\\GPT-like_trained\\Data\\Processed\\CSV\\2_Tokenized_2049_padded.csv'

## Config

In [None]:
GPT_CONFIG = {
    'vocab_size': 50257, # in 151670 (if you use tokenizer.vocab_size then you get partial vocab_size without added tokens)
    'context_length': 1024,
    'emb_dim': 768, #768
    'n_heads': 2,#12,
    'n_layers': 2,#12,
    'drop_rate': 0.05, # 0l1
    'qkv_bias': False
    }

In [None]:
device = 'cuda' if (torch.cuda.is_available()) else 'cpu'
device

# Dataset

## Load Tokenizer

In [None]:
tok = transformers.AutoTokenizer.from_pretrained(TOK_NAME, cache_dir=CACHE_DIR)

Check tokenizer

In [None]:
tok.get_added_vocab

In [None]:
tok.vocab_size

In [None]:
# If tokenizer dont have pad_token
tok.pad_token = tok.eos_token

In [None]:
tok('Привет, как дела mhjm', return_tensors='pt', padding='max_length', max_length=2048)['input_ids'].shape

# Prepare data

## Raw data

In [None]:
with open(DATAPATH, encoding='utf8', mode='r') as file:
    d = file.read()

In [None]:
len(d)

In [None]:
1423181938//131072

In [None]:
num_chunks=25
stride = len(d)//num_chunks

for i, chunk_idx in tqdm(enumerate(range(0, len(d), stride))):
    with open(os.path.join(PREPARED_DATA_DIR, f'chunk_{i}.txt'), mode='w') as file:
        file.write(d[chunk_idx:chunk_idx+stride])
    print(i, chunk_idx)

In [None]:
#num_chunks=25
stride = 131072#len(d)//num_chunks

data_parquet = pd.DataFrame([], columns=['Sample', 'Chunk'])
for i, chunk_idx in tqdm(enumerate(range(0, len(d), stride))):
    data_parquet.loc[len(data_parquet)] = ['sdgsgsg', 0]
    #with open(os.path.join(PREPARED_DATA_DIR, f'chunk_{i}.txt'), mode='w') as file:
    #    file.write(d[chunk_idx:chunk_idx+stride])
    print(i, chunk_idx)

In [None]:
data_parquet = pd.DataFrame([], columns=['Sample', 'Chunk'])
for i, filename in tqdm(enumerate(os.listdir(PREPARED_DATA_DIR)), total=len(os.listdir(PREPARED_DATA_DIR))):
    with open(os.path.join(PREPARED_DATA_DIR, filename), encoding='utf8', mode='r') as file:
        current_file = file.read()
        stride = 2048*3
        mas = ''
        for article in current_file.split('/n'):
            for sentence in article.split('.'):
                if (len(mas)+len(sentence) < stride):
                    mas += sentence
                else:
                    data_parquet.loc[len(data_parquet)] = [mas, i]
                    mas = ''
            
        # for chunk_idx in tqdm(range(0, len(current_file), stride)):
        #     current_chunk = current_file[chunk_idx:chunk_idx+stride]
        #     data_parquet.loc[len(data_parquet)] = ['sdgsgsg', 0]

In [None]:
2048*3

In [None]:
data_parquet.to_parquet(PARQUET_DATA_DIR)

In [None]:
d[200:250]

## Analysis

Небольшой анализ длины предложений

In [None]:
with open(os.path.join(PREPARED_DATA_DIR, os.listdir(PREPARED_DATA_DIR)[0]), encoding='utf8', mode='r') as file:
    d = file.read()

In [None]:
splt = d.split('.')

In [None]:
lens = [len(elem) for elem in splt]

In [None]:
max(lens)

In [None]:
plt.hist(lens, bins=20, range=(0, 1000))

Если взять длину абзацев

In [None]:
splt = d.split('/n')

In [None]:
splt[10]

In [None]:
len(splt)

In [None]:
lens = [len(elem) for elem in splt]

In [None]:
max(lens)

In [None]:
plt.hist(lens, bins=20)

Class for dataset

## Old versions of dataset

In [None]:
class CustomDatasetV1(Dataset):
    def __init__(self, txt: str, tokenizer: object, max_length: int, stride: int):
        self.tokenizer = tokenizer
        self.input_ids = []
        self.target_ids = []

        token_ids = tokenizer.encode(txt)

        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1:i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))
        
    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, index):
        return self.input_ids[index], self.target_ids[index]

In [None]:
class CustomDatasetV2(Dataset):
    def __init__(self, dataframe: str, tokenizer: object, max_length: int, stride: int):
        self.tokenizer = tokenizer
        self.input_ids = []
        self.target_ids = []

        for i, curr_chunk in dataframe.iterrows():
            token_ids = tokenizer.encode(curr_chunk['Sample'])
            for i in range(0, len(token_ids) - max_length, stride):
                input_chunk = token_ids[i:i + max_length]
                target_chunk = token_ids[i + 1:i + max_length + 1]
                self.input_ids.append(torch.tensor(input_chunk))
                self.target_ids.append(torch.tensor(target_chunk))
        
    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, index):

        return self.input_ids[index], self.target_ids[index]

In [None]:
cd = CustomDatasetV2(dataframe=data_parquet.iloc[:100], tokenizer=tok, max_length=1024, stride=1)

## Let's tokenize separately

Let's tokenize separately

In [None]:
data_parquet = pd.read_parquet(PARQUET_DATA_DIR)

In [None]:
data_parquet.shape[0] // 2

In [None]:
data_1 = data_parquet['Sample'].iloc[:100000].progress_apply(lambda curr_chunk: tok(curr_chunk, padding='max_length', max_length=2048+1)['input_ids'])

In [None]:
data_2 = data_parquet['Sample'].iloc[100000:].progress_apply(lambda curr_chunk: tok(curr_chunk, padding='max_length', max_length=2048+1)['input_ids'])

In [None]:
data_1 = pd.DataFrame(data_1)

In [None]:
data_2 = pd.DataFrame(data_2)

In [None]:
data_1['Chunk'] = data_parquet['Chunk'].iloc[:100000]

In [None]:
data_2['Chunk'] = data_parquet['Chunk'].iloc[100000:]

In [None]:
data_1['Sample'].apply(len)

In [None]:
data_1.to_csv()

In [None]:
data_2.to_csv(CSV2)

In [None]:
from pandarallel import pandarallel
pandarallel.initialize()

In [None]:
def tokenize_tokens(curr_chunk):
    return tok(curr_chunk, padding='max_length', max_length=1024+1)['input_ids']

In [None]:
data = data_parquet['Sample'].parallel_apply(tokenize_tokens)

## Version of dataset with tokenizer

In [None]:
class CustomDatasetV3(Dataset):
    def __init__(self, dataframe: str, tokenizer: object, max_length: int):
        self.tokenizer = tokenizer
        self.input_ids = []
        self.target_ids = []

        for i, curr_chunk in tqdm(dataframe.iterrows(), total=dataframe.shape[0]):
            token_ids = tokenizer(curr_chunk['Sample'], return_tensors='pt', padding='max_length', max_length=max_length+1)['input_ids']
            input_chunk = token_ids[:,:max_length].view(-1)
            target_chunk = token_ids[:,1:max_length+1].view(-1)
            #print(input_chunk.size(), target_chunk.size(),)
            self.input_ids.append(input_chunk)
            self.target_ids.append(target_chunk)
        
    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, index):

        return self.input_ids[index], self.target_ids[index]

## Version of dataset without tokenizer

In [None]:
class CustomDatasetV4(Dataset):
    def __init__(self, dataframe: str, max_length: int):
        self.input_ids = []
        self.target_ids = []

        for i, curr_chunk in tqdm(dataframe.iterrows(), total=dataframe.shape[0]):
            token_ids = torch.tensor(json.loads(curr_chunk['Sample']))
            input_chunk = token_ids[:max_length].view(-1)
            target_chunk = token_ids[1:max_length+1].view(-1)
            self.input_ids.append(input_chunk)
            self.target_ids.append(target_chunk)

    # Cannot use vector arifmetic, because pandas cannor recognize torch.tensor type
    # def to_torch(x):
    #     token_ids = torch.tensor(json.loads(x))
    #     return token_ids[:2048].view(-1)
    #data['Sample'].iloc[:100].apply(to_torch)
        
    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, index):

        return self.input_ids[index], self.target_ids[index]

# Load data with tokenizer and dataloader

In [None]:
data_parquet = pd.read_parquet(PARQUET_DATA_DIR)

In [None]:
data_parquet.shape

In [None]:
train_cd = CustomDatasetV3(dataframe=data_parquet.iloc[:100000], tokenizer=tok, max_length=GPT_CONFIG['context_length'])#MY_GPT_CONFIG['context_length'])
#train_cd = CustomDatasetV3(dataframe=data_parquet.iloc[:100], tokenizer=tok, max_length=GPT_CONFIG['context_length'])#MY_GPT_CONFIG['context_length'])

In [None]:
val_cd = CustomDatasetV3(dataframe=data_parquet.iloc[-10000:], tokenizer=tok, max_length=GPT_CONFIG['context_length'])#MY_GPT_CONFIG['context_length'])
#val_cd = CustomDatasetV3(dataframe=data_parquet.iloc[-100:], tokenizer=tok, max_length=GPT_CONFIG['context_length'])#

In [None]:
train_data = DataLoader(dataset=train_cd, batch_size=4, shuffle=True, num_workers=0)
val_data = DataLoader(dataset=val_cd, batch_size=4, shuffle=True, num_workers=0)

In [None]:
next(iter(train_data))

# Load data without tokenizer and dataloader

In [None]:
data = pd.read_csv(CSV1, index_col=0)

In [None]:
train_cd = CustomDatasetV4(dataframe=data.iloc[:90000], max_length=GPT_CONFIG['context_length'])#MY_GPT_CONFIG['context_length'])
#train_cd = CustomDatasetV3(dataframe=data_parquet.iloc[:100], tokenizer=tok, max_length=GPT_CONFIG['context_length'])#MY_GPT_CONFIG['context_length'])

In [None]:
val_cd = CustomDatasetV4(dataframe=data.iloc[-10000:], max_length=GPT_CONFIG['context_length'])#MY_GPT_CONFIG['context_length'])
#val_cd = CustomDatasetV3(dataframe=data_parquet.iloc[-100:], tokenizer=tok, max_length=GPT_CONFIG['context_length'])#

In [None]:
train_data = DataLoader(dataset=train_cd, batch_size=8, shuffle=True, num_workers=0)
val_data = DataLoader(dataset=val_cd, batch_size=8, shuffle=True, num_workers=0)

In [None]:
next(iter(train_data))[0][0]

In [None]:
tok.batch_decode(next(iter(train_data))[0]) # Work properly