# Notatnik demonstracyjny trening end-to-end

In [None]:
# instalacja
!python -m venv .venv
!source .venv/bin/activate            # Windows: .\.venv\Scripts\Activate.ps1
!pip install --upgrade pip
!pip install -r requirements.txt

In [1]:
from src.textclf_transformer import *
import pandas as pd
from datasets import load_dataset, concatenate_datasets
from sklearn.model_selection import train_test_split 
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

  from .autonotebook import tqdm as notebook_tqdm


## Tokenizacja danych

Korzystamy z danych imdb

In [2]:
ds = load_dataset("imdb")
merged = concatenate_datasets([ds["test"], ds["train"]])
df = pd.DataFrame(merged)

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['label'])

test_df, val_df = train_test_split(test_df, test_size=0.5, random_state=42, stratify=test_df['label'])

print(f"Train: {len(train_df)} | Val: {len(val_df)} | Test: {len(test_df)}")

Train: 40000 | Val: 5000 | Test: 5000


Tokenizujemy dane z wykorzystaniem oryginalnego słownika BERT

In [3]:
# inicjalizacja tokenizera
tokenizer = WordPieceTokenizerWrapper()
tokenizer.load("src/textclf_transformer/tokenizer/BERT_original")

# tokenizacja danych - pierwsze 320 tekstów na potrzeby dema
tokens_train = tokenizer.encode_pandas(train_df[:320], 'text',
                                  max_length=512, 
                                  label_col='label')
tokens_val = tokenizer.encode_pandas(val_df[:320], 'text',
                                max_length=512,
                                label_col='label')
tokens_test = tokenizer.encode_pandas(test_df[:320], 'text',
                                 max_length=512,
                                 label_col='label')

# zapisujemy stokenizowane dane
save_dir = Path("./data/tokenized")
torch.save(tokens_train, save_dir / "imdb_train.pt")
torch.save(tokens_val, save_dir / "imdb_val.pt")
torch.save(tokens_test, save_dir / "imdb_test.pt")

## Pretrening


### Generowanie eksperymentu - pretrening

Generowanie eksperymentu na podstawie [szablonu konfiguracji pretreningu](experiments/config_templates/pretraining.yaml).

Wykorzystujemy skrypt `generate_pretraining_experiment.py`, flaga `p` to nazwa eksperymentu pretreningu

In [4]:
!python experiments/generate_pretraining_experiment.py -p imdb_demo_pre

[OK] Utworzono pretraining: C:\Users\bartekb\Desktop\inzynierka\fromscratch-transformer-classifier\experiments\pretraining\imdb_demo_pre
     config: C:\Users\bartekb\Desktop\inzynierka\fromscratch-transformer-classifier\experiments\pretraining\imdb_demo_pre\config.yaml


W wyniku powstaje katalog eksperymentu pretrenigu wraz z [plikiem kofiguracyjnym](experiments/pretraining/imdb_demo_pre/config.yaml)

### Uruchomienie pretreningu
Uruchamiamy skrypt `train.py` z odpowiednimi flagami:
- `-n` nazwa eksperymentu
- `-m` tryb (`pretraining`|`finetuning`)

Wszystkie metryki mozna na bierząco śledzić w [pliku z metrykami](experiments/pretraining/imdb_demo_pre/metrics/train/metrics.csv) w katalogu treningu (trening trwa 60 steps - około 1-2 min.)
> Na potrzeby dema nie korzystamy z wandb

In [20]:
!python train.py -n imdb_demo_pre -m pretraining

Epoch: 0
Epoch: 1
Epoch: 2
[OK] Zapisano checkpoint: /Users/michaliwaniuk/fromscratch-transformer-classifier/experiments/pretraining/imdb_demo_pre/checkpoints/model.ckpt


# Finetuning 

### Generowanie eksperymentu - finetuning

Generowanie eksperymentu na podstawie [szablonu konfiguracji finetuningu](experiments/config_templates/finetuning.yaml) 
oraz szablonu konfiguracji pretreningu.

Wykorzystujemy skrypt `generate_finetuning_experiment.py` z flagami:
- `-f` nazwa eksperymentu finetuning
- `-p` nazwa eksperymentu pretreningowego z którego kontynuujemy trening



In [21]:
!python experiments/generate_finetuning_experiment.py -f imdb_demo_ft -p imdb_demo_pre

[OK] Utworzono finetuning: /Users/michaliwaniuk/fromscratch-transformer-classifier/experiments/finetuning/imdb_demo_ft
     config: /Users/michaliwaniuk/fromscratch-transformer-classifier/experiments/finetuning/imdb_demo_ft/config.yaml


W wyniku powstaje katalog eksperymentu finetuningu wraz z [plikiem kofiguracyjnym](experiments/finetuning/imdb_demo_ft/config.yaml)

### Uruchomienie finetuningu

Wszystkie metryki mozna na bierząco śledzić w [pliku z metrykami](experiments/finetuning/imdb_demo_ft/metrics/train/metrics.csv) w katalogu treningu (trening trwa 60 steps - kilkanascie sek.)
> Na potrzeby dema nie korzystamy z wandb

In [22]:
!python train.py -n imdb_demo_ft -m finetuning

[WARN] Brakujące klucze: ['classifier.pooler.0.weight', 'classifier.pooler.0.bias', 'classifier.classifier.weight', 'classifier.classifier.bias']
[WARN] Nieoczekiwane klucze: ['mlm.transform.0.weight', 'mlm.transform.0.bias', 'mlm.transform.2.weight', 'mlm.transform.2.bias', 'mlm.decoder.weight']
Epoch: 0
Epoch: 1
[OK] Zapisano checkpoint: /Users/michaliwaniuk/fromscratch-transformer-classifier/experiments/finetuning/imdb_demo_ft/checkpoints/model.ckpt


## Wyniki

In [29]:
import pandas as pd

df_pretraining_train_dataset = pd.read_csv('experiments/pretraining/imdb_demo_pre/metrics/train/metrics.csv')
df_pretraining_eval_dataset = pd.read_csv('experiments/pretraining/imdb_demo_pre/metrics/eval/metrics.csv')


df_finetuning_train_dataset = pd.read_csv('experiments/finetuning/imdb_demo_ft/metrics/train/metrics.csv')
df_finetuning_eval_dataset = pd.read_csv('experiments/finetuning/imdb_demo_ft/metrics/eval/metrics.csv')
df_finetuning_test_dataset = pd.read_csv('experiments/finetuning/imdb_demo_ft/metrics/test/metrics.csv')

### Pretrening

In [24]:
df_pretraining_train_dataset


Unnamed: 0,step,train/loss,train/lr,train/grad_norm,train/is_update_step,train/accum_step,train/update_skipped,train/avg_epoch_loss,train/epoch
0,1,10.330363,0.000003,1.577415,1.0,1.0,0.0,,
1,2,10.326643,0.000007,1.294622,1.0,1.0,0.0,,
2,3,10.326793,0.000010,1.329112,1.0,1.0,0.0,,
3,4,10.330911,0.000013,1.415542,1.0,1.0,0.0,,
4,5,10.325354,0.000017,1.399057,1.0,1.0,0.0,,
...,...,...,...,...,...,...,...,...,...
58,57,10.247204,0.000004,1.258345,1.0,1.0,0.0,,
59,58,10.242476,0.000004,1.223702,1.0,1.0,0.0,,
60,59,10.233184,0.000004,1.353890,1.0,1.0,0.0,,
61,60,10.243557,0.000004,1.216970,1.0,1.0,0.0,,


In [30]:
df_pretraining_eval_dataset

Unnamed: 0,step,eval/loss,eval/perplexity,eval/num_tokens,eval/epoch
0,20,10.284577,29277.576435,13902.0,1
1,40,10.248557,28241.763482,13925.0,2
2,60,10.235097,27864.165419,13838.0,3


### Finetuning

In [27]:
df_finetuning_train_dataset

Unnamed: 0,step,train/loss,train/lr,train/grad_norm,train/is_update_step,train/accum_step,train/update_skipped,train/avg_epoch_loss,train/epoch
0,1,1.291785,2.5e-05,51.605637,1.0,1.0,0.0,,
1,2,1.418729,5e-05,55.30024,1.0,1.0,0.0,,
2,3,1.203397,7.5e-05,50.923752,1.0,1.0,0.0,,
3,4,0.497355,0.0001,13.321792,1.0,1.0,0.0,,
4,5,1.076532,0.0001,61.456745,1.0,1.0,0.0,,
5,6,0.728189,9.9e-05,21.586802,1.0,1.0,0.0,,
6,7,0.676637,9.9e-05,14.168283,1.0,1.0,0.0,,
7,8,0.720814,9.8e-05,16.572878,1.0,1.0,0.0,,
8,9,0.778738,9.6e-05,28.768276,1.0,1.0,0.0,,
9,10,0.843969,9.5e-05,24.075056,1.0,1.0,0.0,,


In [25]:
df_finetuning_eval_dataset

Unnamed: 0,step,eval/loss,eval/num_examples,eval/accuracy,eval/balanced_accuracy,eval/precision_macro,eval/recall_macro,eval/f1_macro,eval/precision_micro,eval/recall_micro,...,eval/class_0_precision,eval/class_0_recall,eval/class_0_f1,eval/class_0_support,eval/class_1_precision,eval/class_1_recall,eval/class_1_f1,eval/class_1_support,eval/inference_time_s,eval/epoch
0,20,0.675381,320.0,0.590625,0.579839,0.620406,0.579839,0.547295,0.590625,0.590625,...,0.671642,0.292208,0.40724,154.0,0.56917,0.86747,0.687351,166.0,0.943746,1
1,40,0.669008,320.0,0.653125,0.647825,0.661558,0.647825,0.643341,0.653125,0.653125,...,0.690265,0.506494,0.58427,154.0,0.63285,0.789157,0.702413,166.0,0.903316,2


In [28]:
df_finetuning_test_dataset

Unnamed: 0,step,test/loss,test/num_examples,test/accuracy,test/balanced_accuracy,test/precision_macro,test/recall_macro,test/f1_macro,test/precision_micro,test/recall_micro,...,test/class_0_precision,test/class_0_recall,test/class_0_f1,test/class_0_support,test/class_1_precision,test/class_1_recall,test/class_1_f1,test/class_1_support,test/inference_time_s,test/epoch
0,20,0.682144,320.0,0.5625,0.566573,0.61526,0.566573,0.513022,0.5625,0.5625,...,0.696429,0.240741,0.357798,162.0,0.534091,0.892405,0.668246,158.0,0.859217,1
1,40,0.675067,320.0,0.634375,0.635998,0.645533,0.635998,0.628862,0.634375,0.634375,...,0.689076,0.506173,0.58363,162.0,0.60199,0.765823,0.674095,158.0,0.885237,2
