In [1]:
#Torchtext není k dispozici pro poslední verzi pytorch, budeme tedy využuívat něco jiného ...


In [1]:
from transformers import Trainer, BertForSequenceClassification, BertTokenizer, EarlyStoppingCallback
from datasets import load_from_disk
from torch.utils.data import DataLoader
import torch
import base
import os 

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/jovyan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/jovyan/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /home/jovyan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


In [None]:
base.reset_seed()

In [3]:
DATASET = "trec"

In [4]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU is available and will be used:", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print("GPU is not available, using CPU.")

GPU is available and will be used: NVIDIA H100 PCIe


In [5]:
train = load_from_disk(f"~/data/{DATASET}/train-logits_fine")
eval = load_from_disk(f"~/data/{DATASET}/eval-logits_fine")
test = load_from_disk(f"~/data/{DATASET}/test-logits_fine")

train_aug = load_from_disk(f"~/data/{DATASET}/train-logits-augmented_fine")

In [6]:
tokenizer = BertTokenizer.from_pretrained("ndavid/autotrain-trec-fine-bert-739422530")

In [7]:
train = train.map(lambda e: tokenizer(e["sentence"], truncation=True, padding="max_length", return_tensors="pt", max_length=300), batched=True, desc="Tokenizing the train dataset")
eval = eval.map(lambda e: tokenizer(e["sentence"], truncation=True, padding="max_length", return_tensors="pt", max_length=300), batched=True, desc="Tokenizing the eval dataset")
test = test.map(lambda e: tokenizer(e["sentence"], truncation=True, padding="max_length", return_tensors="pt", max_length=300), batched=True, desc="Tokenizing the test dataset")

train_aug = train_aug.map(lambda e: tokenizer(e["sentence"], truncation=True, padding="max_length", return_tensors="pt", max_length=300), batched=True, desc="Tokenizing the augmented dataset")

Tokenizing the train dataset:   0%|          | 0/4361 [00:00<?, ? examples/s]

Tokenizing the eval dataset:   0%|          | 0/1091 [00:00<?, ? examples/s]

Tokenizing the test dataset:   0%|          | 0/500 [00:00<?, ? examples/s]

Tokenizing the augmented dataset:   0%|          | 0/66864 [00:00<?, ? examples/s]

In [8]:
base.reset_seed()

In [9]:
model = BertForSequenceClassification.from_pretrained("google/bert_uncased_L-2_H-128_A-2", num_labels=50)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bert-base_fine", logging_dir=f"~/logs/{DATASET}/bert-base_fine", lr=0.0005, weight_decay=.01, warmup_steps=4, batch_size=128, epochs=20)

In [11]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train,
    eval_dataset=eval,
    compute_metrics=base.compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 3)]
)

In [12]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.387,2.840134,0.432631,0.064656,0.10383,0.076528
2,2.4907,2.123035,0.571952,0.225552,0.207389,0.188044
3,1.8443,1.671466,0.667278,0.320821,0.306982,0.289364
4,1.4066,1.416176,0.72594,0.39065,0.378789,0.359673
5,1.1059,1.262823,0.742438,0.385176,0.39092,0.370337
6,0.8747,1.166682,0.747021,0.418634,0.405066,0.389456
7,0.717,1.122182,0.740605,0.436371,0.409665,0.404332
8,0.6079,1.078695,0.753437,0.460445,0.443123,0.433565
9,0.5009,1.034985,0.764436,0.517015,0.464773,0.466777
10,0.4216,1.028733,0.767186,0.480723,0.481258,0.472726


TrainOutput(global_step=700, training_loss=0.7814531803131104, metrics={'train_runtime': 105.6302, 'train_samples_per_second': 825.71, 'train_steps_per_second': 6.627, 'total_flos': 65900954952000.0, 'train_loss': 0.7814531803131104, 'epoch': 20.0})

In [13]:
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-1): 2 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=128, out_features=128, bias=True)
              (key): Linear(in_features=128, out_features=128, bias=True)
              (value): Linear(in_features=128, out_features=128, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=128, out_features=128, bias=True)
              (LayerNorm): LayerNorm((128,), eps=1e-1

In [14]:
trainer.evaluate(test)

{'eval_loss': 1.0739054679870605,
 'eval_accuracy': 0.75,
 'eval_precision': 0.5956436960292529,
 'eval_recall': 0.6108920151191847,
 'eval_f1': 0.5633155299167988,
 'eval_runtime': 25.5307,
 'eval_samples_per_second': 19.584,
 'eval_steps_per_second': 0.157,
 'epoch': 20.0}

In [15]:
torch.save(model.state_dict(), f"{os.path.expanduser('~')}/models/{DATASET}/bert_fine.pth")

In [37]:
base.reset_seed()

In [38]:
student_model = BertForSequenceClassification.from_pretrained("google/bert_uncased_L-2_H-128_A-2", num_labels=50)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [39]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bert-distill_fine", logging_dir=f"~/logs/{DATASET}/bert-distill_fine", remove_unused_columns=False, lr=0.0005, weight_decay=.003, warmup_steps=4, batch_size=128, epochs=20, temp=6, lambda_param=.4)

In [40]:
trainer = base.DistilTrainer(
    student_model=student_model,
    args=training_args,
    train_dataset=train,
    eval_dataset=eval,
    compute_metrics=base.compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 3)]
)

In [41]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.388,2.022393,0.439963,0.082278,0.111877,0.082839
2,1.7746,1.518177,0.581118,0.242029,0.216172,0.195616
3,1.3262,1.214127,0.678277,0.284868,0.298381,0.276712
4,1.0407,1.040531,0.72594,0.319425,0.341143,0.320662
5,0.8512,0.942741,0.734189,0.321513,0.349247,0.326995
6,0.6995,0.880714,0.749771,0.375358,0.381842,0.363885
7,0.5915,0.842096,0.745188,0.350238,0.368303,0.351581
8,0.5177,0.80935,0.76352,0.437652,0.415115,0.409491
9,0.4504,0.794824,0.764436,0.480924,0.425622,0.430564
10,0.395,0.786408,0.76077,0.491696,0.44703,0.446788


TrainOutput(global_step=700, training_loss=0.6327556821278163, metrics={'train_runtime': 99.3637, 'train_samples_per_second': 877.785, 'train_steps_per_second': 7.045, 'total_flos': 65900954952000.0, 'train_loss': 0.6327556821278163, 'epoch': 20.0})

In [42]:
student_model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-1): 2 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=128, out_features=128, bias=True)
              (key): Linear(in_features=128, out_features=128, bias=True)
              (value): Linear(in_features=128, out_features=128, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=128, out_features=128, bias=True)
              (LayerNorm): LayerNorm((128,), eps=1e-1

In [43]:
trainer.evaluate(test)

{'eval_loss': 0.7448195219039917,
 'eval_accuracy': 0.772,
 'eval_precision': 0.5266354326636652,
 'eval_recall': 0.5743078647261858,
 'eval_f1': 0.5201327909952986,
 'eval_runtime': 3.6785,
 'eval_samples_per_second': 135.924,
 'eval_steps_per_second': 1.087,
 'epoch': 20.0}

In [44]:
torch.save(student_model.state_dict(), f"{os.path.expanduser('~')}/models/{DATASET}/bert-distil_fine.pth")

In [45]:
base.reset_seed()

In [46]:
model = BertForSequenceClassification.from_pretrained("google/bert_uncased_L-2_H-128_A-2", num_labels=50)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [47]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bert-base-aug_fine", logging_dir=f"~/logs/{DATASET}/bert-base-aug_fine", lr=0.0002, warmup_steps=20, batch_size=128, epochs=20)

In [48]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_aug,
    eval_dataset=eval,
    compute_metrics=base.compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 3)]
)

In [49]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.657,1.124339,0.752521,0.440498,0.447612,0.431779
2,0.3922,0.991754,0.780935,0.59515,0.552515,0.55257
3,0.1502,1.024132,0.793767,0.709125,0.667845,0.667761
4,0.0746,1.06783,0.806599,0.816312,0.725097,0.749853
5,0.0438,1.141872,0.799267,0.796053,0.72697,0.740722
6,0.0298,1.184395,0.799267,0.803258,0.734878,0.751566
7,0.0227,1.246228,0.802016,0.799595,0.738313,0.749569
8,0.017,1.296766,0.797434,0.789797,0.737691,0.746359
9,0.0136,1.344361,0.797434,0.790948,0.74219,0.748242


TrainOutput(global_step=4707, training_loss=0.2667730106323774, metrics={'train_runtime': 117.5222, 'train_samples_per_second': 11378.952, 'train_steps_per_second': 89.004, 'total_flos': 454684855161600.0, 'train_loss': 0.2667730106323774, 'epoch': 9.0})

In [50]:
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-1): 2 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=128, out_features=128, bias=True)
              (key): Linear(in_features=128, out_features=128, bias=True)
              (value): Linear(in_features=128, out_features=128, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=128, out_features=128, bias=True)
              (LayerNorm): LayerNorm((128,), eps=1e-1

In [51]:
trainer.evaluate(test)

{'eval_loss': 1.112660527229309,
 'eval_accuracy': 0.802,
 'eval_precision': 0.7059030232013304,
 'eval_recall': 0.6940250900545852,
 'eval_f1': 0.6775414956968477,
 'eval_runtime': 3.1838,
 'eval_samples_per_second': 157.046,
 'eval_steps_per_second': 1.256,
 'epoch': 9.0}

In [52]:
torch.save(model.state_dict(), f"{os.path.expanduser('~')}/models/{DATASET}/bert-base-aug_fine.pth")

In [73]:
base.reset_seed()

In [74]:
student_model = BertForSequenceClassification.from_pretrained("google/bert_uncased_L-2_H-128_A-2", num_labels=50)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [75]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bert-distill-aug_fine", logging_dir=f"~/logs/{DATASET}/bert-distill-aug_fine", remove_unused_columns=False, lr=0.0005, batch_size=128, weight_decay=.008, warmup_steps=6, epochs=20, temp=4, lambda_param=.7)

In [76]:
trainer = base.DistilTrainer(
    student_model=student_model,
    args=training_args,
    train_dataset=train_aug,
    eval_dataset=eval,
    compute_metrics=base.compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 3)]
)

In [77]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5798,0.544874,0.776352,0.562526,0.506004,0.510333
2,0.1289,0.543075,0.783685,0.676358,0.59018,0.616199
3,0.0897,0.529881,0.794684,0.72619,0.655666,0.671212
4,0.0786,0.522756,0.797434,0.773533,0.681308,0.709115
5,0.0732,0.552171,0.793767,0.805312,0.702308,0.735302
6,0.0693,0.561877,0.790101,0.827488,0.723885,0.755998
7,0.0676,0.537938,0.797434,0.820343,0.725125,0.752319
8,0.0657,0.521953,0.80385,0.824642,0.730822,0.757816
9,0.064,0.552106,0.791017,0.835892,0.721802,0.754901
10,0.0632,0.53256,0.804766,0.842718,0.722607,0.759278


TrainOutput(global_step=8368, training_loss=0.1028169580445007, metrics={'train_runtime': 234.3124, 'train_samples_per_second': 5707.251, 'train_steps_per_second': 44.641, 'total_flos': 808328631398400.0, 'train_loss': 0.1028169580445007, 'epoch': 16.0})

In [78]:
student_model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-1): 2 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=128, out_features=128, bias=True)
              (key): Linear(in_features=128, out_features=128, bias=True)
              (value): Linear(in_features=128, out_features=128, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=128, out_features=128, bias=True)
              (LayerNorm): LayerNorm((128,), eps=1e-1

In [79]:
trainer.evaluate(test)

{'eval_loss': 0.44254034757614136,
 'eval_accuracy': 0.812,
 'eval_precision': 0.7194187710473471,
 'eval_recall': 0.6992282075148261,
 'eval_f1': 0.6826634614101706,
 'eval_runtime': 3.7661,
 'eval_samples_per_second': 132.763,
 'eval_steps_per_second': 1.062,
 'epoch': 16.0}

In [65]:
torch.save(student_model.state_dict(), f"{os.path.expanduser('~')}/models/{DATASET}/bert-distil-aug_fine.pth")