In [None]:
#Torchtext není k dispozici pro poslední verzi pytorch, budeme tedy využuívat něco jiného ...


In [2]:
from transformers import Trainer, BertForSequenceClassification, BertTokenizer, EarlyStoppingCallback
from datasets import load_from_disk
from torch.utils.data import DataLoader
import torch
import base
import os

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/jovyan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/jovyan/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /home/jovyan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


In [3]:
base.reset_seed()

In [4]:
DATASET = "trec"

In [5]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU is available and will be used:", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print("GPU is not available, using CPU.")

GPU is available and will be used: NVIDIA H100 PCIe


In [6]:
train = load_from_disk(f"~/data/{DATASET}/train-logits_coarse")
eval = load_from_disk(f"~/data/{DATASET}/eval-logits_coarse")
test = load_from_disk(f"~/data/{DATASET}/test-logits_coarse")

train_aug = load_from_disk(f"~/data/{DATASET}/train-logits-augmented_coarse")

In [7]:
tokenizer = BertTokenizer.from_pretrained("carrassi-ni/bert-base-trec-question-classification")

In [8]:
train = train.map(lambda e: tokenizer(e["sentence"], truncation=True, padding="max_length", return_tensors="pt", max_length=300), batched=True, desc="Tokenizing the train dataset")
eval = eval.map(lambda e: tokenizer(e["sentence"], truncation=True, padding="max_length", return_tensors="pt", max_length=300), batched=True, desc="Tokenizing the eval dataset")
test = test.map(lambda e: tokenizer(e["sentence"], truncation=True, padding="max_length", return_tensors="pt", max_length=300), batched=True, desc="Tokenizing the test dataset")

train_aug = train_aug.map(lambda e: tokenizer(e["sentence"], truncation=True, padding="max_length", return_tensors="pt", max_length=300), batched=True, desc="Tokenizing the augmented dataset")

Tokenizing the train dataset:   0%|          | 0/4361 [00:00<?, ? examples/s]

Tokenizing the eval dataset:   0%|          | 0/1091 [00:00<?, ? examples/s]

Tokenizing the test dataset:   0%|          | 0/500 [00:00<?, ? examples/s]

Tokenizing the augmented dataset:   0%|          | 0/38920 [00:00<?, ? examples/s]

In [9]:
base.reset_seed()

In [10]:
model = BertForSequenceClassification.from_pretrained("google/bert_uncased_L-2_H-128_A-2", num_labels=6)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bert-base_coarse", logging_dir=f"~/logs/{DATASET}/bert-base_coarse", batch_size=128, epochs=20, lr=0.00045, weight_decay=.003, warmup_steps=3)

In [12]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train,
    eval_dataset=eval,
    compute_metrics=base.compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 4)]
)

In [13]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5363,1.22247,0.570119,0.549572,0.473997,0.483419
2,1.0084,0.848894,0.698442,0.611361,0.595233,0.600865
3,0.6388,0.71965,0.740605,0.631639,0.636751,0.632269
4,0.4019,0.651967,0.773602,0.829526,0.67306,0.679879
5,0.2697,0.688891,0.783685,0.80708,0.736588,0.755685
6,0.175,0.689048,0.789184,0.818088,0.749549,0.772399
7,0.1249,0.726433,0.793767,0.823404,0.762404,0.783551
8,0.094,0.738296,0.808433,0.820707,0.773406,0.790942
9,0.0698,0.772733,0.797434,0.818997,0.76387,0.782488
10,0.0581,0.890894,0.778185,0.7913,0.751188,0.763892


TrainOutput(global_step=420, training_loss=0.37106678372337704, metrics={'train_runtime': 105.1644, 'train_samples_per_second': 829.368, 'train_steps_per_second': 6.656, 'total_flos': 39005907393600.0, 'train_loss': 0.37106678372337704, 'epoch': 12.0})

In [14]:
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-1): 2 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=128, out_features=128, bias=True)
              (key): Linear(in_features=128, out_features=128, bias=True)
              (value): Linear(in_features=128, out_features=128, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=128, out_features=128, bias=True)
              (LayerNorm): LayerNorm((128,), eps=1e-1

In [15]:
trainer.evaluate(test)

{'eval_loss': 0.5802797675132751,
 'eval_accuracy': 0.856,
 'eval_precision': 0.8348930891921884,
 'eval_recall': 0.8464387582299363,
 'eval_f1': 0.8394458598059482,
 'eval_runtime': 3.2883,
 'eval_samples_per_second': 152.055,
 'eval_steps_per_second': 1.216,
 'epoch': 12.0}

In [16]:
torch.save(model.state_dict(), f"{os.path.expanduser('~')}/models/{DATASET}/bert_coarse.pth")

In [17]:
base.reset_seed()

In [18]:
student_model = BertForSequenceClassification.from_pretrained("google/bert_uncased_L-2_H-128_A-2", num_labels=6)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bert-distill_coarse", logging_dir=f"~/logs/{DATASET}/bert-distill_coarse", remove_unused_columns=False, batch_size=128, epochs=20, lr=.0004, weight_decay=.006, warmup_steps=3, temp=2, lambda_param=.7)

In [20]:
trainer = base.DistilTrainer(
    student_model=student_model,
    args=training_args,
    train_dataset=train,
    eval_dataset=eval,
    compute_metrics=base.compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 4)]
)

In [21]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.6551,3.07998,0.566453,0.554482,0.468039,0.477394
2,2.6501,2.199739,0.673694,0.589172,0.574548,0.578777
3,1.7741,1.752752,0.724106,0.61593,0.623426,0.618525
4,1.1574,1.483421,0.76352,0.654551,0.656711,0.652628
5,0.8132,1.411108,0.771769,0.651235,0.663692,0.656045
6,0.523,1.39017,0.787351,0.806785,0.721162,0.74085
7,0.3662,1.496239,0.780935,0.80223,0.735453,0.753162
8,0.2742,1.403582,0.791017,0.814816,0.742884,0.764084
9,0.2088,1.460895,0.792851,0.81972,0.752794,0.774937
10,0.1801,1.530166,0.787351,0.8176,0.747963,0.770322


TrainOutput(global_step=455, training_loss=0.9188931323669769, metrics={'train_runtime': 63.6999, 'train_samples_per_second': 1369.232, 'train_steps_per_second': 10.989, 'total_flos': 42256399676400.0, 'train_loss': 0.9188931323669769, 'epoch': 13.0})

In [22]:
student_model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-1): 2 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=128, out_features=128, bias=True)
              (key): Linear(in_features=128, out_features=128, bias=True)
              (value): Linear(in_features=128, out_features=128, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=128, out_features=128, bias=True)
              (LayerNorm): LayerNorm((128,), eps=1e-1

In [23]:
trainer.evaluate(test)

{'eval_loss': 1.2811310291290283,
 'eval_accuracy': 0.834,
 'eval_precision': 0.8569683099432779,
 'eval_recall': 0.8300146654932593,
 'eval_f1': 0.8394165455730587,
 'eval_runtime': 3.7599,
 'eval_samples_per_second': 132.981,
 'eval_steps_per_second': 1.064,
 'epoch': 13.0}

In [24]:
torch.save(student_model.state_dict(), f"{os.path.expanduser('~')}/models/{DATASET}/bert-distil_coarse.pth")

In [25]:
base.reset_seed()

In [26]:
model = BertForSequenceClassification.from_pretrained("google/bert_uncased_L-2_H-128_A-2", num_labels=6)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [27]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bert-base-aug_coarse", logging_dir=f"~/logs/{DATASET}/bert-base-aug_coarse", batch_size=128, epochs=20, lr=.00004, weight_decay=.001, warmup_steps=18)

In [28]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_aug,
    eval_dataset=eval,
    compute_metrics=base.compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 4)]
)

In [29]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.496,1.248581,0.557287,0.601646,0.450059,0.460087
2,0.9763,0.849538,0.72594,0.787096,0.631521,0.63894
3,0.6535,0.705517,0.769019,0.826046,0.676485,0.692036
4,0.4561,0.64284,0.800183,0.827152,0.748976,0.773718
5,0.3393,0.628436,0.80385,0.827224,0.751681,0.775348
6,0.2713,0.635871,0.80385,0.82778,0.752028,0.775711
7,0.2275,0.652949,0.802933,0.821816,0.751576,0.772673
8,0.1941,0.655493,0.810266,0.828087,0.757634,0.778801
9,0.1697,0.671555,0.809349,0.814188,0.766241,0.782916
10,0.154,0.672345,0.811182,0.815579,0.767756,0.784397


TrainOutput(global_step=5795, training_loss=0.31153567876149296, metrics={'train_runtime': 176.1713, 'train_samples_per_second': 4418.428, 'train_steps_per_second': 34.625, 'total_flos': 551174967504000.0, 'train_loss': 0.31153567876149296, 'epoch': 19.0})

In [30]:
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-1): 2 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=128, out_features=128, bias=True)
              (key): Linear(in_features=128, out_features=128, bias=True)
              (value): Linear(in_features=128, out_features=128, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=128, out_features=128, bias=True)
              (LayerNorm): LayerNorm((128,), eps=1e-1

In [31]:
trainer.evaluate(test)

{'eval_loss': 0.6101865172386169,
 'eval_accuracy': 0.844,
 'eval_precision': 0.8648928082893651,
 'eval_recall': 0.8401511207356247,
 'eval_f1': 0.8482508432332837,
 'eval_runtime': 3.1758,
 'eval_samples_per_second': 157.442,
 'eval_steps_per_second': 1.26,
 'epoch': 19.0}

In [32]:
torch.save(model.state_dict(), f"{os.path.expanduser('~')}/models/{DATASET}/bert-base-aug_coarse.pth")

In [33]:
base.reset_seed()

In [34]:
student_model = BertForSequenceClassification.from_pretrained("google/bert_uncased_L-2_H-128_A-2", num_labels=6)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [35]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bert-distill-aug_coarse", logging_dir=f"~/logs/{DATASET}/bert-distill-aug_coarse", remove_unused_columns=False, batch_size=128, epochs=20, lr=.00025, weight_decay=.001, warmup_steps=2, temp=2.5, lambda_param=.7)

In [36]:
trainer = base.DistilTrainer(
    student_model=student_model,
    args=training_args,
    train_dataset=train_aug,
    eval_dataset=eval,
    compute_metrics=base.compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 4)]
)

In [37]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.1246,1.558991,0.789184,0.793173,0.704948,0.716738
2,0.4473,1.318299,0.819432,0.822149,0.775491,0.791324
3,0.2465,1.197168,0.828598,0.848837,0.781989,0.80415
4,0.1913,1.177543,0.831347,0.846866,0.784839,0.80382
5,0.1601,1.185049,0.833181,0.851305,0.784687,0.807069
6,0.1401,1.299103,0.819432,0.827712,0.783127,0.799048
7,0.1236,1.260248,0.825848,0.819839,0.78839,0.799696
8,0.1129,1.28625,0.822181,0.8183,0.795847,0.803345
9,0.1055,1.240404,0.834097,0.829876,0.80451,0.814127
10,0.0995,1.230714,0.84143,0.846218,0.810128,0.824107


TrainOutput(global_step=4270, training_loss=0.29244671671954475, metrics={'train_runtime': 196.1706, 'train_samples_per_second': 3967.975, 'train_steps_per_second': 31.095, 'total_flos': 406128923424000.0, 'train_loss': 0.29244671671954475, 'epoch': 14.0})

In [38]:
student_model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-1): 2 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=128, out_features=128, bias=True)
              (key): Linear(in_features=128, out_features=128, bias=True)
              (value): Linear(in_features=128, out_features=128, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=128, out_features=128, bias=True)
              (LayerNorm): LayerNorm((128,), eps=1e-1

In [39]:
trainer.evaluate(test)

{'eval_loss': 1.4142205715179443,
 'eval_accuracy': 0.844,
 'eval_precision': 0.8192696828241198,
 'eval_recall': 0.8391381419448681,
 'eval_f1': 0.8277745959900488,
 'eval_runtime': 3.4266,
 'eval_samples_per_second': 145.919,
 'eval_steps_per_second': 1.167,
 'epoch': 14.0}

In [40]:
torch.save(student_model.state_dict(), f"{os.path.expanduser('~')}/models/{DATASET}/bert-distil-aug_coarse.pth")