In [1]:
#Torchtext není k dispozici pro poslední verzi pytorch, budeme tedy využuívat něco jiného ...


In [1]:
from transformers import Trainer, BertForSequenceClassification, BertTokenizer, EarlyStoppingCallback
from datasets import load_from_disk
from torch.utils.data import DataLoader
import torch
import base
import os

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/jovyan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/jovyan/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /home/jovyan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


In [2]:
base.reset_seed()

In [3]:
DATASET = "trec"

In [4]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU is available and will be used:", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print("GPU is not available, using CPU.")

GPU is available and will be used: NVIDIA A100 80GB PCIe MIG 2g.20gb


In [5]:
train = load_from_disk(f"~/data/{DATASET}/train-logits_coarse")
eval = load_from_disk(f"~/data/{DATASET}/eval-logits_coarse")
test = load_from_disk(f"~/data/{DATASET}/test-logits_coarse")

train_aug = load_from_disk(f"~/data/{DATASET}/train-logits-augmented_coarse")

In [8]:
tokenizer = BertTokenizer.from_pretrained("carrassi-ni/bert-base-trec-question-classification")

In [9]:
train = train.map(lambda e: tokenizer(e["sentence"], truncation=True, padding="max_length", return_tensors="pt", max_length=300), batched=True, desc="Tokenizing the train dataset")
eval = eval.map(lambda e: tokenizer(e["sentence"], truncation=True, padding="max_length", return_tensors="pt", max_length=300), batched=True, desc="Tokenizing the eval dataset")
test = test.map(lambda e: tokenizer(e["sentence"], truncation=True, padding="max_length", return_tensors="pt", max_length=300), batched=True, desc="Tokenizing the test dataset")

train_aug = train_aug.map(lambda e: tokenizer(e["sentence"], truncation=True, padding="max_length", return_tensors="pt", max_length=300), batched=True, desc="Tokenizing the augmented dataset")

Tokenizing the train dataset:   0%|          | 0/4361 [00:00<?, ? examples/s]

Tokenizing the eval dataset:   0%|          | 0/1091 [00:00<?, ? examples/s]

Tokenizing the test dataset:   0%|          | 0/500 [00:00<?, ? examples/s]

Tokenizing the augmented dataset:   0%|          | 0/38918 [00:00<?, ? examples/s]

In [10]:
base.reset_seed()

In [10]:
model = BertForSequenceClassification.from_pretrained("google/bert_uncased_L-2_H-128_A-2", num_labels=6)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bert-base_coarse", logging_dir=f"~/logs/{DATASET}/bert-base_coarse", batch_size=128, epochs=10)

In [12]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train,
    eval_dataset=eval,
    compute_metrics=base.compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 3)]
)

In [13]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7348,1.665223,0.339138,0.354753,0.259544,0.219486
2,1.6346,1.555312,0.477544,0.35383,0.375485,0.350736
3,1.5366,1.459618,0.505958,0.510923,0.412153,0.413852
4,1.4544,1.391964,0.525206,0.515787,0.433672,0.441531
5,1.3901,1.340444,0.539872,0.519181,0.448466,0.454084
6,1.3377,1.302161,0.554537,0.532215,0.460447,0.468416
7,1.3055,1.274034,0.570119,0.542238,0.473865,0.481685
8,1.2763,1.253692,0.577452,0.549743,0.480515,0.488757
9,1.2636,1.241146,0.581118,0.549899,0.48358,0.491989
10,1.2596,1.237528,0.586618,0.554046,0.488057,0.496499


TrainOutput(global_step=350, training_loss=1.4193327004568916, metrics={'train_runtime': 122.269, 'train_samples_per_second': 356.672, 'train_steps_per_second': 2.863, 'total_flos': 32504922828000.0, 'train_loss': 1.4193327004568916, 'epoch': 10.0})

In [14]:
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-1): 2 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=128, out_features=128, bias=True)
              (key): Linear(in_features=128, out_features=128, bias=True)
              (value): Linear(in_features=128, out_features=128, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=128, out_features=128, bias=True)
              (LayerNorm): LayerNorm((128,), eps=1e-1

In [15]:
trainer.evaluate(test)

{'eval_loss': 1.2033241987228394,
 'eval_accuracy': 0.652,
 'eval_precision': 0.6378523555056522,
 'eval_recall': 0.5308106638443274,
 'eval_f1': 0.5398035170935934,
 'eval_runtime': 40.5007,
 'eval_samples_per_second': 12.345,
 'eval_steps_per_second': 0.099,
 'epoch': 10.0}

In [16]:
torch.save(model.state_dict(), f"{os.path.expanduser('~')}/models/{DATASET}/bert_coarse.pth")

In [17]:
base.reset_seed()

In [18]:
student_model = BertForSequenceClassification.from_pretrained("google/bert_uncased_L-2_H-128_A-2", num_labels=6)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bert-distill_coarse", logging_dir=f"~/logs/{DATASET}/bert-distill_coarse", remove_unused_columns=False, batch_size=128, epochs=10, temp=5, lambda_param=.5)

In [20]:
trainer = base.DistilTrainer(
    student_model=student_model,
    args=training_args,
    train_dataset=train,
    eval_dataset=eval,
    compute_metrics=base.compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 3)]
)

In [21]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.941,3.813669,0.309808,0.373039,0.231672,0.172883
2,3.7811,3.633712,0.469294,0.377785,0.365707,0.343194
3,3.6228,3.475004,0.482126,0.507357,0.386129,0.370792
4,3.4847,3.357424,0.516957,0.510199,0.425266,0.431829
5,3.3717,3.264846,0.52429,0.506548,0.435402,0.44104
6,3.2785,3.194957,0.532539,0.510292,0.442741,0.447755
7,3.2207,3.143863,0.548121,0.517228,0.455053,0.458742
8,3.1693,3.105377,0.558203,0.533418,0.463798,0.469221
9,3.1347,3.083539,0.56187,0.530388,0.466939,0.471823
10,3.1354,3.07643,0.56187,0.531516,0.466939,0.472294


TrainOutput(global_step=350, training_loss=3.413985028948103, metrics={'train_runtime': 123.9349, 'train_samples_per_second': 351.878, 'train_steps_per_second': 2.824, 'total_flos': 32504922828000.0, 'train_loss': 3.413985028948103, 'epoch': 10.0})

In [22]:
student_model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-1): 2 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=128, out_features=128, bias=True)
              (key): Linear(in_features=128, out_features=128, bias=True)
              (value): Linear(in_features=128, out_features=128, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=128, out_features=128, bias=True)
              (LayerNorm): LayerNorm((128,), eps=1e-1

In [23]:
trainer.evaluate(test)

{'eval_loss': 3.0210273265838623,
 'eval_accuracy': 0.642,
 'eval_precision': 0.6287450328459662,
 'eval_recall': 0.5204954782639689,
 'eval_f1': 0.5264013847403092,
 'eval_runtime': 4.0791,
 'eval_samples_per_second': 122.576,
 'eval_steps_per_second': 0.981,
 'epoch': 10.0}

In [24]:
torch.save(student_model.state_dict(), f"{os.path.expanduser('~')}/models/{DATASET}/bert-distil_coarse.pth")

In [11]:
base.reset_seed()

In [12]:
model = BertForSequenceClassification.from_pretrained("google/bert_uncased_L-2_H-128_A-2", num_labels=6)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bert-base-aug_coarse", logging_dir=f"~/logs/{DATASET}/bert-base-aug_coarse", batch_size=128, epochs=10)

In [14]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_aug,
    eval_dataset=eval,
    compute_metrics=base.compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 4)]
)

In [15]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.4237,1.126349,0.632447,0.610722,0.527022,0.545799
2,0.8679,0.792168,0.740605,0.63892,0.633929,0.635285
3,0.5715,0.698528,0.778185,0.828345,0.704712,0.726332
4,0.4101,0.659463,0.802933,0.825186,0.760602,0.782133
5,0.3242,0.659629,0.804766,0.824715,0.752868,0.774712
6,0.2665,0.664385,0.810266,0.831991,0.766545,0.788426
7,0.2336,0.665218,0.811182,0.833146,0.767079,0.789254
8,0.2093,0.664284,0.812099,0.835697,0.767878,0.790937
9,0.199,0.667429,0.816682,0.838508,0.771184,0.793845
10,0.1898,0.670568,0.816682,0.83678,0.771441,0.793259


TrainOutput(global_step=3050, training_loss=0.4695532214055296, metrics={'train_runtime': 464.9114, 'train_samples_per_second': 837.106, 'train_steps_per_second': 6.56, 'total_flos': 290077181064000.0, 'train_loss': 0.4695532214055296, 'epoch': 10.0})

In [16]:
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-1): 2 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=128, out_features=128, bias=True)
              (key): Linear(in_features=128, out_features=128, bias=True)
              (value): Linear(in_features=128, out_features=128, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=128, out_features=128, bias=True)
              (LayerNorm): LayerNorm((128,), eps=1e-1

In [17]:
trainer.evaluate(test)

{'eval_loss': 0.49601519107818604,
 'eval_accuracy': 0.856,
 'eval_precision': 0.876493408562744,
 'eval_recall': 0.8472441554550786,
 'eval_f1': 0.8584872846768654,
 'eval_runtime': 3.7067,
 'eval_samples_per_second': 134.892,
 'eval_steps_per_second': 1.079,
 'epoch': 10.0}

In [18]:
torch.save(model.state_dict(), f"{os.path.expanduser('~')}/models/{DATASET}/bert-base-aug_coarse.pth")

In [19]:
base.reset_seed()

In [20]:
student_model = BertForSequenceClassification.from_pretrained("google/bert_uncased_L-2_H-128_A-2", num_labels=6)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bert-distill-aug_coarse", logging_dir=f"~/logs/{DATASET}/bert-distill-aug_coarse", remove_unused_columns=False, batch_size=128, epochs=10, temp=5, lambda_param=.5)

In [22]:
trainer = base.DistilTrainer(
    student_model=student_model,
    args=training_args,
    train_dataset=train_aug,
    eval_dataset=eval,
    compute_metrics=base.compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 4)]
)

In [23]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.1993,2.81174,0.608616,0.606531,0.501452,0.517117
2,2.1299,2.019408,0.739688,0.641102,0.632493,0.635413
3,1.4628,1.669034,0.762603,0.655168,0.655668,0.654295
4,1.0477,1.47107,0.783685,0.77541,0.69139,0.70114
5,0.8085,1.400081,0.792851,0.815862,0.734683,0.757304
6,0.6545,1.347878,0.800183,0.820297,0.740653,0.762678
7,0.5635,1.329735,0.799267,0.822075,0.749129,0.771299
8,0.4989,1.299253,0.804766,0.83045,0.752908,0.777197
9,0.4665,1.290551,0.805683,0.815317,0.753363,0.77422
10,0.4477,1.293415,0.804766,0.829052,0.753253,0.776767


TrainOutput(global_step=3050, training_loss=1.1279355058513705, metrics={'train_runtime': 426.3986, 'train_samples_per_second': 912.714, 'train_steps_per_second': 7.153, 'total_flos': 290077181064000.0, 'train_loss': 1.1279355058513705, 'epoch': 10.0})

In [24]:
student_model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-1): 2 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=128, out_features=128, bias=True)
              (key): Linear(in_features=128, out_features=128, bias=True)
              (value): Linear(in_features=128, out_features=128, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=128, out_features=128, bias=True)
              (LayerNorm): LayerNorm((128,), eps=1e-1

In [25]:
trainer.evaluate(test)

{'eval_loss': 1.1607972383499146,
 'eval_accuracy': 0.858,
 'eval_precision': 0.8806367437986035,
 'eval_recall': 0.8304224336048268,
 'eval_f1': 0.8483146240881965,
 'eval_runtime': 3.6698,
 'eval_samples_per_second': 136.246,
 'eval_steps_per_second': 1.09,
 'epoch': 10.0}

In [26]:
torch.save(student_model.state_dict(), f"{os.path.expanduser('~')}/models/{DATASET}/bert-distil-aug_coarse.pth")