In [1]:
#Torchtext není k dispozici pro poslední verzi pytorch, budeme tedy využuívat něco jiného ...


In [2]:
from transformers import BertForSequenceClassification, BertTokenizer, EarlyStoppingCallback
from datasets import load_from_disk
from torch.utils.data import DataLoader
import torch
import base
import os 
import copy

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/jovyan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/jovyan/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /home/jovyan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


In [3]:
base.reset_seed()

In [4]:
DATASET = "trec"

In [5]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU is available and will be used:", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print("GPU is not available, using CPU.")

GPU is available and will be used: NVIDIA A100 80GB PCIe MIG 2g.20gb


In [6]:
train = load_from_disk(f"~/data/{DATASET}/train-logits_coarse")
eval = load_from_disk(f"~/data/{DATASET}/eval-logits_coarse")
test = load_from_disk(f"~/data/{DATASET}/test-logits_coarse")

train_aug = load_from_disk(f"~/data/{DATASET}/train-logits-augmented_coarse")

In [7]:
tokenizer = BertTokenizer.from_pretrained("carrassi-ni/bert-base-trec-question-classification")

In [8]:
train = train.map(lambda e: tokenizer(e["sentence"], truncation=True, padding="max_length", return_tensors="pt", max_length=300), batched=True, desc="Tokenizing the train dataset")
eval = eval.map(lambda e: tokenizer(e["sentence"], truncation=True, padding="max_length", return_tensors="pt", max_length=300), batched=True, desc="Tokenizing the eval dataset")
test = test.map(lambda e: tokenizer(e["sentence"], truncation=True, padding="max_length", return_tensors="pt", max_length=300), batched=True, desc="Tokenizing the test dataset")

train_aug = train_aug.map(lambda e: tokenizer(e["sentence"], truncation=True, padding="max_length", return_tensors="pt", max_length=300), batched=True, desc="Tokenizing the augmented dataset")

In [9]:
train_data_gpu = copy.deepcopy(train)
train_data_gpu.set_format(type="torch", columns=["input_ids", "attention_mask"], device="cuda")
gpu_data_loader = DataLoader(train_data_gpu, batch_size=1, shuffle=False)

train_data_cpu = copy.deepcopy(train)
train_data_cpu.set_format(type="torch", columns=["input_ids", "attention_mask"], device="cpu")
cpu_data_loader = DataLoader(train_data_cpu, batch_size=1, shuffle=False)

In [10]:
base.reset_seed()

In [11]:
teacher_model = BertForSequenceClassification.from_pretrained("carrassi-ni/bert-base-trec-question-classification", num_labels=6)

In [12]:
teacher_model.eval()
teacher_model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [13]:
base.reset_seed()

In [14]:
student_model = BertForSequenceClassification.from_pretrained("google/bert_uncased_L-2_H-128_A-2", num_labels=6)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bert-distill-inner_coarse", logging_dir=f"~/logs/{DATASET}/bert-distill-inner_coarse", remove_unused_columns=False, lr=0.0004, weight_decay=.005, warmup_steps=3, epochs=20, temp=5, lambda_param=0, alpha_param=.5)

In [None]:
trainer = base.DistilTrainerInner(
    student_model = student_model,
    teacher_model = teacher_model,
    args=training_args,
    train_dataset=train,
    eval_dataset=eval,
    compute_metrics=base.compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 4)]
)

In [17]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.8182,1.271669,0.566453,0.544126,0.470061,0.475533
2,0.5649,0.887679,0.676444,0.590816,0.575911,0.578539
3,0.3667,0.736229,0.752521,0.63425,0.647419,0.63878
4,0.2432,0.657686,0.773602,0.831547,0.671979,0.680531
5,0.166,0.649586,0.792851,0.824033,0.743211,0.768549
6,0.1227,0.713031,0.791017,0.819637,0.739601,0.764232
7,0.0892,0.727783,0.792851,0.795585,0.754431,0.766882
8,0.0724,0.77219,0.789184,0.808681,0.750701,0.768497


TrainOutput(global_step=280, training_loss=0.3054164315973009, metrics={'train_runtime': 149.1798, 'train_samples_per_second': 584.664, 'train_steps_per_second': 4.692, 'total_flos': 26003938262400.0, 'train_loss': 0.3054164315973009, 'epoch': 8.0})

In [18]:
student_model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-1): 2 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=128, out_features=128, bias=True)
              (key): Linear(in_features=128, out_features=128, bias=True)
              (value): Linear(in_features=128, out_features=128, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=128, out_features=128, bias=True)
              (LayerNorm): LayerNorm((128,), eps=1e-1

In [19]:
trainer.evaluate(test)

{'eval_loss': 0.5226441621780396,
 'eval_accuracy': 0.852,
 'eval_precision': 0.8786602354723975,
 'eval_recall': 0.8379321919093535,
 'eval_f1': 0.8546809470722515,
 'eval_runtime': 3.6494,
 'eval_samples_per_second': 137.007,
 'eval_steps_per_second': 1.096,
 'epoch': 8.0}

In [20]:
torch.save(student_model.state_dict(), f"{os.path.expanduser('~')}/models/{DATASET}/bert-distil-inner_coarse.pth")

In [37]:
base.reset_seed()

In [38]:
student_model = BertForSequenceClassification.from_pretrained("google/bert_uncased_L-2_H-128_A-2", num_labels=6)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [39]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bert-distill-inner-aug_coarse", logging_dir=f"~/logs/{DATASET}/bert-distill-inner-aug_coarse", remove_unused_columns=False, lr=0.00025, weight_decay=.005, epochs=20, temp=4, lambda_param=0.2, alpha_param=.5)

In [40]:
trainer = base.DistilTrainerInner(
    student_model = student_model,
    teacher_model = teacher_model,
    args=training_args,
    train_dataset=train_aug,
    eval_dataset=eval,
    compute_metrics=base.compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 3)]
)

In [41]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6607,0.664992,0.815765,0.836306,0.76056,0.783015
2,0.187,0.765432,0.829514,0.839569,0.780079,0.801073
3,0.1172,0.829046,0.814849,0.796037,0.771601,0.779053
4,0.0952,0.830824,0.826764,0.823422,0.788263,0.801588
5,0.084,0.807195,0.828598,0.828044,0.797073,0.810075
6,0.0769,0.774028,0.831347,0.819216,0.802339,0.808523
7,0.0715,0.80365,0.831347,0.826021,0.801746,0.81065
8,0.068,0.799588,0.827681,0.810885,0.808464,0.807403
9,0.065,0.806654,0.830431,0.824503,0.801099,0.810252
10,0.0636,0.825085,0.826764,0.824742,0.798505,0.808063


TrainOutput(global_step=3040, training_loss=0.14890538014863666, metrics={'train_runtime': 1262.2633, 'train_samples_per_second': 615.957, 'train_steps_per_second': 4.817, 'total_flos': 289756678500000.0, 'train_loss': 0.14890538014863666, 'epoch': 10.0})

In [42]:
student_model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-1): 2 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=128, out_features=128, bias=True)
              (key): Linear(in_features=128, out_features=128, bias=True)
              (value): Linear(in_features=128, out_features=128, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=128, out_features=128, bias=True)
              (LayerNorm): LayerNorm((128,), eps=1e-1

In [43]:
trainer.evaluate(test)

{'eval_loss': 0.7492300868034363,
 'eval_accuracy': 0.838,
 'eval_precision': 0.8602475280000678,
 'eval_recall': 0.8398821962172841,
 'eval_f1': 0.8454036677342939,
 'eval_runtime': 3.2333,
 'eval_samples_per_second': 154.639,
 'eval_steps_per_second': 1.237,
 'epoch': 10.0}

In [None]:
torch.save(student_model.state_dict(), f"{os.path.expanduser('~')}/models/{DATASET}/bert--distill-inner-aug_coarse.pth")

In [None]:
base.count_parameters(student_model)

In [None]:
cpu_benchmark = base.BenchMarkRunner(student_model, cpu_data_loader, "cpu", 1000)
print(cpu_benchmark.run_benchmark())

In [None]:
gpu_benchmark = base.BenchMarkRunner(student_model, gpu_data_loader, "cuda", 1000)
print(gpu_benchmark.run_benchmark())