In [3]:
#Torchtext není k dispozici pro poslední verzi pytorch, budeme tedy využuívat něco jiného ...


In [4]:
from transformers import Trainer, BertForSequenceClassification, BertTokenizer, EarlyStoppingCallback
from datasets import load_from_disk
import torch
import base
import os

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/jovyan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/jovyan/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /home/jovyan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


In [5]:
base.reset_seed()

In [6]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU is available and will be used:", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print("GPU is not available, using CPU.")

GPU is available and will be used: NVIDIA A100 80GB PCIe MIG 2g.20gb


In [7]:
DATASET = "dbpedia"

In [8]:
train = load_from_disk(f"~/data/{DATASET}/train-logits")
eval = load_from_disk(f"~/data/{DATASET}/eval-logits")
test = load_from_disk(f"~/data/{DATASET}/test-logits")

train_aug = load_from_disk(f"~/data/{DATASET}/train-logits-augmented")

In [9]:
tokenizer = BertTokenizer.from_pretrained("gchhablani/bert-base-cased-finetuned-sst2")

In [10]:
train = train.map(lambda e: tokenizer(e["sentence"], truncation=True, padding="max_length", return_tensors="pt", max_length=300), batched=True, desc="Tokenizing the train dataset")
eval = eval.map(lambda e: tokenizer(e["sentence"], truncation=True, padding="max_length", return_tensors="pt", max_length=300), batched=True, desc="Tokenizing the eval dataset")
test = test.map(lambda e: tokenizer(e["sentence"], truncation=True, padding="max_length", return_tensors="pt", max_length=300), batched=True, desc="Tokenizing the test dataset")

train_aug = train_aug.map(lambda e: tokenizer(e["sentence"], truncation=True, padding="max_length", return_tensors="pt", max_length=300), batched=True, desc="Tokenizing the augmented dataset")

In [11]:
base.reset_seed()

In [12]:
model = BertForSequenceClassification.from_pretrained("google/bert_uncased_L-2_H-128_A-2", num_labels=14)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bert-base", logging_dir=f"~/logs/{DATASET}/bert-base", batch_size=128, epochs=5)

In [14]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train,
    eval_dataset=eval,
    compute_metrics=base.compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 3)]
)

In [15]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5366,0.094706,0.977107,0.977135,0.977107,0.977107
2,0.0864,0.069196,0.982366,0.98236,0.982366,0.982344
3,0.0624,0.064323,0.984,0.984013,0.984,0.983999
4,0.052,0.060339,0.985009,0.985005,0.985009,0.985005
5,0.0463,0.060093,0.985196,0.985191,0.985196,0.985191


TrainOutput(global_step=17500, training_loss=0.15675776105608258, metrics={'train_runtime': 546.8155, 'train_samples_per_second': 4096.445, 'train_steps_per_second': 32.003, 'total_flos': 1673755776000000.0, 'train_loss': 0.15675776105608258, 'epoch': 5.0})

In [16]:
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-1): 2 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=128, out_features=128, bias=True)
              (key): Linear(in_features=128, out_features=128, bias=True)
              (value): Linear(in_features=128, out_features=128, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=128, out_features=128, bias=True)
              (LayerNorm): LayerNorm((128,), eps=1e-1

In [17]:
trainer.evaluate(test)

{'eval_loss': 0.06053699925541878,
 'eval_accuracy': 0.9853,
 'eval_precision': 0.9852876597564322,
 'eval_recall': 0.9853000000000002,
 'eval_f1': 0.9852890083123975,
 'eval_runtime': 12.3863,
 'eval_samples_per_second': 5651.401,
 'eval_steps_per_second': 44.162,
 'epoch': 5.0}

In [18]:
torch.save(model.state_dict(), f"{os.path.expanduser('~')}/models/{DATASET}/bert-base.pth")

In [19]:
student_model = BertForSequenceClassification.from_pretrained("google/bert_uncased_L-2_H-128_A-2", num_labels=14)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bert-distill", logging_dir=f"~/logs/{DATASET}/bert-distill", remove_unused_columns=False, batch_size=128, epochs=5, temp=5, lambda_param=.5)

In [21]:
trainer = base.DistilTrainer(
    student_model=student_model,
    args=training_args,
    train_dataset=train,
    eval_dataset=eval,
    compute_metrics=base.compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 3)]
)

In [22]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.0714,0.340898,0.975741,0.975717,0.975741,0.975713
2,0.2348,0.13658,0.982071,0.982045,0.982071,0.982042
3,0.1512,0.115859,0.98375,0.983764,0.98375,0.983747
4,0.1291,0.106488,0.984812,0.984808,0.984813,0.984805
5,0.1191,0.104356,0.985161,0.985152,0.985161,0.985151


TrainOutput(global_step=17500, training_loss=0.5411343052455357, metrics={'train_runtime': 564.5835, 'train_samples_per_second': 3967.526, 'train_steps_per_second': 30.996, 'total_flos': 1673755776000000.0, 'train_loss': 0.5411343052455357, 'epoch': 5.0})

In [23]:
student_model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-1): 2 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=128, out_features=128, bias=True)
              (key): Linear(in_features=128, out_features=128, bias=True)
              (value): Linear(in_features=128, out_features=128, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=128, out_features=128, bias=True)
              (LayerNorm): LayerNorm((128,), eps=1e-1

In [24]:
trainer.evaluate(test)

{'eval_loss': 0.10508492588996887,
 'eval_accuracy': 0.9848857142857143,
 'eval_precision': 0.9848719828980422,
 'eval_recall': 0.9848857142857143,
 'eval_f1': 0.984872051723931,
 'eval_runtime': 12.7804,
 'eval_samples_per_second': 5477.123,
 'eval_steps_per_second': 42.8,
 'epoch': 5.0}

In [25]:
torch.save(student_model.state_dict(), f"{os.path.expanduser('~')}/models/{DATASET}/bert-distill.pth")

In [26]:
base.reset_seed()

In [27]:
model = BertForSequenceClassification.from_pretrained("google/bert_uncased_L-2_H-128_A-2", num_labels=14)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [28]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bert-base-aug", logging_dir=f"~/logs/{DATASET}/bert-base-aug", batch_size=128, epochs=5)

In [29]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_aug,
    eval_dataset=eval,
    compute_metrics=base.compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 3)]
)

In [30]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3933,0.072101,0.981429,0.981458,0.981429,0.981412
2,0.0956,0.056516,0.985214,0.985221,0.985214,0.985215
3,0.0725,0.053886,0.986036,0.986023,0.986036,0.986024
4,0.0619,0.052391,0.986402,0.986399,0.986402,0.986398
5,0.0554,0.052037,0.986598,0.98659,0.986598,0.986592


TrainOutput(global_step=34360, training_loss=0.13575237394073095, metrics={'train_runtime': 996.5919, 'train_samples_per_second': 4412.914, 'train_steps_per_second': 34.478, 'total_flos': 3286146733650000.0, 'train_loss': 0.13575237394073095, 'epoch': 5.0})

In [31]:
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-1): 2 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=128, out_features=128, bias=True)
              (key): Linear(in_features=128, out_features=128, bias=True)
              (value): Linear(in_features=128, out_features=128, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=128, out_features=128, bias=True)
              (LayerNorm): LayerNorm((128,), eps=1e-1

In [32]:
trainer.evaluate(test)

{'eval_loss': 0.05249098315834999,
 'eval_accuracy': 0.9868714285714286,
 'eval_precision': 0.9868591629840695,
 'eval_recall': 0.9868714285714285,
 'eval_f1': 0.9868614306386609,
 'eval_runtime': 13.2243,
 'eval_samples_per_second': 5293.274,
 'eval_steps_per_second': 41.363,
 'epoch': 5.0}

In [33]:
torch.save(model.state_dict(), f"{os.path.expanduser('~')}/models/{DATASET}/bert-base-aug.pth")

In [34]:
base.reset_seed()

In [35]:
student_model = BertForSequenceClassification.from_pretrained("google/bert_uncased_L-2_H-128_A-2", num_labels=14)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [36]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bert-distil-aug", logging_dir=f"~/logs/{DATASET}/bert-distil-aug", remove_unused_columns=False, batch_size=128, epochs=5, temp=5, lambda_param=.5)

In [37]:
trainer = base.DistilTrainer(
    student_model=student_model,
    args=training_args,
    train_dataset=train_aug,
    eval_dataset=eval,
    compute_metrics=base.compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 3)]
)

In [38]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.3087,0.13993,0.980964,0.980965,0.980964,0.980933
2,0.2171,0.103737,0.984643,0.984636,0.984643,0.984632
3,0.1695,0.094872,0.985554,0.985539,0.985554,0.985539
4,0.1483,0.089543,0.986402,0.986401,0.986402,0.986399
5,0.1371,0.087687,0.986411,0.986403,0.986411,0.986403


TrainOutput(global_step=34360, training_loss=0.3961544991649765, metrics={'train_runtime': 1002.6131, 'train_samples_per_second': 4386.413, 'train_steps_per_second': 34.27, 'total_flos': 3286146733650000.0, 'train_loss': 0.3961544991649765, 'epoch': 5.0})

In [39]:
student_model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-1): 2 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=128, out_features=128, bias=True)
              (key): Linear(in_features=128, out_features=128, bias=True)
              (value): Linear(in_features=128, out_features=128, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=128, out_features=128, bias=True)
              (LayerNorm): LayerNorm((128,), eps=1e-1

In [40]:
trainer.evaluate(test)

{'eval_loss': 0.0893327072262764,
 'eval_accuracy': 0.9864571428571428,
 'eval_precision': 0.9864446110522369,
 'eval_recall': 0.9864571428571429,
 'eval_f1': 0.9864423625174343,
 'eval_runtime': 12.4491,
 'eval_samples_per_second': 5622.899,
 'eval_steps_per_second': 43.939,
 'epoch': 5.0}

In [41]:
torch.save(student_model.state_dict(), f"{os.path.expanduser('~')}/models/{DATASET}/bert-distil-aug.pth")