In [1]:
#Torchtext není k dispozici pro poslední verzi pytorch, budeme tedy využuívat něco jiného ...


In [1]:
from transformers import Trainer, BertForSequenceClassification, BertTokenizer, EarlyStoppingCallback
from datasets import load_from_disk
from torch.utils.data import DataLoader
import torch
import base
import os

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/jovyan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/jovyan/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /home/jovyan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


In [2]:
base.reset_seed()

In [3]:
DATASET = "sst2"

In [4]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU is available and will be used:", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print("GPU is not available, using CPU.")

GPU is available and will be used: NVIDIA A100 80GB PCIe MIG 2g.20gb


In [5]:
train = load_from_disk(f"~/data/{DATASET}/train-logits")
eval = load_from_disk(f"~/data/{DATASET}/eval-logits")
test = load_from_disk(f"~/data/{DATASET}/test-logits")

train_aug = load_from_disk(f"~/data/{DATASET}/train-logits-augmented")
test_blank= load_from_disk(f"~/data/{DATASET}/test-blank-logits")

In [6]:
tokenizer = BertTokenizer.from_pretrained("gchhablani/bert-base-cased-finetuned-sst2")

In [7]:
train = train.map(lambda e: tokenizer(e["sentence"], truncation=True, padding="max_length", return_tensors="pt", max_length=300), batched=True, desc="Tokenizing the train dataset")
eval = eval.map(lambda e: tokenizer(e["sentence"], truncation=True, padding="max_length", return_tensors="pt", max_length=300), batched=True, desc="Tokenizing the eval dataset")
test = test.map(lambda e: tokenizer(e["sentence"], truncation=True, padding="max_length", return_tensors="pt", max_length=300), batched=True, desc="Tokenizing the test dataset")

train_aug = train_aug.map(lambda e: tokenizer(e["sentence"], truncation=True, padding="max_length", return_tensors="pt", max_length=300), batched=True, desc="Tokenizing the augmented dataset")
test_blank = test_blank.map(lambda e: tokenizer(e["sentence"], truncation=True, padding="max_length", return_tensors="pt", max_length=300), batched=True, desc="Tokenizing the blank test dataset")

In [8]:
base.reset_seed()

In [10]:
model = BertForSequenceClassification.from_pretrained("google/bert_uncased_L-2_H-128_A-2", num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bert-base", logging_dir=f"~/logs/{DATASET}/bert-base", batch_size=128, epochs=10)

In [12]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train,
    eval_dataset=eval,
    compute_metrics=base.compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 3)]
)

In [13]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6337,0.549533,0.719037,0.719241,0.719258,0.719036
2,0.4653,0.484065,0.774083,0.774053,0.773901,0.773951
3,0.3762,0.477339,0.784404,0.786395,0.783573,0.783634
4,0.3273,0.489337,0.788991,0.791029,0.788162,0.788238
5,0.2941,0.46883,0.801606,0.80154,0.801602,0.801561
6,0.2765,0.473168,0.801606,0.801566,0.801476,0.801511
7,0.2613,0.492721,0.795872,0.797331,0.795171,0.795302
8,0.2531,0.480232,0.808486,0.808564,0.808653,0.80848
9,0.2445,0.491007,0.806193,0.806134,0.806106,0.806119
10,0.2406,0.494819,0.806193,0.806301,0.805938,0.806033


TrainOutput(global_step=4210, training_loss=0.3372740793114886, metrics={'train_runtime': 280.0215, 'train_samples_per_second': 1924.102, 'train_steps_per_second': 15.035, 'total_flos': 401089284540000.0, 'train_loss': 0.3372740793114886, 'epoch': 10.0})

In [14]:
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-1): 2 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=128, out_features=128, bias=True)
              (key): Linear(in_features=128, out_features=128, bias=True)
              (value): Linear(in_features=128, out_features=128, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=128, out_features=128, bias=True)
              (LayerNorm): LayerNorm((128,), eps=1e-1

In [15]:
trainer.evaluate(test)

{'eval_loss': 0.27465716004371643,
 'eval_accuracy': 0.8951744617668894,
 'eval_precision': 0.8931336127022499,
 'eval_recall': 0.8953544607934005,
 'eval_f1': 0.8940722936529324,
 'eval_runtime': 9.3534,
 'eval_samples_per_second': 1440.125,
 'eval_steps_per_second': 11.333,
 'epoch': 10.0}

In [16]:
torch.save(model.state_dict(), f"{os.path.expanduser('~')}/models/{DATASET}/bert-base.pth")

In [18]:
test_blank.set_format(type="torch", columns=["input_ids", "attention_mask"], device="cuda")
test_blank_dataloader = DataLoader(test_blank, batch_size=128, shuffle=False)
test_blank_logits = base.generate_logits(test_blank_dataloader, model, images=False)

Generating logits for given dataset:   0%|          | 0/15 [00:00<?, ?it/s]

In [18]:
base.generate_real_test_file_sst2(test_blank_logits, f"{os.path.expanduser('~')}/data/{DATASET}/tiny-bert-base-test.tsv")

Created output file named: /home/jovyan/data/sst2/tiny-bert-base-test.tsv upload it to GLUE benchmark to obtain results!


Real test score

![Real test score (GLUE Benchmark)](../imgs/sst2_TinyBert_base_score.png)

In [19]:
base.reset_seed()

In [20]:
student_model = BertForSequenceClassification.from_pretrained("google/bert_uncased_L-2_H-128_A-2", num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bert-distill", logging_dir=f"~/logs/{DATASET}/bert-distill", remove_unused_columns=False, batch_size=128, epochs=10, temp=5, lambda_param=.5)

In [22]:
trainer = base.DistilTrainer(
    student_model=student_model,
    args=training_args,
    train_dataset=train,
    eval_dataset=eval,
    compute_metrics=base.compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 3)]
)

In [23]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.1184,2.39858,0.704128,0.707405,0.705165,0.703565
2,2.3281,1.988954,0.759174,0.76004,0.758546,0.758614
3,1.7785,1.836083,0.779817,0.780384,0.779321,0.779441
4,1.4694,1.852803,0.779817,0.784957,0.778522,0.778219
5,1.2653,1.712452,0.786697,0.787427,0.786162,0.786291
6,1.1496,1.704012,0.797018,0.797054,0.796803,0.796877
7,1.0736,1.813213,0.783257,0.786962,0.782152,0.782046
8,1.023,1.703144,0.801606,0.801831,0.801854,0.801605
9,0.982,1.723111,0.800459,0.800392,0.800392,0.800392
10,0.9621,1.735673,0.799312,0.799481,0.799013,0.799119


TrainOutput(global_step=4210, training_loss=1.5149904518399273, metrics={'train_runtime': 510.9384, 'train_samples_per_second': 1054.511, 'train_steps_per_second': 8.24, 'total_flos': 401089284540000.0, 'train_loss': 1.5149904518399273, 'epoch': 10.0})

In [24]:
student_model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-1): 2 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=128, out_features=128, bias=True)
              (key): Linear(in_features=128, out_features=128, bias=True)
              (value): Linear(in_features=128, out_features=128, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=128, out_features=128, bias=True)
              (LayerNorm): LayerNorm((128,), eps=1e-1

In [25]:
trainer.evaluate(test)

{'eval_loss': 1.0659942626953125,
 'eval_accuracy': 0.8930215293244247,
 'eval_precision': 0.8909833152005082,
 'eval_recall': 0.8930765995259711,
 'eval_f1': 0.8918784493379457,
 'eval_runtime': 8.7267,
 'eval_samples_per_second': 1543.54,
 'eval_steps_per_second': 12.147,
 'epoch': 10.0}

In [26]:
torch.save(student_model.state_dict(), f"{os.path.expanduser('~')}/models/{DATASET}/bert-distil.pth")

In [27]:
test_blank_logits = base.generate_logits(test_blank_dataloader, model, images=False)
base.generate_real_test_file_sst2(test_blank_logits, f"{os.path.expanduser('~')}/data/{DATASET}/tiny-bert-distill-test.tsv")

Generating logits for given dataset:   0%|          | 0/15 [00:00<?, ?it/s]

Created output file named: /home/jovyan/data/sst2/tiny-bert-distill-test.tsv upload it to GLUE benchmark to obtain results!


Real test score

![Real test score (GLUE Benchmark)](../imgs/sst2_TinyBert_distill_score.png)

In [9]:
base.reset_seed()

In [10]:
model = BertForSequenceClassification.from_pretrained("google/bert_uncased_L-2_H-128_A-2", num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bert-base_aug", logging_dir=f"~/logs/{DATASET}/bert-base_aug", batch_size=128, epochs=10)

In [12]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_aug,
    eval_dataset=eval,
    compute_metrics=base.compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 4)]
)

In [13]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3321,0.491531,0.795872,0.79614,0.79614,0.795872
2,0.1891,0.606654,0.794725,0.795422,0.794214,0.794355
3,0.1504,0.663262,0.774083,0.776304,0.773186,0.77318
4,0.1257,0.793031,0.770642,0.775226,0.769386,0.769068
5,0.1078,0.880275,0.768349,0.770619,0.767429,0.767389


TrainOutput(global_step=20845, training_loss=0.1810154677638335, metrics={'train_runtime': 1667.2941, 'train_samples_per_second': 3200.575, 'train_steps_per_second': 25.005, 'total_flos': 1986240231900000.0, 'train_loss': 0.1810154677638335, 'epoch': 5.0})

In [14]:
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-1): 2 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=128, out_features=128, bias=True)
              (key): Linear(in_features=128, out_features=128, bias=True)
              (value): Linear(in_features=128, out_features=128, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=128, out_features=128, bias=True)
              (LayerNorm): LayerNorm((128,), eps=1e-1

In [15]:
trainer.evaluate(test)

{'eval_loss': 0.25607651472091675,
 'eval_accuracy': 0.9058648849294729,
 'eval_precision': 0.9037956165549527,
 'eval_recall': 0.907060167785301,
 'eval_f1': 0.9050152002729163,
 'eval_runtime': 8.9043,
 'eval_samples_per_second': 1512.751,
 'eval_steps_per_second': 11.904,
 'epoch': 5.0}

In [16]:
torch.save(model.state_dict(), f"{os.path.expanduser('~')}/models/{DATASET}/bert-base-aug.pth")

In [19]:
test_blank_logits = base.generate_logits(test_blank_dataloader, model, images=False)
base.generate_real_test_file_sst2(test_blank_logits, f"{os.path.expanduser('~')}/data/{DATASET}/tiny-bert-base-aug-test.tsv")

Generating logits for given dataset:   0%|          | 0/15 [00:00<?, ?it/s]

Created output file named: /home/jovyan/data/sst2/tiny-bert-base-aug-test.tsv upload it to GLUE benchmark to obtain results!


Real test score

![Real test score (GLUE Benchmark)](../imgs/sst2_TinyBert_base_aug_score.png)

In [20]:
base.reset_seed()

In [21]:
student_model = BertForSequenceClassification.from_pretrained("google/bert_uncased_L-2_H-128_A-2", num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bert-distill_aug", logging_dir=f"~/logs/{DATASET}/bert-distill_aug", remove_unused_columns=False, batch_size=128, epochs=10, temp=5, lambda_param=.5)

In [23]:
trainer = base.DistilTrainer(
    student_model=student_model,
    args=training_args,
    train_dataset=train_aug,
    eval_dataset=eval,
    compute_metrics=base.compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 4)]
)

In [24]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.2819,1.544384,0.811927,0.811859,0.811905,0.811878
2,0.605,1.664967,0.805046,0.805267,0.804728,0.804845
3,0.4672,1.751891,0.790138,0.792074,0.78933,0.789417
4,0.3881,1.824437,0.790138,0.791278,0.789499,0.789626
5,0.3348,1.86858,0.787844,0.78851,0.787331,0.787461


TrainOutput(global_step=20845, training_loss=0.6154103975211381, metrics={'train_runtime': 647.1207, 'train_samples_per_second': 8246.221, 'train_steps_per_second': 64.424, 'total_flos': 1986240231900000.0, 'train_loss': 0.6154103975211381, 'epoch': 5.0})

In [25]:
student_model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-1): 2 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=128, out_features=128, bias=True)
              (key): Linear(in_features=128, out_features=128, bias=True)
              (value): Linear(in_features=128, out_features=128, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=128, out_features=128, bias=True)
              (LayerNorm): LayerNorm((128,), eps=1e-1

In [26]:
trainer.evaluate(test)

{'eval_loss': 0.8676739931106567,
 'eval_accuracy': 0.9101707498144024,
 'eval_precision': 0.9083149471174303,
 'eval_recall': 0.9103104024491199,
 'eval_f1': 0.9091869445920846,
 'eval_runtime': 5.0014,
 'eval_samples_per_second': 2693.229,
 'eval_steps_per_second': 21.194,
 'epoch': 5.0}

In [27]:
torch.save(student_model.state_dict(), f"{os.path.expanduser('~')}/models/{DATASET}/bert-distil-aug.pth")

In [29]:
test_blank_logits = base.generate_logits(test_blank_dataloader, model, images=False)
base.generate_real_test_file_sst2(test_blank_logits, f"{os.path.expanduser('~')}/data/{DATASET}/tiny-bert-distill-aug-test.tsv")

Generating logits for given dataset:   0%|          | 0/15 [00:00<?, ?it/s]

Created output file named: /home/jovyan/data/sst2/tiny-bert-distill-aug-test.tsv upload it to GLUE benchmark to obtain results!


Real test score

![Real test score (GLUE Benchmark)](../imgs/sst2_TinyBert_distill_aug_score.png)

Teacher Real test score 

![Real test score (GLUE Benchmark)](../imgs/sst2_BERT_test_score.png)