In [1]:
#Torchtext není k dispozici pro poslední verzi pytorch, budeme tedy využuívat něco jiného ...


In [1]:
from transformers import Trainer, BertForSequenceClassification, BertTokenizer, EarlyStoppingCallback
from datasets import load_from_disk
from torch.utils.data import DataLoader
import torch
import base
import os
import copy

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/jovyan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/jovyan/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /home/jovyan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


In [2]:
base.reset_seed()

In [3]:
DATASET = "sst2"

In [4]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU is available and will be used:", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print("GPU is not available, using CPU.")

GPU is available and will be used: NVIDIA A100 80GB PCIe MIG 2g.20gb


In [11]:
train = load_from_disk(f"~/data/{DATASET}/train-logits")
eval = load_from_disk(f"~/data/{DATASET}/eval-logits")
test = load_from_disk(f"~/data/{DATASET}/test-logits")

train_aug = load_from_disk(f"~/data/{DATASET}/train-logits-augmented")
test_blank= load_from_disk(f"~/data/{DATASET}/test-blank-logits")

In [12]:
tokenizer = BertTokenizer.from_pretrained("gchhablani/bert-base-cased-finetuned-sst2")

In [13]:
train = train.map(lambda e: tokenizer(e["sentence"], truncation=True, padding="max_length", return_tensors="pt", max_length=300), batched=True, desc="Tokenizing the train dataset")
eval = eval.map(lambda e: tokenizer(e["sentence"], truncation=True, padding="max_length", return_tensors="pt", max_length=300), batched=True, desc="Tokenizing the eval dataset")
test = test.map(lambda e: tokenizer(e["sentence"], truncation=True, padding="max_length", return_tensors="pt", max_length=300), batched=True, desc="Tokenizing the test dataset")

train_aug = train_aug.map(lambda e: tokenizer(e["sentence"], truncation=True, padding="max_length", return_tensors="pt", max_length=300), batched=True, desc="Tokenizing the augmented dataset")
test_blank = test_blank.map(lambda e: tokenizer(e["sentence"], truncation=True, padding="max_length", return_tensors="pt", max_length=300), batched=True, desc="Tokenizing the blank test dataset")

Tokenizing the train dataset:   0%|          | 0/53879 [00:00<?, ? examples/s]

Tokenizing the eval dataset:   0%|          | 0/872 [00:00<?, ? examples/s]

Tokenizing the test dataset:   0%|          | 0/13470 [00:00<?, ? examples/s]

Tokenizing the augmented dataset:   0%|          | 0/293636 [00:00<?, ? examples/s]

Tokenizing the blank test dataset:   0%|          | 0/1821 [00:00<?, ? examples/s]

In [14]:
train_data_gpu = copy.deepcopy(train)
train_data_gpu.set_format(type="torch", columns=["input_ids", "attention_mask"], device="cuda")
gpu_data_loader = DataLoader(train_data_gpu, batch_size=1, shuffle=False)

train_data_cpu = copy.deepcopy(train)
train_data_cpu.set_format(type="torch", columns=["input_ids", "attention_mask"], device="cpu")
cpu_data_loader = DataLoader(train_data_cpu, batch_size=1, shuffle=False)

In [24]:
base.reset_seed()

In [25]:
model = BertForSequenceClassification.from_pretrained("google/bert_uncased_L-2_H-128_A-2", num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [26]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bert-base", logging_dir=f"~/logs/{DATASET}/bert-base", lr=.00003, epochs=20, weight_decay=0.008, warmup_steps=35)

In [27]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train,
    eval_dataset=eval,
    compute_metrics=base.compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 3)]
)

In [28]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6669,0.618141,0.68578,0.685718,0.685464,0.6855
2,0.5483,0.525342,0.744266,0.746798,0.743233,0.743008
3,0.4489,0.48891,0.770642,0.771295,0.770102,0.770206
4,0.3918,0.489184,0.783257,0.784527,0.782573,0.782678
5,0.35,0.476693,0.799312,0.799304,0.799139,0.799195
6,0.3237,0.462904,0.805046,0.805066,0.804854,0.804922
7,0.303,0.49049,0.795872,0.798748,0.794919,0.794964
8,0.2868,0.474048,0.797018,0.797041,0.797139,0.797005
9,0.2724,0.476259,0.807339,0.808091,0.80778,0.807323
10,0.2627,0.484224,0.806193,0.806144,0.806233,0.806162


TrainOutput(global_step=7157, training_loss=0.32444957303021466, metrics={'train_runtime': 262.4925, 'train_samples_per_second': 4105.184, 'train_steps_per_second': 32.077, 'total_flos': 681851783718000.0, 'train_loss': 0.32444957303021466, 'epoch': 17.0})

In [51]:
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-1): 2 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=128, out_features=128, bias=True)
              (key): Linear(in_features=128, out_features=128, bias=True)
              (value): Linear(in_features=128, out_features=128, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=128, out_features=128, bias=True)
              (LayerNorm): LayerNorm((128,), eps=1e-1

In [52]:
trainer.evaluate(test)

{'eval_loss': 0.2646693289279938,
 'eval_accuracy': 0.8999257609502599,
 'eval_precision': 0.8981981941832378,
 'eval_recall': 0.8992998607658361,
 'eval_f1': 0.898712891806191,
 'eval_runtime': 5.7292,
 'eval_samples_per_second': 2351.127,
 'eval_steps_per_second': 18.502,
 'epoch': 20.0}

In [53]:
torch.save(model.state_dict(), f"{os.path.expanduser('~')}/models/{DATASET}/bert-base.pth")

In [54]:
test_blank.set_format(type="torch", columns=["input_ids", "attention_mask"], device="cuda")
test_blank_dataloader = DataLoader(test_blank, batch_size=128, shuffle=False)
test_blank_logits = base.generate_logits(test_blank_dataloader, model)

Generating logits for given dataset:   0%|          | 0/15 [00:00<?, ?it/s]

In [55]:
base.generate_real_test_file_sst2(test_blank_logits, f"{os.path.expanduser('~')}/data/{DATASET}/tiny-bert-base-test.tsv")

Created output file named: /home/jovyan/data/sst2/tiny-bert-base-test.tsv upload it to GLUE benchmark to obtain results!


In [81]:
base.reset_seed()

In [82]:
student_model = BertForSequenceClassification.from_pretrained("google/bert_uncased_L-2_H-128_A-2", num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [83]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bert-distill", logging_dir=f"~/logs/{DATASET}/bert-distill", remove_unused_columns=False, lr=0.00005, weight_decay=0.08, epochs=20, temp=6.5, lambda_param=.7)

In [84]:
trainer = base.DistilTrainer(
    student_model=student_model,
    args=training_args,
    train_dataset=train,
    eval_dataset=eval,
    compute_metrics=base.compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 3)]
)

In [85]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,4.5651,3.393508,0.712156,0.714411,0.71107,0.710646
2,3.2545,2.743101,0.766055,0.766889,0.766524,0.766024
3,2.4044,2.444158,0.786697,0.786638,0.786583,0.786606
4,1.9224,2.426809,0.77867,0.783258,0.777437,0.777193
5,1.6364,2.362908,0.783257,0.783663,0.782826,0.782946
6,1.4412,2.352273,0.788991,0.789473,0.788541,0.78867
7,1.3187,2.393249,0.797018,0.797183,0.796718,0.796824
8,1.2036,2.335956,0.799312,0.799409,0.799055,0.799147
9,1.1279,2.415294,0.800459,0.801064,0.800855,0.800449
10,1.0713,2.436018,0.805046,0.805156,0.805233,0.805042


TrainOutput(global_step=7999, training_loss=1.4788935866739201, metrics={'train_runtime': 448.42, 'train_samples_per_second': 2403.06, 'train_steps_per_second': 18.777, 'total_flos': 762069640626000.0, 'train_loss': 1.4788935866739201, 'epoch': 19.0})

In [86]:
student_model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-1): 2 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=128, out_features=128, bias=True)
              (key): Linear(in_features=128, out_features=128, bias=True)
              (value): Linear(in_features=128, out_features=128, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=128, out_features=128, bias=True)
              (LayerNorm): LayerNorm((128,), eps=1e-1

In [87]:
trainer.evaluate(test)

{'eval_loss': 1.1506083011627197,
 'eval_accuracy': 0.9138084632516704,
 'eval_precision': 0.9120214133331037,
 'eval_recall': 0.913884299788369,
 'eval_f1': 0.9128467149013444,
 'eval_runtime': 6.3788,
 'eval_samples_per_second': 2111.697,
 'eval_steps_per_second': 16.618,
 'epoch': 19.0}

In [88]:
torch.save(student_model.state_dict(), f"{os.path.expanduser('~')}/models/{DATASET}/bert-distil.pth")

In [90]:
test_blank_logits = base.generate_logits(test_blank_dataloader, student_model)
base.generate_real_test_file_sst2(test_blank_logits, f"{os.path.expanduser('~')}/data/{DATASET}/tiny-bert-distill-test.tsv")

Generating logits for given dataset:   0%|          | 0/15 [00:00<?, ?it/s]

Created output file named: /home/jovyan/data/sst2/tiny-bert-distill-test.tsv upload it to GLUE benchmark to obtain results!


In [142]:
base.reset_seed()

In [143]:
model = BertForSequenceClassification.from_pretrained("google/bert_uncased_L-2_H-128_A-2", num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [144]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bert-base-aug", logging_dir=f"~/logs/{DATASET}/bert-base-aug", epochs=20, lr=0.00001, weight_decay=0.007, warmup_steps=65)

In [145]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_aug,
    eval_dataset=eval,
    compute_metrics=base.compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 3)]
)

In [146]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6036,0.545399,0.72133,0.726085,0.722563,0.720518
2,0.4422,0.481661,0.774083,0.774142,0.773817,0.773897
3,0.3616,0.457512,0.790138,0.790554,0.790467,0.790135
4,0.3157,0.455039,0.802752,0.802862,0.802938,0.802748
5,0.2838,0.463331,0.802752,0.802701,0.802644,0.802668
6,0.2644,0.470651,0.807339,0.807304,0.807401,0.807314
7,0.2477,0.47916,0.808486,0.808438,0.808527,0.808456
8,0.2364,0.483468,0.800459,0.800527,0.800223,0.800307
9,0.2269,0.493203,0.802752,0.802686,0.802686,0.802686
10,0.2193,0.499995,0.802752,0.802683,0.802728,0.802701


TrainOutput(global_step=22950, training_loss=0.32016659273301334, metrics={'train_runtime': 1092.1365, 'train_samples_per_second': 5377.277, 'train_steps_per_second': 42.028, 'total_flos': 2185902729360000.0, 'train_loss': 0.32016659273301334, 'epoch': 10.0})

In [147]:
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-1): 2 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=128, out_features=128, bias=True)
              (key): Linear(in_features=128, out_features=128, bias=True)
              (value): Linear(in_features=128, out_features=128, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=128, out_features=128, bias=True)
              (LayerNorm): LayerNorm((128,), eps=1e-1

In [148]:
trainer.evaluate(test)

{'eval_loss': 0.26851242780685425,
 'eval_accuracy': 0.9002227171492205,
 'eval_precision': 0.8984696130765251,
 'eval_recall': 0.8996704696118623,
 'eval_f1': 0.8990266947261107,
 'eval_runtime': 5.4443,
 'eval_samples_per_second': 2474.148,
 'eval_steps_per_second': 19.47,
 'epoch': 10.0}

In [112]:
torch.save(model.state_dict(), f"{os.path.expanduser('~')}/models/{DATASET}/bert-base-aug.pth")

In [141]:
test_blank_logits = base.generate_logits(test_blank_dataloader, model)
base.generate_real_test_file_sst2(test_blank_logits, f"{os.path.expanduser('~')}/data/{DATASET}/tiny-bert-base-aug-test.tsv")

Generating logits for given dataset:   0%|          | 0/15 [00:00<?, ?it/s]

Created output file named: /home/jovyan/data/sst2/tiny-bert-base-aug-test.tsv upload it to GLUE benchmark to obtain results!


In [37]:
base.reset_seed()

In [7]:
student_model = BertForSequenceClassification.from_pretrained("google/bert_uncased_L-2_H-128_A-2", num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bert-distill-aug", logging_dir=f"~/logs/{DATASET}/bert-distill-aug", remove_unused_columns=False, lr=0.00001, weight_decay=0.05, warmup_steps=20, epochs=20, temp=5.5, lambda_param=.7)

In [40]:
trainer = base.DistilTrainer(
    student_model=student_model,
    args=training_args,
    train_dataset=train_aug,
    eval_dataset=eval,
    compute_metrics=base.compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 3)]
)

In [41]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.9067,3.28397,0.722477,0.725702,0.723478,0.722003
2,2.6868,2.60331,0.772936,0.773361,0.77248,0.77259
3,2.0003,2.288749,0.788991,0.788931,0.789004,0.788951
4,1.6357,2.140533,0.800459,0.800728,0.800728,0.800459
5,1.4098,2.091068,0.800459,0.800568,0.800644,0.800455
6,1.2718,2.050542,0.801606,0.801557,0.801644,0.801574
7,1.1725,2.05986,0.813073,0.81322,0.813284,0.813071
8,1.1031,2.036237,0.81078,0.810731,0.810821,0.81075
9,1.0438,2.041963,0.811927,0.812037,0.812116,0.811923
10,1.0005,2.047816,0.815367,0.815348,0.815452,0.815347


TrainOutput(global_step=41310, training_loss=1.3557164572884517, metrics={'train_runtime': 1952.8108, 'train_samples_per_second': 3007.316, 'train_steps_per_second': 23.505, 'total_flos': 3934624912848000.0, 'train_loss': 1.3557164572884517, 'epoch': 18.0})

In [42]:
student_model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-1): 2 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=128, out_features=128, bias=True)
              (key): Linear(in_features=128, out_features=128, bias=True)
              (value): Linear(in_features=128, out_features=128, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=128, out_features=128, bias=True)
              (LayerNorm): LayerNorm((128,), eps=1e-1

In [43]:
trainer.evaluate(test)

{'eval_loss': 1.1050318479537964,
 'eval_accuracy': 0.9155902004454343,
 'eval_precision': 0.9139867735650128,
 'eval_recall': 0.9152724406270596,
 'eval_f1': 0.9145825730990167,
 'eval_runtime': 5.1495,
 'eval_samples_per_second': 2615.808,
 'eval_steps_per_second': 20.585,
 'epoch': 18.0}

In [44]:
torch.save(student_model.state_dict(), f"{os.path.expanduser('~')}/models/{DATASET}/bert-distil-aug.pth")

In [45]:
test_blank_logits = base.generate_logits(test_blank_dataloader, model)
base.generate_real_test_file_sst2(test_blank_logits, f"{os.path.expanduser('~')}/data/{DATASET}/tiny-bert-distill-aug-test.tsv")

Generating logits for given dataset:   0%|          | 0/15 [00:00<?, ?it/s]

Created output file named: /home/jovyan/data/sst2/tiny-bert-distill-aug-test.tsv upload it to GLUE benchmark to obtain results!


In [8]:
base.count_parameters(student_model)

model size: 16.740MB.
Total Trainable Params: 4386178.


Unnamed: 0,Modules,Parameters
0,bert.embeddings.word_embeddings.weight,3906816
1,bert.embeddings.position_embeddings.weight,65536
2,bert.embeddings.token_type_embeddings.weight,256
3,bert.embeddings.LayerNorm.weight,128
4,bert.embeddings.LayerNorm.bias,128
5,bert.encoder.layer.0.attention.self.query.weight,16384
6,bert.encoder.layer.0.attention.self.query.bias,128
7,bert.encoder.layer.0.attention.self.key.weight,16384
8,bert.encoder.layer.0.attention.self.key.bias,128
9,bert.encoder.layer.0.attention.self.value.weight,16384


In [21]:
cpu_benchmark = base.BenchMarkRunner(student_model, cpu_data_loader, "cpu", 1000)
print(cpu_benchmark.run_benchmark())

<torch.utils.benchmark.utils.common.Measurement object at 0x7fad741a9540>
self.infer_speed_comp()
  3.67 ms
  1 measurement, 1000 runs , 4 threads


In [23]:
gpu_benchmark = base.BenchMarkRunner(student_model, gpu_data_loader, "cuda", 1000)
print(gpu_benchmark.run_benchmark())

<torch.utils.benchmark.utils.common.Measurement object at 0x7fac9a608160>
self.infer_speed_comp()
  1.94 ms
  1 measurement, 1000 runs , 4 threads
