In [1]:
#Torchtext není k dispozici pro poslední verzi pytorch, budeme tedy využuívat něco jiného ...


In [1]:
from transformers import Trainer, BertForSequenceClassification, BertTokenizer, EarlyStoppingCallback
from datasets import load_from_disk
from torch.utils.data import DataLoader
import torch
import base
import os 

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/jovyan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/jovyan/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /home/jovyan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


In [2]:
base.reset_seed()

In [3]:
DATASET = "trec"

In [4]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU is available and will be used:", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print("GPU is not available, using CPU.")

GPU is available and will be used: NVIDIA A100 80GB PCIe MIG 2g.20gb


In [5]:
train = load_from_disk(f"~/data/{DATASET}/train-logits_fine")
eval = load_from_disk(f"~/data/{DATASET}/eval-logits_fine")
test = load_from_disk(f"~/data/{DATASET}/test-logits_fine")

train_aug = load_from_disk(f"~/data/{DATASET}/train-logits-augmented_fine")

In [6]:
tokenizer = BertTokenizer.from_pretrained("ndavid/autotrain-trec-fine-bert-739422530")

In [7]:
train = train.map(lambda e: tokenizer(e["sentence"], truncation=True, padding="max_length", return_tensors="pt", max_length=300), batched=True, desc="Tokenizing the train dataset")
eval = eval.map(lambda e: tokenizer(e["sentence"], truncation=True, padding="max_length", return_tensors="pt", max_length=300), batched=True, desc="Tokenizing the eval dataset")
test = test.map(lambda e: tokenizer(e["sentence"], truncation=True, padding="max_length", return_tensors="pt", max_length=300), batched=True, desc="Tokenizing the test dataset")

train_aug = train_aug.map(lambda e: tokenizer(e["sentence"], truncation=True, padding="max_length", return_tensors="pt", max_length=300), batched=True, desc="Tokenizing the augmented dataset")

In [13]:
base.reset_seed()

In [14]:
model = BertForSequenceClassification.from_pretrained("google/bert_uncased_L-2_H-128_A-2", num_labels=50)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bert-base_fine", logging_dir=f"~/logs/{DATASET}/bert-base_fine", batch_size=128, epochs=20)

In [16]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train,
    eval_dataset=eval,
    compute_metrics=base.compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 3)]
)

In [17]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.782,3.659952,0.180568,0.016891,0.021096,0.008066
2,3.5999,3.50454,0.179652,0.023548,0.020822,0.007605
3,3.4638,3.361355,0.23923,0.075162,0.03832,0.0323
4,3.3248,3.230582,0.368469,0.068824,0.077057,0.063401
5,3.2237,3.117347,0.401467,0.076742,0.086241,0.066248
6,3.104,3.015311,0.417965,0.093694,0.093164,0.073303
7,3.003,2.925939,0.429881,0.088959,0.100196,0.079902
8,2.9274,2.845344,0.448213,0.085833,0.107519,0.084251
9,2.8458,2.773612,0.455545,0.10441,0.111707,0.087127
10,2.7806,2.710714,0.469294,0.104791,0.12183,0.097324


TrainOutput(global_step=700, training_loss=2.881428451538086, metrics={'train_runtime': 125.8131, 'train_samples_per_second': 693.251, 'train_steps_per_second': 5.564, 'total_flos': 65900954952000.0, 'train_loss': 2.881428451538086, 'epoch': 20.0})

In [18]:
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-1): 2 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=128, out_features=128, bias=True)
              (key): Linear(in_features=128, out_features=128, bias=True)
              (value): Linear(in_features=128, out_features=128, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=128, out_features=128, bias=True)
              (LayerNorm): LayerNorm((128,), eps=1e-1

In [19]:
trainer.evaluate(test)

{'eval_loss': 2.5368008613586426,
 'eval_accuracy': 0.566,
 'eval_precision': 0.12189557405215357,
 'eval_recall': 0.18359783242761965,
 'eval_f1': 0.12798195561772654,
 'eval_runtime': 3.129,
 'eval_samples_per_second': 159.794,
 'eval_steps_per_second': 1.278,
 'epoch': 20.0}

In [16]:
torch.save(model.state_dict(), f"{os.path.expanduser('~')}/models/{DATASET}/bert_fine.pth")

In [17]:
base.reset_seed()

In [20]:
student_model = BertForSequenceClassification.from_pretrained("google/bert_uncased_L-2_H-128_A-2", num_labels=50)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bert-distill_fine", logging_dir=f"~/logs/{DATASET}/bert-distill_fine", remove_unused_columns=False, batch_size=128, epochs=20, temp=5, lambda_param=.5)

In [22]:
trainer = base.DistilTrainer(
    student_model=student_model,
    args=training_args,
    train_dataset=train,
    eval_dataset=eval,
    compute_metrics=base.compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 3)]
)

In [23]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.4076,2.324112,0.176902,0.003538,0.02,0.006012
2,2.2943,2.239185,0.176902,0.003538,0.02,0.006012
3,2.2213,2.169806,0.176902,0.003538,0.02,0.006012
4,2.1539,2.102734,0.293309,0.02665,0.05274,0.033642
5,2.098,2.039486,0.307058,0.035465,0.056396,0.034119
6,2.0331,1.97701,0.337305,0.054202,0.063994,0.042839
7,1.9725,1.914761,0.404216,0.0781,0.090593,0.071845
8,1.917,1.860798,0.430797,0.073561,0.100584,0.077952
9,1.8701,1.811178,0.436297,0.068977,0.103844,0.079061
10,1.8253,1.768889,0.461045,0.104244,0.117568,0.09503


TrainOutput(global_step=700, training_loss=1.8798588562011718, metrics={'train_runtime': 128.1475, 'train_samples_per_second': 680.622, 'train_steps_per_second': 5.462, 'total_flos': 65900954952000.0, 'train_loss': 1.8798588562011718, 'epoch': 20.0})

In [24]:
student_model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-1): 2 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=128, out_features=128, bias=True)
              (key): Linear(in_features=128, out_features=128, bias=True)
              (value): Linear(in_features=128, out_features=128, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=128, out_features=128, bias=True)
              (LayerNorm): LayerNorm((128,), eps=1e-1

In [25]:
trainer.evaluate(test)

{'eval_loss': 1.6074937582015991,
 'eval_accuracy': 0.556,
 'eval_precision': 0.16645158090077605,
 'eval_recall': 0.20287909490037145,
 'eval_f1': 0.15261276616621097,
 'eval_runtime': 3.4842,
 'eval_samples_per_second': 143.506,
 'eval_steps_per_second': 1.148,
 'epoch': 20.0}

In [24]:
torch.save(student_model.state_dict(), f"{os.path.expanduser('~')}/models/{DATASET}/bert-distil_fine.pth")

In [25]:
base.reset_seed()

In [28]:
model = BertForSequenceClassification.from_pretrained("google/bert_uncased_L-2_H-128_A-2", num_labels=50)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [30]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bert-base-aug_fine", logging_dir=f"~/logs/{DATASET}/bert-base-aug_fine", batch_size=128, epochs=10)

In [31]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_aug,
    eval_dataset=eval,
    compute_metrics=base.compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 4)]
)

In [32]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.8241,2.13618,0.622365,0.301445,0.262651,0.258269
2,1.6022,1.491287,0.736022,0.410205,0.412482,0.392001
3,1.0542,1.241309,0.754354,0.446046,0.438159,0.419732
4,0.7693,1.122047,0.767186,0.481814,0.469729,0.45606
5,0.5986,1.071303,0.769936,0.491996,0.493517,0.483893
6,0.4918,1.041133,0.774519,0.522023,0.515877,0.511503
7,0.4186,1.025562,0.769936,0.517848,0.51347,0.508353
8,0.3742,1.014744,0.767186,0.533035,0.515185,0.50933
9,0.3429,1.013984,0.772686,0.534928,0.518609,0.512396
10,0.3292,1.011537,0.774519,0.556973,0.527321,0.524949


TrainOutput(global_step=5250, training_loss=0.8805165405273437, metrics={'train_runtime': 361.447, 'train_samples_per_second': 1855.874, 'train_steps_per_second': 14.525, 'total_flos': 506837429280000.0, 'train_loss': 0.8805165405273437, 'epoch': 10.0})

In [33]:
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-1): 2 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=128, out_features=128, bias=True)
              (key): Linear(in_features=128, out_features=128, bias=True)
              (value): Linear(in_features=128, out_features=128, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=128, out_features=128, bias=True)
              (LayerNorm): LayerNorm((128,), eps=1e-1

In [34]:
trainer.evaluate(test)

{'eval_loss': 1.0670723915100098,
 'eval_accuracy': 0.75,
 'eval_precision': 0.495292131023191,
 'eval_recall': 0.5526976371772174,
 'eval_f1': 0.4961979675487844,
 'eval_runtime': 3.236,
 'eval_samples_per_second': 154.512,
 'eval_steps_per_second': 1.236,
 'epoch': 10.0}

In [None]:
torch.save(model.state_dict(), f"{os.path.expanduser('~')}/models/{DATASET}/bert-base-aug_fine.pth")

In [None]:
base.reset_seed()

In [35]:
student_model = BertForSequenceClassification.from_pretrained("google/bert_uncased_L-2_H-128_A-2", num_labels=50)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [36]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bert-distill-aug_fine", logging_dir=f"~/logs/{DATASET}/bert-distill-aug_fine", remove_unused_columns=False, batch_size=128, epochs=10, temp=5, lambda_param=.5)

In [37]:
trainer = base.DistilTrainer(
    student_model=student_model,
    args=training_args,
    train_dataset=train_aug,
    eval_dataset=eval,
    compute_metrics=base.compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 4)]
)

In [38]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.8156,1.408841,0.560953,0.202901,0.197673,0.172608
2,1.0825,1.010816,0.705775,0.295062,0.315894,0.29077
3,0.7563,0.858956,0.732356,0.430171,0.371395,0.359406
4,0.5883,0.778024,0.743355,0.436063,0.402083,0.391438
5,0.4861,0.734973,0.756187,0.448253,0.429689,0.418087
6,0.4202,0.707767,0.761687,0.494127,0.456463,0.455369
7,0.3734,0.689448,0.766269,0.476383,0.455588,0.450042
8,0.345,0.679557,0.765353,0.473426,0.457292,0.451839
9,0.3266,0.676093,0.769936,0.505999,0.469333,0.4669
10,0.3164,0.672138,0.772686,0.517882,0.477052,0.478183


TrainOutput(global_step=5250, training_loss=0.6510405128115699, metrics={'train_runtime': 366.2537, 'train_samples_per_second': 1831.517, 'train_steps_per_second': 14.334, 'total_flos': 506837429280000.0, 'train_loss': 0.6510405128115699, 'epoch': 10.0})

In [39]:
student_model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-1): 2 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=128, out_features=128, bias=True)
              (key): Linear(in_features=128, out_features=128, bias=True)
              (value): Linear(in_features=128, out_features=128, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=128, out_features=128, bias=True)
              (LayerNorm): LayerNorm((128,), eps=1e-1

In [40]:
trainer.evaluate(test)

{'eval_loss': 0.6992137432098389,
 'eval_accuracy': 0.742,
 'eval_precision': 0.4123024488061562,
 'eval_recall': 0.48892849367182883,
 'eval_f1': 0.4244696974848007,
 'eval_runtime': 3.8399,
 'eval_samples_per_second': 130.213,
 'eval_steps_per_second': 1.042,
 'epoch': 10.0}

In [None]:
torch.save(student_model.state_dict(), f"{os.path.expanduser('~')}/models/{DATASET}/bert-distil-aug_fine.pth")

In [27]:
import time
from torch.utils.data import  DataLoader

base.count_parameters(model)
torch.cuda.synchronize() 
starter, ender = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True)
device = "cuda"
model.to(device)

train.set_format(type="torch", columns=["input_ids", "attention_mask"], device="cuda")
test_loader = DataLoader(train, batch_size=1, shuffle=False)

timings = []



for i, batch in enumerate(test_loader):
    if i >= 1000:
        break
    torch.cuda.synchronize()
    starter.record()
    with torch.no_grad():
        _ = model(**batch)
    ender.record()
    torch.cuda.synchronize()
    timings.append(starter.elapsed_time(ender))

print(f"Average Inference Time on GPU: {sum(timings) / len(timings):.3f} ms")





timings = []
device = "cpu"
model.to(device)
train.set_format(type="torch", columns=["input_ids", "attention_mask"], device="cpu")
test_loader = DataLoader(train, batch_size=1, shuffle=False)
for i, batch in enumerate(test_loader):
    if i >= 1000:
        break
    start_time = time.perf_counter()
    with torch.no_grad():
        _ = model(**batch)
    end_time = time.perf_counter()
    timings.append((end_time - start_time)*1000)


print(f"Average Inference Time on CPU: {sum(timings) / len(timings):.3f} ms")

model size: 16.763MB.
Total Trainable Params: 4392370.
Average Inference Time on GPU: 2.739 ms
Average Inference Time on CPU: 8.063 ms
