In [1]:
%pip install transformers[torch] huggingface_hub datasets evaluate ipywidgets

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting huggingface_hub
  Downloading huggingface_hub-0.29.1-py3-none-any.whl.metadata (13 kB)
Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting ipywidgets
  Downloading ipywidgets-8.1.5-py3-none-any.whl.metadata (2.3 kB)
Collecting transformers[torch]
  Downloading transformers-4.49.0-py3-none-any.whl.metadata (44 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers[torch])
  Downloading tokenizers-0.21.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting accelerate>=0.26.0 (from transformers[torch])
  Downloading accelerate-1.4.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylin

In [2]:
#Torchtext není k dispozici pro poslední verzi pytorch, budeme tedy využuívat něco jiného ...


In [3]:
from transformers import Trainer, BertForSequenceClassification, BertTokenizer
from datasets import load_from_disk

import torch
import base

In [4]:
base.reset_seed()

In [5]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU is available and will be used:", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print("GPU is not available, using CPU.")

GPU is available and will be used: NVIDIA A100 80GB PCIe MIG 2g.20gb


In [6]:
train = load_from_disk('./data/sst2/train-logits')
eval = load_from_disk('./data/sst2/eval-logits')
test = load_from_disk('./data/sst2/test-logits')

train_aug = load_from_disk('./data/sst2/train-logits-augmented')

In [7]:
tokenizer = BertTokenizer.from_pretrained("gchhablani/bert-base-cased-finetuned-sst2")

In [8]:
train = train.map(lambda e: tokenizer(e['sentence'], truncation=True, padding='max_length', return_tensors="pt", max_length=300), batched=True, desc="Tokenizing the train dataset")
eval = eval.map(lambda e: tokenizer(e['sentence'], truncation=True, padding='max_length', return_tensors="pt", max_length=300), batched=True, desc="Tokenizing the eval dataset")
test = test.map(lambda e: tokenizer(e['sentence'], truncation=True, padding='max_length', return_tensors="pt", max_length=300), batched=True, desc="Tokenizing the test dataset")

train_aug = train_aug.map(lambda e: tokenizer(e['sentence'], truncation=True, padding='max_length', return_tensors="pt", max_length=300), batched=True, desc="Tokenizing the augmented dataset")

In [9]:
base.reset_seed()

In [10]:
model = BertForSequenceClassification.from_pretrained("google/bert_uncased_L-2_H-128_A-2", num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
training_args = base.get_training_args(output_dir="./results/bert-base", logging_dir="./logs/bert-base", batch_size=128, epochs=10)

In [12]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train,
    eval_dataset=eval,
    compute_metrics=base.compute_metrics,
)

In [13]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6337,0.549533,0.719037,0.719241,0.719258,0.719036
2,0.4653,0.484065,0.774083,0.774053,0.773901,0.773951
3,0.3762,0.477339,0.784404,0.786395,0.783573,0.783634
4,0.3273,0.489337,0.788991,0.791029,0.788162,0.788238
5,0.2941,0.46883,0.801606,0.80154,0.801602,0.801561
6,0.2765,0.473168,0.801606,0.801566,0.801476,0.801511
7,0.2613,0.492721,0.795872,0.797331,0.795171,0.795302
8,0.2531,0.480232,0.808486,0.808564,0.808653,0.80848
9,0.2445,0.491007,0.806193,0.806134,0.806106,0.806119
10,0.2406,0.494819,0.806193,0.806301,0.805938,0.806033


TrainOutput(global_step=4210, training_loss=0.3372740793114886, metrics={'train_runtime': 318.3798, 'train_samples_per_second': 1692.287, 'train_steps_per_second': 13.223, 'total_flos': 401089284540000.0, 'train_loss': 0.3372740793114886, 'epoch': 10.0})

In [14]:
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-1): 2 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=128, out_features=128, bias=True)
              (key): Linear(in_features=128, out_features=128, bias=True)
              (value): Linear(in_features=128, out_features=128, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=128, out_features=128, bias=True)
              (LayerNorm): LayerNorm((128,), eps=1e-1

In [15]:
trainer.evaluate(test)

{'eval_loss': 0.27465716004371643,
 'eval_accuracy': 0.8951744617668894,
 'eval_precision': 0.8931336127022499,
 'eval_recall': 0.8953544607934005,
 'eval_f1': 0.8940722936529324,
 'eval_runtime': 8.1757,
 'eval_samples_per_second': 1647.567,
 'eval_steps_per_second': 12.965,
 'epoch': 10.0}

In [16]:
torch.save(model, './models/sst2/bert.pth')

In [17]:
base.reset_seed()

In [18]:
student_model = BertForSequenceClassification.from_pretrained("google/bert_uncased_L-2_H-128_A-2", num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
training_args = base.get_training_args(output_dir="./results/bertTestDistil", logging_dir="./logs/bertTestDistil", remove_unused_columns=False, batch_size=128, epochs=10, temp=5, lambda_param=.5)

In [20]:
trainer = base.DistilTrainer(
    student_model=student_model,
    args=training_args,
    train_dataset=train,
    eval_dataset=eval,
    compute_metrics=base.compute_metrics,
)

In [21]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.1185,2.398569,0.704128,0.707405,0.705165,0.703565
2,2.3283,1.98921,0.759174,0.76004,0.758546,0.758614
3,1.7786,1.836033,0.779817,0.780384,0.779321,0.779441
4,1.4694,1.852941,0.779817,0.784957,0.778522,0.778219
5,1.2653,1.712456,0.786697,0.787427,0.786162,0.786291
6,1.1496,1.70417,0.797018,0.797054,0.796803,0.796877
7,1.0736,1.813394,0.783257,0.786962,0.782152,0.782046
8,1.023,1.70336,0.801606,0.801831,0.801854,0.801605
9,0.9819,1.723154,0.800459,0.800392,0.800392,0.800392
10,0.9621,1.736029,0.799312,0.799481,0.799013,0.799119


TrainOutput(global_step=4210, training_loss=1.5150246663218156, metrics={'train_runtime': 382.8306, 'train_samples_per_second': 1407.385, 'train_steps_per_second': 10.997, 'total_flos': 401089284540000.0, 'train_loss': 1.5150246663218156, 'epoch': 10.0})

In [22]:
student_model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-1): 2 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=128, out_features=128, bias=True)
              (key): Linear(in_features=128, out_features=128, bias=True)
              (value): Linear(in_features=128, out_features=128, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=128, out_features=128, bias=True)
              (LayerNorm): LayerNorm((128,), eps=1e-1

In [23]:
trainer.evaluate(test)

{'eval_loss': 1.0660582780838013,
 'eval_accuracy': 0.8928730512249443,
 'eval_precision': 0.8908336252121414,
 'eval_recall': 0.8929261081128524,
 'eval_f1': 0.8917283847291158,
 'eval_runtime': 9.0177,
 'eval_samples_per_second': 1493.731,
 'eval_steps_per_second': 11.755,
 'epoch': 10.0}

In [24]:
torch.save(student_model, './models/sst2/bert-distil.pth')

In [25]:
base.reset_seed()

In [26]:
model = BertForSequenceClassification.from_pretrained("google/bert_uncased_L-2_H-128_A-2", num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [27]:
training_args = base.get_training_args(output_dir="./results/bert-base", logging_dir="./logs/bert-base", batch_size=128, epochs=10)

In [28]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_aug,
    eval_dataset=eval,
    compute_metrics=base.compute_metrics,
)

In [29]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.388,0.413457,0.827982,0.827978,0.82784,0.827891
2,0.2557,0.423607,0.825688,0.825688,0.825798,0.825673
3,0.2209,0.428542,0.829128,0.829834,0.829555,0.829117
4,0.1958,0.45746,0.826835,0.827262,0.826461,0.826616
5,0.1758,0.499291,0.815367,0.816155,0.814863,0.815034
6,0.1615,0.522447,0.813073,0.813192,0.812821,0.81292
7,0.1497,0.541583,0.819954,0.819935,0.820041,0.819935
8,0.141,0.56763,0.813073,0.813078,0.812905,0.812965
9,0.1351,0.574606,0.81078,0.811162,0.8104,0.81054
10,0.1312,0.581769,0.813073,0.813078,0.812905,0.812965


TrainOutput(global_step=39660, training_loss=0.1954774442010015, metrics={'train_runtime': 3540.9658, 'train_samples_per_second': 1433.547, 'train_steps_per_second': 11.2, 'total_flos': 3778810595640000.0, 'train_loss': 0.1954774442010015, 'epoch': 10.0})

In [30]:
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-1): 2 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=128, out_features=128, bias=True)
              (key): Linear(in_features=128, out_features=128, bias=True)
              (value): Linear(in_features=128, out_features=128, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=128, out_features=128, bias=True)
              (LayerNorm): LayerNorm((128,), eps=1e-1

In [31]:
trainer.evaluate(test)

{'eval_loss': 0.21611741185188293,
 'eval_accuracy': 0.9204157386785449,
 'eval_precision': 0.9185639385834699,
 'eval_recall': 0.9209728140334594,
 'eval_f1': 0.9195838809560937,
 'eval_runtime': 8.8313,
 'eval_samples_per_second': 1525.249,
 'eval_steps_per_second': 12.003,
 'epoch': 10.0}

In [32]:
torch.save(model, './models/sst2/bert-base-aug.pth')

In [33]:
base.reset_seed()

In [34]:
student_model = BertForSequenceClassification.from_pretrained("google/bert_uncased_L-2_H-128_A-2", num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [35]:
training_args = base.get_training_args(output_dir="./results/bertTestDistil", logging_dir="./logs/bertTestDistil", remove_unused_columns=False, batch_size=128, epochs=10, temp=5, lambda_param=.5)

In [36]:
trainer = base.DistilTrainer(
    student_model=student_model,
    args=training_args,
    train_dataset=train_aug,
    eval_dataset=eval,
    compute_metrics=base.compute_metrics,
)

In [37]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.4487,1.315928,0.827982,0.827924,0.827924,0.827924
2,0.8025,1.284763,0.826835,0.826773,0.82684,0.826796
3,0.6732,1.266073,0.833716,0.833668,0.833639,0.833652
4,0.5864,1.306508,0.823394,0.823655,0.823082,0.823212
5,0.521,1.358838,0.822248,0.824499,0.821451,0.821638
6,0.4754,1.414337,0.818807,0.819536,0.818325,0.818498
7,0.4408,1.431028,0.819954,0.820162,0.819662,0.819781
8,0.4143,1.457472,0.823394,0.824142,0.822914,0.823093
9,0.4004,1.474662,0.81422,0.815245,0.813652,0.813828
10,0.3874,1.4832,0.816514,0.817094,0.816073,0.816234


TrainOutput(global_step=39660, training_loss=0.6149963132672639, metrics={'train_runtime': 3097.5646, 'train_samples_per_second': 1638.752, 'train_steps_per_second': 12.804, 'total_flos': 3778810595640000.0, 'train_loss': 0.6149963132672639, 'epoch': 10.0})

In [38]:
student_model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-1): 2 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=128, out_features=128, bias=True)
              (key): Linear(in_features=128, out_features=128, bias=True)
              (value): Linear(in_features=128, out_features=128, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=128, out_features=128, bias=True)
              (LayerNorm): LayerNorm((128,), eps=1e-1

In [39]:
trainer.evaluate(test)

{'eval_loss': 0.6758776903152466,
 'eval_accuracy': 0.9255382331106162,
 'eval_precision': 0.924193908662698,
 'eval_recall': 0.9250594547219044,
 'eval_f1': 0.924607041195131,
 'eval_runtime': 9.1618,
 'eval_samples_per_second': 1470.237,
 'eval_steps_per_second': 11.57,
 'epoch': 10.0}

In [40]:
torch.save(student_model, './models/sst2/bert-distil-aug.pth')