In [None]:
%pip install transformers[torch] huggingface_hub datasets evaluate

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting huggingface_hub
  Downloading huggingface_hub-0.28.1-py3-none-any.whl.metadata (13 kB)
Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting torchtext==0.18.0
  Downloading torchtext-0.18.0-cp310-cp310-manylinux1_x86_64.whl.metadata (7.9 kB)
Collecting transformers[torch]
  Downloading transformers-4.48.3-py3-none-any.whl.metadata (44 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers[torch])
  Downloading tokenizers-0.21.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting accelerate>=0.26.0 (from transformers[torch])
  Downloading accelerate-1.3.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3

In [3]:
#Torchtext není k dispozici pro poslední verzi pytorch, budeme tedy využuívat něco jiného ...


In [1]:
from transformers import Trainer, BertForSequenceClassification
from datasets import load_from_disk

import torch
import base

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
base.reset_seed()

In [11]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU is available and will be used:", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print("GPU is not available, using CPU.")

GPU is available and will be used: NVIDIA A100 80GB PCIe MIG 2g.20gb


In [12]:
train = load_from_disk('./data/sst2/train-logits')
eval = load_from_disk('./data/sst2/eval-logits')
test = load_from_disk('./data/sst2/test-logits')

In [42]:
base.reset_seed()

In [43]:
model = BertForSequenceClassification.from_pretrained("google/bert_uncased_L-2_H-128_A-2")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [44]:
#model = base.freeze_model(model)

In [45]:
training_args = base.get_training_args(output_dir="./results/bertTest", logging_dir="./logs/bertTest", batch_size=32, epochs=10)

In [46]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train,
    eval_dataset=eval,
    compute_metrics=base.compute_metrics,
)

In [47]:
trainer.train()

Epoch,Training Loss,Validation Loss


TrainOutput(global_step=2170, training_loss=0.49731627081945745, metrics={'train_runtime': 58.6215, 'train_samples_per_second': 1180.454, 'train_steps_per_second': 37.017, 'total_flos': 10989712896000.0, 'train_loss': 0.49731627081945745, 'epoch': 10.0})

In [48]:
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-1): 2 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=128, out_features=128, bias=True)
              (key): Linear(in_features=128, out_features=128, bias=True)
              (value): Linear(in_features=128, out_features=128, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=128, out_features=128, bias=True)
              (LayerNorm): LayerNorm((128,), eps=1e-1

In [49]:
trainer.evaluate(test)

{'eval_loss': 0.5364744067192078,
 'eval_accuracy': 0.7534321801208127,
 'eval_precision': 0.7535666695634542,
 'eval_recall': 0.753412512303862,
 'eval_f1': 0.7533893435762404,
 'eval_runtime': 3.131,
 'eval_samples_per_second': 581.61,
 'eval_steps_per_second': 18.205,
 'epoch': 10.0}

In [71]:
base.reset_seed()

In [72]:
student_model = BertForSequenceClassification.from_pretrained("google/bert_uncased_L-2_H-128_A-2")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [73]:
training_args = base.get_training_args(output_dir="./results/bertTestDistil", logging_dir="./logs/bertTestDistil", remove_unused_columns=False, batch_size=32, epochs=10, temp=5, lambda_param=.5)

In [74]:
trainer = base.ImageDistilTrainer(
    student_model=student_model,
    args=training_args,
    train_dataset=train,
    eval_dataset=eval,
    compute_metrics=base.compute_metrics,
)

In [75]:
trainer.train()

Epoch,Training Loss,Validation Loss


TrainOutput(global_step=2170, training_loss=2.4708346177905387, metrics={'train_runtime': 71.8087, 'train_samples_per_second': 963.672, 'train_steps_per_second': 30.219, 'total_flos': 10989712896000.0, 'train_loss': 2.4708346177905387, 'epoch': 10.0})

In [76]:
student_model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-1): 2 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=128, out_features=128, bias=True)
              (key): Linear(in_features=128, out_features=128, bias=True)
              (value): Linear(in_features=128, out_features=128, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=128, out_features=128, bias=True)
              (LayerNorm): LayerNorm((128,), eps=1e-1

In [77]:
trainer.evaluate(test)

{'eval_loss': 2.2033705711364746,
 'eval_accuracy': 0.7468423942888522,
 'eval_precision': 0.7468420163809846,
 'eval_recall': 0.7468426118927682,
 'eval_f1': 0.7468420889151719,
 'eval_runtime': 3.2789,
 'eval_samples_per_second': 555.364,
 'eval_steps_per_second': 17.384,
 'epoch': 10.0}