#  🤗 Transformers Finetuning

__Автор задач: Блохин Н.В. (NVBlokhin@fa.ru)__

Материалы:
* https://huggingface.co/docs/transformers/training
* https://huggingface.co/docs/datasets/main/en/repository_structure
* https://huggingface.co/docs/datasets/main/en/package_reference/loading_methods#datasets.load_dataset
* https://huggingface.co/docs/transformers/v4.35.2/en/training#prepare-a-dataset
* https://huggingface.co/docs/datasets/process
* https://huggingface.co/docs/evaluate/index
* https://huggingface.co/docs/transformers/main_classes/trainer
* https://huggingface.co/docs/transformers/v4.35.2/en/main_classes/trainer#transformers.TrainingArguments

## Задачи для совместного разбора

1\. Обсудите основные шаги по дообучению моделей из экосистемы 🤗 Transformers.

## Задачи для самостоятельного решения

In [1]:
!pip install datasets

Installing collected packages: pyarrow-hotfix, dill, multiprocess, datasets
Successfully installed datasets-2.15.0 dill-0.3.7 multiprocess-0.70.15 pyarrow-hotfix-0.6


In [3]:
!pip install torchmetrics

Installing collected packages: lightning-utilities, torchmetrics
Successfully installed lightning-utilities-0.10.0 torchmetrics-1.2.1


In [4]:
import torch as th
import torch.nn as nn
import torch.optim as optim
import transformers
import pandas as pd
import datasets
import torchmetrics as M
from transformers import AutoTokenizer
from torch.utils.data.dataloader import DataLoader
from sklearn.model_selection import train_test_split

<p class="task" id="1"></p>

1\. Разбейте данные из файла `reviews_polarity.csv` на обучающее и валидационное множество в соотношении 80 на 20. Создайте папку `reviews_polarity_dataset` и сохраните в нее полученные фрагменты данных под названием `train.csv` и `test.csv`. Создайте объект `datasets.Dataset`, используя функцию `load_dataset`.

Токенизируйте строки при помощи токенизатора, соотвествующего модели `rubert-base-cased-sentiment`. Удалите из датасета поле `text` после токенизации, замените поле `class` на `labels` и приведите данные к тензорам `torch`.

Создайте два `DataLoader` на основе обучающего и валидационного множества. Получите батч из обучающего множества и выведите его на экран.

- [ ] Проверено на семинаре

In [5]:
df = pd.read_csv('reviews_polarity.csv')
df.head(1)

Unnamed: 0,text,class
0,"Очень хорошо что открылась 5 ка, теперь не над...",1


In [6]:
train, test = train_test_split(df, test_size=0.2, random_state=42)
train.shape, test.shape

((30574, 2), (7644, 2))

In [8]:
train.to_csv('reviews_polarity_dataset/train.csv', index=None)
test.to_csv('reviews_polarity_dataset/test.csv', index=None)

In [9]:
dataset = datasets.load_dataset('reviews_polarity_dataset')

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [10]:
tokenizer = AutoTokenizer.from_pretrained('blanchefort/rubert-base-cased-sentiment')

tokenizer_config.json:   0%|          | 0.00/499 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/943 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/1.40M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [11]:
def tokenize_function(ex):
    return tokenizer(ex['text'], padding=True, truncation=True)

dset_tok = dataset.map(tokenize_function, batched=True, batch_size=64)

Map:   0%|          | 0/30574 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/7644 [00:00<?, ? examples/s]

In [12]:
dset_tok = dset_tok.rename_column("class", "labels")
dset_tok = dset_tok.remove_columns('text')
dset_tok.set_format(type='torch')

In [13]:
dset_tok

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 30574
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 7644
    })
})

In [14]:
train_loader = DataLoader(dset_tok['train'], batch_size=64)
test_loader = DataLoader(dset_tok['test'], batch_size=64)

for batch in train_loader:
  print(batch)
  break

{'labels': tensor([0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1,
        0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0]), 'input_ids': tensor([[  101, 14220,  1297,  ...,     0,     0,     0],
        [  101,  3065, 31617,  ...,     0,     0,     0],
        [  101, 82963,  5187,  ...,     0,     0,     0],
        ...,
        [  101,  1067, 29878,  ...,     0,     0,     0],
        [  101,  2937,   883,  ...,     0,     0,     0],
        [  101,  8104,   102,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0

<p class="task" id="2"></p>

2\. Создайте модель при помощи класса `AutoModelForSequenceClassification`, заменив голову модели в соответствии с задачей бинарной классификации. Используя стандартный цикл обучения `torch`, настройте модель для решения задачи бинарной классификации. Во время обучения выводите на экран значение функции потерь (используйте готовые значения, которые генерирует модель) на обучающем множестве и f1 на валидационном множестве.

Здесь и далее для ускорения процесса обучения вы можете заморозить часть сети или уменьшить размер наборов данных, выбрав небольшое подмножество примеров.

- [ ] Проверено на семинаре

In [None]:
from transformers import AutoModelForSequenceClassification

In [None]:
model_path = "blanchefort/rubert-base-cased-sentiment"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(
    model_path,
    num_labels=2,
    ignore_mismatched_sizes=True
)
model.to(device='cuda')
model.requires_grad_(False)
model.classifier.requires_grad_(True)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at blanchefort/rubert-base-cased-sentiment and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([2]) in the model instantiated
- classifier.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([2, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Linear(in_features=768, out_features=2, bias=True)

In [None]:
n_epoch = 5
lr = 0.01
optimizer = optim.Adam(model.parameters(), lr)

for epoch in range(1, n_epoch+1):
  batch_count = 50
  batch_calculated = 0
  f1_train = M.F1Score(task='binary', num_classes=2).to(device='cuda')
  for batch in train_loader:
    for k in batch.keys():
      batch[k] = batch[k].to(device='cuda')
    out = model(**batch)
    f1_train.update(out.logits.argmax(dim=1).detach(), batch['labels'])
    loss = out.loss
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
    if batch_calculated == batch_count:
      break
    else:
      batch_calculated += 1

  f1_test = M.F1Score(task='binary', num_classes=2).to(device='cuda')
  for batch in test_loader:
    batch = {k: v.to(device='cuda') for k, v in batch.items()}
    out = model(**batch)
    f1_test.update(out.logits.argmax(dim=1).detach(), batch['labels'])
  print(f'{epoch=} {loss.item()=:.5f} f1_train={f1_train.compute().item()} f1_test={f1_test.compute().item()}')

epoch=1 loss.item()=0.70060 f1_train=0.8751919865608215 f1_test=0.866378664970398
epoch=2 loss.item()=0.53935 f1_train=0.8796539902687073 f1_test=0.8676685690879822
epoch=3 loss.item()=0.55161 f1_train=0.8804864287376404 f1_test=0.8736593723297119
epoch=4 loss.item()=0.56151 f1_train=0.8776484131813049 f1_test=0.8775827288627625
epoch=5 loss.item()=0.55406 f1_train=0.8784868121147156 f1_test=0.8757280111312866


<p class="task" id="3"></p>

3\. Создайте модель при помощи класса `AutoModelForSequenceClassification`, заменив голову модели в соответствии с задачей бинарной классификации. Используя `transformers.Trainer`, настройте модель для решения задачи бинарной классификации. При настройке `Trainer` укажите количество эпох, равное 5. Во время обучения выводите на экран значение функции потерь на обучающем множестве и f1 на валидационном множестве.  

- [ ] Проверено на семинаре


In [None]:
model_path = "blanchefort/rubert-base-cased-sentiment"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(
    model_path,
    num_labels=2,
    ignore_mismatched_sizes=True
)
model.to(device='cuda')
model.requires_grad_(False)
model.classifier.requires_grad_(True)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at blanchefort/rubert-base-cased-sentiment and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([2]) in the model instantiated
- classifier.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([2, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Linear(in_features=768, out_features=2, bias=True)

In [None]:
!pip install accelerate -U

In [None]:
from sklearn.metrics import f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds)
    return {'f1_score': f1}

In [None]:
optimizer = optim.Adam(model.parameters(), 0.01)
args = transformers.TrainingArguments(
    output_dir='out',
    learning_rate=0.01,
    num_train_epochs=3,
    evaluation_strategy='epoch'
)

trainer = transformers.Trainer(
    model=model,
    args=args,
    compute_metrics=compute_metrics,
    train_dataset=dset_tok['train'],
    eval_dataset=dset_tok['test'],
    tokenizer=tokenizer,
    optimizers=[optimizer, None]
)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,F1 Score
1,0.7381,0.850403,0.898968
2,0.5388,0.47076,0.89804
3,0.4266,0.368147,0.899733


TrainOutput(global_step=11466, training_loss=0.5993851988223675, metrics={'train_runtime': 449.8143, 'train_samples_per_second': 203.911, 'train_steps_per_second': 25.491, 'total_flos': 2157179709577680.0, 'train_loss': 0.5993851988223675, 'epoch': 3.0})

<p class="task" id="4"></p>

4\. Используя эмбеддинги `distiluse-base-multilingual-cased-v1` из пакета `sentence_transformers`, решите задачу бинарной классификации. Для этого добавьте несколько полносвязных слоев поверх модели `SentenceTransformer`. Заморозьте часть модели, отвечающей за генерацию эмбеддингов. Во время обучения выводите на экран значение функции потерь на обучающем множестве и f1 на валидационном множестве.  

- [ ] Проверено на семинаре

In [15]:
!pip install sentence_transformers

Installing collected packages: sentencepiece, sentence_transformers
Successfully installed sentence_transformers-2.2.2 sentencepiece-0.1.99


In [16]:
from sentence_transformers import SentenceTransformer

In [None]:
model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: DistilBertModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Dense({'in_features': 768, 'out_features': 512, 'bias': True, 'activation_function': 'torch.nn.modules.activation.Tanh'})
)

In [48]:
class Net(nn.Module):
  def __init__(self):
    super().__init__()
    self.base_model = SentenceTransformer('distiluse-base-multilingual-cased-v1', device='cuda')
    self.base_model.requires_grad_(False)
    self.classifier  = nn.Sequential(
        nn.Linear(512, 64),
        nn.ReLU(),
        nn.Linear(64, 2)
    )

  def forward(self, X):
    emb = self.base_model(X)['sentence_embedding']
    out = self.classifier(emb)
    return out

In [53]:
n_epoch = 5
lr = 0.001
model = Net()
model.to(device='cuda')
optimizer = optim.Adam(model.parameters(), lr)
crit = nn.CrossEntropyLoss(ignore_index=0)

for epoch in range(1, n_epoch+1):
  for batch in train_loader:
    batch = {k: v.to(device='cuda') for k, v in batch.items()}
    y = batch.pop('labels')
    batch.pop('token_type_ids')
    out = model(batch)
    loss = crit(out, y)
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()

  f1_test = M.F1Score(task='binary', num_classes=2).to(device='cuda')
  for batch in test_loader:
    batch = {k: v.to(device='cuda') for k, v in batch.items()}
    y = batch.pop('labels')
    batch.pop('token_type_ids')
    out = model(batch)
    f1_test.update(out.argmax(dim=1), y)

  print(f'{epoch=} loss={loss.item():.5f} f1_test={f1_test.compute().item():.5f}')

epoch=1 loss=0.00069 f1_test=0.88295
epoch=2 loss=0.00015 f1_test=0.88295
epoch=3 loss=0.00006 f1_test=0.88295
epoch=4 loss=0.00003 f1_test=0.88295
epoch=5 loss=0.00002 f1_test=0.88295


## Обратная связь
- [ ] Хочу получить обратную связь по решению