<a href="https://colab.research.google.com/github/AndreySerdyukov/ML-projects/blob/main/bert_fine_tune.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Импорт библиотек

In [None]:
# !pip install transformers
# !pip install datasets

In [None]:
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from datasets import load_dataset
from sklearn.metrics import accuracy_score, classification_report
import torch
from torch.utils.data import DataLoader, Dataset, random_split
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader
from torch.optim import AdamW
from torch.optim.lr_scheduler import LambdaLR

# Препроцессинг

## Загружаю датасет

In [None]:
dataset = load_dataset("imdb")

In [None]:
dataset["train"] = dataset["train"].shuffle(seed=42).select(range(int(0.2 * len(dataset["train"]))))
dataset["test"] = dataset["test"].shuffle(seed=42).select(range(int(0.2 * len(dataset["test"]))))

## Сплит на трейн/валидацию/тест

In [None]:
split_dataset = dataset["train"].train_test_split(test_size=0.2) # трейн тест для обучения
train_dataset = split_dataset["train"] # трейн
val_dataset = split_dataset["test"] # тест
test_dataset = dataset["test"]

In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [None]:
def tokenize_function(example):
    return tokenizer(
        example["text"],
        padding="longest",
        truncation=True,
        max_length=512
    )

In [None]:
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [None]:
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
val_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

In [None]:
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)
test_loader = DataLoader(test_dataset, batch_size=16)

In [None]:
import torch
from torch.optim import AdamW
from torch.optim.lr_scheduler import LambdaLR
from tqdm.auto import tqdm

class SentimentTrainer:
    def __init__(
        self,
        model,
        train_loader,
        val_loader,
        test_loader,
        device,
        learning_rate=3e-5,      # изменённый learning rate
        num_epochs=4,            # изменённое число эпох
        weight_decay=0.01,
        warmup_ratio=0.1         # warmup шаги задаются как 10% от общего количества шагов
    ):
        self.model = model.to(device)
        self.train_loader = train_loader
        self.val_loader = val_loader
        self.test_loader = test_loader
        self.device = device
        self.learning_rate = learning_rate
        self.num_epochs = num_epochs
        self.weight_decay = weight_decay

        # Создаем оптимизатор AdamW из PyTorch
        self.optimizer = AdamW(self.model.parameters(), lr=self.learning_rate, weight_decay=self.weight_decay)

        # Расчет общего числа шагов и warmup шагов
        total_steps = len(self.train_loader) * self.num_epochs
        warmup_steps = int(total_steps * warmup_ratio)

        # Функция для линейного увеличения скорости обучения в начале и уменьшения до 0 после warmup
        def lr_lambda(current_step):
            if current_step < warmup_steps:
                return float(current_step) / float(max(1, warmup_steps))
            return max(0.0, float(total_steps - current_step) / float(max(1, total_steps - warmup_steps)))

        # Создаем планировщик на основе LambdaLR
        self.scheduler = LambdaLR(self.optimizer, lr_lambda=lr_lambda)

    def train_epoch(self):
        self.model.train()
        total_loss = 0
        # Обходим батчи с помощью tqdm для визуализации прогресса
        for batch in tqdm(self.train_loader, desc="Training"):
            self.optimizer.zero_grad()
            # Переносим все тензоры в нужное устройство
            batch = {k: v.to(self.device) for k, v in batch.items()}
            # Модель принимает на вход: input_ids, attention_mask и labels
            outputs = self.model(**batch)
            loss = outputs.loss
            loss.backward()
            # Градиент-клиппинг для стабилизации обучения
            torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=1.0)
            self.optimizer.step()
            self.scheduler.step()
            total_loss += loss.item()
        avg_loss = total_loss / len(self.train_loader)
        return avg_loss

    def evaluate(self, data_loader):
        self.model.eval()
        total_loss = 0
        all_preds = []
        all_labels = []
        with torch.no_grad():
            for batch in tqdm(data_loader, desc="Evaluating"):
                batch = {k: v.to(self.device) for k, v in batch.items()}
                outputs = self.model(**batch)
                loss = outputs.loss
                total_loss += loss.item()
                # Получаем предсказания по максимальному логиту
                logits = outputs.logits
                preds = torch.argmax(logits, dim=1)
                all_preds.extend(preds.cpu().numpy())
                all_labels.extend(batch["labels"].cpu().numpy())
        avg_loss = total_loss / len(data_loader)
        return avg_loss, all_preds, all_labels

    def train(self):
        best_val_loss = float("inf")
        for epoch in range(self.num_epochs):
            print(f"Epoch {epoch + 1}/{self.num_epochs}")
            train_loss = self.train_epoch()
            print(f"Training loss: {train_loss:.4f}")

            val_loss, preds, labels = self.evaluate(self.val_loader)
            print(f"Validation loss: {val_loss:.4f}")
            # При желании можно добавить вычисление accuracy, F1 и других метрик

            # Сохраняем модель, если валидационный лосс улучшился
            if val_loss < best_val_loss:
                best_val_loss = val_loss
                torch.save(self.model.state_dict(), "best_model.pt")
                print("Saved best model.")

    def test(self):
        test_loss, preds, labels = self.evaluate(self.test_loader)
        print(f"Test loss: {test_loss:.4f}")
        return preds, labels

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
print(dataset["train"].column_names)


['text', 'label']


In [None]:
dataset["train"] = dataset["train"].rename_column("labels", "label")
dataset["test"] = dataset["test"].rename_column("labels", "label")

In [None]:
 model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
trainer = SentimentTrainer(model, train_loader, val_loader, test_loader, device)

# Запускаем процесс обучения
trainer.train()

Epoch 1/4


Training:   0%|          | 0/250 [00:00<?, ?it/s]

TypeError: BertForSequenceClassification.forward() got an unexpected keyword argument 'label'

In [None]:
test_preds, test_labels = trainer.test()

In [None]:
# from transformers import AutoConfig

# config = AutoConfig.from_pretrained("textattack/bert-base-uncased-imdb")
# print(config)