In [3]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
import torch
from torch import nn
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader
from sklearn.preprocessing import OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import sklearn
from sklearn.metrics import f1_score, confusion_matrix

# импортируем трансформеры
import transformers
from transformers import AutoTokenizer, AutoModel
import warnings

from torchmetrics.classification import (
    BinaryAccuracy,
    BinaryPrecision,
    BinaryRecall,
    BinaryF1Score,
    MulticlassPrecision,
    MulticlassAccuracy,
    MulticlassConfusionMatrix,
    MulticlassRecall,
    MulticlassF1Score,
)

In [4]:
import multiprocessing as mp
from dataclasses import dataclass
from typing import Union
from tqdm.auto import tqdm
import mlflow
from time import time
import os
from tqdm.auto import tqdm
import re

sklearn.set_config(transform_output="pandas")

In [5]:
if __name__ == "__main__":
    mp.set_start_method("spawn", force=True)
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
# GENERATOR = (
#     torch.Generator(device=DEVICE) if torch.cuda.is_available() else torch.Generator()
# )
GENERATOR = torch.Generator()

use_mlflow = True
mlflow.set_tracking_uri("http://localhost:5000")
CURR_DIR = os.curdir

In [6]:
%load_ext autoreload
%autoreload 2

In [11]:
# with open(os.path.join(CURR_DIR, "data", "healthcare_facilities_reviews.jsonl")) as f:
#     df = pd.read_json(f)
df = pd.read_json(
    os.path.join(CURR_DIR, "..", "data", "healthcare_facilities_reviews.jsonl"),
    lines=True,
)

In [12]:
df.head()

Unnamed: 0,review_id,category,title,content,sentiment,source_url
0,0,Поликлиники стоматологические,Классный мастер,Огромное спасибо за чудесное удаление двух зуб...,positive,http://www.spr.ru/forum_vyvod.php?id_tema=2727539
1,1,Поликлиники стоматологические,Замечательный врач,Хочу выразить особую благодарность замечательн...,positive,http://www.spr.ru/forum_vyvod.php?id_tema=2302877
2,2,Поликлиники стоматологические,Благодарность работникам рентгена,Добрый вечер! Хотелось бы поблагодарить сотруд...,positive,http://www.spr.ru/forum_vyvod.php?id_tema=2815031
3,3,Поликлиники стоматологические,Доктор Рабинович,Женщины советского образца в регистратуре не и...,negative,http://www.spr.ru/forum_vyvod.php?id_tema=3443161
4,4,Поликлиники стоматологические,Есть кому сказать спасибо,У меня с детства очень плохие зубы (тонкая и х...,positive,http://www.spr.ru/forum_vyvod.php?id_tema=2592430


In [13]:
df["sentiment"].nunique()

2

In [16]:
labels = df["sentiment"].copy().apply(lambda x: 1 if x == "positive" else 0)
df["labels"] = labels

In [None]:
data = df.loc[:, ["content", "labels"]].copy()
print(data.shape)
data.head()

(70597, 7)


Unnamed: 0,content,labels
0,Огромное спасибо за чудесное удаление двух зуб...,1
1,Хочу выразить особую благодарность замечательн...,1
2,Добрый вечер! Хотелось бы поблагодарить сотруд...,1
3,Женщины советского образца в регистратуре не и...,0
4,У меня с детства очень плохие зубы (тонкая и х...,1


In [24]:
tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2")
# model = AutoModel.from_pretrained("cointegrated/rubert-tiny2")

In [19]:
encoded_posts = (
    data["content"]
    .apply(lambda x: tokenizer(x, max_length=64, truncation=True, padding="max_length"))
    .values
)

In [20]:
class TinyBertCLSInputs(torch.utils.data.Dataset):
    def __init__(self, encoded_text, y_true):
        super().__init__()
        self.inputs = encoded_text
        self.labels = y_true

    def __len__(self):
        return self.inputs.shape[0]

    def __getitem__(self, idx):
        # print(self.inputs[idx])
        return (
            torch.tensor(self.inputs[idx]["input_ids"]).long(),
            torch.tensor(self.inputs[idx]["attention_mask"]).long(),
            torch.tensor(self.labels[idx]).long(),  # добавили для классификации
        )


dataset = TinyBertCLSInputs(encoded_text=encoded_posts, y_true=data["labels"])

In [21]:
BATCH_SIZE = 128

In [46]:
from torch.utils.data import Subset

train_idx, valid_idx = train_test_split(range(len(labels)), test_size=0.15)
train_ds = Subset(dataset, train_idx)
valid_ds = Subset(dataset, valid_idx)

train_loader_2 = DataLoader(
    train_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=0, pin_memory=True
)
valid_loader_2 = DataLoader(
    valid_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=0, pin_memory=True
)

In [47]:
class MyPersonalTinyBert(nn.Module):
    def __init__(self) -> None:
        super().__init__()
        # и снова грузим
        self.bert = AutoModel.from_pretrained("cointegrated/rubert-tiny2")
        # заморозим параметры
        for param in self.bert.parameters():
            param.requires_grad = False
        # делаем свой слой для классификации
        self.linear = nn.Sequential(
            nn.Linear(312, 256),  # начинаем с длины embedding, которые делает модель
            nn.ReLU(),
            nn.Dropout(p=0.5),
            nn.Linear(256, 64),  # это добавил для души))
            nn.ReLU(),
            nn.Linear(64, 1),  # заканчиваем кол-вом классов
        )

    def forward(self, input_ids, attention_mask):
        bert_out = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        vector = bert_out.last_hidden_state[:, 0, :]
        classes = self.linear(vector)
        return classes

In [48]:
model = MyPersonalTinyBert()
model.to(DEVICE)

MyPersonalTinyBert(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(83828, 312, padding_idx=0)
      (position_embeddings): Embedding(2048, 312)
      (token_type_embeddings): Embedding(2, 312)
      (LayerNorm): LayerNorm((312,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-2): 3 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=312, out_features=312, bias=True)
              (key): Linear(in_features=312, out_features=312, bias=True)
              (value): Linear(in_features=312, out_features=312, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=312, out_features=312, bias=True)
              (LayerNorm): LayerNorm((312,), eps=1e-12, element

In [49]:
criterion = nn.BCEWithLogitsLoss()
LR = 2e-4
optimizer = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=3e-5)

In [50]:
def binary_metrics(outputs, labels, device, num_classes=2):
    acc = BinaryAccuracy().to(device)
    prec = BinaryPrecision().to(device)
    rec = BinaryRecall().to(device)
    f1 = BinaryF1Score().to(device)

    preds = outputs.squeeze().float()
    labels = labels.squeeze().float()
    return (
        acc(preds, labels).item(),
        prec(preds, labels).item(),
        rec(preds, labels).item(),
        f1(preds, labels).item(),
    )


def fit_model(
    epochs: int,
    model: nn.Module,
    model_name: str,
    optimizer: torch.optim.Optimizer,
    criterion,
    train_loader,
    valid_loader,
    device,
    metrics_func,
    use_mlflow=False,
    num_classes=2,
):

    log = dict()
    log["train_loss"] = []
    log["valid_loss"] = []
    log["train_accuracy"] = []
    log["valid_accuracy"] = []
    log["train_precision"] = []
    log["valid_precision"] = []
    log["train_recall"] = []
    log["valid_recall"] = []
    log["train_f1"] = []
    log["valid_f1"] = []

    time_start = time()

    start_epoch = len(log["train_loss"])

    ### Создаем папку для записи весов
    # -----------------------------------------------------------------
    # Создаём корневую папку weights, если её нет
    folder_path = f"weights/"
    model_folder_path = os.path.join(folder_path, f"{model_name}")

    os.makedirs(model_folder_path, exist_ok=True)

    # Список номеров run_*
    run_nums = []

    # Ищем все подпапки с именем run_число
    for item_name in os.listdir(model_folder_path):
        full_path = os.path.join(model_folder_path, item_name)
        if os.path.isdir(full_path):
            match = re.search(r"run_(\d+)", item_name)
            if match:
                run_nums.append(int(match.group(1)))

    # Определяем следующий номер
    run = max(run_nums) + 1 if run_nums else 1

    # Создаём новую папку
    new_folder = os.path.join(model_folder_path, f"run_{run}")
    os.makedirs(new_folder, exist_ok=True)
    # -----------------------------------------------------------------

    ### Цикл обучения
    # -----------------------------------------------------------------
    for epoch in range(start_epoch + 1, start_epoch + epochs + 1):

        curr_run_path = os.path.join(folder_path, model_name, f"run_{run}")

        epoch_time_start = time()

        print(f'{"-"*13} Epoch {epoch} {"-"*13}')

        ### Обучение

        batch_acc = []
        batch_prec = []
        batch_recall = []
        batch_loss = []
        batch_f1 = []

        model.train()

        # Прогресс бар

        train_pbar = tqdm(
            train_loader, desc=f"Epoch {epoch}/{epochs} [Train]", leave=True
        )

        for inputs, masks, labels in train_pbar:

            inputs = inputs.to(device)
            labels = labels.to(device)
            masks = masks.to(device)

            optimizer.zero_grad()

            # Функции потерь

            outputs = model(inputs, masks)

            loss = criterion(outputs, labels.unsqueeze(1).to(outputs.dtype))  # .long())
            batch_loss.append(loss.item())

            # Метрики
            acc, prec, rec, f1 = metrics_func(
                outputs, labels, device=device, num_classes=num_classes
            )

            batch_acc.append(acc)
            batch_prec.append(prec)
            batch_recall.append(rec)
            batch_f1.append(f1)

            loss.backward()
            optimizer.step()

        train_pbar.set_postfix(
            {
                "Loss": loss,
                "Accuracy": acc,
                "Precision": prec,
                "Recall": rec,
                "F1-score": f1,
            }
        )

        log["train_loss"].append(np.mean(batch_loss))
        log["train_accuracy"].append(np.mean(batch_acc))
        log["train_precision"].append(np.mean(batch_prec))
        log["train_recall"].append(np.mean(batch_recall))
        log["train_f1"].append(np.mean(batch_f1))

        ### Валидация

        batch_acc = []
        batch_prec = []
        batch_recall = []
        batch_loss = []
        batch_f1 = []

        model.eval()

        valid_pbar = tqdm(
            valid_loader, desc=f"Epoch {epoch}/{epochs} [Test]", leave=True
        )
        for inputs, masks, labels in valid_pbar:

            inputs = inputs.to(device)
            labels = labels.to(device)
            masks = masks.to(device)

            with torch.no_grad():
                outputs = model(inputs, masks)

            loss = criterion(outputs, labels.unsqueeze(1).to(outputs.dtype))  # .long())
            batch_loss.append(loss.item())

            # Метрики
            acc, prec, rec, f1 = metrics_func(
                outputs, labels, device=device, num_classes=num_classes
            )

            batch_acc.append(acc)
            batch_prec.append(prec)
            batch_recall.append(rec)
            batch_f1.append(f1)

        valid_pbar.set_postfix(
            {
                "Loss": loss,
                "Accuracy": acc,
                "Precision": prec,
                "Recall": rec,
                "F1-score": f1,
            }
        )
        ### Метрики и логирование

        log["valid_loss"].append(np.mean(batch_loss))
        log["valid_accuracy"].append(np.mean(batch_acc))
        log["valid_precision"].append(np.mean(batch_prec))
        log["valid_recall"].append(np.mean(batch_recall))
        log["valid_f1"].append(np.mean(batch_f1))

        # [MLflow] Логируем метрики
        if use_mlflow:
            # epoch – номер шага (можно указывать step=epoch)
            for c in log.keys():
                mlflow.log_metric(c, log[c][-1], step=epoch)

        epoch_time = time() - epoch_time_start

        ### Выводим результаты эпохи
        # Train stage
        print(
            f"Train stage: "
            f"loss: {log['train_loss'][-1]:>6.3f}  "
            f"Accuracy: {log['train_accuracy'][-1]:>6.3f}  "
            f"Precision: {log['train_precision'][-1]:>6.3f}  "
            f"Recall: {log['train_recall'][-1]:>6.3f}  "
            f"F1-score: {log['train_f1'][-1]:>6.3f}  "
        )

        # Valid stage
        print(
            f"Valid stage: "
            f"loss: {log['valid_loss'][-1]:>6.3f}  "
            f"Accuracy: {log['valid_accuracy'][-1]:>6.3f}  "
            f"Precision: {log['valid_precision'][-1]:>6.3f}  "
            f"Recall: {log['valid_recall'][-1]:>6.3f}  "
            f"F1-score: {log['valid_f1'][-1]:>6.3f}  "
        )
        print(f"Time: {epoch_time}")

        print(f'{"-"*35}\n')
        torch.save(
            model.state_dict(), os.path.join(curr_run_path, f"weight_epoch_{epoch}.pth")
        )

    total_training_time = time() - time_start
    print(f"Total time = {total_training_time:>5.1f} сек")
    # -----------------------------------------------------------------

    return log, total_training_time, run


def fit_with_mlflow(
    model,
    model_name,
    epochs,
    optimizer,
    criterion,
    train_loader,
    valid_loader,
    device,
    batch_size,
    lr,
    metrics_func,
    num_classes=2,
):
    mlflow.set_experiment(
        f"{model_name} experiment"
    )  # установить (или создать) эксперимент
    with mlflow.start_run(run_name=f"{model_name}_BS = {batch_size}_lr_{lr}"):
        # Логируем гиперпараметры из config
        mlflow.log_param("batch_size", batch_size)
        mlflow.log_param("learning_rate", lr)
        mlflow.log_param("epochs", epochs)
        mlflow.log_param("device", device)
        mlflow.log_param("optimizer", optimizer)
        mlflow.log_param("criterion", criterion)

        # mlflow.pytorch.autolog(
        #     checkpoint=True,
        #     checkpoint_save_best_only=False,
        #     checkpoint_save_weights_only=False,
        #     checkpoint_save_freq="epoch",
        # )
        # mlflow.log_param("augmentation", ("Yes" if augmentation else "No"))
        print("начало обучения...")
        # Запускаем обучение
        logs, tot_time, run = fit_model(
            model=model,
            model_name=model_name,
            epochs=epochs,
            optimizer=optimizer,
            criterion=criterion,
            train_loader=train_loader,
            valid_loader=valid_loader,
            device=device,
            use_mlflow=True,
            metrics_func=metrics_func,
            num_classes=num_classes,
        )
        mlflow.log_param("Total time", tot_time)

        # Сохраняем модель в MLflow (опционально)
        # mlflow.pytorch.log_model(base_cnn, "model")

    # После выхода из `with` Run автоматически завершается
    return logs, tot_time, run

In [51]:
logs, run_time, run = fit_with_mlflow(
    model=model,
    model_name="rubert-tiny2 + classifier",
    epochs=10,
    optimizer=optimizer,
    criterion=criterion,
    train_loader=train_loader_2,
    valid_loader=valid_loader_2,
    device=DEVICE,
    batch_size=128,
    lr=LR,
    metrics_func=binary_metrics,
    num_classes=2,
)

начало обучения...
------------- Epoch 1 -------------


Epoch 1/10 [Train]: 100%|██████████| 469/469 [00:08<00:00, 58.43it/s]
Epoch 1/10 [Test]: 100%|██████████| 83/83 [00:01<00:00, 63.97it/s]


Train stage: loss:  0.373  Accuracy:  0.823  Precision:  0.860  Recall:  0.836  F1-score:  0.837  
Valid stage: loss:  0.306  Accuracy:  0.869  Precision:  0.910  Recall:  0.864  F1-score:  0.885  
Time: 10.61057996749878
-----------------------------------

------------- Epoch 2 -------------


Epoch 2/10 [Train]: 100%|██████████| 469/469 [00:08<00:00, 58.54it/s]
Epoch 2/10 [Test]: 100%|██████████| 83/83 [00:01<00:00, 66.53it/s]


Train stage: loss:  0.319  Accuracy:  0.861  Precision:  0.892  Recall:  0.868  F1-score:  0.879  
Valid stage: loss:  0.297  Accuracy:  0.873  Precision:  0.911  Recall:  0.871  F1-score:  0.890  
Time: 10.545008420944214
-----------------------------------

------------- Epoch 3 -------------


Epoch 3/10 [Train]: 100%|██████████| 469/469 [00:08<00:00, 58.09it/s]
Epoch 3/10 [Test]: 100%|██████████| 83/83 [00:01<00:00, 63.91it/s]


Train stage: loss:  0.310  Accuracy:  0.865  Precision:  0.892  Recall:  0.876  F1-score:  0.883  
Valid stage: loss:  0.290  Accuracy:  0.879  Precision:  0.909  Recall:  0.883  F1-score:  0.895  
Time: 10.654560327529907
-----------------------------------

------------- Epoch 4 -------------


Epoch 4/10 [Train]: 100%|██████████| 469/469 [00:07<00:00, 58.87it/s]
Epoch 4/10 [Test]: 100%|██████████| 83/83 [00:01<00:00, 64.58it/s]


Train stage: loss:  0.304  Accuracy:  0.869  Precision:  0.896  Recall:  0.879  F1-score:  0.886  
Valid stage: loss:  0.285  Accuracy:  0.879  Precision:  0.908  Recall:  0.885  F1-score:  0.896  
Time: 10.54303503036499
-----------------------------------

------------- Epoch 5 -------------


Epoch 5/10 [Train]: 100%|██████████| 469/469 [00:07<00:00, 58.92it/s]
Epoch 5/10 [Test]: 100%|██████████| 83/83 [00:01<00:00, 64.78it/s]


Train stage: loss:  0.298  Accuracy:  0.872  Precision:  0.898  Recall:  0.882  F1-score:  0.889  
Valid stage: loss:  0.281  Accuracy:  0.882  Precision:  0.913  Recall:  0.886  F1-score:  0.899  
Time: 10.530460834503174
-----------------------------------

------------- Epoch 6 -------------


Epoch 6/10 [Train]: 100%|██████████| 469/469 [00:07<00:00, 59.60it/s]
Epoch 6/10 [Test]: 100%|██████████| 83/83 [00:01<00:00, 67.09it/s]


Train stage: loss:  0.294  Accuracy:  0.873  Precision:  0.897  Recall:  0.885  F1-score:  0.890  
Valid stage: loss:  0.276  Accuracy:  0.885  Precision:  0.905  Recall:  0.899  F1-score:  0.901  
Time: 10.390114068984985
-----------------------------------

------------- Epoch 7 -------------


Epoch 7/10 [Train]: 100%|██████████| 469/469 [00:07<00:00, 58.96it/s]
Epoch 7/10 [Test]: 100%|██████████| 83/83 [00:01<00:00, 64.43it/s]


Train stage: loss:  0.291  Accuracy:  0.877  Precision:  0.901  Recall:  0.888  F1-score:  0.893  
Valid stage: loss:  0.274  Accuracy:  0.884  Precision:  0.909  Recall:  0.893  F1-score:  0.901  
Time: 10.52360486984253
-----------------------------------

------------- Epoch 8 -------------


Epoch 8/10 [Train]: 100%|██████████| 469/469 [00:08<00:00, 58.53it/s]
Epoch 8/10 [Test]: 100%|██████████| 83/83 [00:01<00:00, 65.19it/s]


Train stage: loss:  0.290  Accuracy:  0.875  Precision:  0.899  Recall:  0.888  F1-score:  0.892  
Valid stage: loss:  0.272  Accuracy:  0.886  Precision:  0.911  Recall:  0.893  F1-score:  0.902  
Time: 10.575334310531616
-----------------------------------

------------- Epoch 9 -------------


Epoch 9/10 [Train]: 100%|██████████| 469/469 [00:07<00:00, 60.25it/s]
Epoch 9/10 [Test]: 100%|██████████| 83/83 [00:01<00:00, 65.21it/s]


Train stage: loss:  0.285  Accuracy:  0.878  Precision:  0.902  Recall:  0.889  F1-score:  0.895  
Valid stage: loss:  0.268  Accuracy:  0.889  Precision:  0.911  Recall:  0.900  F1-score:  0.905  
Time: 10.344475030899048
-----------------------------------

------------- Epoch 10 -------------


Epoch 10/10 [Train]: 100%|██████████| 469/469 [00:08<00:00, 58.13it/s]
Epoch 10/10 [Test]: 100%|██████████| 83/83 [00:01<00:00, 64.67it/s]


Train stage: loss:  0.282  Accuracy:  0.879  Precision:  0.902  Recall:  0.891  F1-score:  0.896  
Valid stage: loss:  0.268  Accuracy:  0.889  Precision:  0.911  Recall:  0.900  F1-score:  0.905  
Time: 10.638150930404663
-----------------------------------

Total time = 106.3 сек
🏃 View run rubert-tiny2 + classifier_BS = 128_lr_0.0002 at: http://localhost:5000/#/experiments/795024553813846834/runs/80d18be25eb840d1aae25480a3699ebf
🧪 View experiment at: http://localhost:5000/#/experiments/795024553813846834
