In [7]:
import ast
import torch
import pandas as pd
from torch.utils.data import Dataset, DataLoader, Subset
from transformers import AutoTokenizer
from tqdm import tqdm

# Загружаем данные (пример)
df = pd.read_csv('../data/aug_train.csv', sep=';')
df.columns = ['text', 'spans']
df['spans'] = df['spans'].apply(lambda x: [{'start': span[0], 'end': span[1], 'label': span[2].replace('0', 'O')} for span in ast.literal_eval(x)])
df = df.iloc[[i for i, s in enumerate(df['spans'].values) if s[0]['start'] == 0]]

# словарь меток
label2id = {
    "O": 0, 
    "B-BRAND": 1, "B-TYPE": 2, "B-VOLUME": 3, "B-PERCENT": 4,
    "I-BRAND": 5, "I-TYPE": 6, "I-VOLUME": 7, "I-PERCENT": 8,
}
id2label = {v: k for k, v in label2id.items()}

# ---------------------------
# 2. Dataset
# ---------------------------

# model_name = "cointegrated/rubert-tiny2"
# model_name = 'DeepPavlov/rubert-base-cased'
model_name = 'xlm-roberta-base'
tokenizer = AutoTokenizer.from_pretrained(model_name)
MAX_LEN = 128

class NERDataset(Dataset):
    def __init__(self, texts, spans, tokenizer, label2id, max_len=128):
        self.texts = texts
        self.spans = spans
        self.tokenizer = tokenizer
        self.label2id = label2id
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        spans = self.spans[idx]

        # посимвольные метки
        char_labels = ["O"] * len(text)
        for span in spans:
            start, end, tag = span["start"], span["end"], span["label"]
            char_labels[start] = tag
            for i in range(start+1, end):
                char_labels[i] = tag.replace("B-", "I-")

        # токенизация
        enc = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_offsets_mapping=True,
            return_tensors="pt"
        )

        # метки на токенах
        labels = []
        numerics = []
        offsets = enc["offset_mapping"][0]
        for start, end in offsets:
            if start == end:
                labels.append(-100)
            else:
                labels.append(self.label2id[char_labels[start]] if start < len(char_labels) else -100)
            
            if text[start : end].isdigit():
                numerics.append(1)
            else:
                numerics.append(0)

        item = {k: v.squeeze(0) for k,v in enc.items() if k != "offset_mapping"}
        item["labels"] = torch.tensor(labels)
        item['is_numeric'] = torch.tensor(numerics)
        return item


# разделение train/test
train_df = df.sample(frac=0.9, random_state=42)
test_df = df.drop(train_df.index)

full_dataset = NERDataset(df["text"].tolist(), df["spans"].tolist(), tokenizer, label2id, MAX_LEN)
train_dataset = NERDataset(train_df["text"].tolist(), train_df["spans"].tolist(), tokenizer, label2id, MAX_LEN)
test_dataset = NERDataset(test_df["text"].tolist(), test_df["spans"].tolist(), tokenizer, label2id, MAX_LEN)

full_loader = DataLoader(df, batch_size=16, shuffle=True)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)

In [18]:
import torch
import torch.nn as nn
import pytorch_lightning as pl
from transformers import AutoModel
from sklearn.metrics import f1_score
from torchmetrics import F1Score

class NERLightning(pl.LightningModule):
    def __init__(self, model_name: str, num_labels: int, id2label: dict, label2id: dict, lr: float = 2e-4):
        super().__init__()
        self.save_hyperparameters()

        # BERT
        self.bert = AutoModel.from_pretrained(model_name)

        # Заморозим все слои, кроме последних 2
        for name, param in self.bert.named_parameters():
            param.requires_grad = False
        for layer in self.bert.encoder.layer[-1:]:
            for param in layer.parameters():
                param.requires_grad = True

        # Классификатор
        self.dropout = nn.Dropout(0.2)
        self.classifier = nn.Linear(self.bert.config.hidden_size + 1, num_labels)

        # Взвешенный loss (меньше вес для класса "O")
        class_weights = torch.ones(num_labels)
        class_weights[label2id["O"]] = 0.1
        self.loss_fn = nn.CrossEntropyLoss(weight=class_weights, ignore_index=-100)

        self.id2label = id2label
        self.label2id = label2id
        self.lr = lr

        # Хранилище предсказаний для метрики
        self.train_preds, self.train_labels = [], []
        self.val_preds, self.val_labels = [], []

        self.f1 = F1Score(task="multiclass", num_classes=num_labels, average="macro")

    def forward(self, input_ids, attention_mask=None, is_numeric=None, labels=None, **kwargs):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = self.dropout(outputs.last_hidden_state)

        if is_numeric is not None:
            is_numeric = is_numeric.unsqueeze(-1).float()  # [B, L, 1]
            sequence_output = torch.cat([sequence_output, is_numeric], dim=-1)

        logits = self.classifier(sequence_output)

        loss = None
        if labels is not None:
            loss = self.loss_fn(logits.view(-1, self.hparams.num_labels), labels.view(-1))

        return {"loss": loss, "logits": logits}

    def training_step(self, batch, batch_idx):
        outputs = self(**batch)
        loss, logits = outputs["loss"], outputs["logits"]

        preds = torch.argmax(logits, dim=-1).detach().cpu().tolist()
        labels = batch["labels"].detach().cpu().tolist()

        self.train_preds.extend(preds)
        self.train_labels.extend(labels)

        self.log("train_loss", loss, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        outputs = self(**batch)
        val_loss, logits = outputs["loss"], outputs["logits"]

        preds = torch.argmax(logits, dim=-1).detach().cpu()
        labels = batch["labels"].detach().cpu().tolist()

        self.val_preds.extend(preds)
        self.val_labels.extend(labels)

        self.log("val_loss", val_loss, prog_bar=True)
        return val_loss

    def on_train_epoch_end(self):
        f1 = self.compute_f1(self.train_labels, self.train_preds)
        self.log("train_f1", f1, prog_bar=True)
        self.train_preds, self.train_labels = [], []  # очистка

    def on_validation_epoch_end(self):
        f1 = self.compute_f1(self.val_labels, self.val_preds)
        self.log("val_f1", f1, prog_bar=True)
        self.val_preds, self.val_labels = [], []

    def compute_f1(self, labels, preds):
        # выравнивание + удаление -100
        y_true, y_pred = [], []
        for yt, yp in zip(labels, preds):
            for t, p in zip(yt, yp):
                if t == -100:
                    continue
                y_true.append(t)
                y_pred.append(p)

        return f1_score(y_true, y_pred, average="macro")

    def configure_optimizers(self):
        return torch.optim.AdamW(filter(lambda p: p.requires_grad, self.parameters()), lr=self.lr)

In [19]:
import os
from sklearn.model_selection import KFold
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from torch.utils.data import DataLoader, Subset

# === Кросс-валидация с сохранением модели ===
def run_kfold_crossval(
    dataset, 
    model_name: str, 
    num_labels: int,
    label2id: dict[str, int],
    id2label: dict[int, str],
    batch_size=16, 
    lr=2e-5, 
    k=5, 
    max_epochs=5,
    save_dir="../models"
):
    os.makedirs(save_dir, exist_ok=True)
    kfold = KFold(n_splits=k, shuffle=True, random_state=42)
    results = []

    for fold, (train_idx, val_idx) in enumerate(kfold.split(dataset)):
        print(f"\n===== Fold {fold+1} / {k} =====")

        # сабсеты
        train_subset = Subset(dataset, train_idx)
        val_subset = Subset(dataset, val_idx)

        train_loader = DataLoader(train_subset, batch_size=batch_size, shuffle=True)
        val_loader = DataLoader(val_subset, batch_size=batch_size)

        # новая модель на каждом фолде
        model = NERLightning(
            model_name=model_name, 
            num_labels=num_labels, 
            label2id=label2id,
            id2label=id2label,
            lr=lr,
        )

        # коллбэки
        checkpoint_callback = ModelCheckpoint(
            dirpath=f"{save_dir}/fold_{fold+1}",
            filename="best-checkpoint",
            save_top_k=1,
            verbose=True,
            monitor="val_f1",
            mode="max",
        )

        early_stop_callback = EarlyStopping(
            monitor="val_f1",
            patience=2,
            mode="max",
            verbose=True,
        )

        trainer = pl.Trainer(
            accelerator="gpu" if torch.cuda.is_available() else "cpu",
            devices=1,
            max_epochs=max_epochs,
            log_every_n_steps=10,
            callbacks=[checkpoint_callback, early_stop_callback],
        )

        trainer.fit(model, train_loader, val_loader)

        # загрузка лучшей модели
        best_model_path = checkpoint_callback.best_model_path
        print(f"Best model saved at {best_model_path}")

        val_metrics = trainer.callback_metrics
        results.append(val_metrics["val_f1"].item())

    avg_f1 = sum(results) / len(results)
    print(f"\n===== Mean Macro-F1 across {k} folds: {avg_f1:.4f} =====")
    return results, avg_f1

In [None]:
model_name = "cointegrated/rubert-tiny2"
# model_name = 'DeepPavlov/rubert-base-cased'
# model_name = 'xlm-roberta-base'

tokenizer = AutoTokenizer.from_pretrained(model_name)
full_dataset = NERDataset(df["text"].tolist(), df["spans"].tolist(), tokenizer, label2id, MAX_LEN)
run_kfold_crossval(
    full_dataset, 
    model_name, 
    num_labels=len(id2label),
    label2id=label2id,
    id2label=id2label,
    max_epochs=10,
)


===== Fold 1 / 5 =====


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name       | Type              | Params | Mode 
---------------------------------------------------------
0 | bert       | BertModel         | 29.2 M | eval 
1 | dropout    | Dropout           | 0      | train
2 | classifier | Linear            | 2.8 K  | train
3 | loss_fn    | CrossEntropyLoss  | 0      | train
4 | f1         | MulticlassF1Score | 0      | train
---------------------------------------------------------
770 K     Trainable params
28.4 M    Non-trainable params
29.2 M    Total params
116.786   Total estimated model params size (MB)
4         Modules in train mode
66        Modules in eval mode


                                                                           

/root/hack-x5/.venv/lib/python3.12/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:433: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


Epoch 0: 100%|██████████| 1369/1369 [01:26<00:00, 15.88it/s, v_num=25, train_loss=0.200, val_loss=0.357, val_f1=0.443]

Metric val_f1 improved. New best score: 0.443
Epoch 0, global step 1369: 'val_f1' reached 0.44328 (best 0.44328), saving model to '/root/hack-x5/models/fold_1/best-checkpoint.ckpt' as top 1


Epoch 0:   2%|▏         | 23/1369 [06:02<5:54:03,  0.06it/s, v_num=21, train_loss=1.720]val_loss=0.357, val_f1=0.443, train_f1=0.322] 
Epoch 1: 100%|██████████| 1369/1369 [01:25<00:00, 15.98it/s, v_num=25, train_loss=0.284, val_loss=0.285, val_f1=0.616, train_f1=0.322] 

Metric val_f1 improved by 0.173 >= min_delta = 0.0. New best score: 0.616
Epoch 1, global step 2738: 'val_f1' reached 0.61595 (best 0.61595), saving model to '/root/hack-x5/models/fold_1/best-checkpoint.ckpt' as top 1


Epoch 2: 100%|██████████| 1369/1369 [01:27<00:00, 15.73it/s, v_num=25, train_loss=0.307, val_loss=0.257, val_f1=0.684, train_f1=0.601] 

Metric val_f1 improved by 0.068 >= min_delta = 0.0. New best score: 0.684
Epoch 2, global step 4107: 'val_f1' reached 0.68403 (best 0.68403), saving model to '/root/hack-x5/models/fold_1/best-checkpoint.ckpt' as top 1


Epoch 3: 100%|██████████| 1369/1369 [01:25<00:00, 15.95it/s, v_num=25, train_loss=0.178, val_loss=0.238, val_f1=0.729, train_f1=0.673] 

Metric val_f1 improved by 0.045 >= min_delta = 0.0. New best score: 0.729
Epoch 3, global step 5476: 'val_f1' reached 0.72942 (best 0.72942), saving model to '/root/hack-x5/models/fold_1/best-checkpoint.ckpt' as top 1


Epoch 4: 100%|██████████| 1369/1369 [01:25<00:00, 15.96it/s, v_num=25, train_loss=0.279, val_loss=0.225, val_f1=0.774, train_f1=0.727] 

Metric val_f1 improved by 0.044 >= min_delta = 0.0. New best score: 0.774
Epoch 4, global step 6845: 'val_f1' reached 0.77387 (best 0.77387), saving model to '/root/hack-x5/models/fold_1/best-checkpoint.ckpt' as top 1
`Trainer.fit` stopped: `max_epochs=5` reached.


Epoch 4: 100%|██████████| 1369/1369 [01:26<00:00, 15.90it/s, v_num=25, train_loss=0.279, val_loss=0.225, val_f1=0.774, train_f1=0.727]
Best model saved at /root/hack-x5/models/fold_1/best-checkpoint.ckpt

===== Fold 2 / 5 =====


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name       | Type              | Params | Mode 
---------------------------------------------------------
0 | bert       | BertModel         | 29.2 M | eval 
1 | dropout    | Dropout           | 0      | train
2 | classifier | Linear            | 2.8 K  | train
3 | loss_fn    | CrossEntropyLoss  | 0      | train
4 | f1         | MulticlassF1Score | 0      | train
---------------------------------------------------------
770 K     Trainable params
28.4 M    Non-trainable params
29.2 M    Total params
116.786   Total estimated model params size (MB)
4         Modules in train mode
66        Modules in eval mode


                                                                           

/root/hack-x5/.venv/lib/python3.12/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:433: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.
/root/hack-x5/.venv/lib/python3.12/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:433: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


Epoch 0: 100%|██████████| 1369/1369 [01:28<00:00, 15.51it/s, v_num=26, train_loss=0.315, val_loss=0.353, val_f1=0.521]

Metric val_f1 improved. New best score: 0.521
Epoch 0, global step 1369: 'val_f1' reached 0.52098 (best 0.52098), saving model to '/root/hack-x5/models/fold_2/best-checkpoint.ckpt' as top 1


Epoch 1: 100%|██████████| 1369/1369 [01:26<00:00, 15.84it/s, v_num=26, train_loss=0.360, val_loss=0.280, val_f1=0.642, train_f1=0.322] 

Metric val_f1 improved by 0.121 >= min_delta = 0.0. New best score: 0.642
Epoch 1, global step 2738: 'val_f1' reached 0.64247 (best 0.64247), saving model to '/root/hack-x5/models/fold_2/best-checkpoint.ckpt' as top 1


Epoch 2: 100%|██████████| 1369/1369 [01:25<00:00, 16.02it/s, v_num=26, train_loss=0.422, val_loss=0.250, val_f1=0.680, train_f1=0.622] 

Metric val_f1 improved by 0.038 >= min_delta = 0.0. New best score: 0.680
Epoch 2, global step 4107: 'val_f1' reached 0.68042 (best 0.68042), saving model to '/root/hack-x5/models/fold_2/best-checkpoint.ckpt' as top 1


Epoch 3: 100%|██████████| 1369/1369 [01:27<00:00, 15.69it/s, v_num=26, train_loss=0.168, val_loss=0.236, val_f1=0.733, train_f1=0.714] 

Metric val_f1 improved by 0.053 >= min_delta = 0.0. New best score: 0.733
Epoch 3, global step 5476: 'val_f1' reached 0.73293 (best 0.73293), saving model to '/root/hack-x5/models/fold_2/best-checkpoint.ckpt' as top 1


Epoch 4: 100%|██████████| 1369/1369 [01:26<00:00, 15.74it/s, v_num=26, train_loss=0.201, val_loss=0.220, val_f1=0.758, train_f1=0.751] 

Metric val_f1 improved by 0.026 >= min_delta = 0.0. New best score: 0.758
Epoch 4, global step 6845: 'val_f1' reached 0.75848 (best 0.75848), saving model to '/root/hack-x5/models/fold_2/best-checkpoint.ckpt' as top 1
`Trainer.fit` stopped: `max_epochs=5` reached.


Epoch 4: 100%|██████████| 1369/1369 [01:27<00:00, 15.68it/s, v_num=26, train_loss=0.201, val_loss=0.220, val_f1=0.758, train_f1=0.751]
Best model saved at /root/hack-x5/models/fold_2/best-checkpoint.ckpt

===== Fold 3 / 5 =====


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name       | Type              | Params | Mode 
---------------------------------------------------------
0 | bert       | BertModel         | 29.2 M | eval 
1 | dropout    | Dropout           | 0      | train
2 | classifier | Linear            | 2.8 K  | train
3 | loss_fn    | CrossEntropyLoss  | 0      | train
4 | f1         | MulticlassF1Score | 0      | train
---------------------------------------------------------
770 K     Trainable params
28.4 M    Non-trainable params
29.2 M    Total params
116.786   Total estimated model params size (MB)
4         Modules in train mode
66        Modules in eval mode


                                                                           

/root/hack-x5/.venv/lib/python3.12/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:433: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.
/root/hack-x5/.venv/lib/python3.12/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:433: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


Epoch 0: 100%|██████████| 1369/1369 [01:26<00:00, 15.75it/s, v_num=27, train_loss=0.549, val_loss=0.352, val_f1=0.422]

Metric val_f1 improved. New best score: 0.422
Epoch 0, global step 1369: 'val_f1' reached 0.42167 (best 0.42167), saving model to '/root/hack-x5/models/fold_3/best-checkpoint.ckpt' as top 1


Epoch 1: 100%|██████████| 1369/1369 [01:28<00:00, 15.50it/s, v_num=27, train_loss=0.382, val_loss=0.284, val_f1=0.629, train_f1=0.314] 

Metric val_f1 improved by 0.207 >= min_delta = 0.0. New best score: 0.629
Epoch 1, global step 2738: 'val_f1' reached 0.62883 (best 0.62883), saving model to '/root/hack-x5/models/fold_3/best-checkpoint.ckpt' as top 1


Epoch 2: 100%|██████████| 1369/1369 [01:25<00:00, 16.03it/s, v_num=27, train_loss=0.600, val_loss=0.255, val_f1=0.760, train_f1=0.595] 

Metric val_f1 improved by 0.132 >= min_delta = 0.0. New best score: 0.760
Epoch 2, global step 4107: 'val_f1' reached 0.76049 (best 0.76049), saving model to '/root/hack-x5/models/fold_3/best-checkpoint.ckpt' as top 1


Epoch 3: 100%|██████████| 1369/1369 [01:26<00:00, 15.90it/s, v_num=27, train_loss=0.0905, val_loss=0.238, val_f1=0.812, train_f1=0.705]

Metric val_f1 improved by 0.051 >= min_delta = 0.0. New best score: 0.812
Epoch 3, global step 5476: 'val_f1' reached 0.81177 (best 0.81177), saving model to '/root/hack-x5/models/fold_3/best-checkpoint.ckpt' as top 1


Epoch 4: 100%|██████████| 1369/1369 [01:24<00:00, 16.23it/s, v_num=27, train_loss=0.143, val_loss=0.229, val_f1=0.809, train_f1=0.775] 

Epoch 4, global step 6845: 'val_f1' was not in top 1
`Trainer.fit` stopped: `max_epochs=5` reached.


Epoch 4: 100%|██████████| 1369/1369 [01:24<00:00, 16.21it/s, v_num=27, train_loss=0.143, val_loss=0.229, val_f1=0.809, train_f1=0.775]
Best model saved at /root/hack-x5/models/fold_3/best-checkpoint.ckpt

===== Fold 4 / 5 =====


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name       | Type              | Params | Mode 
---------------------------------------------------------
0 | bert       | BertModel         | 29.2 M | eval 
1 | dropout    | Dropout           | 0      | train
2 | classifier | Linear            | 2.8 K  | train
3 | loss_fn    | CrossEntropyLoss  | 0      | train
4 | f1         | MulticlassF1Score | 0      | train
---------------------------------------------------------
770 K     Trainable params
28.4 M    Non-trainable params
29.2 M    Total params
116.786   Total estimated model params size (MB)
4         Modules in train mode
66        Modules in eval mode


                                                                           

/root/hack-x5/.venv/lib/python3.12/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:433: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.
/root/hack-x5/.venv/lib/python3.12/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:433: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


Epoch 0: 100%|██████████| 1369/1369 [01:25<00:00, 15.95it/s, v_num=28, train_loss=0.230, val_loss=0.363, val_f1=0.413]

Metric val_f1 improved. New best score: 0.413
Epoch 0, global step 1369: 'val_f1' reached 0.41253 (best 0.41253), saving model to '/root/hack-x5/models/fold_4/best-checkpoint.ckpt' as top 1


Epoch 1: 100%|██████████| 1369/1369 [01:26<00:00, 15.80it/s, v_num=28, train_loss=0.256, val_loss=0.293, val_f1=0.614, train_f1=0.318] 

Metric val_f1 improved by 0.201 >= min_delta = 0.0. New best score: 0.614
Epoch 1, global step 2738: 'val_f1' reached 0.61357 (best 0.61357), saving model to '/root/hack-x5/models/fold_4/best-checkpoint.ckpt' as top 1


Epoch 2: 100%|██████████| 1369/1369 [01:38<00:00, 13.95it/s, v_num=28, train_loss=0.0532, val_loss=0.263, val_f1=0.706, train_f1=0.559]

Metric val_f1 improved by 0.092 >= min_delta = 0.0. New best score: 0.706
Epoch 2, global step 4107: 'val_f1' reached 0.70582 (best 0.70582), saving model to '/root/hack-x5/models/fold_4/best-checkpoint.ckpt' as top 1


Epoch 3: 100%|██████████| 1369/1369 [01:27<00:00, 15.65it/s, v_num=28, train_loss=0.373, val_loss=0.244, val_f1=0.749, train_f1=0.634] 

Metric val_f1 improved by 0.043 >= min_delta = 0.0. New best score: 0.749
Epoch 3, global step 5476: 'val_f1' reached 0.74859 (best 0.74859), saving model to '/root/hack-x5/models/fold_4/best-checkpoint.ckpt' as top 1


Epoch 4: 100%|██████████| 1369/1369 [01:26<00:00, 15.88it/s, v_num=28, train_loss=0.208, val_loss=0.232, val_f1=0.768, train_f1=0.731] 

Metric val_f1 improved by 0.019 >= min_delta = 0.0. New best score: 0.768
Epoch 4, global step 6845: 'val_f1' reached 0.76775 (best 0.76775), saving model to '/root/hack-x5/models/fold_4/best-checkpoint.ckpt' as top 1
`Trainer.fit` stopped: `max_epochs=5` reached.


Epoch 4: 100%|██████████| 1369/1369 [01:26<00:00, 15.82it/s, v_num=28, train_loss=0.208, val_loss=0.232, val_f1=0.768, train_f1=0.731]
Best model saved at /root/hack-x5/models/fold_4/best-checkpoint.ckpt

===== Fold 5 / 5 =====


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name       | Type              | Params | Mode 
---------------------------------------------------------
0 | bert       | BertModel         | 29.2 M | eval 
1 | dropout    | Dropout           | 0      | train
2 | classifier | Linear            | 2.8 K  | train
3 | loss_fn    | CrossEntropyLoss  | 0      | train
4 | f1         | MulticlassF1Score | 0      | train
---------------------------------------------------------
770 K     Trainable params
28.4 M    Non-trainable params
29.2 M    Total params
116.786   Total estimated model params size (MB)
4         Modules in train mode
66        Modules in eval mode


                                                                           

/root/hack-x5/.venv/lib/python3.12/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:433: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.
/root/hack-x5/.venv/lib/python3.12/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:433: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


Epoch 0: 100%|██████████| 1369/1369 [01:26<00:00, 15.76it/s, v_num=29, train_loss=0.211, val_loss=0.370, val_f1=0.476]

Metric val_f1 improved. New best score: 0.476
Epoch 0, global step 1369: 'val_f1' reached 0.47597 (best 0.47597), saving model to '/root/hack-x5/models/fold_5/best-checkpoint.ckpt' as top 1


Epoch 1: 100%|██████████| 1369/1369 [01:25<00:00, 16.10it/s, v_num=29, train_loss=0.304, val_loss=0.313, val_f1=0.603, train_f1=0.336] 

Metric val_f1 improved by 0.127 >= min_delta = 0.0. New best score: 0.603
Epoch 1, global step 2738: 'val_f1' reached 0.60320 (best 0.60320), saving model to '/root/hack-x5/models/fold_5/best-checkpoint.ckpt' as top 1


Epoch 2: 100%|██████████| 1369/1369 [01:25<00:00, 15.92it/s, v_num=29, train_loss=0.231, val_loss=0.277, val_f1=0.691, train_f1=0.587] 

Metric val_f1 improved by 0.087 >= min_delta = 0.0. New best score: 0.691
Epoch 2, global step 4107: 'val_f1' reached 0.69067 (best 0.69067), saving model to '/root/hack-x5/models/fold_5/best-checkpoint.ckpt' as top 1


Epoch 3: 100%|██████████| 1369/1369 [01:26<00:00, 15.80it/s, v_num=29, train_loss=0.226, val_loss=0.258, val_f1=0.749, train_f1=0.681] 

Metric val_f1 improved by 0.058 >= min_delta = 0.0. New best score: 0.749
Epoch 3, global step 5476: 'val_f1' reached 0.74871 (best 0.74871), saving model to '/root/hack-x5/models/fold_5/best-checkpoint.ckpt' as top 1


Epoch 4: 100%|██████████| 1369/1369 [01:26<00:00, 15.74it/s, v_num=29, train_loss=0.136, val_loss=0.247, val_f1=0.778, train_f1=0.731] 

Metric val_f1 improved by 0.029 >= min_delta = 0.0. New best score: 0.778
Epoch 4, global step 6845: 'val_f1' reached 0.77819 (best 0.77819), saving model to '/root/hack-x5/models/fold_5/best-checkpoint.ckpt' as top 1
`Trainer.fit` stopped: `max_epochs=5` reached.


Epoch 4: 100%|██████████| 1369/1369 [01:27<00:00, 15.69it/s, v_num=29, train_loss=0.136, val_loss=0.247, val_f1=0.778, train_f1=0.731]
Best model saved at /root/hack-x5/models/fold_5/best-checkpoint.ckpt

===== Mean Macro-F1 across 5 folds: 0.7774 =====


([0.7738693952560425,
  0.7584832906723022,
  0.8086845278739929,
  0.7677515149116516,
  0.7781853675842285],
 0.7773948192596436)

In [None]:
from pytorch_lightning import Trainer

model = NERLightning(
    model_name=model_name,
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id,
    lr=2e-4,
)

trainer = Trainer(max_epochs=4, accelerator="auto", devices=1)
trainer.fit(model, train_loader, test_loader)

💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/root/hack-x5/.venv/lib/python3.12/site-packages/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py:76: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `pytorch_lightning` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default

  | Name       | Type             | Params | Mode 
--------------------------------------------------------
0 | bert       | XLMRobertaModel  | 278 M 

Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]

/root/hack-x5/.venv/lib/python3.12/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:433: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


                                                                           

/root/hack-x5/.venv/lib/python3.12/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:433: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


Epoch 0:   6%|▌         | 88/1540 [00:57<15:52,  1.52it/s, v_num=17, train_loss=0.444]


Detected KeyboardInterrupt, attempting graceful shutdown ...


SystemExit: 1

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [42]:
import torch
import json
import os
from transformers import AutoTokenizer

def save_model(model, save_dir="../models"):
    os.makedirs(save_dir, exist_ok=True)

    # 1. веса модели
    torch.save(model.state_dict(), os.path.join(save_dir, "ner_model.bin"))

    # 2. метаданные
    metadata = {
        "model_name": model.hparams.model_name,
        "num_labels": model.hparams.num_labels,
        "label2id": model.hparams.label2id,
        "id2label": model.hparams.id2label
    }
    with open(os.path.join(save_dir, "config.json"), "w", encoding="utf-8") as f:
        json.dump(metadata, f, ensure_ascii=False, indent=2)

    print(f"✅ Model saved to {save_dir}")

def load_model(save_dir="../models"):
    # 1. метаданные
    with open(os.path.join(save_dir, "config.json"), "r", encoding="utf-8") as f:
        metadata = json.load(f)

    # 2. инициализация модели
    model = NERLightning(
        model_name=metadata["model_name"],
        num_labels=metadata["num_labels"],
        id2label=metadata["id2label"],
        label2id=metadata["label2id"]
    )

    # 3. загрузка весов
    state_dict = torch.load(os.path.join(save_dir, "ner_model.bin"), map_location="cpu")
    model.load_state_dict(state_dict)

    # 4. токенайзер
    tokenizer = AutoTokenizer.from_pretrained(metadata["model_name"])

    print(f"✅ Model loaded from {save_dir}")
    return model, tokenizer

In [45]:
save_model(model, '../models/deeppavlov_ner_model')

✅ Model saved to ../models/deeppavlov_ner_model


In [47]:
l_model, l_tokenizer = load_model()

✅ Model loaded from ../models


In [26]:
# пример использования
def predict(text: str):
    text = text.replace("\xa0", " ")
    model.eval()
    enc = tokenizer(text, return_tensors="pt", return_offsets_mapping=True, truncation=True, padding="max_length")
    input_ids = enc["input_ids"].to('cpu')
    attention_mask = enc["attention_mask"].to('cpu')
    offsets = enc["offset_mapping"][0]
    is_numeric = []
    for start, end in offsets:
        if text[start : end].isdigit():
            is_numeric.append(1)
        else:
            is_numeric.append(0)

    is_numeric = torch.tensor([is_numeric])

    with torch.no_grad():
        logits = model(input_ids=input_ids, attention_mask=attention_mask, is_numeric=is_numeric)['logits'].argmax(dim=-1)[0].cpu().numpy()
    tokens = tokenizer.convert_ids_to_tokens(enc["input_ids"][0])

    return logits, offsets, tokens

def decode_predictions(text, offsets, labels):
    tokens = text.split(' ')
    token_offsets = []
    cur_i = 0
    for token in tokens:
        start = cur_i
        end = start + len(token)
        token_offsets.append((start, end))
        cur_i += len(token) + 1

    bio_start_offsets = [int(o[0]) for o in offsets if o[0] != o[1]]
    res = []
    for token, (start, end) in zip(tokens, token_offsets):
        idx_token_label = bio_start_offsets.index(start) + 1
        label = labels[idx_token_label]
        res.append((token, (start, end, id2label[label])))

    return res

text = "сливки 13 процентов"
logits, offsets, tokens = predict(text)
print(decode_predictions(text, offsets, logits))

[('сливки', (0, 6, 'B-TYPE')), ('13', (7, 9, 'B-PERCENT')), ('процентов', (10, 19, 'I-PERCENT'))]


In [27]:
import time
text = "корм влажный purina one"
logits, offsets, tokens = predict(text)
print(decode_predictions(text, offsets, logits))

[('корм', (0, 4, 'B-TYPE')), ('влажный', (5, 12, 'I-TYPE')), ('purina', (13, 19, 'B-BRAND')), ('one', (20, 23, 'I-BRAND'))]


In [56]:
row = test_df.sample(1).values[0]
text = row[0]
print(text, [s['label'] for s in row[1]])
logits, offsets, tokens = predict(text)
print(decode_predictions(text, offsets, logits))

яблоков ['B-TYPE']
[('яблоков', (0, 7, 'B-TYPE'))]


In [57]:
from collections import defaultdict

def compute_macro_f1(y_true, y_pred, entity_types=("TYPE","BRAND","VOLUME","PERCENT")):
    """
    y_true, y_pred: списки предсказанных и эталонных сущностей для всех примеров
    Каждое значение — список кортежей: (start, end, label)
    label в формате 'B-TYPE', 'I-BRAND' и т.д.
    """
    
    # счётчики TP, FP, FN для каждого типа
    stats = {etype: {"TP":0, "FP":0, "FN":0} for etype in entity_types}

    for true_spans, pred_spans in zip(y_true, y_pred):
        # создаём словари по типу сущности
        true_by_type = defaultdict(list)
        for start, end, label in true_spans:
            etype = label.split("-")[-1]
            true_by_type[etype].append((start,end))
        
        pred_by_type = defaultdict(list)
        for start, end, label in pred_spans:
            etype = label.split("-")[-1]
            pred_by_type[etype].append((start,end))
        
        for etype in entity_types:
            true_set = set(true_by_type.get(etype, []))
            pred_set = set(pred_by_type.get(etype, []))
            TP = len(true_set & pred_set)
            FP = len(pred_set - true_set)
            FN = len(true_set - pred_set)
            stats[etype]["TP"] += TP
            stats[etype]["FP"] += FP
            stats[etype]["FN"] += FN

    # вычисляем F1 для каждого типа
    f1_scores = []
    for etype in entity_types:
        TP = stats[etype]["TP"]
        FP = stats[etype]["FP"]
        FN = stats[etype]["FN"]
        precision = TP / (TP + FP) if TP + FP > 0 else 0.0
        recall = TP / (TP + FN) if TP + FN > 0 else 0.0
        f1 = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0.0
        f1_scores.append(f1)

    macro_f1 = sum(f1_scores)/len(f1_scores)
    return macro_f1

In [13]:
texts = test_df['text'].tolist()

In [14]:
text_pred_bio = []
for text in tqdm(texts):
    logits, offsets, tokens = predict(text)
    bio = decode_predictions(text, offsets, logits) 
    text_pred_bio.append(bio)



Epoch 0:   1%|▏         | 20/1533 [22:22<28:13:06,  0.01it/s, v_num=1]






100%|██████████| 2725/2725 [03:34<00:00, 12.71it/s]


In [15]:
text_true_bio = [
    [(span['start'], span['end'], span['label']) for span in spans]
    for spans in test_df['spans'].tolist()
]

In [16]:
score = compute_macro_f1(text_true_bio, [[s[1] for s in spans] for spans in text_pred_bio])
print("Macro F1:", score)

Macro F1: 0.8704645610683013


In [24]:
test_texts = test_df['text'].tolist()
test_spans = [
    [(s['start'], s['end'], s['label']) for s in spans]
    for spans in test_df['spans'].tolist()
]

In [46]:
sdf = pd.read_csv('../data/submission.csv', sep=';')
sdf.columns = ['text', 'spans']
sdf['spans'] = sdf['spans'].apply(lambda x: [{'start': span[0], 'end': span[1], 'label': span[2].replace('0', 'O')} for span in ast.literal_eval(x)])
sdf = sdf.iloc[[i for i, s in enumerate(sdf['spans'].values) if s[0]['start'] == 0]]

In [47]:
sdf.head(2)

Unnamed: 0,text,spans
0,форма для выпечки,"[{'start': 0, 'end': 5, 'label': 'B-TYPE'}, {'..."
1,фарш свиной,"[{'start': 0, 'end': 4, 'label': 'B-TYPE'}, {'..."


In [None]:
texts = sdf['text'].tolist()
s_text_pred_bio = []
for text in tqdm(texts):
    logits, offsets, tokens = predict(text)
    bio = decode_predictions(text, offsets, logits) 
    s_text_pred_bio.append(bio)

Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no padding.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [31]:
sbio = [[s[1] for s in spans] for spans in s_text_pred_bio]

In [32]:
sdf['annotation'] = sbio

In [33]:
sdf = sdf.rename(columns={'text': 'sample'}).drop(columns='spans')

In [34]:
sdf.head(2)

Unnamed: 0,sample,annotation
0,форма для выпечки,"[(0, 5, B-TYPE), (6, 9, I-TYPE), (10, 17, I-TY..."
1,фарш свиной,"[(0, 4, B-TYPE), (5, 11, I-TYPE)]"


In [36]:
sdf.to_csv('../data/test.csv', sep=';', index=False)