In [1]:
import ast
import torch
import pandas as pd
from torch.utils.data import Dataset, DataLoader, Subset
from transformers import AutoTokenizer
from tqdm import tqdm

# Загружаем данные (пример)
df = pd.read_csv('../data/aug_train.csv', sep=';')
df.columns = ['text', 'spans']
df['spans'] = df['spans'].apply(lambda x: [{'start': span[0], 'end': span[1], 'label': span[2].replace('0', 'O')} for span in ast.literal_eval(x)])
df = df.iloc[[i for i, s in enumerate(df['spans'].values) if s[0]['start'] == 0]]

# словарь меток
label2id = {
    "O": 0, 
    "B-BRAND": 1, "B-TYPE": 2, "B-VOLUME": 3, "B-PERCENT": 4,
    "I-BRAND": 5, "I-TYPE": 6, "I-VOLUME": 7, "I-PERCENT": 8,
}
id2label = {v: k for k, v in label2id.items()}

# ---------------------------
# 2. Dataset
# ---------------------------

# model_name = "cointegrated/rubert-tiny2"
# model_name = 'DeepPavlov/rubert-base-cased'
# model_name = 'xlm-roberta-base'
# model_name = 'ai-forever/ruBert-large'
model_name = 'ai-forever/ruRoberta-large'
# model_name = "sberbank-ai/ruBert-large"
# model_name = "sberbank-ai/ruRoberta-large"
# model_name = 'ai-forever/FRED-T5-large'
tokenizer = AutoTokenizer.from_pretrained(model_name)
MAX_LEN = 128

class NERDataset(Dataset):
    def __init__(self, texts, spans, tokenizer, label2id, max_len=128):
        self.texts = texts
        self.spans = spans
        self.tokenizer = tokenizer
        self.label2id = label2id
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        spans = self.spans[idx]

        # посимвольные метки
        char_labels = ["O"] * len(text)
        for span in spans:
            start, end, tag = span["start"], span["end"], span["label"]
            char_labels[start] = tag
            for i in range(start+1, end):
                char_labels[i] = tag.replace("B-", "I-")

        # токенизация
        enc = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_offsets_mapping=True,
            return_tensors="pt",
        )

        # метки на токенах
        labels = []
        numerics = []
        offsets = enc["offset_mapping"][0]
        for start, end in offsets:
            if start == end:
                labels.append(-100)
            else:
                labels.append(self.label2id[char_labels[start]] if start < len(char_labels) else -100)
            
            if text[start : end].isdigit():
                numerics.append(1)
            else:
                numerics.append(0)

        item = {k: v.squeeze(0) for k,v in enc.items() if k != "offset_mapping"}
        item["labels"] = torch.tensor(labels)
        item['is_numeric'] = torch.tensor(numerics)
        return item


# разделение train/test
train_df = df.sample(frac=0.9, random_state=42)
test_df = df.drop(train_df.index)

full_dataset = NERDataset(df["text"].tolist(), df["spans"].tolist(), tokenizer, label2id, MAX_LEN)
train_dataset = NERDataset(train_df["text"].tolist(), train_df["spans"].tolist(), tokenizer, label2id, MAX_LEN)
test_dataset = NERDataset(test_df["text"].tolist(), test_df["spans"].tolist(), tokenizer, label2id, MAX_LEN)

full_loader = DataLoader(full_dataset, batch_size=64, shuffle=True)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import torch
import torch.nn as nn
import pytorch_lightning as pl
from transformers import AutoModel
from sklearn.metrics import f1_score
from torchmetrics import F1Score

class NERLightning(pl.LightningModule):
    def __init__(self, model_name: str, num_labels: int, id2label: dict, label2id: dict, lr: float = 2e-4):
        super().__init__()
        self.save_hyperparameters()

        self.encoder = AutoModel.from_pretrained(model_name)

        # Заморозим все слои, кроме последних 2
        for _, param in self.encoder.named_parameters():
            param.requires_grad = False

        n_unfreeze_layers = 2
        if 'bert' in self.encoder.name_or_path.lower():
            for layer in self.encoder.encoder.layer[-n_unfreeze_layers:]:
                for param in layer.parameters():
                    param.requires_grad = True
        elif 't5' in self.encoder.name_or_path.lower():
            self.encoder = self.encoder.encoder
            for layer in self.encoder.block[-n_unfreeze_layers:]:
                for param in layer.parameters():
                    param.requires_grad = True
        else:
            raise TypeError(f'Invalid encoder name: {self.encoder.name_or_path}')

        self.encoder_dropout = nn.Dropout(0.2)

        # Классификатор
        self.classifier = nn.Linear(self.encoder.config.hidden_size + 1, num_labels)
        torch.nn.init.xavier_uniform(self.classifier.weight)

        # Взвешенный loss (меньше вес для класса "O")
        class_weights = torch.ones(num_labels)
        class_weights[label2id["O"]] = 0.9
        class_weights[label2id["B-TYPE"]] = 0.2
        class_weights[label2id["I-TYPE"]] = 1.0
        class_weights[label2id["B-BRAND"]] = 0.35
        class_weights[label2id["I-BRAND"]] = 5.0
        class_weights[label2id["B-PERCENT"]] = 10.0   # было 15 → ограничили
        class_weights[label2id["I-PERCENT"]] = 10.0   # было 98 → ограничили
        class_weights[label2id["B-VOLUME"]] = 10.0    # было 41 → ограничили
        class_weights[label2id["I-VOLUME"]] = 10.0
        self.loss_fn = nn.CrossEntropyLoss(weight=class_weights, ignore_index=-100)

        self.id2label = id2label
        self.label2id = label2id
        self.lr = lr

        # Хранилище предсказаний для метрики
        self.train_preds, self.train_labels = [], []
        self.val_preds, self.val_labels = [], []

        self.f1 = F1Score(task="multiclass", num_classes=num_labels, average="macro")

    def forward(self, input_ids,  attention_mask, is_numeric, labels=None, **kwargs):
        x = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        x = self.encoder_dropout(x.last_hidden_state)
        x = torch.cat([x, is_numeric.unsqueeze(-1).float()], dim=-1)
        logits = self.classifier(x)

        loss = None
        if labels is not None:
            loss = self.loss_fn(logits.view(-1, self.hparams.num_labels), labels.view(-1))

        return {"loss": loss, "logits": logits}

    def training_step(self, batch, batch_idx):
        outputs = self(**batch)
        loss, logits = outputs["loss"], outputs["logits"]

        preds = torch.argmax(logits, dim=-1).detach().cpu().tolist()
        labels = batch["labels"].detach().cpu().tolist()

        self.train_preds.extend(preds)
        self.train_labels.extend(labels)

        self.log("train_loss", loss, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        outputs = self(**batch)
        val_loss, logits = outputs["loss"], outputs["logits"]

        preds = torch.argmax(logits, dim=-1).detach().cpu()
        labels = batch["labels"].detach().cpu().tolist()

        self.val_preds.extend(preds)
        self.val_labels.extend(labels)

        self.log("val_loss", val_loss, prog_bar=True)
        return val_loss

    def on_train_epoch_end(self):
        f1 = self.compute_f1(self.train_labels, self.train_preds)
        self.log("train_f1", f1, prog_bar=True)
        self.train_preds, self.train_labels = [], []  # очистка

    def on_validation_epoch_end(self):
        f1 = self.compute_f1(self.val_labels, self.val_preds)
        self.log("val_f1", f1, prog_bar=True)
        self.val_preds, self.val_labels = [], []

    def compute_f1(self, labels, preds):
        # выравнивание + удаление -100
        y_true, y_pred = [], []
        for yt, yp in zip(labels, preds):
            for t, p in zip(yt, yp):
                if t == -100:
                    continue
                y_true.append(t)
                y_pred.append(p)

        return f1_score(y_true, y_pred, average="macro")

    def configure_optimizers(self):
        return torch.optim.AdamW(filter(lambda p: p.requires_grad, self.parameters()), lr=self.lr)

In [49]:
import os
from sklearn.model_selection import KFold
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from torch.utils.data import DataLoader, Subset

# пример использования
def predict(text: str, model):
    text = text.replace("\xa0", " ")
    model.eval()
    enc = tokenizer(text, return_tensors="pt", return_offsets_mapping=True, truncation=True, padding="max_length")
    input_ids = enc["input_ids"].to('cpu')
    attention_mask = enc["attention_mask"].to('cpu')
    offsets = enc["offset_mapping"][0]
    is_numeric = []
    for start, end in offsets:
        if text[start : end].isdigit():
            is_numeric.append(1)
        else:
            is_numeric.append(0)

    is_numeric = torch.tensor([is_numeric])

    with torch.no_grad():
        logits = model(
            input_ids=input_ids.to('cuda'),
            attention_mask=attention_mask.to('cuda'),
            is_numeric=is_numeric.to('cuda'),
        )['logits'].argmax(dim=-1)[0].cpu().numpy()
    tokens = tokenizer.convert_ids_to_tokens(enc["input_ids"][0])

    return logits, offsets, tokens

def decode_predictions(text, offsets, labels):
    tokens = text.split(' ')
    token_offsets = []
    cur_i = 0
    for token in tokens:
        start = cur_i
        end = start + len(token)
        token_offsets.append((start, end))
        cur_i += len(token) + 1

    bio_start_offsets = [int(o[0]) for o in offsets if o[0] != o[1]]
    res = []
    for token, (start, end) in zip(tokens, token_offsets):
        idx_token_label = bio_start_offsets.index(start) + 1
        label = labels[idx_token_label]
        res.append((start, end, id2label[label]))

    return res

# === Кросс-валидация с сохранением модели ===
def run_kfold_crossval(
    df: pd.DataFrame,
    model_name: str,
    num_labels: int,
    label2id: dict[str, int],
    id2label: dict[int, str],
    batch_size=16,
    lr=2e-5,
    k=5,
    max_epochs=5,
    save_dir="../models",
    max_len: int = 128,
):
    os.makedirs(save_dir, exist_ok=True)
    kfold = KFold(n_splits=k, shuffle=True, random_state=42)
    results = []
    dataset = NERDataset(df["text"].tolist(), df["spans"].tolist(), tokenizer, label2id, max_len)
    validates = []

    for fold, (train_idx, val_idx) in enumerate(kfold.split(dataset)):
        print(f"\n===== Fold {fold+1} / {k} =====")

        # сабсеты
        train_subset = Subset(dataset, train_idx)
        val_subset = Subset(dataset, val_idx)

        train_loader = DataLoader(train_subset, batch_size=batch_size, shuffle=True)
        val_loader = DataLoader(val_subset, batch_size=batch_size)

        # новая модель на каждом фолде
        model = NERLightning(
            model_name=model_name, 
            num_labels=num_labels, 
            label2id=label2id,
            id2label=id2label,
            lr=lr,
        ).to('cuda')

        # коллбэки
        checkpoint_callback = ModelCheckpoint(
            dirpath=f"{save_dir}/fold_{fold+1}",
            filename="best-checkpoint",
            save_top_k=1,
            verbose=True,
            monitor="val_f1",
            mode="max",
        )

        early_stop_callback = EarlyStopping(
            monitor="val_f1",
            patience=1,
            mode="max",
            verbose=True,
        )

        trainer = pl.Trainer(
            accelerator="gpu" if torch.cuda.is_available() else "cpu",
            devices=1,
            max_epochs=max_epochs,
            log_every_n_steps=10,
            callbacks=[checkpoint_callback, early_stop_callback],
        )

        trainer.fit(model, train_loader, val_loader)

        val_metrics = trainer.callback_metrics
        results.append(val_metrics["val_f1"].item())

        # загрузка лучшей модели
        best_model_path = checkpoint_callback.best_model_path
        print(f"Best model saved at {best_model_path}")

        best_model = NERLightning.load_from_checkpoint(
            best_model_path, 
            num_labels=len(label2id), 
            id2label=id2label,
            label2id=label2id,
        )
        best_model.eval().to('cuda')

        for text, span in tqdm(df.iloc[val_idx, :].values, desc='Pred'):
            logits, offsets, tokens = predict(text, best_model)
            pred_span = decode_predictions(text, offsets, logits) 
            span = [(s['start'], s['end'], s['label']) for s in span]
            validates.append((text, span, pred_span, fold))

    avg_f1 = sum(results) / len(results)
    print(f"\n===== Mean Macro-F1 across {k} folds: {avg_f1:.4f} =====")
    return results, validates

In [None]:
df['spans'] \
    .apply(lambda x: [s['label'] for s in x]) \
    .explode() \
    .reset_index() \
    .drop(columns='index') \
    .groupby(['spans']).agg({'spans': 'count'}) \
    .rename(columns={'spans': 'count'}) \
    .reset_index() \
    .sort_values('count') \
    ['spans'].tolist()

['I-PERCENT',
 'I-VOLUME',
 'B-VOLUME',
 'B-PERCENT',
 'I-BRAND',
 'I-TYPE',
 'O',
 'B-BRAND',
 'B-TYPE']

In [50]:
results, validate_rows = run_kfold_crossval(
    df,
    model_name=model_name, 
    num_labels=len(id2label),
    label2id=label2id,
    id2label=id2label,
    k=5,
    max_epochs=10,
)


===== Fold 1 / 5 =====


Some weights of RobertaModel were not initialized from the model checkpoint at ai-forever/ruRoberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  torch.nn.init.xavier_uniform(self.classifier.weight)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/root/hack-x5/.venv/lib/python3.12/site-packages/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py:76: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `pytorch_lightning` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default
You are

Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]

/root/hack-x5/.venv/lib/python3.12/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:433: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.


                                                                           

/root/hack-x5/.venv/lib/python3.12/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:433: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.


Epoch 0: 100%|██████████| 1369/1369 [02:02<00:00, 11.21it/s, v_num=8, train_loss=0.205, val_loss=0.336, val_f1=0.823]

Metric val_f1 improved. New best score: 0.823
Epoch 0, global step 1369: 'val_f1' reached 0.82345 (best 0.82345), saving model to '/root/hack-x5/models/fold_1/best-checkpoint-v4.ckpt' as top 1


Epoch 1: 100%|██████████| 1369/1369 [02:02<00:00, 11.15it/s, v_num=8, train_loss=0.219, val_loss=0.312, val_f1=0.874, train_f1=0.680] 

Metric val_f1 improved by 0.051 >= min_delta = 0.0. New best score: 0.874
Epoch 1, global step 2738: 'val_f1' reached 0.87422 (best 0.87422), saving model to '/root/hack-x5/models/fold_1/best-checkpoint-v4.ckpt' as top 1


Epoch 2: 100%|██████████| 1369/1369 [02:02<00:00, 11.21it/s, v_num=8, train_loss=0.108, val_loss=0.288, val_f1=0.882, train_f1=0.875] 

Metric val_f1 improved by 0.008 >= min_delta = 0.0. New best score: 0.882
Epoch 2, global step 4107: 'val_f1' reached 0.88229 (best 0.88229), saving model to '/root/hack-x5/models/fold_1/best-checkpoint-v4.ckpt' as top 1


Epoch 3: 100%|██████████| 1369/1369 [02:01<00:00, 11.23it/s, v_num=8, train_loss=0.0685, val_loss=0.373, val_f1=0.882, train_f1=0.913] 

Monitored metric val_f1 did not improve in the last 1 records. Best score: 0.882. Signaling Trainer to stop.
Epoch 3, global step 5476: 'val_f1' was not in top 1


Epoch 3: 100%|██████████| 1369/1369 [02:02<00:00, 11.20it/s, v_num=8, train_loss=0.0685, val_loss=0.373, val_f1=0.882, train_f1=0.913]
Best model saved at /root/hack-x5/models/fold_1/best-checkpoint-v4.ckpt


Some weights of RobertaModel were not initialized from the model checkpoint at ai-forever/ruRoberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  torch.nn.init.xavier_uniform(self.classifier.weight)
Pred:   0%|          | 0/5476 [00:00<?, ?it/s]Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no padding.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Pred: 100%|██████████| 5476/5476 [01:28<00:00, 61.53it/s]



===== Fold 2 / 5 =====


Some weights of RobertaModel were not initialized from the model checkpoint at ai-forever/ruRoberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/root/hack-x5/.venv/lib/python3.12/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:751: Checkpoint directory /root/hack-x5/models/fold_2 exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name            | Type              | Params | Mode 
--------------------------------------------------------------
0 | encoder         | RobertaModel      | 355 M  | eval 
1 | encoder_dropout | Dropout           | 0      | train
2 | classifier      | Linear            | 9.2 K  | train
3 | loss_fn         | CrossEntropyLoss  | 0      | train
4 | f1              | Multic

                                                                           

/root/hack-x5/.venv/lib/python3.12/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:433: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.
/root/hack-x5/.venv/lib/python3.12/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:433: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.


Epoch 0: 100%|██████████| 1369/1369 [02:00<00:00, 11.33it/s, v_num=9, train_loss=0.249, val_loss=0.322, val_f1=0.844]

Metric val_f1 improved. New best score: 0.844
Epoch 0, global step 1369: 'val_f1' reached 0.84368 (best 0.84368), saving model to '/root/hack-x5/models/fold_2/best-checkpoint-v2.ckpt' as top 1


Epoch 1: 100%|██████████| 1369/1369 [02:01<00:00, 11.27it/s, v_num=9, train_loss=0.164, val_loss=0.275, val_f1=0.864, train_f1=0.684] 

Metric val_f1 improved by 0.020 >= min_delta = 0.0. New best score: 0.864
Epoch 1, global step 2738: 'val_f1' reached 0.86410 (best 0.86410), saving model to '/root/hack-x5/models/fold_2/best-checkpoint-v2.ckpt' as top 1


Epoch 2: 100%|██████████| 1369/1369 [02:00<00:00, 11.32it/s, v_num=9, train_loss=0.065, val_loss=0.290, val_f1=0.872, train_f1=0.867] 

Metric val_f1 improved by 0.008 >= min_delta = 0.0. New best score: 0.872
Epoch 2, global step 4107: 'val_f1' reached 0.87219 (best 0.87219), saving model to '/root/hack-x5/models/fold_2/best-checkpoint-v2.ckpt' as top 1


Epoch 3: 100%|██████████| 1369/1369 [02:00<00:00, 11.33it/s, v_num=9, train_loss=0.047, val_loss=0.308, val_f1=0.895, train_f1=0.909]  

Metric val_f1 improved by 0.023 >= min_delta = 0.0. New best score: 0.895
Epoch 3, global step 5476: 'val_f1' reached 0.89507 (best 0.89507), saving model to '/root/hack-x5/models/fold_2/best-checkpoint-v2.ckpt' as top 1


Epoch 4: 100%|██████████| 1369/1369 [02:00<00:00, 11.34it/s, v_num=9, train_loss=0.139, val_loss=0.404, val_f1=0.904, train_f1=0.932]  

Metric val_f1 improved by 0.009 >= min_delta = 0.0. New best score: 0.904
Epoch 4, global step 6845: 'val_f1' reached 0.90445 (best 0.90445), saving model to '/root/hack-x5/models/fold_2/best-checkpoint-v2.ckpt' as top 1


Epoch 5: 100%|██████████| 1369/1369 [02:01<00:00, 11.30it/s, v_num=9, train_loss=0.0105, val_loss=0.402, val_f1=0.909, train_f1=0.958]  

Metric val_f1 improved by 0.004 >= min_delta = 0.0. New best score: 0.909
Epoch 5, global step 8214: 'val_f1' reached 0.90870 (best 0.90870), saving model to '/root/hack-x5/models/fold_2/best-checkpoint-v2.ckpt' as top 1


Epoch 6: 100%|██████████| 1369/1369 [02:01<00:00, 11.30it/s, v_num=9, train_loss=0.0629, val_loss=0.432, val_f1=0.905, train_f1=0.971] 

Monitored metric val_f1 did not improve in the last 1 records. Best score: 0.909. Signaling Trainer to stop.
Epoch 6, global step 9583: 'val_f1' was not in top 1


Epoch 6: 100%|██████████| 1369/1369 [02:01<00:00, 11.27it/s, v_num=9, train_loss=0.0629, val_loss=0.432, val_f1=0.905, train_f1=0.971]
Best model saved at /root/hack-x5/models/fold_2/best-checkpoint-v2.ckpt


Some weights of RobertaModel were not initialized from the model checkpoint at ai-forever/ruRoberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  torch.nn.init.xavier_uniform(self.classifier.weight)
Pred: 100%|██████████| 5476/5476 [01:30<00:00, 60.27it/s]



===== Fold 3 / 5 =====


Some weights of RobertaModel were not initialized from the model checkpoint at ai-forever/ruRoberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name            | Type              | Params | Mode 
--------------------------------------------------------------
0 | encoder         | RobertaModel      | 355 M  | eval 
1 | encoder_dropout | Dropout           | 0      | train
2 | classifier      | Linear            | 9.2 K  | train
3 | loss_fn         | CrossEntropyLoss  | 0      | train
4 | f1              | MulticlassF1Score | 0      | train
--------------------------------------------------------------
25.2 M    Trainable params
330 M     Non-trainable params
355 M     Total params
1,4

Sanity Checking DataLoader 0: 100%|██████████| 2/2 [00:00<00:00, 18.98it/s]

/root/hack-x5/.venv/lib/python3.12/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:433: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.


                                                                           

/root/hack-x5/.venv/lib/python3.12/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:433: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.


Epoch 0: 100%|██████████| 1369/1369 [02:02<00:00, 11.21it/s, v_num=10, train_loss=0.188, val_loss=0.331, val_f1=0.808]

Metric val_f1 improved. New best score: 0.808
Epoch 0, global step 1369: 'val_f1' reached 0.80766 (best 0.80766), saving model to '/root/hack-x5/models/fold_3/best-checkpoint.ckpt' as top 1


Epoch 1: 100%|██████████| 1369/1369 [02:02<00:00, 11.18it/s, v_num=10, train_loss=0.143, val_loss=0.280, val_f1=0.850, train_f1=0.669] 

Metric val_f1 improved by 0.042 >= min_delta = 0.0. New best score: 0.850
Epoch 1, global step 2738: 'val_f1' reached 0.84970 (best 0.84970), saving model to '/root/hack-x5/models/fold_3/best-checkpoint.ckpt' as top 1


Epoch 2: 100%|██████████| 1369/1369 [02:01<00:00, 11.27it/s, v_num=10, train_loss=0.0648, val_loss=0.266, val_f1=0.863, train_f1=0.868]

Metric val_f1 improved by 0.014 >= min_delta = 0.0. New best score: 0.863
Epoch 2, global step 4107: 'val_f1' reached 0.86322 (best 0.86322), saving model to '/root/hack-x5/models/fold_3/best-checkpoint.ckpt' as top 1


Epoch 3: 100%|██████████| 1369/1369 [02:01<00:00, 11.24it/s, v_num=10, train_loss=0.168, val_loss=0.336, val_f1=0.881, train_f1=0.911]  

Metric val_f1 improved by 0.018 >= min_delta = 0.0. New best score: 0.881
Epoch 3, global step 5476: 'val_f1' reached 0.88143 (best 0.88143), saving model to '/root/hack-x5/models/fold_3/best-checkpoint.ckpt' as top 1


Epoch 4: 100%|██████████| 1369/1369 [02:02<00:00, 11.16it/s, v_num=10, train_loss=0.0151, val_loss=0.354, val_f1=0.875, train_f1=0.943] 

Monitored metric val_f1 did not improve in the last 1 records. Best score: 0.881. Signaling Trainer to stop.
Epoch 4, global step 6845: 'val_f1' was not in top 1


Epoch 4: 100%|██████████| 1369/1369 [02:03<00:00, 11.13it/s, v_num=10, train_loss=0.0151, val_loss=0.354, val_f1=0.875, train_f1=0.943]
Best model saved at /root/hack-x5/models/fold_3/best-checkpoint.ckpt


Some weights of RobertaModel were not initialized from the model checkpoint at ai-forever/ruRoberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  torch.nn.init.xavier_uniform(self.classifier.weight)
Pred: 100%|██████████| 5476/5476 [01:29<00:00, 61.08it/s]



===== Fold 4 / 5 =====


Some weights of RobertaModel were not initialized from the model checkpoint at ai-forever/ruRoberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name            | Type              | Params | Mode 
--------------------------------------------------------------
0 | encoder         | RobertaModel      | 355 M  | eval 
1 | encoder_dropout | Dropout           | 0      | train
2 | classifier      | Linear            | 9.2 K  | train
3 | loss_fn         | CrossEntropyLoss  | 0      | train
4 | f1              | MulticlassF1Score | 0      | train
--------------------------------------------------------------
25.2 M    Trainable params
330 M     Non-trainable params
355 M     Total params
1,4

Sanity Checking DataLoader 0: 100%|██████████| 2/2 [00:00<00:00, 19.18it/s]

/root/hack-x5/.venv/lib/python3.12/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:433: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.


                                                                           

/root/hack-x5/.venv/lib/python3.12/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:433: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.


Epoch 0: 100%|██████████| 1369/1369 [02:01<00:00, 11.30it/s, v_num=11, train_loss=0.479, val_loss=0.304, val_f1=0.865]

Metric val_f1 improved. New best score: 0.865
Epoch 0, global step 1369: 'val_f1' reached 0.86500 (best 0.86500), saving model to '/root/hack-x5/models/fold_4/best-checkpoint.ckpt' as top 1


Epoch 1: 100%|██████████| 1369/1369 [02:02<00:00, 11.20it/s, v_num=11, train_loss=0.215, val_loss=0.293, val_f1=0.881, train_f1=0.673] 

Metric val_f1 improved by 0.016 >= min_delta = 0.0. New best score: 0.881
Epoch 1, global step 2738: 'val_f1' reached 0.88136 (best 0.88136), saving model to '/root/hack-x5/models/fold_4/best-checkpoint.ckpt' as top 1


Epoch 2: 100%|██████████| 1369/1369 [02:04<00:00, 10.99it/s, v_num=11, train_loss=0.265, val_loss=0.264, val_f1=0.891, train_f1=0.849] 

Metric val_f1 improved by 0.010 >= min_delta = 0.0. New best score: 0.891
Epoch 2, global step 4107: 'val_f1' reached 0.89086 (best 0.89086), saving model to '/root/hack-x5/models/fold_4/best-checkpoint.ckpt' as top 1


Epoch 3: 100%|██████████| 1369/1369 [02:03<00:00, 11.06it/s, v_num=11, train_loss=0.0203, val_loss=0.278, val_f1=0.901, train_f1=0.902] 

Metric val_f1 improved by 0.010 >= min_delta = 0.0. New best score: 0.901
Epoch 3, global step 5476: 'val_f1' reached 0.90051 (best 0.90051), saving model to '/root/hack-x5/models/fold_4/best-checkpoint.ckpt' as top 1


Epoch 4: 100%|██████████| 1369/1369 [02:01<00:00, 11.25it/s, v_num=11, train_loss=0.192, val_loss=0.364, val_f1=0.893, train_f1=0.935]  

Monitored metric val_f1 did not improve in the last 1 records. Best score: 0.901. Signaling Trainer to stop.
Epoch 4, global step 6845: 'val_f1' was not in top 1


Epoch 4: 100%|██████████| 1369/1369 [02:02<00:00, 11.22it/s, v_num=11, train_loss=0.192, val_loss=0.364, val_f1=0.893, train_f1=0.935]
Best model saved at /root/hack-x5/models/fold_4/best-checkpoint.ckpt


Some weights of RobertaModel were not initialized from the model checkpoint at ai-forever/ruRoberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  torch.nn.init.xavier_uniform(self.classifier.weight)
Pred: 100%|██████████| 5475/5475 [01:30<00:00, 60.40it/s]



===== Fold 5 / 5 =====


Some weights of RobertaModel were not initialized from the model checkpoint at ai-forever/ruRoberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name            | Type              | Params | Mode 
--------------------------------------------------------------
0 | encoder         | RobertaModel      | 355 M  | eval 
1 | encoder_dropout | Dropout           | 0      | train
2 | classifier      | Linear            | 9.2 K  | train
3 | loss_fn         | CrossEntropyLoss  | 0      | train
4 | f1              | MulticlassF1Score | 0      | train
--------------------------------------------------------------
25.2 M    Trainable params
330 M     Non-trainable params
355 M     Total params
1,4

Sanity Checking DataLoader 0:  50%|█████     | 1/2 [00:00<00:00, 23.50it/s]

/root/hack-x5/.venv/lib/python3.12/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:433: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.


                                                                           

/root/hack-x5/.venv/lib/python3.12/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:433: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.


Epoch 0: 100%|██████████| 1369/1369 [02:01<00:00, 11.25it/s, v_num=12, train_loss=0.162, val_loss=0.353, val_f1=0.832]

Metric val_f1 improved. New best score: 0.832
Epoch 0, global step 1369: 'val_f1' reached 0.83163 (best 0.83163), saving model to '/root/hack-x5/models/fold_5/best-checkpoint.ckpt' as top 1


Epoch 1: 100%|██████████| 1369/1369 [02:01<00:00, 11.29it/s, v_num=12, train_loss=0.115, val_loss=0.310, val_f1=0.865, train_f1=0.689] 

Metric val_f1 improved by 0.033 >= min_delta = 0.0. New best score: 0.865
Epoch 1, global step 2738: 'val_f1' reached 0.86506 (best 0.86506), saving model to '/root/hack-x5/models/fold_5/best-checkpoint.ckpt' as top 1


Epoch 2: 100%|██████████| 1369/1369 [02:00<00:00, 11.35it/s, v_num=12, train_loss=0.303, val_loss=0.303, val_f1=0.862, train_f1=0.875] 

Monitored metric val_f1 did not improve in the last 1 records. Best score: 0.865. Signaling Trainer to stop.
Epoch 2, global step 4107: 'val_f1' was not in top 1


Epoch 2: 100%|██████████| 1369/1369 [02:00<00:00, 11.32it/s, v_num=12, train_loss=0.303, val_loss=0.303, val_f1=0.862, train_f1=0.875]
Best model saved at /root/hack-x5/models/fold_5/best-checkpoint.ckpt


Some weights of RobertaModel were not initialized from the model checkpoint at ai-forever/ruRoberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  torch.nn.init.xavier_uniform(self.classifier.weight)
Pred: 100%|██████████| 5475/5475 [01:30<00:00, 60.31it/s]


===== Mean Macro-F1 across 5 folds: 0.8833 =====





In [52]:
pred_val_df = pd.DataFrame(validate_rows, columns=['text', 'spans', 'pspans', 'fold'])
pred_val_df['spans'] = pred_val_df['spans'].apply(lambda x: str(x))
pred_val_df['pspans'] = pred_val_df['pspans'].apply(lambda x: str(x))
pred_val_df.to_csv('../data/pred_val_spans.csv', sep=';', index=False)
pred_val_df.head(2)

Unnamed: 0,text,spans,pspans,fold
0,abon,"[(0, 4, 'O')]","[(0, 4, 'O')]",0
1,abtoys игрушк,"[(0, 6, 'B-BRAND'), (7, 13, 'B-TYPE')]","[(0, 6, 'B-BRAND'), (7, 13, 'I-TYPE')]",0


In [54]:
pred_val_df = pd.DataFrame(validate_rows, columns=['text', 'spans', 'pspans', 'fold'])
pred_val_df

Unnamed: 0,text,spans,pspans,fold
0,abon,"[(0, 4, O)]","[(0, 4, O)]",0
1,abtoys игрушк,"[(0, 6, B-BRAND), (7, 13, B-TYPE)]","[(0, 6, B-BRAND), (7, 13, I-TYPE)]",0
2,active,"[(0, 6, B-BRAND)]","[(0, 6, B-BRAND)]",0
3,agata,"[(0, 5, B-BRAND)]","[(0, 5, B-BRAND)]",0
4,agnesi пше,"[(0, 6, B-BRAND), (7, 10, B-TYPE)]","[(0, 6, B-BRAND), (7, 10, B-TYPE)]",0
...,...,...,...,...
27373,яыц,"[(0, 3, B-TYPE)]","[(0, 3, B-TYPE)]",4
27374,яыца,"[(0, 4, B-TYPE)]","[(0, 4, B-TYPE)]",4
27375,№1 газе,"[(0, 2, B-BRAND), (3, 7, B-TYPE)]","[(0, 2, B-BRAND), (3, 7, B-BRAND)]",4
27376,№1 кофейник,"[(0, 2, B-BRAND), (3, 11, B-TYPE)]","[(0, 2, B-BRAND), (3, 11, I-TYPE)]",4


In [56]:
pred_val_df[pred_val_df['spans'] != pred_val_df['pspans']].sample(10)

Unnamed: 0,text,spans,pspans,fold
4266,соус кисло,"[(0, 4, B-TYPE), (5, 10, O)]","[(0, 4, B-TYPE), (5, 10, I-TYPE)]",0
20412,сделай с,"[(0, 6, B-TYPE), (7, 8, I-TYPE)]","[(0, 6, B-TYPE), (7, 8, O)]",3
12777,киндер молочный ломтик,"[(0, 6, B-BRAND), (7, 15, B-TYPE), (16, 22, I-...","[(0, 6, B-BRAND), (7, 15, I-TYPE), (16, 22, I-...",2
13614,молоко молочная станция,"[(0, 6, B-TYPE), (7, 15, B-BRAND), (16, 23, I-...","[(0, 6, B-TYPE), (7, 15, I-TYPE), (16, 23, I-B...",2
21056,тан царицы,"[(0, 3, B-TYPE), (4, 10, B-BRAND)]","[(0, 3, B-TYPE), (4, 10, I-BRAND)]",3
22007,crem,"[(0, 4, B-TYPE)]","[(0, 4, B-BRAND)]",4
2486,маркет мёд,"[(0, 6, B-BRAND), (7, 10, B-TYPE)]","[(0, 6, O), (7, 10, B-TYPE)]",0
6338,в маринаде,"[(0, 1, B-TYPE), (2, 10, I-TYPE)]","[(0, 1, O), (2, 10, O)]",1
3773,русие колбсы,"[(0, 5, O), (6, 12, O)]","[(0, 5, O), (6, 12, I-TYPE)]",0
25524,редбулл,"[(0, 7, B-BRAND)]","[(0, 7, B-TYPE)]",4


In [None]:
from pathlib import Path
best_model_path = Path.cwd().parent / 'models' / 'fold' / 'best_checkpoint.ckpt'
print(f"Best model saved at: {best_model_path}")

best_model = NERLightning.load_from_checkpoint(
    best_model_path, 
    num_labels=len(label2id), 
    id2label=id2label,
    label2id=label2id,
)
best_model.eval()

Best model saved at: /root/models/fold/best_checkpoint.ckpt


FileNotFoundError: [Errno 2] No such file or directory: '/root/models/fold/best_checkpoint.ckpt'

In [None]:
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping

early_stop_callback = EarlyStopping(
    monitor="val_f1",
    patience=2,
    mode="max",
    verbose=True,
)

checkpoint_callback = ModelCheckpoint(
    dirpath="../models/checkpoints",
    filename=model_name + "-ner-{epoch:02d}-{val_f1:.4f}",
    save_top_k=1,
    monitor="val_f1",
    mode="max",
    save_weights_only=True,
)

model = NERLightning(
    model_name=model_name,
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id,
    lr=2e-4,
)

trainer = Trainer(max_epochs=10, accelerator="gpu", devices=1, callbacks=[early_stop_callback, checkpoint_callback])
trainer.fit(model, train_loader, test_loader)

best_model_path = checkpoint_callback.best_model_path
print(f"Best model saved at: {best_model_path}")

best_model = NERLightning.load_from_checkpoint(
    best_model_path, 
    num_labels=len(label2id), 
    id2label=id2label,
    label2id=label2id,
)
best_model.eval()
print()

Some weights of RobertaModel were not initialized from the model checkpoint at ai-forever/ruRoberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  torch.nn.init.xavier_uniform(self.classifier.weight)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/root/hack-x5/.venv/lib/python3.12/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:751: Checkpoint directory /root/hack-x5/models/checkpoints exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name            | Type              | Params | Mode 
--------------------------------------------------------------
0 | encoder         | RobertaModel      | 355 M  | eval 
1 | encoder_dropout | Dropout           | 0      | train
2 | classifier      | Linear            | 9.2 K  | train
3 | loss_fn         | Cro

Sanity Checking DataLoader 0:  50%|█████     | 1/2 [00:00<00:00,  7.55it/s]

/root/hack-x5/.venv/lib/python3.12/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:433: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.


                                                                           

/root/hack-x5/.venv/lib/python3.12/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:433: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.


Epoch 0: 100%|██████████| 385/385 [01:30<00:00,  4.26it/s, v_num=5, train_loss=0.277, val_loss=0.356, val_f1=0.728]

Metric val_f1 improved. New best score: 0.728


Epoch 1: 100%|██████████| 385/385 [01:31<00:00,  4.20it/s, v_num=5, train_loss=0.149, val_loss=0.257, val_f1=0.865, train_f1=0.663] 

Metric val_f1 improved by 0.137 >= min_delta = 0.0. New best score: 0.865


Epoch 2: 100%|██████████| 385/385 [01:31<00:00,  4.20it/s, v_num=5, train_loss=0.192, val_loss=0.268, val_f1=0.866, train_f1=0.863] 

Metric val_f1 improved by 0.001 >= min_delta = 0.0. New best score: 0.866


Epoch 3: 100%|██████████| 385/385 [01:30<00:00,  4.24it/s, v_num=5, train_loss=0.130, val_loss=0.298, val_f1=0.875, train_f1=0.926] 

Metric val_f1 improved by 0.009 >= min_delta = 0.0. New best score: 0.875


Epoch 5: 100%|██████████| 385/385 [01:30<00:00,  4.24it/s, v_num=5, train_loss=0.0223, val_loss=0.451, val_f1=0.873, train_f1=0.968] 

Monitored metric val_f1 did not improve in the last 2 records. Best score: 0.875. Signaling Trainer to stop.


Epoch 5: 100%|██████████| 385/385 [01:30<00:00,  4.24it/s, v_num=5, train_loss=0.0223, val_loss=0.451, val_f1=0.873, train_f1=0.968]
Best model saved at: /root/hack-x5/models/checkpoints/ai-forever/ruRoberta-large-ner-epoch=03-val_f1=0.8747.ckpt


Some weights of RobertaModel were not initialized from the model checkpoint at ai-forever/ruRoberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  torch.nn.init.xavier_uniform(self.classifier.weight)





In [None]:
model = NERLightning(
    model_name=model_name,
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id,
    lr=2e-4,
)

trainer = Trainer(max_epochs=4, accelerator="gpu", devices=1)
trainer.fit(model, full_loader)

Some weights of RobertaModel were not initialized from the model checkpoint at ai-forever/ruRoberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  torch.nn.init.xavier_uniform(self.classifier.weight)
💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/root/hack-x5/.venv/lib/python3.12/site-packages/pytorch_lightning/trainer/configuration_validator.py:70: You defined a `validation_step` but have no `val_dataloader`. Skipping val loop.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name            | Type              | Params | Mode
-------------------------------------------------

Epoch 3: 100%|██████████| 428/428 [01:31<00:00,  4.69it/s, v_num=7, train_loss=0.0105, train_f1=0.933]  

`Trainer.fit` stopped: `max_epochs=4` reached.


Epoch 3: 100%|██████████| 428/428 [01:46<00:00,  4.03it/s, v_num=7, train_loss=0.0105, train_f1=0.933]


In [98]:
import torch
import json
import os
from transformers import AutoTokenizer

def save_model(model, save_dir="../models"):
    os.makedirs(save_dir, exist_ok=True)

    # 1. веса модели
    torch.save(model.state_dict(), os.path.join(save_dir, "ner_model.bin"))

    # 2. метаданные
    metadata = {
        "model_name": model.hparams.model_name,
        "num_labels": model.hparams.num_labels,
        "label2id": model.hparams.label2id,
        "id2label": model.hparams.id2label
    }
    with open(os.path.join(save_dir, "config.json"), "w", encoding="utf-8") as f:
        json.dump(metadata, f, ensure_ascii=False, indent=2)

    print(f"✅ Model saved to {save_dir}")

def load_model(save_dir="../models"):
    # 1. метаданные
    with open(os.path.join(save_dir, "config.json"), "r", encoding="utf-8") as f:
        metadata = json.load(f)

    # 2. инициализация модели
    model = NERLightning(
        model_name=metadata["model_name"],
        num_labels=metadata["num_labels"],
        id2label=metadata["id2label"],
        label2id=metadata["label2id"]
    )

    # 3. загрузка весов
    state_dict = torch.load(os.path.join(save_dir, "ner_model.bin"), map_location="cpu")
    model.load_state_dict(state_dict)

    # 4. токенайзер
    tokenizer = AutoTokenizer.from_pretrained(metadata["model_name"])

    print(f"✅ Model loaded from {save_dir}")
    return model, tokenizer

In [45]:
save_model(model, '../models/deeppavlov_ner_model')

✅ Model saved to ../models/deeppavlov_ner_model


In [47]:
l_model, l_tokenizer = load_model()

✅ Model loaded from ../models


In [None]:
# пример использования
def predict(text: str):
    text = text.replace("\xa0", " ")
    model.eval()
    enc = tokenizer(text, return_tensors="pt", return_offsets_mapping=True, truncation=True, padding="max_length")
    input_ids = enc["input_ids"].to('cpu')
    attention_mask = enc["attention_mask"].to('cpu')
    offsets = enc["offset_mapping"][0]
    is_numeric = []
    for start, end in offsets:
        if text[start : end].isdigit():
            is_numeric.append(1)
        else:
            is_numeric.append(0)

    is_numeric = torch.tensor([is_numeric])

    with torch.no_grad():
        logits = model(
            input_ids=input_ids.to('cuda'),
            attention_mask=attention_mask.to('cuda'),
            is_numeric=is_numeric.to('cuda'),
        )['logits'].argmax(dim=-1)[0].cpu().numpy()
    tokens = tokenizer.convert_ids_to_tokens(enc["input_ids"][0])

    return logits, offsets, tokens

def decode_predictions(text, offsets, labels):
    tokens = text.split(' ')
    token_offsets = []
    cur_i = 0
    for token in tokens:
        start = cur_i
        end = start + len(token)
        token_offsets.append((start, end))
        cur_i += len(token) + 1

    bio_start_offsets = [int(o[0]) for o in offsets if o[0] != o[1]]
    res = []
    for token, (start, end) in zip(tokens, token_offsets):
        idx_token_label = bio_start_offsets.index(start) + 1
        label = labels[idx_token_label]
        res.append((token, (start, end, id2label[label])))

    return res

text = "сливки 13 процентов"
logits, offsets, tokens = predict(text)
print(decode_predictions(text, offsets, logits))

NameError: name 'model' is not defined

In [139]:
import time
text = "корм влажный purina one"
logits, offsets, tokens = predict(text)
print(decode_predictions(text, offsets, logits))

[('корм', (0, 4, 'B-TYPE')), ('влажный', (5, 12, 'I-TYPE')), ('purina', (13, 19, 'B-BRAND')), ('one', (20, 23, 'I-BRAND'))]


In [127]:
row = test_df.sample(1).values[0]
text = row[0]
print(text, [s['label'] for s in row[1]])
logits, offsets, tokens = predict(text)
print(decode_predictions(text, offsets, logits))

бананаааа ['B-TYPE']
[('бананаааа', (0, 9, 'B-TYPE'))]


In [51]:
sdf = pd.read_csv('../data/submission.csv', sep=';')
sdf.columns = ['text', 'spans']
sdf['spans'] = sdf['spans'].apply(lambda x: [{'start': span[0], 'end': span[1], 'label': span[2].replace('0', 'O')} for span in ast.literal_eval(x)])
sdf = sdf.iloc[[i for i, s in enumerate(sdf['spans'].values) if s[0]['start'] == 0]]
sdf.head(2)

Unnamed: 0,text,spans
0,форма для выпечки,"[{'start': 0, 'end': 5, 'label': 'B-TYPE'}, {'..."
1,фарш свиной,"[{'start': 0, 'end': 4, 'label': 'B-TYPE'}, {'..."


In [52]:
texts = sdf['text'].tolist()
s_text_pred_bio = []
for text in tqdm(texts, desc='Pred'):
    logits, offsets, tokens = predict(text)
    bio = decode_predictions(text, offsets, logits) 
    s_text_pred_bio.append(bio)

Pred: 100%|██████████| 4999/4999 [00:37<00:00, 132.33it/s]


In [54]:
sdf['annotation'] = [[s[1] for s in spans] for spans in s_text_pred_bio]
sdf = sdf.rename(columns={'text': 'sample'}).drop(columns='spans')
sdf.to_csv('../data/test.csv', sep=';', index=False)
sdf.head(2)

KeyError: "['spans'] not found in axis"

In [None]:
from collections import defaultdict

def compute_macro_f1(y_true, y_pred, entity_types=("TYPE","BRAND","VOLUME","PERCENT")):
    """
    y_true, y_pred: списки предсказанных и эталонных сущностей для всех примеров
    Каждое значение — список кортежей: (start, end, label)
    label в формате 'B-TYPE', 'I-BRAND' и т.д.
    """
    
    # счётчики TP, FP, FN для каждого типа
    stats = {etype: {"TP":0, "FP":0, "FN":0} for etype in entity_types}

    for true_spans, pred_spans in zip(y_true, y_pred):
        # создаём словари по типу сущности
        true_by_type = defaultdict(list)
        for start, end, label in true_spans:
            etype = label.split("-")[-1]
            true_by_type[etype].append((start,end))
        
        pred_by_type = defaultdict(list)
        for start, end, label in pred_spans:
            etype = label.split("-")[-1]
            pred_by_type[etype].append((start,end))
        
        for etype in entity_types:
            true_set = set(true_by_type.get(etype, []))
            pred_set = set(pred_by_type.get(etype, []))
            TP = len(true_set & pred_set)
            FP = len(pred_set - true_set)
            FN = len(true_set - pred_set)
            stats[etype]["TP"] += TP
            stats[etype]["FP"] += FP
            stats[etype]["FN"] += FN

    # вычисляем F1 для каждого типа
    f1_scores = []
    for etype in entity_types:
        TP = stats[etype]["TP"]
        FP = stats[etype]["FP"]
        FN = stats[etype]["FN"]
        precision = TP / (TP + FP) if TP + FP > 0 else 0.0
        recall = TP / (TP + FN) if TP + FN > 0 else 0.0
        f1 = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0.0
        f1_scores.append(f1)

    macro_f1 = sum(f1_scores)/len(f1_scores)
    return macro_f1

In [None]:
texts = test_df['text'].tolist()
text_pred_bio = []
for text in tqdm(texts):
    logits, offsets, tokens = predict(text)
    bio = decode_predictions(text, offsets, logits) 
    text_pred_bio.append(bio)

text_true_bio = [
    [(span['start'], span['end'], span['label']) for span in spans]
    for spans in test_df['spans'].tolist()
]

100%|██████████| 2738/2738 [01:17<00:00, 35.27it/s]


In [96]:
test_df['pred_spans'] = [
    [{'start': span[0], 'end': span[1], 'label': span[2]} for _, span in items] 
    for items in text_pred_bio
]
test_df.head(2)

Unnamed: 0,text,spans,pred_spans,spansl,pred_spansl
9,actimal,"[{'start': 0, 'end': 7, 'label': 'B-BRAND'}]","[{'start': 0, 'end': 7, 'label': 'B-BRAND'}]","[(0, 7, B-BRAND)]","[(0, 7, B-BRAND)]"
11,actimeuno,"[{'start': 0, 'end': 9, 'label': 'B-BRAND'}]","[{'start': 0, 'end': 9, 'label': 'B-BRAND'}]","[(0, 9, B-BRAND)]","[(0, 9, B-BRAND)]"


In [97]:
test_df['spansl'] = test_df['spans'].apply(lambda spans: [(s['start'], s['end'], s['label']) for s in spans])
test_df['pred_spansl'] = test_df['pred_spans'].apply(lambda spans: [(s['start'], s['end'], s['label']) for s in spans])
test_df.head(2)

Unnamed: 0,text,spans,pred_spans,spansl,pred_spansl
9,actimal,"[{'start': 0, 'end': 7, 'label': 'B-BRAND'}]","[{'start': 0, 'end': 7, 'label': 'B-BRAND'}]","[(0, 7, B-BRAND)]","[(0, 7, B-BRAND)]"
11,actimeuno,"[{'start': 0, 'end': 9, 'label': 'B-BRAND'}]","[{'start': 0, 'end': 9, 'label': 'B-BRAND'}]","[(0, 9, B-BRAND)]","[(0, 9, B-BRAND)]"


In [102]:
test_df[test_df['spansl'] != test_df['pred_spansl']].sample(10)[['text', 'spansl', 'pred_spansl']]

Unnamed: 0,text,spansl,pred_spansl
22823,сырокопченя кобаса,"[(0, 11, O), (12, 18, O)]","[(0, 11, B-TYPE), (12, 18, B-BRAND)]"
16163,перец черный горошком,"[(0, 5, B-TYPE), (6, 12, I-TYPE), (13, 21, O)]","[(0, 5, B-TYPE), (6, 12, I-TYPE), (13, 21, I-T..."
11260,кэннон труcы,"[(0, 6, B-BRAND), (7, 12, B-TYPE)]","[(0, 6, B-BRAND), (7, 12, B-BRAND)]"
17625,приправа хмели сунели,"[(0, 8, B-TYPE), (9, 14, I-TYPE), (15, 21, O)]","[(0, 8, B-TYPE), (9, 14, B-BRAND), (15, 21, I-..."
13584,мороженое фруктовый лед !,"[(0, 9, B-TYPE), (10, 19, I-TYPE), (20, 23, O)...","[(0, 9, B-TYPE), (10, 19, I-TYPE), (20, 23, I-..."
14559,нпро,"[(0, 4, O)]","[(0, 4, B-TYPE)]"
23398,телятины охлжденне,"[(0, 8, O), (9, 18, O)]","[(0, 8, B-TYPE), (9, 18, B-BRAND)]"
10756,"кртки, вероки","[(0, 6, O), (7, 13, O)]","[(0, 6, B-TYPE), (7, 13, I-TYPE)]"
24300,"удобреия, подкорма","[(0, 9, O), (10, 18, O)]","[(0, 9, B-TYPE), (10, 18, I-TYPE)]"
25438,хрусteam,"[(0, 8, B-BRAND)]","[(0, 8, B-TYPE)]"


In [111]:
from itertools import chain
from collections import Counter
c = Counter(chain(*[[s['label'] for s in spans] for spans in train_df['spans'].values]))
c

Counter({'B-TYPE': 22183,
         'B-BRAND': 6523,
         'O': 4820,
         'I-TYPE': 4109,
         'I-BRAND': 438,
         'B-PERCENT': 142,
         'B-VOLUME': 53,
         'I-VOLUME': 25,
         'I-PERCENT': 22})

In [98]:
score = compute_macro_f1(text_true_bio, [[s[1] for s in spans] for spans in text_pred_bio])
print("Macro F1:", score)

Macro F1: 0.8676535272656256


In [None]:
test_texts = test_df['text'].tolist()
test_spans = [
    [(s['start'], s['end'], s['label']) for s in spans]
    for spans in test_df['spans'].tolist()
]