In [None]:
# В ячейке Jupyter поставь с ! перед командой
!pip install -q transformers datasets seqeval torch torchvision torchaudio fastapi uvicorn python-multipart
# Опционально (если хочешь ONNX Runtime для ускорения inference на CPU)
!pip install -q onnxruntime


zsh:1: command not found: pip
zsh:1: command not found: pip


In [None]:
!pip install evaluate

zsh:1: command not found: pip


In [None]:
pip install --upgrade transformers

Note: you may need to restart the kernel to use updated packages.


In [None]:
pip install --upgrade accelerate

Note: you may need to restart the kernel to use updated packages.


In [6]:
# Cell 1: импорты и константы
import ast
import re
import json
from typing import List, Tuple, Dict, Any
from collections import defaultdict

import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification
from datasets import Dataset
import evaluate

# Название модели (можешь менять)
MODEL_NAME = "DeepPavlov/rubert-base-cased"

# Метки (как в задании — BIO по 4 типам)
LABELS = [
    "O",
    "B-TYPE","I-TYPE",
    "B-BRAND","I-BRAND",
    "B-VOLUME","I-VOLUME",
    "B-PERCENT","I-PERCENT",
]
label2id = {lbl: idx for idx, lbl in enumerate(LABELS)}
id2label = {idx: lbl for lbl, idx in label2id.items()}

# Entity types для финальной метрики
ENTITY_TYPES = ["TYPE","BRAND","VOLUME","PERCENT"]


In [7]:
# Cell 2: парсер разметки
def parse_annotation_str(s: str) -> List[Tuple[int,int,str]]:
    """
    Разбирает строку вида "[(0, 7, 'B-TYPE'), (9, 13, 'B-VOLUME')]" в список кортежей.
    Возвращает [] если пусто или недоступно.
    """
    if not s or str(s).strip() in ("", "[]", "None"):
        return []
    try:
        parsed = ast.literal_eval(s)
        out = []
        for item in parsed:
            if len(item) < 3:
                continue
            start = int(item[0])
            end = int(item[1])
            tag = str(item[2])
            out.append((start, end, tag))
        return out
    except Exception as e:
        # если парсинг не удался — вернем пустой список, но можно логировать
        print("Warning: can't parse annotation:", s, " -> ", e)
        return []

# Тестовый пример
print(parse_annotation_str("[(0, 7, 'B-TYPE')]"))
print(parse_annotation_str("[]"))
print(parse_annotation_str(""))


[(0, 7, 'B-TYPE')]
[]
[]


In [8]:
# Cell 3: загрузка файла и ручная проверка
df = pd.read_csv("/Users/marina/Documents/projects/X5/Датасет 2/train.csv", sep=";")  # если csv большой, можно использовать chunksize
df.head(5)


Unnamed: 0,sample,annotation
0,aa,"[(0, 2, 'O')]"
1,aala,"[(0, 4, 'O')]"
2,aarcca,"[(0, 6, 'O')]"
3,abon,"[(0, 4, 'O')]"
4,abso,"[(0, 4, 'B-BRAND')]"


In [9]:
# Cell 4: функция токенизации + выравнивания label'ов
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

# Cell X: безопасная токенизация и выравнивание меток
from typing import List, Tuple

def tokenize_and_align_labels_safe(
    examples: List[str],
    annotations: List[List[Tuple[int,int,str]]],
    max_length: int = 128
):
    """
    Преобразует список текстов и их аннотаций в формат, пригодный для обучения NER.
    Проверяет разметку и безопасно обрабатывает 'O' или некорректные теги.
    
    Args:
        examples: список строк (текстов)
        annotations: список списков кортежей (start, end, tag) для каждой строки
        max_length: максимальная длина токенизированного текста
    
    Returns:
        dict с ключами: input_ids, attention_mask, labels, offset_mapping
    """
    enc = tokenizer(
        examples,
        truncation=True,
        padding=False,   # паддинг будем делать позже в DataCollator
        max_length=max_length,
        return_offsets_mapping=True,
        return_attention_mask=True
    )

    all_labels = []

    for i, offsets in enumerate(enc["offset_mapping"]):
        spans = annotations[i]  # разметка для текущего текста
        token_labels = []

        for tidx, (t_s, t_e) in enumerate(offsets):
            # Специальные токены ([CLS], [SEP] и т.д.)
            if t_s == t_e:
                token_labels.append(-100)
                continue

            # Найдём пересечение токена с сущностью
            assigned_label = "O"
            found_span = None
            for ent_s, ent_e, ent_tag in spans:
                if t_e <= ent_s or t_s >= ent_e:
                    continue  # токен левее или правее сущности
                found_span = (ent_s, ent_e, ent_tag)
                break

            if found_span is None:
                token_labels.append(label2id["O"])
            else:
                ent_s, ent_e, ent_tag = found_span

                # безопасная проверка: есть ли дефис
                if "-" in ent_tag:
                    ent_type = ent_tag.split("-",1)[1]  # TYPE, BRAND, ...
                    if t_s <= ent_s < t_e:
                        lbl = f"B-{ent_type}"
                    else:
                        lbl = f"I-{ent_type}"
                    token_labels.append(label2id.get(lbl, label2id["O"]))
                else:
                    # случай O или некорректного тега
                    token_labels.append(label2id["O"])

        all_labels.append(token_labels)

    # сохраняем и возвращаем offsets для отладки
    enc["labels"] = all_labels
    return enc



In [10]:
from sklearn.model_selection import train_test_split

# Подготовим списки текстов и аннотаций
texts = df["sample"].astype(str).tolist()           # у тебя колонка называется 'sample'
annotations_parsed = [parse_annotation_str(s) for s in df["annotation"].tolist()]

# Разделение на train и validation
train_texts, val_texts, train_ann, val_ann = train_test_split(
    texts, annotations_parsed, test_size=0.1, random_state=42
)

print("Train examples:", len(train_texts))
print("Validation examples:", len(val_texts))


Train examples: 24525
Validation examples: 2726


In [11]:
# Cell 5: split train/val and prepare HF Dataset
train_enc = tokenize_and_align_labels_safe(train_texts, train_ann, max_length=128)
val_enc = tokenize_and_align_labels_safe(val_texts, val_ann, max_length=128)

In [12]:
model = AutoModelForTokenClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(LABELS),
    id2label=id2label,
    label2id=label2id
)

data_collator = DataCollatorForTokenClassification(tokenizer)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
# Cell 7: TrainingArguments и Trainer (baseline)
training_args = TrainingArguments(
    output_dir="hf_out",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_steps=100,
    save_total_limit=2
)


# Для промежуточной метрики используем seqeval (token-level).
import evaluate

seq_metric = evaluate.load("seqeval")

def compute_token_metrics(pred):
    """
    Функция считает token-level metrics через seqeval.
    Это полезно для мониторинга, но финальная проверка будет entity-level.
    """
    preds = np.argmax(pred.predictions, axis=2)
    label_list = LABELS
    # Преобразуем, пропуская метки с -100
    true_nested = []
    pred_nested = []
    for i, lab in enumerate(pred.label_ids):
        true_seq = []
        pred_seq = []
        for j, label_id in enumerate(lab):
            if label_id == -100:
                continue
            true_seq.append(label_list[label_id])
            pred_seq.append(label_list[preds[i][j]])
        true_nested.append(true_seq)
        pred_nested.append(pred_seq)
    results = seq_metric.compute(predictions=pred_nested, references=true_nested)
    return {
        "eval_precision": results.get("overall_precision", 0.0),
        "eval_recall": results.get("overall_recall", 0.0),
        "eval_f1": results.get("overall_f1", 0.0)
    }
from datasets import Dataset

train_dataset = Dataset.from_dict({
    "input_ids": train_enc["input_ids"],
    "attention_mask": train_enc["attention_mask"],
    "labels": train_enc["labels"],
    "offset_mapping": train_enc["offset_mapping"],  # для отладки, можно убрать
    "text": train_texts
})

val_dataset = Dataset.from_dict({
    "input_ids": val_enc["input_ids"],
    "attention_mask": val_enc["attention_mask"],
    "labels": val_enc["labels"],
    "offset_mapping": val_enc["offset_mapping"],
    "text": val_texts
})

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_token_metrics
)

# Запуск тренировки (выполняй, когда готов)
trainer.train()


  trainer = Trainer(


Step,Training Loss
100,0.6497
200,0.4351
300,0.3608
400,0.381
500,0.3508
600,0.344
700,0.3742
800,0.2852
900,0.342
1000,0.2959




TrainOutput(global_step=3066, training_loss=0.2968198604546414, metrics={'train_runtime': 982.786, 'train_samples_per_second': 24.955, 'train_steps_per_second': 3.12, 'total_flos': 91732194258084.0, 'train_loss': 0.2968198604546414, 'epoch': 1.0})

In [14]:
def tokens_to_spans(pred_label_ids: List[int], offsets: List[Tuple[int,int]]) -> List[Tuple[int,int,str]]:
    """
    pred_label_ids: список id меток для токенов (без -100)
    offsets: список (start_char, end_char) для токенов
    Возвращает список (start_char, end_char, 'B-TYPE') spans
    """
    spans = []
    cur = None  # [start_char, end_char, typ]
    for lid, (t_s, t_e) in zip(pred_label_ids, offsets):
        if t_s == t_e:
            continue
        label = id2label[int(lid)]
        if label == "O":
            if cur is not None:
                spans.append((cur[0], cur[1], f"B-{cur[2]}"))
                cur = None
        else:
            pref, typ = label.split("-", 1)
            if pref == "B":
                if cur is not None:
                    spans.append((cur[0], cur[1], f"B-{cur[2]}"))
                cur = [t_s, t_e, typ]
            else:  # I-
                if cur is None:
                    # некорректный I без B — начнём новую сущность
                    cur = [t_s, t_e, typ]
                else:
                    # расширяем текущую сущность вправо
                    cur[1] = t_e
    if cur is not None:
        spans.append((cur[0], cur[1], f"B-{cur[2]}"))
    return spans

def compute_entity_macro_f1(y_true_spans_list: List[List[Tuple[int,int,str]]],
                            y_pred_spans_list: List[List[Tuple[int,int,str]]]) -> Tuple[float, Dict[str,float]]:
    """
    y_true_spans_list и y_pred_spans_list: списки списков span-ов для каждого примера.
    Возвращает (macro_f1, per_type_f1_dict).
    """
    stats = {t: {"TP":0,"FP":0,"FN":0} for t in ENTITY_TYPES}
    for true_spans, pred_spans in zip(y_true_spans_list, y_pred_spans_list):
        true_by_type = defaultdict(set)
        pred_by_type = defaultdict(set)
        for s,e,tag in true_spans:
            typ = tag.split("-",1)[1]
            true_by_type[typ].add((s,e))
        for s,e,tag in pred_spans:
            typ = tag.split("-",1)[1]
            pred_by_type[typ].add((s,e))
        for t in ENTITY_TYPES:
            tp = len(true_by_type[t] & pred_by_type[t])
            fp = len(pred_by_type[t] - true_by_type[t])
            fn = len(true_by_type[t] - pred_by_type[t])
            stats[t]["TP"] += tp
            stats[t]["FP"] += fp
            stats[t]["FN"] += fn

    f1s = {}
    for t in ENTITY_TYPES:
        TP = stats[t]["TP"]; FP = stats[t]["FP"]; FN = stats[t]["FN"]
        prec = TP / (TP + FP) if (TP + FP) > 0 else 0.0
        rec  = TP / (TP + FN) if (TP + FN) > 0 else 0.0
        f1 = 2*prec*rec/(prec+rec) if (prec+rec) > 0 else 0.0
        f1s[t] = f1
    macro_f1 = sum(f1s.values()) / len(f1s)
    return macro_f1, f1s

In [15]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
model.eval()

y_true_spans = []
y_pred_spans = []

for i in tqdm(range(len(val_dataset))):
    item = val_dataset[i]
    text = item["text"]

    # Токенизация с возвратом offset_mapping
    enc = tokenizer(
        text,
        truncation=True,
        max_length=128,
        return_offsets_mapping=True,
        return_tensors="pt"
    )

    # Сохраняем смещения для постобработки
    offsets_cpu = enc["offset_mapping"][0].cpu().numpy().tolist()

    # Убираем offset_mapping перед передачей в модель
    enc = {k: v.to(device) for k, v in enc.items() if k != "offset_mapping"}

    # Прогон через модель
    with torch.inference_mode():
        logits = model(**enc).logits  # (1, seq_len, num_labels)

    preds = logits.argmax(dim=-1)[0].cpu().numpy().tolist()

    # Преобразуем предсказания в spans
    pred_spans = tokens_to_spans(preds, offsets_cpu)

    # Ground truth (char spans) из val_ann
    true_spans = val_ann[i]

    y_true_spans.append(true_spans)
    y_pred_spans.append(pred_spans)

# Финальные метрики
macro_f1, per_type = compute_entity_macro_f1(y_true_spans, y_pred_spans)
print("Macro-F1:", macro_f1)
print("Per-type F1:", per_type)



  0%|          | 0/2726 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


IndexError: list index out of range