In [8]:
from corus import load_ne5
import corus.sources.ne5 as ne5

def load_text_utf8(path):
    with open(path, 'r', encoding='utf-8') as f:
        return f.read()

ne5.load_text = load_text_utf8

obj = load_ne5("Collection5")

for i, o in enumerate(obj):
    if i == 0:
        print(o)
        break


Ne5Markup(id='001', text='Россия рассчитывает на конструктивное воздействие США на Грузию\n\n04/08/2008 12:08\n\nМОСКВА, 4 авг - РИА Новости. Россия рассчитывает, что США воздействуют на Тбилиси в связи с обострением ситуации в зоне грузино-осетинского конфликта. Об этом статс-секретарь - заместитель министра иностранных дел России Григорий Карасин заявил в телефонном разговоре с заместителем госсекретаря США Дэниэлом Фридом.\n\n"С российской стороны выражена глубокая озабоченность в связи с новым витком напряженности вокруг Южной Осетии, противозаконными действиями грузинской стороны по наращиванию своих вооруженных сил в регионе, бесконтрольным строительством фортификационных сооружений", - говорится в сообщении.\n\n"Россия уже призвала Тбилиси к ответственной линии и рассчитывает также на конструктивное воздействие со стороны Вашингтона", - сообщил МИД России. ', spans=[Ne5Span(index='T1', type='GEOPOLIT', start=0, stop=6, text='Россия'), Ne5Span(index='T2', type='GEOPOLIT', start=5

### Загрузка и подготовка набора данных Collection5

In [32]:
from corus import load_ne5
import corus.sources.ne5 as ne5
from datasets import Dataset
from transformers import AutoTokenizer

def load_text_utf8(path):
    with open(path, 'r', encoding='utf-8') as f:
        return f.read()

ne5.load_text = load_text_utf8
obj = load_ne5("Collection5")
docs = list(obj)

# Собираем все типы сущностей
types = set()
for doc in docs:
    for span in doc.spans:
        types.add(span.type)

label_list = ['O']
for entity_type in sorted(types):
    label_list.extend([f'B-{entity_type}', f'I-{entity_type}'])
label2id = {label: idx for idx, label in enumerate(label_list)}

tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2")

def process_doc(doc):
    text = doc.text
    spans = sorted(doc.spans, key=lambda x: x.stop - x.start, reverse=True)  # Приоритет длинным
    
    # Токенизация
    tokenized = tokenizer(
        text, 
        return_offsets_mapping=True,
        truncation=False,
        add_special_tokens=False
    )
    
    tokens = []
    for token, offset in zip(tokenized.tokens(), tokenized.offset_mapping):
        if token in tokenizer.all_special_tokens:
            continue
        tokens.append({
            'token': token,
            'start': offset[0],
            'end': offset[1],
            'is_word_start': not token.startswith('##')
        })
    
    # Инициализация меток
    token_labels = ['O'] * len(tokens)
    
    # Обработка спанов в порядке приоритета
    for span in spans:
        span_type = span.type
        span_start = span.start
        span_end = span.stop
        
        for i, token in enumerate(tokens):
            if token_labels[i] != 'O':
                continue  # Уже занято более приоритетной сущностью
                
            if token['start'] >= span_end or token['end'] <= span_start:
                continue
                
            # Определение B/I
            prev_token = tokens[i-1] if i > 0 else None
            if prev_token and (prev_token['end'] > span_start):
                prefix = 'I-'
            else:
                prefix = 'B-'
                
            if token['is_word_start']:
                token_labels[i] = f'{prefix}{span_type}'
            else:
                token_labels[i] = f'I-{span_type}'
    
    return {
        'tokens': [t['token'] for t in tokens],
        'ner_tags': [label2id[label] for label in token_labels]
    }

# Обрабатываем все документы
processed_data = [process_doc(doc) for doc in docs]

# Создаем Dataset
dataset = Dataset.from_list(processed_data)
dataset.features['ner_tags'].feature.names = label_list

print(dataset)
print(dataset[0])
print(f"{label_list=}")

Token indices sequence length is longer than the specified maximum sequence length for this model (2392 > 2048). Running this sequence through the model will result in indexing errors


Dataset({
    features: ['tokens', 'ner_tags'],
    num_rows: 1000
})
{'tokens': ['Россия', 'рассчитывает', 'на', 'конструктивно', '##е', 'воздействие', 'США', 'на', 'Грузию', '04', '/', '08', '/', '2008', '12', ':', '08', 'МО', '##СК', '##ВА', ',', '4', 'ав', '##г', '-', 'РИА', 'Новости', '.', 'Россия', 'рассчитывает', ',', 'что', 'США', 'воздейс', '##твуют', 'на', 'Тбилиси', 'в', 'связи', 'с', 'обострение', '##м', 'ситуации', 'в', 'зоне', 'груз', '##ино', '-', 'осети', '##нского', 'конфликта', '.', 'Об', 'этом', 'стат', '##с', '-', 'секретарь', '-', 'заместитель', 'министра', 'иностранных', 'дел', 'России', 'Григорий', 'Кара', '##син', 'заявил', 'в', 'телефон', '##ном', 'разговоре', 'с', 'заместителем', 'госсекретаря', 'США', 'Дэни', '##эл', '##ом', 'Фрид', '##ом', '.', '"', 'С', 'российской', 'стороны', 'выражена', 'глубокая', 'озабоченность', 'в', 'связи', 'с', 'новым', 'ви', '##тком', 'напряженности', 'вокруг', 'Южной', 'Осетии', ',', 'противо', '##зак', '##онными', 'действиями', 

Метод `compute_metrics` для вычисления метрик на основе предсказаний модели и истинных меток.

In [None]:
import evaluate
import numpy as np

seqeval = evaluate.load("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for l in label if l != -100]
        for label in labels
    ]

    results = seqeval.compute(
        predictions=true_predictions,
        references=true_labels,
        zero_division=0
    )
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"]
    }

### Дообучение модели rubert-tiny2 на train-части корпуса для решения NER-задачи и замеры качества NER-метрик до и после дообучения

In [None]:
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    Trainer,
    TrainingArguments,
    DataCollatorForTokenClassification
)
from transformers import EarlyStoppingCallback
from datasets import Dataset, DatasetDict
import tabulate

# Загрузка модели и токенизатора
model_name = "cointegrated/rubert-tiny2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(label_list),
    id2label={i: label for i, label in enumerate(label_list)},
    label2id=label2id
)

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        is_split_into_words=True,
        padding="max_length",
        max_length=128
    )
    
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

splitted_dataset = dataset.train_test_split(test_size=0.2, shuffle=True, seed=42) if not isinstance(dataset, DatasetDict) else dataset
tokenized_dataset = splitted_dataset.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=splitted_dataset["train"].column_names
)




# Конфигурация обучения
training_args = TrainingArguments(
    output_dir="./ner_results",
    eval_strategy="steps",
    save_strategy="steps",
    save_steps=1000,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    max_steps=10000,
    logging_steps=100,
    weight_decay=0.01,
    metric_for_best_model="f1",
    logging_dir="./logs",
    report_to="none",
    load_best_model_at_end=True,
    save_total_limit=2,
    # lr_scheduler_type="cosine",
    # warmup_steps=500
)

# Дообучение
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    data_collator=DataCollatorForTokenClassification(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)

trainer.add_callback(EarlyStoppingCallback(
    early_stopping_patience=10,
    early_stopping_threshold=0.001,
))

baseline_metrics = trainer.evaluate(tokenized_dataset["test"])
print("Метрики ДО дообучения:")
print(tabulate.tabulate(
    baseline_metrics.items(),
    headers=["Метрика", "Значение"],
    tablefmt="grid",
    floatfmt=".4f"
))


trainer.train()

final_metrics = trainer.evaluate(tokenized_dataset["test"])
print("Метрики ПОСЛЕ дообучения:")
print(tabulate.tabulate(
    final_metrics.items(),
    headers=["Метрика", "Значение"],
    tablefmt="grid",
    floatfmt=".4f"
))


Some weights of BertForTokenClassification were not initialized from the model checkpoint at cointegrated/rubert-tiny2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Метрики ДО дообучения:
+-----------------------------+------------+
| Метрика                     |   Значение |
| eval_loss                   |     2.5222 |
+-----------------------------+------------+
| eval_model_preparation_time |     0.0010 |
+-----------------------------+------------+
| eval_precision              |     0.0062 |
+-----------------------------+------------+
| eval_recall                 |     0.0475 |
+-----------------------------+------------+
| eval_f1                     |     0.0110 |
+-----------------------------+------------+
| eval_accuracy               |     0.0502 |
+-----------------------------+------------+
| eval_runtime                |     0.7425 |
+-----------------------------+------------+
| eval_samples_per_second     |   269.3460 |
+-----------------------------+------------+
| eval_steps_per_second       |    17.5070 |
+-----------------------------+------------+


Step,Training Loss,Validation Loss,Model Preparation Time,Precision,Recall,F1,Accuracy
100,1.3522,0.923296,0.001,0.072864,0.043027,0.054104,0.730649
200,0.8064,0.650835,0.001,0.215002,0.222552,0.218712,0.799958
300,0.6236,0.521369,0.001,0.26776,0.315035,0.28948,0.844135
400,0.5135,0.443436,0.001,0.329834,0.372898,0.350046,0.864716
500,0.4347,0.393364,0.001,0.384615,0.427794,0.405057,0.880535
600,0.3765,0.361805,0.001,0.412869,0.456973,0.433803,0.889688
700,0.3328,0.341149,0.001,0.427995,0.47181,0.448836,0.893339
800,0.2972,0.327853,0.001,0.443596,0.488131,0.464799,0.89736
900,0.2691,0.320625,0.001,0.451512,0.501978,0.47541,0.899053
1000,0.2465,0.312153,0.001,0.468807,0.512859,0.489844,0.901963


Метрики ПОСЛЕ дообучения:
+-----------------------------+------------+
| Метрика                     |   Значение |
| eval_loss                   |     0.3104 |
+-----------------------------+------------+
| eval_model_preparation_time |     0.0010 |
+-----------------------------+------------+
| eval_precision              |     0.4989 |
+-----------------------------+------------+
| eval_recall                 |     0.5455 |
+-----------------------------+------------+
| eval_f1                     |     0.5211 |
+-----------------------------+------------+
| eval_accuracy               |     0.9109 |
+-----------------------------+------------+
| eval_runtime                |     0.5073 |
+-----------------------------+------------+
| eval_samples_per_second     |   394.2640 |
+-----------------------------+------------+
| eval_steps_per_second       |    25.6270 |
+-----------------------------+------------+
| epoch                       |    82.0000 |
+----------------------------

Будем ориентироваться на F1-меру, потому что она учитывает как точность, так и полноту.     
F1 до обучения: 0.0110      
F1 после обучения: 0.5211       

### Предварительно дообучение на train-части в MLM режиме, а потом дообучение на NER-задачу

In [None]:
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForMaskedLM,
    AutoModelForTokenClassification,
    Trainer,
    TrainingArguments,
    DataCollatorForTokenClassification,
    DataCollatorForLanguageModeling,
    EarlyStoppingCallback
)
from sklearn.model_selection import train_test_split
import tabulate


# Разделение данных на train и test
train_docs, test_docs = train_test_split(docs, test_size=0.2, shuffle=True, random_state=42)

tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2")


# Подготовка данных для MLM
mlm_texts = [doc.text for doc in train_docs]
mlm_dataset = Dataset.from_dict({"text": mlm_texts})

def tokenize_mlm(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=128,
        return_special_tokens_mask=True
    )

tokenized_mlm = mlm_dataset.map(tokenize_mlm, batched=True, remove_columns=["text"])

# Обучение MLM
mlm_model = AutoModelForMaskedLM.from_pretrained("cointegrated/rubert-tiny2")
data_collator_mlm = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.15
)

training_args_mlm = TrainingArguments(
    output_dir="./mlm_results",
    overwrite_output_dir=True,
    max_steps=3000,
    per_device_train_batch_size=16,
    save_steps=1000,
    save_total_limit=2,
    logging_dir='./mlm_logs',
    report_to="none"
)

trainer_mlm = Trainer(
    model=mlm_model,
    args=training_args_mlm,
    train_dataset=tokenized_mlm,
    data_collator=data_collator_mlm
)

trainer_mlm.train()
mlm_model.save_pretrained("./mlm_model")

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        is_split_into_words=True,
        padding="max_length",
        max_length=128
    )
    
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Создаем Dataset
dataset = Dataset.from_list(processed_data)
dataset.features['ner_tags'].feature.names = label_list
splitted_dataset = dataset.train_test_split(test_size=0.2, shuffle=True, seed=42)
tokenized_dataset = splitted_dataset.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=splitted_dataset["train"].column_names
)

# Инициализация модели для NER
model = AutoModelForTokenClassification.from_pretrained(
    "./mlm_model",
    num_labels=len(label_list),
    id2label={i: label for i, label in enumerate(label_list)},
    label2id=label2id
)



# Обучение NER
training_args = TrainingArguments(
    output_dir="./ner_results",
    eval_strategy="steps",
    save_strategy="steps",
    save_steps=1000,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    max_steps =10000,
    logging_steps=100,
    weight_decay=0.01,
    metric_for_best_model="f1",
    logging_dir="./logs",
    report_to="none",
    save_total_limit=2,
    load_best_model_at_end=True,
    # lr_scheduler_type="cosine",
    # warmup_steps=500
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    data_collator=DataCollatorForTokenClassification(tokenizer=tokenizer),
    compute_metrics=compute_metrics
)

trainer.add_callback(EarlyStoppingCallback(
    early_stopping_patience=10,
    early_stopping_threshold=0.001
))

baseline_metrics = trainer.evaluate(tokenized_dataset["test"])
print("Метрики ДО дообучения:")
print(tabulate.tabulate(
    baseline_metrics.items(),
    headers=["Метрика", "Значение"],
    tablefmt="grid",
    floatfmt=".4f"
))

trainer.train()

final_metrics = trainer.evaluate(tokenized_dataset["test"])
print("Метрики ПОСЛЕ дообучения:")
print(tabulate.tabulate(
    final_metrics.items(),
    headers=["Метрика", "Значение"],
    tablefmt="grid",
    floatfmt=".4f"
))

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Step,Training Loss
500,2.8592
1000,2.4485
1500,2.241
2000,2.1049
2500,2.0321
3000,1.9703


Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at ./mlm_model and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Метрики ДО дообучения:
+-----------------------------+------------+
| Метрика                     |   Значение |
| eval_loss                   |     2.3312 |
+-----------------------------+------------+
| eval_model_preparation_time |     0.0010 |
+-----------------------------+------------+
| eval_precision              |     0.0039 |
+-----------------------------+------------+
| eval_recall                 |     0.0242 |
+-----------------------------+------------+
| eval_f1                     |     0.0067 |
+-----------------------------+------------+
| eval_accuracy               |     0.1649 |
+-----------------------------+------------+
| eval_runtime                |     0.7155 |
+-----------------------------+------------+
| eval_samples_per_second     |   279.5100 |
+-----------------------------+------------+
| eval_steps_per_second       |    18.1680 |
+-----------------------------+------------+


Step,Training Loss,Validation Loss,Model Preparation Time,Precision,Recall,F1,Accuracy
100,1.33,0.87238,0.001,0.085932,0.055885,0.067726,0.74377
200,0.7581,0.594611,0.001,0.242105,0.238872,0.240478,0.818634
300,0.5776,0.482423,0.001,0.305792,0.355094,0.328604,0.859531
400,0.4748,0.412696,0.001,0.351614,0.393175,0.371235,0.875615
500,0.4029,0.368551,0.001,0.401336,0.445598,0.422311,0.887837
600,0.3478,0.34232,0.001,0.43719,0.480218,0.457695,0.895773
700,0.3088,0.326711,0.001,0.450831,0.496538,0.472582,0.898312
800,0.2748,0.313859,0.001,0.465635,0.505935,0.484949,0.90191
900,0.2489,0.308543,0.001,0.460854,0.512364,0.485246,0.90392
1000,0.2267,0.300886,0.001,0.48,0.522255,0.500237,0.905931


Метрики ПОСЛЕ дообучения:
+-----------------------------+------------+
| Метрика                     |   Значение |
| eval_loss                   |     0.3437 |
+-----------------------------+------------+
| eval_model_preparation_time |     0.0010 |
+-----------------------------+------------+
| eval_precision              |     0.5219 |
+-----------------------------+------------+
| eval_recall                 |     0.5767 |
+-----------------------------+------------+
| eval_f1                     |     0.5479 |
+-----------------------------+------------+
| eval_accuracy               |     0.9137 |
+-----------------------------+------------+
| eval_runtime                |     0.5073 |
+-----------------------------+------------+
| eval_samples_per_second     |   394.2370 |
+-----------------------------+------------+
| eval_steps_per_second       |    25.6250 |
+-----------------------------+------------+
| epoch                       |    82.0000 |
+----------------------------

Также будем ориентироваться на F1-меру.     
F1 до обучения: 0.0067       
F1 после обучения: 0.5211->0.5479 

### Генерация синтетической разметки новостного `lenta-ru-news.csv.gz` корпуса `xlm-roberta-large-finetuned-conll03-english` NER моделью и использование ее для дообучения rubert-tiny2 вместе с основным набором данных.

Формирование выборки из 15000 семплов.

In [2]:
from corus import load_lenta
import pandas as pd

path = 'lenta-ru-news.csv.gz'
records = load_lenta(path)
# next(records)

data = []
for record in records:
    if record.topic is None:
        continue
    data.append({
        'title': record.title,
        'text': record.text,
        'topic': record.topic
    })
    if len(data) >= 15_000:
        break
df = pd.DataFrame(data)

Разметка текста умной моделью и сохранение в pandas датафрейм

In [None]:
from transformers import pipeline
from tqdm import tqdm
import torch
import pandas as pd

ner_model = pipeline(
    "ner",
    model="xlm-roberta-large-finetuned-conll03-english",
    aggregation_strategy="simple",
    device=0 if torch.cuda.is_available() else -1
)

def process_text(text):
    """Обрабатывает текст и извлекает N-сущности с индексами"""
    entities = ner_model(text)
    return [
        {
            "ner_token": ent["word"],
            "label": ent["entity_group"],
            "start": ent["start"],
            "end": ent["end"]
        }
        for ent in entities
    ]

tqdm.pandas()
df["ner_entities"] = df["text"].progress_apply(process_text)

result_df = pd.DataFrame({
    "text": df["text"],
    "ner_entities": df["ner_entities"]
})

result_df.to_csv("ner_dataset.csv", index=False)

Some weights of the model checkpoint at xlm-roberta-large-finetuned-conll03-english were not used when initializing XLMRobertaForTokenClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0
  attn_output = torch.nn.functional.scaled_dot_product_attention(
  0%|          | 11/15000 [00:01<27:42,  9.01it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|██████████| 15000/15000 [26:55<00:0

Формирование размеченного датасета для обучения

In [None]:
from datasets import Dataset, DatasetDict, concatenate_datasets
from transformers import AutoTokenizer

# Загрузка токенизатора
tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2")

def result_df_to_dataset(result_df, label2id):
    processed_data = []
    
    for _, row in tqdm(result_df.iterrows(), total=len(result_df)):
        text = row['text']
        entities = row['ner_entities']
        
        # Токенизация текста
        tokenized = tokenizer(
            text,
            return_offsets_mapping=True,
            truncation=False,
            add_special_tokens=False
        )
        
        # Собираем информацию о токенах
        tokens_info = []
        for token, offset in zip(tokenized.tokens(), tokenized.offset_mapping):
            if token in tokenizer.all_special_tokens:
                continue
            tokens_info.append({
                'token': token,
                'start': offset[0],
                'end': offset[1],
                'is_word_start': not token.startswith('##')
            })
        
        # Создаем список меток
        token_labels = ['O'] * len(tokens_info)
        
        # Сортируем сущности по длине (длинные сначала)
        sorted_entities = sorted(
            entities,
            key=lambda x: x['end'] - x['start'],
            reverse=True
        )
        
        # Размечаем сущности
        for ent in sorted_entities:
            ent_start = ent['start']
            ent_end = ent['end']
            ent_type = ent['label']
            
            # Ищем пересекающиеся токены
            for i, token in enumerate(tokens_info):
                if token_labels[i] != 'O':
                    continue
                
                # Проверяем пересечение токена с сущностью
                if (token['start'] >= ent_end) or (token['end'] <= ent_start):
                    continue
                
                # Определяем B/I
                if token['is_word_start']:
                    # Проверяем начало сущности
                    if token['start'] == ent_start:
                        prefix = 'B-'
                    else:
                        prefix = 'I-'
                else:
                    prefix = 'I-'
                
                # Проверяем существование метки
                full_label = f"{prefix}{ent_type}"
                if full_label not in label2id:
                    continue  # Пропускаем неизвестные метки
                
                token_labels[i] = full_label
        
        # Конвертируем метки в ID
        ner_tags = [label2id.get(label, label2id['O']) for label in token_labels]
        
        processed_data.append({
            'tokens': [t['token'] for t in tokens_info],
            'ner_tags': ner_tags
        })
    
    features = dataset.features
    new_dataset = Dataset.from_list(processed_data, features=features)
    
    return new_dataset

label2id = {label: idx for idx, label in enumerate(label_list)}
new_dataset = result_df_to_dataset(result_df, label2id)

print(new_dataset)

100%|██████████| 15000/15000 [00:43<00:00, 341.50it/s]


Dataset({
    features: ['tokens', 'ner_tags'],
    num_rows: 15000
})


Объединение с оригинальным датасетом

In [None]:

splitted_orig_dataset = dataset.train_test_split(test_size=0.2, shuffle=True, seed=42) if not isinstance(dataset, DatasetDict) else dataset
combined_train = concatenate_datasets([
    splitted_orig_dataset['train'], 
    new_dataset 
])

combined_dataset = DatasetDict({
    'train': combined_train,
    'test': splitted_orig_dataset['test']
})

print(combined_dataset)



Дообучение rubert-tiny2 вместе на новом наборе данных вместе с основным

In [None]:
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    Trainer,
    TrainingArguments,
    DataCollatorForTokenClassification
)
from transformers import EarlyStoppingCallback
from datasets import Dataset, DatasetDict
import tabulate

# Загрузка модели и токенизатора
model_name = "cointegrated/rubert-tiny2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(label_list),
    id2label={i: label for i, label in enumerate(label_list)},
    label2id=label2id
)

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        is_split_into_words=True,
        padding="max_length",
        max_length=128
    )
    
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_dataset = combined_dataset.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=combined_dataset["train"].column_names
)



training_args = TrainingArguments(
    output_dir="./ner_upd_dataset_results",
    eval_strategy="steps",
    save_strategy="steps",
    save_steps=3000,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    max_steps=30000,
    logging_steps=1000,
    weight_decay=0.01,
    metric_for_best_model="f1",
    logging_dir="./logs",
    report_to="none",
    load_best_model_at_end=True,
    save_total_limit=2,
    # lr_scheduler_type="cosine",
    # warmup_steps=500
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    data_collator=DataCollatorForTokenClassification(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)

# trainer.add_callback(EarlyStoppingCallback(
#     early_stopping_patience=10,
#     early_stopping_threshold=0.001,
# ))

baseline_metrics = trainer.evaluate(tokenized_dataset["test"])
print("Метрики ДО дообучения:")
print(tabulate.tabulate(
    baseline_metrics.items(),
    headers=["Метрика", "Значение"],
    tablefmt="grid",
    floatfmt=".4f"
))


trainer.train()

final_metrics = trainer.evaluate(tokenized_dataset["test"])
print("Метрики ПОСЛЕ дообучения:")
print(tabulate.tabulate(
    final_metrics.items(),
    headers=["Метрика", "Значение"],
    tablefmt="grid",
    floatfmt=".4f"
))


Some weights of BertForTokenClassification were not initialized from the model checkpoint at cointegrated/rubert-tiny2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/15800 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Метрики ДО дообучения:
+-----------------------------+------------+
| Метрика                     |   Значение |
| eval_loss                   |     2.4345 |
+-----------------------------+------------+
| eval_model_preparation_time |     0.0010 |
+-----------------------------+------------+
| eval_precision              |     0.0049 |
+-----------------------------+------------+
| eval_recall                 |     0.0381 |
+-----------------------------+------------+
| eval_f1                     |     0.0087 |
+-----------------------------+------------+
| eval_accuracy               |     0.0716 |
+-----------------------------+------------+
| eval_runtime                |     0.7654 |
+-----------------------------+------------+
| eval_samples_per_second     |   261.3100 |
+-----------------------------+------------+
| eval_steps_per_second       |    16.9850 |
+-----------------------------+------------+


Step,Training Loss,Validation Loss,Model Preparation Time,Precision,Recall,F1,Accuracy
1000,0.315,0.790659,0.001,0.152301,0.18002,0.165005,0.80583
2000,0.136,0.636909,0.001,0.215755,0.243818,0.22893,0.827469
3000,0.107,0.538011,0.001,0.239067,0.283877,0.259552,0.841437
4000,0.0896,0.451956,0.001,0.283076,0.325915,0.302989,0.861912
5000,0.078,0.401337,0.001,0.325571,0.366469,0.344812,0.875721
6000,0.0684,0.385799,0.001,0.356067,0.397626,0.375701,0.88281
7000,0.0625,0.352127,0.001,0.389088,0.430267,0.408643,0.890323
8000,0.0561,0.357688,0.001,0.387986,0.424827,0.405571,0.890323
9000,0.0522,0.338247,0.001,0.408476,0.448071,0.427358,0.895085
10000,0.0477,0.334893,0.001,0.427419,0.47181,0.448519,0.897572


Метрики ПОСЛЕ дообучения:
+-----------------------------+------------+
| Метрика                     |   Значение |
| eval_loss                   |     0.3383 |
+-----------------------------+------------+
| eval_model_preparation_time |     0.0010 |
+-----------------------------+------------+
| eval_precision              |     0.4762 |
+-----------------------------+------------+
| eval_recall                 |     0.5054 |
+-----------------------------+------------+
| eval_f1                     |     0.4904 |
+-----------------------------+------------+
| eval_accuracy               |     0.9080 |
+-----------------------------+------------+
| eval_runtime                |     0.5023 |
+-----------------------------+------------+
| eval_samples_per_second     |   398.1370 |
+-----------------------------+------------+
| eval_steps_per_second       |    25.8790 |
+-----------------------------+------------+
| epoch                       |    30.3644 |
+----------------------------

Также будем ориентироваться на F1-меру.     
F1 до обучения: 0.0087     
F1 после обучения: 0.5211->0.5479->0.4904 