In [1]:
import tabulate

In [2]:
from corus import load_ne5
import corus.sources.ne5 as ne5

def load_text_utf8(path):
    with open(path, 'r', encoding='utf-8') as f:
        return f.read()

ne5.load_text = load_text_utf8

obj = load_ne5("Collection5")

for i, o in enumerate(obj):
    if i == 0:
        print(o)
        break


Ne5Markup(id='001', text='Россия рассчитывает на конструктивное воздействие США на Грузию\n\n04/08/2008 12:08\n\nМОСКВА, 4 авг - РИА Новости. Россия рассчитывает, что США воздействуют на Тбилиси в связи с обострением ситуации в зоне грузино-осетинского конфликта. Об этом статс-секретарь - заместитель министра иностранных дел России Григорий Карасин заявил в телефонном разговоре с заместителем госсекретаря США Дэниэлом Фридом.\n\n"С российской стороны выражена глубокая озабоченность в связи с новым витком напряженности вокруг Южной Осетии, противозаконными действиями грузинской стороны по наращиванию своих вооруженных сил в регионе, бесконтрольным строительством фортификационных сооружений", - говорится в сообщении.\n\n"Россия уже призвала Тбилиси к ответственной линии и рассчитывает также на конструктивное воздействие со стороны Вашингтона", - сообщил МИД России. ', spans=[Ne5Span(index='T1', type='GEOPOLIT', start=0, stop=6, text='Россия'), Ne5Span(index='T2', type='GEOPOLIT', start=5

In [3]:
from corus import load_ne5
import corus.sources.ne5 as ne5
import re
from datasets import Dataset
from transformers import AutoTokenizer

def load_text_utf8(path):
    with open(path, 'r', encoding='utf-8') as f:
        return f.read()

ne5.load_text = load_text_utf8
obj = load_ne5("Collection5")
docs = list(obj)

# Собираем все типы сущностей
types = set()
for doc in docs:
    for span in doc.spans:
        types.add(span.type)

label_list = ['O']
for entity_type in sorted(types):
    label_list.extend([f'B-{entity_type}', f'I-{entity_type}'])
label2id = {label: idx for idx, label in enumerate(label_list)}

tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2")

def process_doc(doc):
    text = doc.text
    spans = sorted(doc.spans, key=lambda x: x.stop - x.start, reverse=True)  # Приоритет длинным
    
    # Токенизация
    tokenized = tokenizer(
        text, 
        return_offsets_mapping=True,
        truncation=False,
        add_special_tokens=False
    )
    
    tokens = []
    for token, offset in zip(tokenized.tokens(), tokenized.offset_mapping):
        if token in tokenizer.all_special_tokens:
            continue
        tokens.append({
            'token': token,
            'start': offset[0],
            'end': offset[1],
            'is_word_start': not token.startswith('##')
        })
    
    # Инициализация меток
    token_labels = ['O'] * len(tokens)
    
    # Обработка спанов в порядке приоритета
    for span in spans:
        span_type = span.type
        span_start = span.start
        span_end = span.stop
        
        for i, token in enumerate(tokens):
            if token_labels[i] != 'O':
                continue  # Уже занято более приоритетной сущностью
                
            if token['start'] >= span_end or token['end'] <= span_start:
                continue
                
            # Определение B/I
            prev_token = tokens[i-1] if i > 0 else None
            if prev_token and (prev_token['end'] > span_start):
                prefix = 'I-'
            else:
                prefix = 'B-'
                
            if token['is_word_start']:
                token_labels[i] = f'{prefix}{span_type}'
            else:
                token_labels[i] = f'I-{span_type}'
    
    return {
        'tokens': [t['token'] for t in tokens],
        'ner_tags': [label2id[label] for label in token_labels]
    }

# Обрабатываем все документы
processed_data = [process_doc(doc) for doc in docs]

# Создаем Dataset
dataset = Dataset.from_list(processed_data)
dataset.features['ner_tags'].feature.names = label_list

print(dataset)
print(f"{label_list=}")

Token indices sequence length is longer than the specified maximum sequence length for this model (2392 > 2048). Running this sequence through the model will result in indexing errors


Dataset({
    features: ['tokens', 'ner_tags'],
    num_rows: 1000
})
label_list=['O', 'B-GEOPOLIT', 'I-GEOPOLIT', 'B-LOC', 'I-LOC', 'B-MEDIA', 'I-MEDIA', 'B-ORG', 'I-ORG', 'B-PER', 'I-PER']


In [4]:
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    Trainer,
    TrainingArguments,
    DataCollatorForTokenClassification
)
from transformers import EarlyStoppingCallback
from datasets import Dataset, DatasetDict
import evaluate
import numpy as np
import torch

seqeval = evaluate.load("seqeval")

# Загрузка модели и токенизатора
model_name = "cointegrated/rubert-tiny2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(label_list),
    id2label={i: label for i, label in enumerate(label_list)},
    label2id=label2id
)

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        is_split_into_words=True,
        padding="max_length",
        max_length=128
    )
    
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

splitted_dataset = dataset.train_test_split(test_size=0.2, shuffle=True, seed=42) if not isinstance(dataset, DatasetDict) else dataset
tokenized_dataset = splitted_dataset.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=splitted_dataset["train"].column_names
)



def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for l in label if l != -100]
        for label in labels
    ]

    results = seqeval.compute(
        predictions=true_predictions,
        references=true_labels,
        zero_division=0
    )
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"]
    }



# Конфигурация обучения
training_args = TrainingArguments(
    output_dir="./ner_results",
    eval_strategy="steps",
    save_strategy="steps",
    save_steps=1000,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    max_steps=10000,
    logging_steps=100,
    weight_decay=0.01,
    metric_for_best_model="f1",
    logging_dir="./logs",
    report_to="none",
    load_best_model_at_end=True
)

# Дообучение
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    data_collator=DataCollatorForTokenClassification(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)

trainer.add_callback(EarlyStoppingCallback(
    early_stopping_patience=10,
    early_stopping_threshold=0.001,
))

baseline_metrics = trainer.evaluate(tokenized_dataset["test"])
print("Метрики ДО дообучения:")
print(tabulate.tabulate(
    baseline_metrics.items(),
    headers=["Метрика", "Значение"],
    tablefmt="grid",
    floatfmt=".4f"
))


trainer.train()

final_metrics = trainer.evaluate(tokenized_dataset["test"])
print("Метрики ПОСЛЕ дообучения:")
print(tabulate.tabulate(
    final_metrics.items(),
    headers=["Метрика", "Значение"],
    tablefmt="grid",
    floatfmt=".4f"
))





Some weights of BertForTokenClassification were not initialized from the model checkpoint at cointegrated/rubert-tiny2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

  attn_output = torch.nn.functional.scaled_dot_product_attention(


Метрики ДО дообучения:
+-----------------------------+------------+
| Метрика                     |   Значение |
| eval_loss                   |     2.4641 |
+-----------------------------+------------+
| eval_model_preparation_time |     0.0027 |
+-----------------------------+------------+
| eval_precision              |     0.0028 |
+-----------------------------+------------+
| eval_recall                 |     0.0193 |
+-----------------------------+------------+
| eval_f1                     |     0.0049 |
+-----------------------------+------------+
| eval_accuracy               |     0.0680 |
+-----------------------------+------------+
| eval_runtime                |     0.7206 |
+-----------------------------+------------+
| eval_samples_per_second     |   277.5540 |
+-----------------------------+------------+
| eval_steps_per_second       |    18.0410 |
+-----------------------------+------------+


Step,Training Loss,Validation Loss,Model Preparation Time,Precision,Recall,F1,Accuracy
100,1.3183,0.889212,0.0027,0.110502,0.069733,0.085506,0.738903
200,0.7915,0.636209,0.0027,0.213055,0.20178,0.207264,0.799799
300,0.6131,0.512768,0.0027,0.277874,0.318002,0.296587,0.845246
400,0.505,0.436167,0.0027,0.339482,0.382295,0.359619,0.867891
500,0.4288,0.388828,0.0027,0.383893,0.424332,0.403101,0.880271
600,0.3715,0.356753,0.0027,0.422113,0.46637,0.443139,0.889953
700,0.3296,0.337679,0.0027,0.428252,0.473788,0.449871,0.893498
800,0.2948,0.324441,0.0027,0.450749,0.491098,0.470059,0.898154
900,0.2666,0.315963,0.0027,0.452158,0.502473,0.47599,0.900693
1000,0.2441,0.308002,0.0027,0.458707,0.50544,0.480941,0.902174


Метрики ПОСЛЕ дообучения:
+-----------------------------+------------+
| Метрика                     |   Значение |
| eval_loss                   |     0.3419 |
+-----------------------------+------------+
| eval_model_preparation_time |     0.0027 |
+-----------------------------+------------+
| eval_precision              |     0.5132 |
+-----------------------------+------------+
| eval_recall                 |     0.5663 |
+-----------------------------+------------+
| eval_f1                     |     0.5384 |
+-----------------------------+------------+
| eval_accuracy               |     0.9129 |
+-----------------------------+------------+
| eval_runtime                |     0.5007 |
+-----------------------------+------------+
| eval_samples_per_second     |   399.4760 |
+-----------------------------+------------+
| eval_steps_per_second       |    25.9660 |
+-----------------------------+------------+
| epoch                       |   102.0000 |
+----------------------------

In [None]:
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForMaskedLM,
    AutoModelForTokenClassification,
    Trainer,
    TrainingArguments,
    DataCollatorForTokenClassification,
    DataCollatorForLanguageModeling,
    EarlyStoppingCallback
)
from sklearn.model_selection import train_test_split



# Разделение данных на train и test
train_docs, test_docs = train_test_split(docs, test_size=0.2, shuffle=True, random_state=42)

tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2")


# Подготовка данных для MLM
mlm_texts = [doc.text for doc in train_docs]
mlm_dataset = Dataset.from_dict({"text": mlm_texts})

def tokenize_mlm(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=128,
        return_special_tokens_mask=True
    )

tokenized_mlm = mlm_dataset.map(tokenize_mlm, batched=True, remove_columns=["text"])

# Обучение MLM
mlm_model = AutoModelForMaskedLM.from_pretrained("cointegrated/rubert-tiny2")
data_collator_mlm = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.15
)

training_args_mlm = TrainingArguments(
    output_dir="./mlm_results",
    overwrite_output_dir=True,
    num_train_epochs=5,
    per_device_train_batch_size=16,
    save_steps=500,
    save_total_limit=2,
    logging_dir='./mlm_logs',
    report_to="none"
)

trainer_mlm = Trainer(
    model=mlm_model,
    args=training_args_mlm,
    train_dataset=tokenized_mlm,
    data_collator=data_collator_mlm
)

trainer_mlm.train()
mlm_model.save_pretrained("./mlm_model")



# Создаем Dataset
dataset = Dataset.from_list(processed_data)
dataset.features['ner_tags'].feature.names = label_list
splitted_dataset = dataset.train_test_split(test_size=0.2, shuffle=True, seed=42)
tokenized_dataset = splitted_dataset.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=splitted_dataset["train"].column_names
)

# Инициализация модели для NER
model = AutoModelForTokenClassification.from_pretrained(
    "./mlm_model",
    num_labels=len(label_list),
    id2label={i: label for i, label in enumerate(label_list)},
    label2id=label2id
)



# Обучение NER
training_args = TrainingArguments(
    output_dir="./ner_results",
    eval_strategy="steps",
    save_strategy="steps",
    save_steps=1000,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    max_steps =10000,
    logging_steps=100,
    weight_decay=0.01,
    metric_for_best_model="f1",
    logging_dir="./logs",
    report_to="none",
    load_best_model_at_end=True
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    data_collator=DataCollatorForTokenClassification(tokenizer=tokenizer),
    compute_metrics=compute_metrics
)

trainer.add_callback(EarlyStoppingCallback(
    early_stopping_patience=10,
    early_stopping_threshold=0.001
))

baseline_metrics = trainer.evaluate(tokenized_dataset["test"])
print("Метрики ДО дообучения:")
print(tabulate.tabulate(
    baseline_metrics.items(),
    headers=["Метрика", "Значение"],
    tablefmt="grid",
    floatfmt=".4f"
))

trainer.train()

final_metrics = trainer.evaluate(tokenized_dataset["test"])
print("Метрики ПОСЛЕ дообучения:")
print(tabulate.tabulate(
    final_metrics.items(),
    headers=["Метрика", "Значение"],
    tablefmt="grid",
    floatfmt=".4f"
))

Map:   0%|          | 0/800 [00:00<?, ? examples/s]