In [None]:
!pip install seqeval

In [None]:
!pip install evaluate

In [45]:
import pandas as pd
from sklearn.model_selection import train_test_split
import json
import numpy as np
from transformers import AutoTokenizer, DataCollatorForTokenClassification
from datasets import load_dataset, DatasetDict, Dataset, load_from_disk
import evaluate
import os

# Предобработка

In [2]:
def split_text_by_entity(text, entity):
    if entity not in text:
        return [text], [0]

    parts = text.split(entity)
    tokens = []
    tags = []

    for i, part in enumerate(parts):
        if part:
            tokens.append(part.strip())
            tags.append(0)
        if i < len(parts) - 1:
            tokens.append(entity.strip())
            tags.append(1)

    return tokens, tags

file_path = "/content/выгрузка_артикулов.xlsx"
df = pd.read_excel(file_path)


dataset = []
for _, row in df.iterrows():
    tokens, tags = split_text_by_entity(str(row['text']), str(row['entity']))
    dataset.append({
        "tokens": tokens,
        "tags": tags
    })

for item in dataset[:3]:
    print(item)


{'tokens': ['Тест-система для идентификации линий ГМО "Соя A2704-12 идентификация" (50 тестов) Артикул:Sintol-', 'GM-202-50'], 'tags': [0, 1]}
{'tokens': ['Адаптер/муфта', 'F603462/0'], 'tags': [0, 1]}
{'tokens': ['БОЛТ', 'KG00468961'], 'tags': [0, 1]}


In [9]:
dataset = Dataset.from_list(dataset)
dataset = dataset.train_test_split(test_size=0.06)

In [None]:
dataset

# Токенизация

In [3]:
label_names = ['O', 'B-ENT', 'I-ENT']

In [4]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)

        elif word_id is None:
            new_labels.append(-100)

        else:

            label = labels[word_id]
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

In [5]:
model_checkpoint = "DeepPavlov/rubert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/642 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/1.65M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [None]:
inputs = tokenizer(dataset[0]['tokens'], is_split_into_words=True)
inputs.tokens()

In [None]:
labels = dataset[0]['tags']
word_ids = inputs.word_ids()
print(labels)
print(align_labels_with_tokens(labels, word_ids))

In [8]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [11]:
tokenized_datasets = dataset.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=dataset["train"].column_names,
)

Map:   0%|          | 0/9625 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/615 [00:00<?, ? examples/s]

In [12]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [None]:
batch = data_collator([tokenized_datasets["train"][i] for i in range(2)])
batch["labels"]

# Метрики

In [15]:
metric = evaluate.load("seqeval")

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

In [16]:
import numpy as np


def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

# Обучение

In [17]:
id2label = {i: label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

In [18]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id,
)

pytorch_model.bin:   0%|          | 0.00/714M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
model.config.num_labels

3

In [20]:
from transformers import TrainingArguments
from transformers import Trainer

In [None]:
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
os.environ['WANDB_DISABLED'] = 'true'

In [21]:
training_args = TrainingArguments(
    output_dir="ner_model",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="epoch",
    logging_dir="logs",
)

In [22]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)
trainer.train()

  trainer = Trainer(


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33malmaz-aziat[0m ([33malmaz-aziat-azb[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.0467,0.028137,0.975535,0.981538,0.978528,0.994056
2,0.0279,0.021514,0.975422,0.976923,0.976172,0.995283
3,0.0101,0.021824,0.981538,0.981538,0.981538,0.995849


TrainOutput(global_step=3612, training_loss=0.03262746858834428, metrics={'train_runtime': 531.2234, 'train_samples_per_second': 54.356, 'train_steps_per_second': 6.799, 'total_flos': 512163625338468.0, 'train_loss': 0.03262746858834428, 'epoch': 3.0})

In [34]:
model.save_pretrained("ner_model")
tokenizer.save_pretrained("ner_model")

Модель сохранена в 'ner_model'


# Инференс

In [106]:
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification
from collections import defaultdict
from typing import Union, List

labels = ["O", "B-ENT", "I-ENT"]
id2label = {i: label for i, label in enumerate(labels)}
label2id = {label: i for i, label in enumerate(labels)}


device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model_path = "ner_model"
NER_tokenizer = AutoTokenizer.from_pretrained("ner_model")
NER_model = AutoModelForTokenClassification.from_pretrained(f"ner_model", num_labels=len(labels)).to(device)

def predict_NER(texts: Union[str, List[str]])-> defaultdict(list):

    if isinstance(texts, str):
        texts = [texts]

    encoded_input = NER_tokenizer(
        texts,
        return_tensors='pt',
        padding=True,
        truncation=True,
        max_length=512
    ).to(device)

    output = NER_model(**encoded_input)

    predictions = torch.argmax(output.logits, dim=-1)

    predicted_labels = [
        [id2label[label_id.item()] for label_id in sentence_predictions]
        for sentence_predictions in predictions
    ]
    list_of_entities = []
    tokens = [NER_tokenizer.convert_ids_to_tokens(ids) for ids in encoded_input['input_ids']]
    for i, sentence in enumerate(texts):

        current_entity = []
        current_label = None
        for token, label in zip(tokens[i], predicted_labels[i]):

            if token in NER_tokenizer.all_special_tokens:
                continue
            if label == 'O':
                if current_entity:
                    entity_output = ''.join(current_entity)
                    list_of_entities.append({current_label: entity_output})
                    current_entity = []
                    current_label = None
                continue

            if token.startswith("##"):
                if current_label is not None and current_label == label:
                    current_entity.append(token[2:])
                else:
                    if current_entity:
                        entity_output = ''.join(current_entity)
                        list_of_entities.append({current_label: entity_output})
                    current_entity = [token[2:]]
                    current_label = label
            else:
                if current_entity and current_label == label:
                    if not current_entity[-1].endswith(('-', '/', '_')) and not token.startswith(('-', '/', '_')):
                        current_entity.append(f' {token}')
                    else:
                        current_entity.append(token)
                else:
                    if current_entity:
                        entity_output = ''.join(current_entity)
                        list_of_entities.append({current_label: entity_output})
                    current_entity = [token]
                    current_label = label

        if current_entity:
            entity_output = ''.join(current_entity)
            list_of_entities.append({current_label: entity_output})


    ents = []
    j=0
    for i in range(len(list_of_entities)):
      entity = ''
      if list_of_entities[i].get('B-ENT'):
        entity = list_of_entities[i].get('B-ENT')
        j=i
        if i+2 != len(list_of_entities):
            while list_of_entities[j+1].get('I-ENT'):

              entity+=list_of_entities[j+1]['I-ENT']
              j+=1
        else:
            entity+=list_of_entities[j+1]['I-ENT']
        ents.append(entity)

    return list_of_entities, ents

In [None]:
examples = [
    'Тест-система для идентификации линий ГМО "Соя A2704-12 идентификация" (50 тестов) Артикул:Sintol-GM-202-50',
    'Кольцо № 76086022',
    'ТРУБА 12X2X1120 ИЗОГ. 475 9853',
    'Нипель МОМ 209-38-11270_Артикул: 209-38-11270'
]

predictions, ents = predict_NER(examples)