In [1]:
import spacy
import random
import json
import re
from tqdm.notebook import tqdm
from collections import defaultdict
from spacy.training import Example
from spacy.scorer import Scorer
from spacy_transformers import TransformerModel
from typing import List, Tuple, Dict, Any, Optional, Union
import warnings
from pathlib import Path

warnings.filterwarnings('ignore')

In [21]:
class DataProcessor:
    def load_data(self, 
                file_path: str,
                text_key: str = "text",
                possible_entity_keys: List[str] = ["label", "entities", "annotations"],
                comment_key: str = "Comments") -> List[Dict]:
        """
        –£–Ω–∏–≤–µ—Ä—Å–∞–ª—å–Ω–∞—è –∑–∞–≥—Ä—É–∑–∫–∞ –¥–∞–Ω–Ω—ã—Ö –∏–∑ JSONL —Å –ø–æ–¥–¥–µ—Ä–∂–∫–æ–π —Ä–∞–∑–Ω—ã—Ö —Ñ–æ—Ä–º–∞—Ç–æ–≤ Doccano
        
        Args:
            file_path: –ü—É—Ç—å –∫ —Ñ–∞–π–ª—É
            text_key: –ö–ª—é—á –¥–ª—è —Ç–µ–∫—Å—Ç–∞
            possible_entity_keys: –í–æ–∑–º–æ–∂–Ω—ã–µ –∫–ª—é—á–∏ –¥–ª—è —Ä–∞–∑–º–µ—Ç–∫–∏ (–ø—Ä–æ–±—É—é—Ç—Å—è –ø–æ –ø–æ—Ä—è–¥–∫—É)
            comment_key: –ö–ª—é—á –¥–ª—è –∫–æ–º–º–µ–Ω—Ç–∞—Ä–∏–µ–≤
            
        Returns:
            –°–ø–∏—Å–æ–∫ —Å–ª–æ–≤–∞—Ä–µ–π —Å –¥–∞–Ω–Ω—ã–º–∏ –≤ —É–Ω–∏—Ñ–∏—Ü–∏—Ä–æ–≤–∞–Ω–Ω–æ–º —Ñ–æ—Ä–º–∞—Ç–µ
        """
        data = []
        line_num = 0
        
        try:
            with open(file_path, "r", encoding="utf-8") as f:
                for line in tqdm(f, desc="üì• –ó–∞–≥—Ä—É–∑–∫–∞ –¥–∞–Ω–Ω—ã—Ö"):
                    line_num += 1
                    line = line.strip()
                    if not line:
                        continue
                    
                    try:
                        item = json.loads(line)
                        
                        # –ü—Ä–æ–≤–µ—Ä–∫–∞ –æ–±—è–∑–∞—Ç–µ–ª—å–Ω–æ–≥–æ –ø–æ–ª—è —Å —Ç–µ–∫—Å—Ç–æ–º
                        if text_key not in item:
                            print(f"‚ö†Ô∏è –°—Ç—Ä–æ–∫–∞ {line_num}: –Ω–µ—Ç –∫–ª—é—á–∞ '{text_key}'")
                            continue
                        
                        # –ü–æ–∏—Å–∫ –∫–ª—é—á–∞ —Å —Ä–∞–∑–º–µ—Ç–∫–æ–π
                        entities = []
                        for key in possible_entity_keys:
                            if key in item:
                                entities = item[key]
                                break
                        
                        # –ö–æ–Ω–≤–µ—Ä—Ç–∞—Ü–∏—è —Ä–∞–∑–Ω—ã—Ö —Ñ–æ—Ä–º–∞—Ç–æ–≤ –≤ –µ–¥–∏–Ω—ã–π
                        unified_entities = self._convert_entities(entities, item[text_key])
                        
                        data.append({
                            "text": item[text_key],
                            "entities": unified_entities,
                            "comments": item.get(comment_key, [])
                        })
                        
                    except json.JSONDecodeError:
                        print(f"‚ùå –°—Ç—Ä–æ–∫–∞ {line_num}: –æ—à–∏–±–∫–∞ JSON (–ø—Ä–æ–ø—É—â–µ–Ω–∞)")
                    except Exception as e:
                        print(f"‚ö†Ô∏è –°—Ç—Ä–æ–∫–∞ {line_num}: {str(e)}")
        
        except FileNotFoundError:
            raise FileNotFoundError(f"–§–∞–π–ª –Ω–µ –Ω–∞–π–¥–µ–Ω: {file_path}")
        
        print(f"‚úÖ –£—Å–ø–µ—à–Ω–æ –∑–∞–≥—Ä—É–∂–µ–Ω–æ {len(data)} –∑–∞–ø–∏—Å–µ–π")
        return data
    
    def _convert_entities(self, 
                        entities: Union[List, Dict], 
                        text: str) -> List[Dict]:
        """
        –ö–æ–Ω–≤–µ—Ä—Ç–∏—Ä—É–µ—Ç —Ä–∞–∑–Ω—ã–µ —Ñ–æ—Ä–º–∞—Ç—ã —Ä–∞–∑–º–µ—Ç–∫–∏ –≤ –µ–¥–∏–Ω—ã–π
        
        –ü–æ–¥–¥–µ—Ä–∂–∏–≤–∞–µ—Ç:
        - Doccano v1: [[start, end, label], ...]
        - Doccano v2: [{"start": X, "end": Y, "label": Z}, ...]
        - –°—Ç–∞–Ω–¥–∞—Ä—Ç–Ω—ã–π: [{"start_offset": X, "end_offset": Y, "tag": Z}, ...]
        """
        converted = []
        
        if not entities:
            return converted
        
        first_item = entities[0]
        
        # –§–æ—Ä–º–∞—Ç Doccano v1: [start, end, label]
        if isinstance(first_item, list) and len(first_item) == 3:
            for start, end, label in entities:
                converted.append({
                    "start": int(start),
                    "end": int(end),
                    "label": str(label)
                })
        
        # –§–æ—Ä–º–∞—Ç Doccano v2/—Å—Ç–∞–Ω–¥–∞—Ä—Ç–Ω—ã–π: {"start": X, "end": Y, ...}
        elif isinstance(first_item, dict):
            for ent in entities:
                # –û–ø—Ä–µ–¥–µ–ª—è–µ–º –∫–ª—é—á–∏
                start_key = "start" if "start" in ent else "start_offset"
                end_key = "end" if "end" in ent else "end_offset"
                label_key = "label" if "label" in ent else "tag"
                
                converted.append({
                    "start": int(ent[start_key]),
                    "end": int(ent[end_key]),
                    "label": str(ent[label_key])
                })
        
        return converted
    
    def prepare_data(self, 
                    data: List[Dict],
                    validate: bool = True) -> List[Tuple[str, Dict]]:
        """
        –ü–æ–¥–≥–æ—Ç–æ–≤–∫–∞ –¥–∞–Ω–Ω—ã—Ö –≤ —Ñ–æ—Ä–º–∞—Ç–µ –¥–ª—è –æ–±—É—á–µ–Ω–∏—è spaCy
        
        Args:
            data: –î–∞–Ω–Ω—ã–µ –≤ —É–Ω–∏—Ñ–∏—Ü–∏—Ä–æ–≤–∞–Ω–Ω–æ–º —Ñ–æ—Ä–º–∞—Ç–µ
            validate: –ü—Ä–æ–≤–µ—Ä—è—Ç—å –ª–∏ –∫–æ—Ä—Ä–µ–∫—Ç–Ω–æ—Å—Ç—å —Ä–∞–∑–º–µ—Ç–∫–∏
            
        Returns:
            –î–∞–Ω–Ω—ã–µ –≤ —Ñ–æ—Ä–º–∞—Ç–µ (text, {"entities": [(start, end, label), ...]})
        """
        formatted_data = []
        error_count = 0
        
        for item in tqdm(data, desc="üîß –ü–æ–¥–≥–æ—Ç–æ–≤–∫–∞ –¥–∞–Ω–Ω—ã—Ö"):
            try:
                text = item["text"]
                entities = []
                
                for ent in item["entities"]:
                    start = ent["start"]
                    end = ent["end"]
                    label = ent["label"]
                    
                    if validate:
                        # –ü—Ä–æ–≤–µ—Ä–∫–∞ –∫–æ—Ä—Ä–µ–∫—Ç–Ω–æ—Å—Ç–∏ –∞–Ω–Ω–æ—Ç–∞—Ü–∏–π
                        if not (0 <= start <= end <= len(text)):
                            raise ValueError(
                                f"–ù–µ–∫–æ—Ä—Ä–µ–∫—Ç–Ω—ã–µ –ø–æ–∑–∏—Ü–∏–∏: {start}-{end} "
                                f"–¥–ª—è —Ç–µ–∫—Å—Ç–∞ –¥–ª–∏–Ω—ã {len(text)}. –¢–µ–∫—Å—Ç: '{text[start:end]}'"
                            )
                        
                        # –ü—Ä–æ–≤–µ—Ä–∫–∞, —á—Ç–æ –º–µ—Ç–∫–∞ –Ω–µ –ø—É—Å—Ç–∞—è
                        if not label.strip():
                            raise ValueError(f"–ü—É—Å—Ç–∞—è –º–µ—Ç–∫–∞ –¥–ª—è –ø–æ–∑–∏—Ü–∏–π {start}-{end}")
                    
                    entities.append((start, end, label))
                
                formatted_data.append((text, {"entities": entities}))
            except Exception as e:
                error_count += 1
                print(f"‚ö†Ô∏è –û—à–∏–±–∫–∞ –≤ —ç–ª–µ–º–µ–Ω—Ç–µ: {str(e)}")
                continue
        
        if error_count > 0:
            print(f"üî¥ –í—Å–µ–≥–æ –æ—à–∏–±–æ–∫: {error_count} (–∏–∑ {len(data)})")
        
        return formatted_data

    def save_to_jsonl(self, 
                     data: List[Dict], 
                     output_path: str,
                     format: str = "doccano") -> None:
        """
        –°–æ—Ö—Ä–∞–Ω–µ–Ω–∏–µ –¥–∞–Ω–Ω—ã—Ö –≤ JSONL —Ñ–æ—Ä–º–∞—Ç–µ
        
        Args:
            data: –î–∞–Ω–Ω—ã–µ –¥–ª—è —Å–æ—Ö—Ä–∞–Ω–µ–Ω–∏—è
            output_path: –ü—É—Ç—å –¥–ª—è —Å–æ—Ö—Ä–∞–Ω–µ–Ω–∏—è
            format: –§–æ—Ä–º–∞—Ç ("doccano" –∏–ª–∏ "standard")
        """
        with open(output_path, "w", encoding="utf-8") as f:
            for item in tqdm(data, desc="üíæ –°–æ—Ö—Ä–∞–Ω–µ–Ω–∏–µ –¥–∞–Ω–Ω—ã—Ö"):
                if format == "doccano":
                    # –§–æ—Ä–º–∞—Ç Doccano: {"text": "...", "label": [[start, end, tag], ...]}
                    labels = [
                        [ent["start"], ent["end"], ent["label"]]
                        for ent in item["entities"]
                    ]
                    json.dump({
                        "text": item["text"],
                        "label": labels,
                        "Comments": item.get("comments", [])
                    }, f, ensure_ascii=False)
                else:
                    # –°—Ç–∞–Ω–¥–∞—Ä—Ç–Ω—ã–π —Ñ–æ—Ä–º–∞—Ç
                    json.dump(item, f, ensure_ascii=False)
                f.write("\n")

In [19]:
def clean_text(text):
    """–û—á–∏—Å—Ç–∫–∞ —Ç–µ–∫—Å—Ç–∞ –æ—Ç –ª–∏—à–Ω–∏—Ö –ø–µ—Ä–µ–Ω–æ—Å–æ–≤ –∏ –ø—Ä–æ–±–µ–ª–æ–≤"""
    text = re.sub(r'\n+', ' ', text)  # –ó–∞–º–µ–Ω—è–µ–º –ø–µ—Ä–µ–Ω–æ—Å—ã –Ω–∞ –ø—Ä–æ–±–µ–ª—ã
    text = re.sub(r'\s+', ' ', text)  # –£–¥–∞–ª—è–µ–º –º–Ω–æ–∂–µ—Å—Ç–≤–µ–Ω–Ω—ã–µ –ø—Ä–æ–±–µ–ª—ã
    return text.strip()

def adjust_annotations(text, annotations):
    """–ö–æ—Ä—Ä–µ–∫—Ç–∏—Ä–æ–≤–∫–∞ —Ä–∞–∑–º–µ—Ç–∫–∏ –ø–æ—Å–ª–µ –æ—á–∏—Å—Ç–∫–∏ —Ç–µ–∫—Å—Ç–∞"""
    cleaned_text = clean_text(text)
    new_entities = []
    
    # –°–æ–∑–¥–∞–µ–º —Å–ø–∏—Å–æ–∫ –∫–æ—Ä—Ç–µ–∂–µ–π (start, end, label, entity_text)
    entities_info = [(start, end, label, text[start:end]) 
                    for start, end, label in annotations['entities']]
    
    # –°–æ—Ä—Ç–∏—Ä—É–µ–º —Å—É—â–Ω–æ—Å—Ç–∏ –ø–æ –Ω–∞—á–∞–ª—å–Ω–æ–π –ø–æ–∑–∏—Ü–∏–∏
    entities_info.sort(key=lambda x: x[0])
    
    # –î–ª—è –∫–∞–∂–¥–æ–π —Å—É—â–Ω–æ—Å—Ç–∏ –Ω–∞—Ö–æ–¥–∏–º –µ–µ –ø–æ–∑–∏—Ü–∏—é –≤ –æ—á–∏—â–µ–Ω–Ω–æ–º —Ç–µ–∫—Å—Ç–µ
    for start, end, label, entity_text in entities_info:
        # –ù–∞—Ö–æ–¥–∏–º –≤—Å–µ –≤—Ö–æ–∂–¥–µ–Ω–∏—è —ç—Ç–æ–π –ø–æ–¥—Å—Ç—Ä–æ–∫–∏ –≤ –æ—Ä–∏–≥–∏–Ω–∞–ª—å–Ω–æ–º —Ç–µ–∫—Å—Ç–µ
        occurrences = [m.start() for m in re.finditer(re.escape(entity_text), text)]
        
        # –î–ª—è –∫–∞–∂–¥–æ–≥–æ –≤—Ö–æ–∂–¥–µ–Ω–∏—è –ø—Ä–æ–≤–µ—Ä—è–µ–º, —Å–æ–≤–ø–∞–¥–∞–µ—Ç –ª–∏ –æ–Ω–æ —Å –Ω–∞—à–µ–π –∞–Ω–Ω–æ—Ç–∞—Ü–∏–µ–π
        for occ_start in occurrences:
            if occ_start == start:  # –≠—Ç–æ –Ω–∞—à–∞ –∞–Ω–Ω–æ—Ç–∞—Ü–∏—è
                # –ù–∞—Ö–æ–¥–∏–º –ø–æ–∑–∏—Ü–∏—é –≤ –æ—á–∏—â–µ–Ω–Ω–æ–º —Ç–µ–∫—Å—Ç–µ
                new_start = cleaned_text.find(entity_text)
                if new_start != -1:
                    new_end = new_start + len(entity_text)
                    new_entities.append((new_start, new_end, label))
                    # –ó–∞–º–µ–Ω—è–µ–º –Ω–∞–π–¥–µ–Ω–Ω—ã–π —Ñ—Ä–∞–≥–º–µ–Ω—Ç –Ω–∞ –ø—Ä–æ–±–µ–ª—ã, —á—Ç–æ–±—ã –Ω–µ –Ω–∞—Ö–æ–¥–∏—Ç—å –µ–≥–æ —Å–Ω–æ–≤–∞
                    cleaned_text = cleaned_text[:new_start] + ' ' * len(entity_text) + cleaned_text[new_end:]
                break
    
    return {'entities': new_entities}

class NERTrainer:
    def __init__(self, 
                 model_name: str = "ru_core_news_sm", 
                 use_gpu: bool = True,
                 blank_language: str = "ru",
                 disable_pipes: Optional[List[str]] = None,
                 transformer_name: Optional[str] = None):
        """
        Args:
            transformer_name: –ò–º—è —Ç—Ä–∞–Ω—Å—Ñ–æ—Ä–º–µ—Ä–Ω–æ–π –º–æ–¥–µ–ª–∏ (–Ω–∞–ø—Ä–∏–º–µ—Ä "DeepPavlov/rubert-base-cased")
        """
        self.use_gpu = use_gpu
        self._setup_device()
        
        if transformer_name:
            self.nlp = self._create_transformer_model(blank_language, transformer_name)
        else:
            self.nlp = self._load_model(model_name, blank_language, disable_pipes)
            
        self.ner = self._setup_ner_pipe(transformer_name)

    def _create_transformer_model(self, lang: str, transformer_name: str):
        """–°–æ–∑–¥–∞–Ω–∏–µ –º–æ–¥–µ–ª–∏ —Å —Ç—Ä–∞–Ω—Å—Ñ–æ—Ä–º–µ—Ä–æ–º"""
        nlp = spacy.blank(lang)
        
        # –î–æ–±–∞–≤–ª—è–µ–º –∫–æ–º–ø–æ–Ω–µ–Ω—Ç —Ç—Ä–∞–Ω—Å—Ñ–æ—Ä–º–µ—Ä–∞
        config = {
            "model": {
                "@architectures": "spacy-transformers.TransformerModel.v3",
                "name": transformer_name,
                "tokenizer_config": {"use_fast": True},
                "transformer_config": {"output_hidden_states": True}
            }
        }
        nlp.add_pipe("transformer", config=config)
        
        print(f"‚úÖ –°–æ–∑–¥–∞–Ω–∞ —Ç—Ä–∞–Ω—Å—Ñ–æ—Ä–º–µ—Ä–Ω–∞—è –º–æ–¥–µ–ª—å —Å {transformer_name}")
        return nlp
        
    def _setup_device(self) -> None:
        """–ù–∞—Å—Ç—Ä–æ–π–∫–∞ —É—Å—Ç—Ä–æ–π—Å—Ç–≤–∞ (CPU/GPU)"""
        if self.use_gpu and spacy.prefer_gpu():
            spacy.require_gpu()
            print("‚úÖ –ò—Å–ø–æ–ª—å–∑—É–µ—Ç—Å—è GPU")
        else:
            print("‚ö†Ô∏è GPU –Ω–µ –¥–æ—Å—Ç—É–ø–µ–Ω, –∏—Å–ø–æ–ª—å–∑—É–µ—Ç—Å—è CPU")
    
    def _load_model(self, model_name: str, blank_language: str, disable_pipes: List[str]):
        """–ó–∞–≥—Ä—É–∑–∫–∞ –º–æ–¥–µ–ª–∏ —Å –æ–±—Ä–∞–±–æ—Ç–∫–æ–π –æ—à–∏–±–æ–∫"""
        try:
            if model_name.lower() == "blank":
                return spacy.blank(blank_language)
            elif Path(model_name).exists():
                return spacy.load(model_name, disable=disable_pipes or [])
            else:
                return spacy.load(model_name, disable=disable_pipes or [])
        except Exception as e:
            raise ValueError(f"–ù–µ —É–¥–∞–ª–æ—Å—å –∑–∞–≥—Ä—É–∑–∏—Ç—å –º–æ–¥–µ–ª—å '{model_name}': {str(e)}")
    
    def _setup_ner_pipe(self, transformer_name):
        """–ù–∞—Å—Ç—Ä–æ–π–∫–∞ NER –∫–æ–º–ø–æ–Ω–µ–Ω—Ç–∞ —Å —É—á–µ—Ç–æ–º —Ç—Ä–∞–Ω—Å—Ñ–æ—Ä–º–µ—Ä–∞"""
        if "ner" in self.nlp.pipe_names:
            return self.nlp.get_pipe("ner")
    
        # –î–ª—è —Ç—Ä–∞–Ω—Å—Ñ–æ—Ä–º–µ—Ä–Ω—ã—Ö –º–æ–¥–µ–ª–µ–π
        if "transformer" in self.nlp.pipe_names:
            return self.nlp.add_pipe(
                "ner",
                after="transformer",
                config={
                    "model": {
                        "@architectures": "spacy.TransitionBasedParser.v2",
                        "hidden_width": 128,
                        "maxout_pieces": 2,
                        "use_upper": True
                    }
                }
            )
    
        # –î–ª—è –æ–±—ã—á–Ω—ã—Ö –º–æ–¥–µ–ª–µ–π
        return self.nlp.add_pipe("ner")
    
    def prepare_data(self, 
                    data: List[Dict],
                    text_key: str = "text",
                    entities_key: str = "label",
                    start_offset_key: str = "start_offset",
                    end_offset_key: str = "end_offset",
                    label_key: str = "label") -> List[Tuple[str, Dict]]:
        """
        –ü–æ–¥–≥–æ—Ç–æ–≤–∫–∞ –¥–∞–Ω–Ω—ã—Ö –≤ —Ñ–æ—Ä–º–∞—Ç–µ spaCy —Å –≤–∞–ª–∏–¥–∞—Ü–∏–µ–π
        
        Args:
            data: –°—ã—Ä—ã–µ –¥–∞–Ω–Ω—ã–µ
            ...keys: –ö–ª—é—á–∏ –¥–ª—è –¥–æ—Å—Ç—É–ø–∞ –∫ –¥–∞–Ω–Ω—ã–º
            
        Returns:
            –î–∞–Ω–Ω—ã–µ –≤ —Ñ–æ—Ä–º–∞—Ç–µ (text, {"entities": [...]})
        """
        formatted_data = []
        error_count = 0
        
        for item in tqdm(data, desc="üîß –ü–æ–¥–≥–æ—Ç–æ–≤–∫–∞ –¥–∞–Ω–Ω—ã—Ö"):
            try:
                text = item[text_key]
                entities = []
                
                for ent in item.get(entities_key, []):
                    # –ü—Ä–æ–≤–µ—Ä–∫–∞ –∫–æ—Ä—Ä–µ–∫—Ç–Ω–æ—Å—Ç–∏ –∞–Ω–Ω–æ—Ç–∞—Ü–∏–π
                    if not all(k in ent for k in [start_offset_key, end_offset_key, label_key]):
                        raise ValueError(f"–ù–µ–ø–æ–ª–Ω–∞—è –∞–Ω–Ω–æ—Ç–∞—Ü–∏—è: {ent}")
                    
                    start = ent[start_offset_key]
                    end = ent[end_offset_key]
                    
                    # –í–∞–ª–∏–¥–∞—Ü–∏—è –ø–æ–∑–∏—Ü–∏–π
                    if not (0 <= start <= end <= len(text)):
                        raise ValueError(f"–ù–µ–∫–æ—Ä—Ä–µ–∫—Ç–Ω—ã–µ –ø–æ–∑–∏—Ü–∏–∏: {start}-{end} –¥–ª—è —Ç–µ–∫—Å—Ç–∞ –¥–ª–∏–Ω—ã {len(text)}")
                    
                    entities.append((start, end, ent[label_key]))
                
                formatted_data.append((text, {"entities": entities}))
            except Exception as e:
                error_count += 1
                print(f"‚ö†Ô∏è –û—à–∏–±–∫–∞ –≤ —ç–ª–µ–º–µ–Ω—Ç–µ: {str(e)}")
                continue
        
        if error_count > 0:
            print(f"üî¥ –í—Å–µ–≥–æ –æ—à–∏–±–æ–∫: {error_count} (–∏–∑ {len(data)})")
        
        return formatted_data
    
    def train_test_split(self, 
                       data: List,
                       test_size: float = 0.2,
                       random_state: Optional[int] = None) -> Tuple[List, List]:
        """
        –£–ª—É—á—à–µ–Ω–Ω–æ–µ —Ä–∞–∑–¥–µ–ª–µ–Ω–∏–µ –¥–∞–Ω–Ω—ã—Ö —Å —Ñ–∏–∫—Å–∞—Ü–∏–µ–π —Å–ª—É—á–∞–π–Ω–æ—Å—Ç–∏
        
        Args:
            data: –ü–æ–ª–Ω—ã–π –Ω–∞–±–æ—Ä –¥–∞–Ω–Ω—ã—Ö
            test_size: –î–æ–ª—è —Ç–µ—Å—Ç–æ–≤—ã—Ö –¥–∞–Ω–Ω—ã—Ö
            random_state: –§–∏–∫—Å–∞—Ç–æ—Ä —Å–ª—É—á–∞–π–Ω–æ—Å—Ç–∏
            
        Returns:
            –ö–æ—Ä—Ç–µ–∂ (train_data, test_data)
        """
        if random_state is not None:
            random.seed(random_state)
            
        if not 0 < test_size < 1:
            raise ValueError("test_size –¥–æ–ª–∂–µ–Ω –±—ã—Ç—å –º–µ–∂–¥—É 0 –∏ 1")
            
        shuffled = random.sample(data, len(data))
        split_idx = int(len(data) * (1 - test_size))
        return shuffled[:split_idx], shuffled[split_idx:]
    
    def add_labels(self, 
                  data: Optional[List[Tuple[str, Dict]]] = None,
                  labels: Optional[List[str]] = None) -> None:
        """
        –ì–∏–±–∫–æ–µ –¥–æ–±–∞–≤–ª–µ–Ω–∏–µ –º–µ—Ç–æ–∫
        
        Args:
            data: –î–∞–Ω–Ω—ã–µ –¥–ª—è –∏–∑–≤–ª–µ—á–µ–Ω–∏—è –º–µ—Ç–æ–∫ (–µ—Å–ª–∏ –Ω–µ —É–∫–∞–∑–∞–Ω—ã —è–≤–Ω–æ)
            labels: –°–ø–∏—Å–æ–∫ –º–µ—Ç–æ–∫ –¥–ª—è –¥–æ–±–∞–≤–ª–µ–Ω–∏—è (–µ—Å–ª–∏ —É–∫–∞–∑–∞–Ω, –∏—Å–ø–æ–ª—å–∑—É–µ—Ç—Å—è –≤–º–µ—Å—Ç–æ data)
        """
        if labels is not None:
            unique_labels = set(labels)
        elif data is not None:
            unique_labels = set()
            for _, annotations in data:
                for _, _, label in annotations["entities"]:
                    unique_labels.add(label)
        else:
            raise ValueError("–ù–µ–æ–±—Ö–æ–¥–∏–º–æ —É–∫–∞–∑–∞—Ç—å –ª–∏–±–æ data, –ª–∏–±–æ labels")
        
        for label in unique_labels:
            self.ner.add_label(label)
        
        print(f"üè∑ –î–æ–±–∞–≤–ª–µ–Ω—ã –º–µ—Ç–∫–∏: {sorted(unique_labels)}")
    
    def train(self, 
             train_data: List[Tuple[str, Dict]],
             epochs: int = 10,
             batch_size: int = 8,
             dropout: float = 0.5,
             learning_rate: float = 0.001,
             save_path: Optional[str] = None,
             eval_data: Optional[List[Tuple[str, Dict]]] = None,
             early_stopping: Optional[int] = None):
    
        # 1. –ò–Ω–∏—Ü–∏–∞–ª–∏–∑–∞—Ü–∏—è –æ–ø—Ç–∏–º–∏–∑–∞—Ç–æ—Ä–∞
        optimizer = self.nlp.initialize()
    
        # 2. –ù–∞—Å—Ç—Ä–æ–π–∫–∞ –ø–∞—Ä–∞–º–µ—Ç—Ä–æ–≤ –¥–ª—è —Ç—Ä–∞–Ω—Å—Ñ–æ—Ä–º–µ—Ä–æ–≤
        if "transformer" in self.nlp.pipe_names:
            batch_size = min(batch_size, 4)
            learning_rate = 1e-4
            dropout = 0.1
    
        # 3. –£—Å—Ç–∞–Ω–æ–≤–∫–∞ learning rate
        if hasattr(optimizer, "learn_rate"):
            optimizer.learn_rate = learning_rate
    
        # 4. –ò–Ω–∏—Ü–∏–∞–ª–∏–∑–∞—Ü–∏—è –ø–µ—Ä–µ–º–µ–Ω–Ω—ã—Ö –¥–ª—è –æ—Ç—Å–ª–µ–∂–∏–≤–∞–Ω–∏—è –ø—Ä–æ–≥—Ä–µ—Å—Å–∞
        best_f1 = -1
        best_epoch = 0
        history = {"loss": [], "f1": []}
    
        # 5. –¶–∏–∫–ª –æ–±—É—á–µ–Ω–∏—è
        for epoch in range(epochs):
            losses = {}
            random.shuffle(train_data)
        
            # –ü–∞–∫–µ—Ç–Ω–∞—è –æ–±—Ä–∞–±–æ—Ç–∫–∞
            batches = [train_data[i:i+batch_size] for i in range(0, len(train_data), batch_size)]
        
            for batch in tqdm(batches, desc=f"–≠–ø–æ—Ö–∞ {epoch+1}/{epochs}"):
                examples = []
                for text, annotations in batch:
                    doc = self.nlp.make_doc(text)
                    example = Example.from_dict(doc, annotations)
                    examples.append(example)
            
                self.nlp.update(examples, drop=dropout, losses=losses, sgd=optimizer)
        
            # –õ–æ–≥–∏—Ä–æ–≤–∞–Ω–∏–µ –∏ –≤–∞–ª–∏–¥–∞—Ü–∏—è
            epoch_loss = losses.get("ner", 0)
            history["loss"].append(epoch_loss)
            print(f"\n–≠–ø–æ—Ö–∞ {epoch+1}/{epochs}")
            print(f"üìâ Loss: {epoch_loss:.3f}")
        
            if eval_data:
                metrics = self.evaluate(eval_data, verbose=False)
                avg_f1 = sum(m["f"] for m in metrics.values()) / len(metrics)
                history["f1"].append(avg_f1)
                print(f"üìä F1-score: {avg_f1:.3f}")
            
                if save_path and avg_f1 > best_f1:
                    best_f1 = avg_f1
                    best_epoch = epoch
                    self.nlp.to_disk(save_path)
                    print(f"üíæ –°–æ—Ö—Ä–∞–Ω–µ–Ω–∞ –ª—É—á—à–∞—è –º–æ–¥–µ–ª—å (F1: {best_f1:.3f})")
            
                if early_stopping and (epoch - best_epoch) >= early_stopping:
                    print(f"üõë –†–∞–Ω–Ω—è—è –æ—Å—Ç–∞–Ω–æ–≤–∫–∞ –ø–æ—Å–ª–µ {early_stopping} —ç–ø–æ—Ö –±–µ–∑ —É–ª—É—á—à–µ–Ω–∏–π")
                    break
    
        if eval_data is None and save_path:
            self.nlp.to_disk(save_path)
            print("üíæ –°–æ—Ö—Ä–∞–Ω–µ–Ω–∞ —Ñ–∏–Ω–∞–ª—å–Ω–∞—è –º–æ–¥–µ–ª—å")
    
        return history
    
    def evaluate(self, test_data: List[Tuple[str, Dict]], verbose: bool = True) -> Dict:
        """
        –ò—Å–ø—Ä–∞–≤–ª–µ–Ω–Ω–∞—è –æ—Ü–µ–Ω–∫–∞ –º–æ–¥–µ–ª–∏ —Å —Ç–æ—á–Ω—ã–º –ø–æ–¥—Å—á–µ—Ç–æ–º –º–µ—Ç—Ä–∏–∫
        """
        true_entities = []
        pred_entities = []
    
        for text, annotations in tqdm(test_data, desc="üîç –û—Ü–µ–Ω–∫–∞ –º–æ–¥–µ–ª–∏"):
            # –ò—Å—Ç–∏–Ω–Ω—ã–µ —Å—É—â–Ω–æ—Å—Ç–∏ (–∏–∑ —Ç–µ—Å—Ç–æ–≤—ã—Ö –¥–∞–Ω–Ω—ã—Ö)
            true = [(start, end, label) for start, end, label in annotations["entities"]]
            true_entities.append((text, true))
        
            # –ü—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–Ω—ã–µ —Å—É—â–Ω–æ—Å—Ç–∏
            doc = self.nlp(text)
            pred = [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]
            pred_entities.append((text, pred))
    
        # –í—ã—á–∏—Å–ª–µ–Ω–∏–µ –º–µ—Ç—Ä–∏–∫
        metrics = self._calculate_metrics(true_entities, pred_entities)
    
        if verbose:
            self._print_detailed_metrics(metrics)
    
        return metrics

    def _calculate_metrics(self, true_entities, pred_entities):
        """–¢–æ—á–Ω—ã–π —Ä–∞—Å—á–µ—Ç –º–µ—Ç—Ä–∏–∫ –ø–æ —Ç–∏–ø–∞–º —Å—É—â–Ω–æ—Å—Ç–µ–π"""
        metrics = defaultdict(lambda: {"tp": 0, "fp": 0, "fn": 0})
    
        for (true_text, true), (pred_text, pred) in zip(true_entities, pred_entities):
            # –ü—Ä–æ–≤–µ—Ä–∫–∞ —Å–æ–≤–ø–∞–¥–µ–Ω–∏—è —Ç–µ–∫—Å—Ç–æ–≤
            if true_text != pred_text:
                raise ValueError("–¢–µ–∫—Å—Ç—ã –≤ –¥–∞–Ω–Ω—ã—Ö –∏ –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏—è—Ö –Ω–µ —Å–æ–≤–ø–∞–¥–∞—é—Ç!")
        
            true_set = set(true)
            pred_set = set(pred)
        
            # –°—á–∏—Ç–∞–µ–º TP, FP, FN –¥–ª—è –∫–∞–∂–¥–æ–≥–æ —Ç–∏–ø–∞
            for label in set([e[2] for e in true] + [e[2] for e in pred]):
                tp = len([e for e in true if e in pred and e[2] == label])
                fp = len([e for e in pred if e not in true and e[2] == label])
                fn = len([e for e in true if e not in pred and e[2] == label])
            
                metrics[label]["tp"] += tp
                metrics[label]["fp"] += fp
                metrics[label]["fn"] += fn
    
        # –†–∞—Å—Å—á–µ—Ç precision, recall, f1 –¥–ª—è –∫–∞–∂–¥–æ–≥–æ —Ç–∏–ø–∞
        result = {}
        for label, counts in metrics.items():
            tp, fp, fn = counts["tp"], counts["fp"], counts["fn"]
            p = tp / (tp + fp) if (tp + fp) > 0 else 0
            r = tp / (tp + fn) if (tp + fn) > 0 else 0
            f1 = 2 * (p * r) / (p + r) if (p + r) > 0 else 0
        
            result[label] = {
                "precision": p,
                "recall": r,
                "f1": f1,
                "support": tp + fn,
                "tp": tp,
                "fp": fp,
                "fn": fn
            }
    
        return result

    def _print_detailed_metrics(self, metrics):
        """–î–µ—Ç–∞–ª–∏–∑–∏—Ä–æ–≤–∞–Ω–Ω—ã–π –≤—ã–≤–æ–¥ –º–µ—Ç—Ä–∏–∫"""
        print("\nüìä –ü–æ–¥—Ä–æ–±–Ω—ã–µ –º–µ—Ç—Ä–∏–∫–∏:")
        print("{:<20} {:<10} {:<10} {:<10} {:<10}".format(
            "–¢–∏–ø", "Precision", "Recall", "F1", "–ü–æ–¥–¥–µ—Ä–∂–∫–∞"))
        print("-" * 60)
    
        for label, values in metrics.items():
            print("{:<20} {:<10.3f} {:<10.3f} {:<10.3f} {:<10}".format(
                label,
                values["precision"],
                values["recall"],
                values["f1"],
                values["support"]))
    
        # Micro-average
        total_tp = sum(m["tp"] for m in metrics.values())
        total_fp = sum(m["fp"] for m in metrics.values())
        total_fn = sum(m["fn"] for m in metrics.values())
    
        micro_p = total_tp / (total_tp + total_fp) if (total_tp + total_fp) > 0 else 0
        micro_r = total_tp / (total_tp + total_fn) if (total_tp + total_fn) > 0 else 0
        micro_f1 = 2 * (micro_p * micro_r) / (micro_p + micro_r) if (micro_p + micro_r) > 0 else 0
    
        print("\nüîç –ò—Ç–æ–≥–æ–≤—ã–µ –º–µ—Ç—Ä–∏–∫–∏ (micro-average):")
        print(f"Precision: {micro_p:.3f}")
        print(f"Recall: {micro_r:.3f}")
        print(f"F1-score: {micro_f1:.3f}")

    def _print_detailed_metrics(self, metrics):
        """–î–µ—Ç–∞–ª–∏–∑–∏—Ä–æ–≤–∞–Ω–Ω—ã–π –≤—ã–≤–æ–¥ –º–µ—Ç—Ä–∏–∫"""
        print("\nüìä –ü–æ–¥—Ä–æ–±–Ω—ã–µ –º–µ—Ç—Ä–∏–∫–∏:")
        print("{:<20} {:<10} {:<10} {:<10} {:<10}".format(
            "–¢–∏–ø", "Precision", "Recall", "F1", "–ü–æ–¥–¥–µ—Ä–∂–∫–∞"))
        print("-" * 60)
    
        for label, values in metrics.items():
            print("{:<20} {:<10.3f} {:<10.3f} {:<10.3f} {:<10}".format(
                label,
                values["precision"],
                values["recall"],
                values["f1"],
                values["support"]))
    
        # Micro-average
        total_tp = sum(m["tp"] for m in metrics.values())
        total_fp = sum(m["fp"] for m in metrics.values())
        total_fn = sum(m["fn"] for m in metrics.values())
    
        micro_p = total_tp / (total_tp + total_fp) if (total_tp + total_fp) > 0 else 0
        micro_r = total_tp / (total_tp + total_fn) if (total_tp + total_fn) > 0 else 0
        micro_f1 = 2 * (micro_p * micro_r) / (micro_p + micro_r) if (micro_p + micro_r) > 0 else 0
    
        print("\nüîç –ò—Ç–æ–≥–æ–≤—ã–µ –º–µ—Ç—Ä–∏–∫–∏ (micro-average):")
        print(f"Precision: {micro_p:.3f}")
        print(f"Recall: {micro_r:.3f}")
        print(f"F1-score: {micro_f1:.3f}")
    
    def predict(self, 
               text: str,
               return_doc: bool = False) -> Union[List[Tuple[str, str, int, int]], "spacy.tokens.Doc"]:
        """
        –£–ª—É—á—à–µ–Ω–Ω–æ–µ –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏–µ —Å –≤–æ–∑–º–æ–∂–Ω–æ—Å—Ç—å—é –≤–æ–∑–≤—Ä–∞—Ç–∞ Doc –æ–±—ä–µ–∫—Ç–∞
        
        Args:
            text: –¢–µ–∫—Å—Ç –¥–ª—è –∞–Ω–∞–ª–∏–∑–∞
            return_doc: –í–æ–∑–≤—Ä–∞—â–∞—Ç—å –ø–æ–ª–Ω—ã–π Doc –æ–±—ä–µ–∫—Ç
            
        Returns:
            –°–ø–∏—Å–æ–∫ —Å—É—â–Ω–æ—Å—Ç–µ–π –∏–ª–∏ Doc –æ–±—ä–µ–∫—Ç
        """
        doc = self.nlp(text)
        if return_doc:
            return doc
        return [(ent.text, ent.label_, ent.start_char, ent.end_char) for ent in doc.ents]



In [34]:
import time

processor = DataProcessor()

# –ó–∞–≥—Ä—É–∑–∫–∞ –¥–∞–Ω–Ω—ã—Ö
train_data = processor.load_data("../../data/jsonl/dataset_jsonl_train.jsonl")
test_data = processor.load_data("../../data/jsonl/dataset_jsonl_test.jsonl")

# –ü–æ–¥–≥–æ—Ç–æ–≤–∫–∞ –¥–ª—è –æ–±—É—á–µ–Ω–∏—è
formatted_train = processor.prepare_data(train_data)
formatted_test = processor.prepare_data(test_data) if test_data else None

time_start = time.time()
# –ò–Ω–∏—Ü–∏–∞–ª–∏–∑–∞—Ü–∏—è —Ç—Ä–µ–Ω–µ—Ä–∞
trainer_blank_empty = NERTrainer(
    model_name="blank",
    blank_language="ru",
)

# –î–æ–±–∞–≤–ª–µ–Ω–∏–µ –º–µ—Ç–æ–∫ –î–û —Ä–∞–∑–¥–µ–ª–µ–Ω–∏—è (—Ç–µ–ø–µ—Ä—å –∏—Å–ø–æ–ª—å–∑—É–µ–º –ø–æ–¥–≥–æ—Ç–æ–≤–ª–µ–Ω–Ω—ã–µ –¥–∞–Ω–Ω—ã–µ)
print("–ü—Ä–∏–º–µ—Ä—ã –æ–±—É—á–∞—é—â–∏—Ö –¥–∞–Ω–Ω—ã—Ö:", formatted_train[:1])  # –ø–æ–∫–∞–∂–µ–º –ø–µ—Ä–≤—ã–π –ø—Ä–∏–º–µ—Ä –¥–ª—è –ø—Ä–æ–≤–µ—Ä–∫–∏
trainer_blank_empty.add_labels(formatted_train)

print(f"\n–†–∞–∑–º–µ—Ä –æ–±—É—á–∞—é—â–µ–π –≤—ã–±–æ—Ä–∫–∏: {len(formatted_train)}")
print(f"–†–∞–∑–º–µ—Ä —Ç–µ—Å—Ç–æ–≤–æ–π –≤—ã–±–æ—Ä–∫–∏: {len(formatted_test) if formatted_test else 0}")

# –û–±—É—á–µ–Ω–∏–µ –º–æ–¥–µ–ª–∏ (—É–≤–µ–ª–∏—á–∏–º epochs –¥–ª—è –ª—É—á—à–µ–≥–æ –∫–∞—á–µ—Å—Ç–≤–∞)
trainer_blank_empty.train(
    formatted_train, 
    epochs=40,
    batch_size=1,  
    save_path=r"C:\Users\mezhonnyy\Desktop\–†–µ—à–µ–Ω–∏—è\NER\model\NER_final\data\jsonl\model"
)
print('Train time: ', time.time() - time_start, ' c')
# –û—Ü–µ–Ω–∫–∞ –º–æ–¥–µ–ª–∏ (–µ—Å–ª–∏ –µ—Å—Ç—å —Ç–µ—Å—Ç–æ–≤–∞—è –≤—ã–±–æ—Ä–∫–∞)
time_start = time.time()
if formatted_test:
    metrics = trainer_blank_empty.evaluate(formatted_test)
    print("\n–ú–µ—Ç—Ä–∏–∫–∏ –Ω–∞ —Ç–µ—Å—Ç–æ–≤–æ–π –≤—ã–±–æ—Ä–∫–µ:")
    print(f"Precision: {metrics['precision']:.2f}")
    print(f"Recall: {metrics['recall']:.2f}")
    print(f"F1-score: {metrics['f1_score']:.2f}")
else:
    print("\n–¢–µ—Å—Ç–æ–≤–∞—è –≤—ã–±–æ—Ä–∫–∞ –Ω–µ –ø—Ä–µ–¥–æ—Å—Ç–∞–≤–ª–µ–Ω–∞, –æ—Ü–µ–Ω–∫–∞ –Ω–µ –≤—ã–ø–æ–ª–Ω–µ–Ω–∞")
print('Inference time: ', time.time() - time_start, ' c')    

üì• –ó–∞–≥—Ä—É–∑–∫–∞ –¥–∞–Ω–Ω—ã—Ö: 0it [00:00, ?it/s]

‚úÖ –£—Å–ø–µ—à–Ω–æ –∑–∞–≥—Ä—É–∂–µ–Ω–æ 2051 –∑–∞–ø–∏—Å–µ–π


üì• –ó–∞–≥—Ä—É–∑–∫–∞ –¥–∞–Ω–Ω—ã—Ö: 0it [00:00, ?it/s]

‚úÖ –£—Å–ø–µ—à–Ω–æ –∑–∞–≥—Ä—É–∂–µ–Ω–æ 879 –∑–∞–ø–∏—Å–µ–π


üîß –ü–æ–¥–≥–æ—Ç–æ–≤–∫–∞ –¥–∞–Ω–Ω—ã—Ö:   0%|          | 0/2051 [00:00<?, ?it/s]

‚ö†Ô∏è –û—à–∏–±–∫–∞ –≤ —ç–ª–µ–º–µ–Ω—Ç–µ: –ù–µ–∫–æ—Ä—Ä–µ–∫—Ç–Ω—ã–µ –ø–æ–∑–∏—Ü–∏–∏: 117-121 –¥–ª—è —Ç–µ–∫—Å—Ç–∞ –¥–ª–∏–Ω—ã 114. –¢–µ–∫—Å—Ç: ''
‚ö†Ô∏è –û—à–∏–±–∫–∞ –≤ —ç–ª–µ–º–µ–Ω—Ç–µ: –ù–µ–∫–æ—Ä—Ä–µ–∫—Ç–Ω—ã–µ –ø–æ–∑–∏—Ü–∏–∏: 262-267 –¥–ª—è —Ç–µ–∫—Å—Ç–∞ –¥–ª–∏–Ω—ã 259. –¢–µ–∫—Å—Ç: ''
üî¥ –í—Å–µ–≥–æ –æ—à–∏–±–æ–∫: 2 (–∏–∑ 2051)


üîß –ü–æ–¥–≥–æ—Ç–æ–≤–∫–∞ –¥–∞–Ω–Ω—ã—Ö:   0%|          | 0/879 [00:00<?, ?it/s]

‚ö†Ô∏è GPU –Ω–µ –¥–æ—Å—Ç—É–ø–µ–Ω, –∏—Å–ø–æ–ª—å–∑—É–µ—Ç—Å—è CPU
–ü—Ä–∏–º–µ—Ä—ã –æ–±—É—á–∞—é—â–∏—Ö –¥–∞–Ω–Ω—ã—Ö: [('–ª–∏—Å—Ç –Ω–µ—Ä–∂–∞–≤–µ—é—â–∏–π 4 x 1500 x 3000 mm aisi 201 12—Ö15–≥9–Ω–¥ –≥ / –∫ –º–∞—Ç–æ–≤—ã–π\n\n\n–ª–∏—Å—Ç –Ω–µ—Ä–∂–∞–≤–µ—é—â–∏–π 8 x 1500 x 3000 mm aisi 201 12—Ö15–≥9–Ω–¥ –≥ / –∫ –º–∞—Ç–æ–≤—ã–π\n\n\n–ª–∏—Å—Ç –Ω–µ—Ä–∂–∞–≤–µ—é—â–∏–π 1.5 x 1000 x 2000 mm aisi 201 12—Ö15–≥9–Ω–¥ —Ö / –∫ —à–ª–∏—Ñ–æ–≤–∞–Ω–Ω—ã–π\n\n\n–ª–∏—Å—Ç –Ω–µ—Ä–∂–∞–≤–µ—é—â–∏–π 2 x 1000 x 2000 mm aisi 201 12—Ö15–≥9–Ω–¥ –≥ / –∫ –ø—Ä–æ—Å–µ—á–Ω–æ - –≤—ã—Ç—è–∂–Ω–æ–π –ø–≤–ª\n\n\n–ª–∏—Å—Ç –Ω–µ—Ä–∂–∞–≤–µ—é—â–∏–π 4 x 1000 x 2000 mm aisi 201 12—Ö15–≥9–Ω–¥ –≥ / –∫ –ø—Ä–æ—Å–µ—á–Ω–æ - –≤—ã—Ç—è–∂–Ω–æ–π –ø–≤–ª', {'entities': [(0, 4, 'product'), (5, 16, 'material'), (17, 18, 'thickness'), (21, 25, 'width'), (28, 32, 'length'), (36, 44, 'mark_steel_aisi'), (45, 54, 'mark_steal'), (55, 60, 'tehnology'), (61, 68, 'color'), (71, 75, 'product'), (76, 87, 'material'), (88, 89, 'thickness'), (92, 96, 'width'), (99, 103, 'length

–≠–ø–æ—Ö–∞ 1/40:   0%|          | 0/2049 [00:00<?, ?it/s]


–≠–ø–æ—Ö–∞ 1/40
üìâ Loss: 34751.421


–≠–ø–æ—Ö–∞ 2/40:   0%|          | 0/2049 [00:00<?, ?it/s]


–≠–ø–æ—Ö–∞ 2/40
üìâ Loss: 17453.279


–≠–ø–æ—Ö–∞ 3/40:   0%|          | 0/2049 [00:00<?, ?it/s]


–≠–ø–æ—Ö–∞ 3/40
üìâ Loss: 13180.380


–≠–ø–æ—Ö–∞ 4/40:   0%|          | 0/2049 [00:00<?, ?it/s]


–≠–ø–æ—Ö–∞ 4/40
üìâ Loss: 11255.004


–≠–ø–æ—Ö–∞ 5/40:   0%|          | 0/2049 [00:00<?, ?it/s]


–≠–ø–æ—Ö–∞ 5/40
üìâ Loss: 9928.187


–≠–ø–æ—Ö–∞ 6/40:   0%|          | 0/2049 [00:00<?, ?it/s]


–≠–ø–æ—Ö–∞ 6/40
üìâ Loss: 9012.005


–≠–ø–æ—Ö–∞ 7/40:   0%|          | 0/2049 [00:00<?, ?it/s]


–≠–ø–æ—Ö–∞ 7/40
üìâ Loss: 7986.002


–≠–ø–æ—Ö–∞ 8/40:   0%|          | 0/2049 [00:00<?, ?it/s]


–≠–ø–æ—Ö–∞ 8/40
üìâ Loss: 7156.043


–≠–ø–æ—Ö–∞ 9/40:   0%|          | 0/2049 [00:00<?, ?it/s]


–≠–ø–æ—Ö–∞ 9/40
üìâ Loss: 6830.694


–≠–ø–æ—Ö–∞ 10/40:   0%|          | 0/2049 [00:00<?, ?it/s]


–≠–ø–æ—Ö–∞ 10/40
üìâ Loss: 6312.031


–≠–ø–æ—Ö–∞ 11/40:   0%|          | 0/2049 [00:00<?, ?it/s]


–≠–ø–æ—Ö–∞ 11/40
üìâ Loss: 5962.034


–≠–ø–æ—Ö–∞ 12/40:   0%|          | 0/2049 [00:00<?, ?it/s]


–≠–ø–æ—Ö–∞ 12/40
üìâ Loss: 5555.889


–≠–ø–æ—Ö–∞ 13/40:   0%|          | 0/2049 [00:00<?, ?it/s]


–≠–ø–æ—Ö–∞ 13/40
üìâ Loss: 5437.939


–≠–ø–æ—Ö–∞ 14/40:   0%|          | 0/2049 [00:00<?, ?it/s]


–≠–ø–æ—Ö–∞ 14/40
üìâ Loss: 5274.134


–≠–ø–æ—Ö–∞ 15/40:   0%|          | 0/2049 [00:00<?, ?it/s]


–≠–ø–æ—Ö–∞ 15/40
üìâ Loss: 4979.238


–≠–ø–æ—Ö–∞ 16/40:   0%|          | 0/2049 [00:00<?, ?it/s]


–≠–ø–æ—Ö–∞ 16/40
üìâ Loss: 4765.407


–≠–ø–æ—Ö–∞ 17/40:   0%|          | 0/2049 [00:00<?, ?it/s]


–≠–ø–æ—Ö–∞ 17/40
üìâ Loss: 4654.535


–≠–ø–æ—Ö–∞ 18/40:   0%|          | 0/2049 [00:00<?, ?it/s]


–≠–ø–æ—Ö–∞ 18/40
üìâ Loss: 4375.937


–≠–ø–æ—Ö–∞ 19/40:   0%|          | 0/2049 [00:00<?, ?it/s]


–≠–ø–æ—Ö–∞ 19/40
üìâ Loss: 4364.238


–≠–ø–æ—Ö–∞ 20/40:   0%|          | 0/2049 [00:00<?, ?it/s]


–≠–ø–æ—Ö–∞ 20/40
üìâ Loss: 4283.655


–≠–ø–æ—Ö–∞ 21/40:   0%|          | 0/2049 [00:00<?, ?it/s]


–≠–ø–æ—Ö–∞ 21/40
üìâ Loss: 4241.335


–≠–ø–æ—Ö–∞ 22/40:   0%|          | 0/2049 [00:00<?, ?it/s]


–≠–ø–æ—Ö–∞ 22/40
üìâ Loss: 3984.289


–≠–ø–æ—Ö–∞ 23/40:   0%|          | 0/2049 [00:00<?, ?it/s]


–≠–ø–æ—Ö–∞ 23/40
üìâ Loss: 3888.364


–≠–ø–æ—Ö–∞ 24/40:   0%|          | 0/2049 [00:00<?, ?it/s]


–≠–ø–æ—Ö–∞ 24/40
üìâ Loss: 4042.827


–≠–ø–æ—Ö–∞ 25/40:   0%|          | 0/2049 [00:00<?, ?it/s]


–≠–ø–æ—Ö–∞ 25/40
üìâ Loss: 3962.687


–≠–ø–æ—Ö–∞ 26/40:   0%|          | 0/2049 [00:00<?, ?it/s]


–≠–ø–æ—Ö–∞ 26/40
üìâ Loss: 3801.571


–≠–ø–æ—Ö–∞ 27/40:   0%|          | 0/2049 [00:00<?, ?it/s]


–≠–ø–æ—Ö–∞ 27/40
üìâ Loss: 3854.330


–≠–ø–æ—Ö–∞ 28/40:   0%|          | 0/2049 [00:00<?, ?it/s]


–≠–ø–æ—Ö–∞ 28/40
üìâ Loss: 3610.508


–≠–ø–æ—Ö–∞ 29/40:   0%|          | 0/2049 [00:00<?, ?it/s]


–≠–ø–æ—Ö–∞ 29/40
üìâ Loss: 3688.979


–≠–ø–æ—Ö–∞ 30/40:   0%|          | 0/2049 [00:00<?, ?it/s]


–≠–ø–æ—Ö–∞ 30/40
üìâ Loss: 3604.925


–≠–ø–æ—Ö–∞ 31/40:   0%|          | 0/2049 [00:00<?, ?it/s]


–≠–ø–æ—Ö–∞ 31/40
üìâ Loss: 3501.070


–≠–ø–æ—Ö–∞ 32/40:   0%|          | 0/2049 [00:00<?, ?it/s]


–≠–ø–æ—Ö–∞ 32/40
üìâ Loss: 3567.380


–≠–ø–æ—Ö–∞ 33/40:   0%|          | 0/2049 [00:00<?, ?it/s]


–≠–ø–æ—Ö–∞ 33/40
üìâ Loss: 3380.647


–≠–ø–æ—Ö–∞ 34/40:   0%|          | 0/2049 [00:00<?, ?it/s]


–≠–ø–æ—Ö–∞ 34/40
üìâ Loss: 3541.794


–≠–ø–æ—Ö–∞ 35/40:   0%|          | 0/2049 [00:00<?, ?it/s]


–≠–ø–æ—Ö–∞ 35/40
üìâ Loss: 3528.954


–≠–ø–æ—Ö–∞ 36/40:   0%|          | 0/2049 [00:00<?, ?it/s]


–≠–ø–æ—Ö–∞ 36/40
üìâ Loss: 3452.260


–≠–ø–æ—Ö–∞ 37/40:   0%|          | 0/2049 [00:00<?, ?it/s]


–≠–ø–æ—Ö–∞ 37/40
üìâ Loss: 3507.874


–≠–ø–æ—Ö–∞ 38/40:   0%|          | 0/2049 [00:00<?, ?it/s]


–≠–ø–æ—Ö–∞ 38/40
üìâ Loss: 3424.018


–≠–ø–æ—Ö–∞ 39/40:   0%|          | 0/2049 [00:00<?, ?it/s]


–≠–ø–æ—Ö–∞ 39/40
üìâ Loss: 3345.640


–≠–ø–æ—Ö–∞ 40/40:   0%|          | 0/2049 [00:00<?, ?it/s]


–≠–ø–æ—Ö–∞ 40/40
üìâ Loss: 3399.545
üíæ –°–æ—Ö—Ä–∞–Ω–µ–Ω–∞ —Ñ–∏–Ω–∞–ª—å–Ω–∞—è –º–æ–¥–µ–ª—å
Train time:  7203.317417383194  c


üîç –û—Ü–µ–Ω–∫–∞ –º–æ–¥–µ–ª–∏:   0%|          | 0/879 [00:00<?, ?it/s]


üìä –ü–æ–¥—Ä–æ–±–Ω—ã–µ –º–µ—Ç—Ä–∏–∫–∏:
–¢–∏–ø                  Precision  Recall     F1         –ü–æ–¥–¥–µ—Ä–∂–∫–∞ 
------------------------------------------------------------
length               0.965      0.953      0.959      1236      
tehnology            0.977      0.977      0.977      1017      
product              0.994      0.991      0.993      4421      
thickness            0.991      0.989      0.990      2812      
color                0.960      0.960      0.960      100       
coating              0.970      0.959      0.965      171       
mark_steal           0.980      0.979      0.980      2724      
material             0.980      0.992      0.986      1940      
mark_steel_aisi      0.988      0.988      0.988      603       
width                0.967      0.975      0.971      1869      
type                 0.982      0.989      0.985      1862      
mark                 0.915      0.933      0.924      403       
height               0.942      0.921    

TypeError: unsupported format string passed to dict.__format__

In [44]:
import pandas as pd

print('Train time: ', time.time() - time_start, ' c')
# –û—Ü–µ–Ω–∫–∞ –º–æ–¥–µ–ª–∏ (–µ—Å–ª–∏ –µ—Å—Ç—å —Ç–µ—Å—Ç–æ–≤–∞—è –≤—ã–±–æ—Ä–∫–∞)
time_start = time.time()
if formatted_test:
    metrics = trainer_blank_empty.evaluate(formatted_test)
print('Inference time: ', time.time() - time_start, ' c')    

# –°–æ–∑–¥–∞–µ–º DataFrame –∏ —Ñ–∏–ª—å—Ç—Ä—É–µ–º –∫–ª–∞—Å—Å—ã –±–µ–∑ –ø–æ–¥–¥–µ—Ä–∂–∫–∏
df = pd.DataFrame.from_dict(metrics, orient='index')
#df = df[df['support'] > 0]  # –ò–≥–Ω–æ—Ä–∏—Ä—É–µ–º –∫–ª–∞—Å—Å—ã —Å support=0

# 1. –ú–∞–∫—Ä–æ-—É—Å—Ä–µ–¥–Ω–µ–Ω–Ω—ã–µ –º–µ—Ç—Ä–∏–∫–∏ (–≤—Å–µ –∫–ª–∞—Å—Å—ã —Ä–∞–≤–Ω–æ–∑–Ω–∞—á–Ω—ã)
macro_precision = df['precision'].mean()
macro_recall = df['recall'].mean()
macro_f1 = df['f1'].mean()

print("–ú–∞–∫—Ä–æ-—É—Å—Ä–µ–¥–Ω–µ–Ω–Ω—ã–µ –º–µ—Ç—Ä–∏–∫–∏ (–≤—Å–µ –∫–ª–∞—Å—Å—ã —Ä–∞–≤–Ω—ã):")
print(f"Macro-Precision: {macro_precision:.4f}")
print(f"Macro-Recall: {macro_recall:.4f}")
print(f"Macro-F1: {macro_f1:.4f}\n")

# 2. –í–∑–≤–µ—à–µ–Ω–Ω—ã–µ –º–µ—Ç—Ä–∏–∫–∏ (—É—á–∏—Ç—ã–≤–∞—é—Ç —Ä–∞–∑–º–µ—Ä –∫–ª–∞—Å—Å–æ–≤)
total_support = df['support'].sum()
weighted_precision = (df['precision'] * df['support']).sum() / total_support
weighted_recall = (df['recall'] * df['support']).sum() / total_support
weighted_f1 = (df['f1'] * df['support']).sum() / total_support

print("–í–∑–≤–µ—à–µ–Ω–Ω—ã–µ –º–µ—Ç—Ä–∏–∫–∏ (—É—á–∏—Ç—ã–≤–∞—é—Ç —Ä–∞–∑–º–µ—Ä –∫–ª–∞—Å—Å–æ–≤):")
print(f"Weighted Precision: {weighted_precision:.4f}")
print(f"Weighted Recall: {weighted_recall:.4f}")
print(f"Weighted F1: {weighted_f1:.4f}\n")

# 3. –ú–∏–∫—Ä–æ-—É—Å—Ä–µ–¥–Ω–µ–Ω–Ω—ã–µ –º–µ—Ç—Ä–∏–∫–∏ (–∞–ª—å—Ç–µ—Ä–Ω–∞—Ç–∏–≤–Ω—ã–π —Ä–∞—Å—á–µ—Ç)
micro_precision = weighted_precision  # –î–ª—è classification report –æ–Ω–∏ —Å–æ–≤–ø–∞–¥–∞—é—Ç
micro_recall = weighted_recall
micro_f1 = weighted_f1

print("–ú–∏–∫—Ä–æ-—É—Å—Ä–µ–¥–Ω–µ–Ω–Ω—ã–µ –º–µ—Ç—Ä–∏–∫–∏:")
print(f"Micro-Precision: {micro_precision:.4f}")
print(f"Micro-Recall: {micro_recall:.4f}")
print(f"Micro-F1: {micro_f1:.4f}")

Train time:  122.83794498443604  c


üîç –û—Ü–µ–Ω–∫–∞ –º–æ–¥–µ–ª–∏:   0%|          | 0/879 [00:00<?, ?it/s]


üìä –ü–æ–¥—Ä–æ–±–Ω—ã–µ –º–µ—Ç—Ä–∏–∫–∏:
–¢–∏–ø                  Precision  Recall     F1         –ü–æ–¥–¥–µ—Ä–∂–∫–∞ 
------------------------------------------------------------
length               0.965      0.953      0.959      1236      
tehnology            0.977      0.977      0.977      1017      
product              0.994      0.991      0.993      4421      
thickness            0.991      0.989      0.990      2812      
color                0.960      0.960      0.960      100       
coating              0.970      0.959      0.965      171       
mark_steal           0.980      0.979      0.980      2724      
material             0.980      0.992      0.986      1940      
mark_steel_aisi      0.988      0.988      0.988      603       
width                0.967      0.975      0.971      1869      
type                 0.982      0.989      0.985      1862      
mark                 0.915      0.933      0.924      403       
height               0.942      0.921    

In [None]:
processor = DataProcessor()

# –ó–∞–≥—Ä—É–∑–∫–∞ –¥–∞–Ω–Ω—ã—Ö
train_data = processor.load_data("../../data/jsonl/dataset_jsonl_train.jsonl")
test_data = processor.load_data("../../data/jsonl/dataset_jsonl_test.jsonl")

# –ü–æ–¥–≥–æ—Ç–æ–≤–∫–∞ –¥–ª—è –æ–±—É—á–µ–Ω–∏—è
formatted_train = processor.prepare_data(train_data)
formatted_test = processor.prepare_data(test_data) if test_data else None

time_start = time.time()
# –ò–Ω–∏—Ü–∏–∞–ª–∏–∑–∞—Ü–∏—è —Ç—Ä–µ–Ω–µ—Ä–∞
trainer_blank_empty = NERTrainer(
    blank_language="ru",
    transformer_name="DeepPavlov/rubert-base-cased"
)

# –î–æ–±–∞–≤–ª–µ–Ω–∏–µ –º–µ—Ç–æ–∫ –î–û —Ä–∞–∑–¥–µ–ª–µ–Ω–∏—è (—Ç–µ–ø–µ—Ä—å –∏—Å–ø–æ–ª—å–∑—É–µ–º –ø–æ–¥–≥–æ—Ç–æ–≤–ª–µ–Ω–Ω—ã–µ –¥–∞–Ω–Ω—ã–µ)
print("–ü—Ä–∏–º–µ—Ä—ã –æ–±—É—á–∞—é—â–∏—Ö –¥–∞–Ω–Ω—ã—Ö:", formatted_train[:1])  # –ø–æ–∫–∞–∂–µ–º –ø–µ—Ä–≤—ã–π –ø—Ä–∏–º–µ—Ä –¥–ª—è –ø—Ä–æ–≤–µ—Ä–∫–∏
trainer_blank_empty.add_labels(formatted_train)

print(f"\n–†–∞–∑–º–µ—Ä –æ–±—É—á–∞—é—â–µ–π –≤—ã–±–æ—Ä–∫–∏: {len(formatted_train)}")
print(f"–†–∞–∑–º–µ—Ä —Ç–µ—Å—Ç–æ–≤–æ–π –≤—ã–±–æ—Ä–∫–∏: {len(formatted_test) if formatted_test else 0}")

# –û–±—É—á–µ–Ω–∏–µ –º–æ–¥–µ–ª–∏ (—É–≤–µ–ª–∏—á–∏–º epochs –¥–ª—è –ª—É—á—à–µ–≥–æ –∫–∞—á–µ—Å—Ç–≤–∞)
trainer_blank_empty.train(
    formatted_train, 
    epochs=10,
    batch_size=16,  
    save_path=r"C:\Users\mezhonnyy\Desktop\–†–µ—à–µ–Ω–∏—è\NER\model\NER_final\data\jsonl\model"
)
print('Train time: ', time.time() - time_start, ' c')
# –û—Ü–µ–Ω–∫–∞ –º–æ–¥–µ–ª–∏ (–µ—Å–ª–∏ –µ—Å—Ç—å —Ç–µ—Å—Ç–æ–≤–∞—è –≤—ã–±–æ—Ä–∫–∞)
time_start = time.time()
if formatted_test:
    metrics = trainer_blank_empty.evaluate(formatted_test)
    print("\n–ú–µ—Ç—Ä–∏–∫–∏ –Ω–∞ —Ç–µ—Å—Ç–æ–≤–æ–π –≤—ã–±–æ—Ä–∫–µ:")
    print(f"Precision: {metrics['precision']:.2f}")
    print(f"Recall: {metrics['recall']:.2f}")
    print(f"F1-score: {metrics['f1_score']:.2f}")
else:
    print("\n–¢–µ—Å—Ç–æ–≤–∞—è –≤—ã–±–æ—Ä–∫–∞ –Ω–µ –ø—Ä–µ–¥–æ—Å—Ç–∞–≤–ª–µ–Ω–∞, –æ—Ü–µ–Ω–∫–∞ –Ω–µ –≤—ã–ø–æ–ª–Ω–µ–Ω–∞")
print('Inference time: ', time.time() - time_start, ' c')    

üì• –ó–∞–≥—Ä—É–∑–∫–∞ –¥–∞–Ω–Ω—ã—Ö: 0it [00:00, ?it/s]

‚úÖ –£—Å–ø–µ—à–Ω–æ –∑–∞–≥—Ä—É–∂–µ–Ω–æ 2051 –∑–∞–ø–∏—Å–µ–π


üì• –ó–∞–≥—Ä—É–∑–∫–∞ –¥–∞–Ω–Ω—ã—Ö: 0it [00:00, ?it/s]

‚úÖ –£—Å–ø–µ—à–Ω–æ –∑–∞–≥—Ä—É–∂–µ–Ω–æ 879 –∑–∞–ø–∏—Å–µ–π


üîß –ü–æ–¥–≥–æ—Ç–æ–≤–∫–∞ –¥–∞–Ω–Ω—ã—Ö:   0%|          | 0/2051 [00:00<?, ?it/s]

‚ö†Ô∏è –û—à–∏–±–∫–∞ –≤ —ç–ª–µ–º–µ–Ω—Ç–µ: –ù–µ–∫–æ—Ä—Ä–µ–∫—Ç–Ω—ã–µ –ø–æ–∑–∏—Ü–∏–∏: 117-121 –¥–ª—è —Ç–µ–∫—Å—Ç–∞ –¥–ª–∏–Ω—ã 114. –¢–µ–∫—Å—Ç: ''
‚ö†Ô∏è –û—à–∏–±–∫–∞ –≤ —ç–ª–µ–º–µ–Ω—Ç–µ: –ù–µ–∫–æ—Ä—Ä–µ–∫—Ç–Ω—ã–µ –ø–æ–∑–∏—Ü–∏–∏: 262-267 –¥–ª—è —Ç–µ–∫—Å—Ç–∞ –¥–ª–∏–Ω—ã 259. –¢–µ–∫—Å—Ç: ''
üî¥ –í—Å–µ–≥–æ –æ—à–∏–±–æ–∫: 2 (–∏–∑ 2051)


üîß –ü–æ–¥–≥–æ—Ç–æ–≤–∫–∞ –¥–∞–Ω–Ω—ã—Ö:   0%|          | 0/879 [00:00<?, ?it/s]

‚ö†Ô∏è GPU –Ω–µ –¥–æ—Å—Ç—É–ø–µ–Ω, –∏—Å–ø–æ–ª—å–∑—É–µ—Ç—Å—è CPU
‚úÖ –°–æ–∑–¥–∞–Ω–∞ —Ç—Ä–∞–Ω—Å—Ñ–æ—Ä–º–µ—Ä–Ω–∞—è –º–æ–¥–µ–ª—å —Å DeepPavlov/rubert-base-cased
–ü—Ä–∏–º–µ—Ä—ã –æ–±—É—á–∞—é—â–∏—Ö –¥–∞–Ω–Ω—ã—Ö: [('–ª–∏—Å—Ç –Ω–µ—Ä–∂–∞–≤–µ—é—â–∏–π 4 x 1500 x 3000 mm aisi 201 12—Ö15–≥9–Ω–¥ –≥ / –∫ –º–∞—Ç–æ–≤—ã–π\n\n\n–ª–∏—Å—Ç –Ω–µ—Ä–∂–∞–≤–µ—é—â–∏–π 8 x 1500 x 3000 mm aisi 201 12—Ö15–≥9–Ω–¥ –≥ / –∫ –º–∞—Ç–æ–≤—ã–π\n\n\n–ª–∏—Å—Ç –Ω–µ—Ä–∂–∞–≤–µ—é—â–∏–π 1.5 x 1000 x 2000 mm aisi 201 12—Ö15–≥9–Ω–¥ —Ö / –∫ —à–ª–∏—Ñ–æ–≤–∞–Ω–Ω—ã–π\n\n\n–ª–∏—Å—Ç –Ω–µ—Ä–∂–∞–≤–µ—é—â–∏–π 2 x 1000 x 2000 mm aisi 201 12—Ö15–≥9–Ω–¥ –≥ / –∫ –ø—Ä–æ—Å–µ—á–Ω–æ - –≤—ã—Ç—è–∂–Ω–æ–π –ø–≤–ª\n\n\n–ª–∏—Å—Ç –Ω–µ—Ä–∂–∞–≤–µ—é—â–∏–π 4 x 1000 x 2000 mm aisi 201 12—Ö15–≥9–Ω–¥ –≥ / –∫ –ø—Ä–æ—Å–µ—á–Ω–æ - –≤—ã—Ç—è–∂–Ω–æ–π –ø–≤–ª', {'entities': [(0, 4, 'product'), (5, 16, 'material'), (17, 18, 'thickness'), (21, 25, 'width'), (28, 32, 'length'), (36, 44, 'mark_steel_aisi'), (45, 54, 'mark_steal'), (55, 60, 'tehnology'), (61, 68, 'color'), (71, 75, 

Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


–≠–ø–æ—Ö–∞ 1/10:   0%|          | 0/513 [00:00<?, ?it/s]


–≠–ø–æ—Ö–∞ 1/10
üìâ Loss: 59980.936


–≠–ø–æ—Ö–∞ 2/10:   0%|          | 0/513 [00:00<?, ?it/s]


–≠–ø–æ—Ö–∞ 2/10
üìâ Loss: 21648.147


–≠–ø–æ—Ö–∞ 3/10:   0%|          | 0/513 [00:00<?, ?it/s]


–≠–ø–æ—Ö–∞ 3/10
üìâ Loss: 12628.520


–≠–ø–æ—Ö–∞ 4/10:   0%|          | 0/513 [00:00<?, ?it/s]


–≠–ø–æ—Ö–∞ 4/10
üìâ Loss: 8729.204


–≠–ø–æ—Ö–∞ 5/10:   0%|          | 0/513 [00:00<?, ?it/s]


–≠–ø–æ—Ö–∞ 5/10
üìâ Loss: 6783.503


–≠–ø–æ—Ö–∞ 6/10:   0%|          | 0/513 [00:00<?, ?it/s]


–≠–ø–æ—Ö–∞ 6/10
üìâ Loss: 5536.863


–≠–ø–æ—Ö–∞ 7/10:   0%|          | 0/513 [00:00<?, ?it/s]


–≠–ø–æ—Ö–∞ 7/10
üìâ Loss: 4647.616


–≠–ø–æ—Ö–∞ 8/10:   0%|          | 0/513 [00:00<?, ?it/s]


–≠–ø–æ—Ö–∞ 8/10
üìâ Loss: 4039.089


–≠–ø–æ—Ö–∞ 9/10:   0%|          | 0/513 [00:00<?, ?it/s]


–≠–ø–æ—Ö–∞ 9/10
üìâ Loss: 3319.055


–≠–ø–æ—Ö–∞ 10/10:   0%|          | 0/513 [00:00<?, ?it/s]

In [133]:
trainer_blank.predict('–∫—É–ø–∏—Ç—å —à–µ—Å—Ç–∏–≥—Ä–∞–Ω–Ω–∏–∫ –∫–∞–ª–∏–±—Ä —Å—Ç40x 27 –≥–æ—Å—Ç 8560 - 78, 4543 - 2016 —Ü–µ–Ω–∞ 013568-95')

[('–∫—É–ø–∏—Ç—å', 'product', 0, 6),
 ('—à–µ—Å—Ç–∏–≥—Ä–∞–Ω–Ω–∏–∫', 'product', 7, 19),
 ('–∫–∞–ª–∏–±—Ä', 'type', 20, 26),
 ('—Å—Ç40x 27', 'mark_steal', 27, 35),
 ('–≥–æ—Å—Ç 8560 - 78', 'standart_gost', 36, 50),
 ('4543 - 2016', 'standart_gost', 52, 63),
 ('95', 'width', 76, 78)]

In [None]:
trainer_blank_empty.predict('–∫—É–ø–∏—Ç—å —à–µ—Å—Ç–∏–≥—Ä–∞–Ω–Ω–∏–∫ –∫–∞–ª–∏–±—Ä —Å—Ç40x 27 –≥–æ—Å—Ç 8560 - 78, 4543 - 2016 —Ü–µ–Ω–∞ 013568-95')

