In [1]:
import spacy
import random
import json
import re
from tqdm.notebook import tqdm
from collections import defaultdict
from spacy.training import Example
from spacy.scorer import Scorer
from spacy_transformers import TransformerModel
from typing import List, Tuple, Dict, Any, Optional, Union
import warnings
from pathlib import Path

warnings.filterwarnings('ignore')

In [21]:
class DataProcessor:
    def load_data(self, 
                file_path: str,
                text_key: str = "text",
                possible_entity_keys: List[str] = ["label", "entities", "annotations"],
                comment_key: str = "Comments") -> List[Dict]:
        data = []
        line_num = 0
        
        try:
            with open(file_path, "r", encoding="utf-8") as f:
                for line in tqdm(f, desc="Download data"):
                    line_num += 1
                    line = line.strip()
                    if not line:
                        continue
                    
                    try:
                        item = json.loads(line)
                        
                        # Check field with text
                        if text_key not in item:
                            print(f"⚠️ Lime {line_num}: not key '{text_key}'")
                            continue
                        
                        # Find key with annotation
                        entities = []
                        for key in possible_entity_keys:
                            if key in item:
                                entities = item[key]
                                break
                        
                        # Convert format
                        unified_entities = self._convert_entities(entities, item[text_key])
                        
                        data.append({
                            "text": item[text_key],
                            "entities": unified_entities,
                            "comments": item.get(comment_key, [])
                        })
                        
                    except json.JSONDecodeError:
                        print(f"Line {line_num}: Error JSON (skipped)")
                    except Exception as e:
                        print(f"Line {line_num}: {str(e)}")
        
        except FileNotFoundError:
            raise FileNotFoundError(f"File not found: {file_path}")
        
        print(f"Succes download {len(data)} row")
        return data
    
    def _convert_entities(self, 
                        entities: Union[List, Dict], 
                        text: str) -> List[Dict]:
        converted = []
        
        if not entities:
            return converted
        
        first_item = entities[0]
        
        # Format Doccano v1: [start, end, label]
        if isinstance(first_item, list) and len(first_item) == 3:
            for start, end, label in entities:
                converted.append({
                    "start": int(start),
                    "end": int(end),
                    "label": str(label)
                })
        
        # Format Doccano v2/standart: {"start": X, "end": Y, ...}
        elif isinstance(first_item, dict):
            for ent in entities:
                # Extract key
                start_key = "start" if "start" in ent else "start_offset"
                end_key = "end" if "end" in ent else "end_offset"
                label_key = "label" if "label" in ent else "tag"
                
                converted.append({
                    "start": int(ent[start_key]),
                    "end": int(ent[end_key]),
                    "label": str(ent[label_key])
                })
        
        return converted
    
    def prepare_data(self, 
                    data: List[Dict],
                    validate: bool = True) -> List[Tuple[str, Dict]]:
        formatted_data = []
        error_count = 0
        
        for item in tqdm(data, desc="Prepare data"):
            try:
                text = item["text"]
                entities = []
                
                for ent in item["entities"]:
                    start = ent["start"]
                    end = ent["end"]
                    label = ent["label"]
                    
                    if validate:
                        # Check correct annotation
                        if not (0 <= start <= end <= len(text)):
                            raise ValueError(
                                f"Incorrect position: {start}-{end} "
                                f"for text line {len(text)}. Text: '{text[start:end]}'"
                            )
                        
                        # Check mark not empty
                        if not label.strip():
                            raise ValueError(f"Empty mark for position {start}-{end}")
                    
                    entities.append((start, end, label))
                
                formatted_data.append((text, {"entities": entities}))
            except Exception as e:
                error_count += 1
                print(f"Error in element: {str(e)}")
                continue
        
        if error_count > 0:
            print(f"🔴 All error number: {error_count} (из {len(data)})")
        
        return formatted_data

    def save_to_jsonl(self, 
                     data: List[Dict], 
                     output_path: str,
                     format: str = "doccano") -> None:
        with open(output_path, "w", encoding="utf-8") as f:
            for item in tqdm(data, desc="Saved data"):
                if format == "doccano":
                    # Format Doccano: {"text": "...", "label": [[start, end, tag], ...]}
                    labels = [
                        [ent["start"], ent["end"], ent["label"]]
                        for ent in item["entities"]
                    ]
                    json.dump({
                        "text": item["text"],
                        "label": labels,
                        "Comments": item.get("comments", [])
                    }, f, ensure_ascii=False)
                else:
                    # Standart format
                    json.dump(item, f, ensure_ascii=False)
                f.write("\n")

In [19]:
def clean_text(text):
    """Clear text"""
    text = re.sub(r'\n+', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

def adjust_annotations(text, annotations):
    """Correct annotation"""
    cleaned_text = clean_text(text)
    new_entities = []
    
    # Create list (start, end, label, entity_text)
    entities_info = [(start, end, label, text[start:end]) 
                    for start, end, label in annotations['entities']]
    
    # Sort entity
    entities_info.sort(key=lambda x: x[0])
    
    for start, end, label, entity_text in entities_info:
        occurrences = [m.start() for m in re.finditer(re.escape(entity_text), text)]
        
        for occ_start in occurrences:
            if occ_start == start:  
                new_start = cleaned_text.find(entity_text)
                if new_start != -1:
                    new_end = new_start + len(entity_text)
                    new_entities.append((new_start, new_end, label))
                    cleaned_text = cleaned_text[:new_start] + ' ' * len(entity_text) + cleaned_text[new_end:]
                break
    
    return {'entities': new_entities}

class NERTrainer:
    def __init__(self, 
                 model_name: str = "ru_core_news_sm", 
                 use_gpu: bool = True,
                 blank_language: str = "ru",
                 disable_pipes: Optional[List[str]] = None,
                 transformer_name: Optional[str] = None):
        self.use_gpu = use_gpu
        self._setup_device()
        
        if transformer_name:
            self.nlp = self._create_transformer_model(blank_language, transformer_name)
        else:
            self.nlp = self._load_model(model_name, blank_language, disable_pipes)
            
        self.ner = self._setup_ner_pipe(transformer_name)

    def _create_transformer_model(self, lang: str, transformer_name: str):
        """Create model with transformer"""
        nlp = spacy.blank(lang)
        
        # Add component transformer
        config = {
            "model": {
                "@architectures": "spacy-transformers.TransformerModel.v3",
                "name": transformer_name,
                "tokenizer_config": {"use_fast": True},
                "transformer_config": {"output_hidden_states": True}
            }
        }
        nlp.add_pipe("transformer", config=config)
        
        print(f"Create transformer model {transformer_name}")
        return nlp
        
    def _setup_device(self) -> None:
        """Setting device (CPU/GPU)"""
        if self.use_gpu and spacy.prefer_gpu():
            spacy.require_gpu()
            print("Used GPU")
        else:
            print("⚠️ GPU not found, used CPU")
    
    def _load_model(self, model_name: str, blank_language: str, disable_pipes: List[str]):
        """Load model and processing error"""
        try:
            if model_name.lower() == "blank":
                return spacy.blank(blank_language)
            elif Path(model_name).exists():
                return spacy.load(model_name, disable=disable_pipes or [])
            else:
                return spacy.load(model_name, disable=disable_pipes or [])
        except Exception as e:
            raise ValueError(f"Model not found '{model_name}': {str(e)}")
    
    def _setup_ner_pipe(self, transformer_name):
        """Setting NER component with transformer"""
        if "ner" in self.nlp.pipe_names:
            return self.nlp.get_pipe("ner")
    
        # ДFor transformer model
        if "transformer" in self.nlp.pipe_names:
            return self.nlp.add_pipe(
                "ner",
                after="transformer",
                config={
                    "model": {
                        "@architectures": "spacy.TransitionBasedParser.v2",
                        "hidden_width": 128,
                        "maxout_pieces": 2,
                        "use_upper": True
                    }
                }
            )
    
        # For simple model
        return self.nlp.add_pipe("ner")
    
    def prepare_data(self, 
                    data: List[Dict],
                    text_key: str = "text",
                    entities_key: str = "label",
                    start_offset_key: str = "start_offset",
                    end_offset_key: str = "end_offset",
                    label_key: str = "label") -> List[Tuple[str, Dict]]:
        formatted_data = []
        error_count = 0
        
        for item in tqdm(data, desc="Data preparation"):
            try:
                text = item[text_key]
                entities = []
                
                for ent in item.get(entities_key, []):
                    # Check correct annotation
                    if not all(k in ent for k in [start_offset_key, end_offset_key, label_key]):
                        raise ValueError(f"Incorrect annotation: {ent}")
                    
                    start = ent[start_offset_key]
                    end = ent[end_offset_key]
                    
                    # Validate position
                    if not (0 <= start <= end <= len(text)):
                        raise ValueError(f"Incorrect position: {start}-{end} for text length {len(text)}")
                    
                    entities.append((start, end, ent[label_key]))
                
                formatted_data.append((text, {"entities": entities}))
            except Exception as e:
                error_count += 1
                print(f"Error in element: {str(e)}")
                continue
        
        if error_count > 0:
            print(f"All error count: {error_count} (for {len(data)})")
        
        return formatted_data
    
    def train_test_split(self, 
                       data: List,
                       test_size: float = 0.2,
                       random_state: Optional[int] = None) -> Tuple[List, List]:
        if random_state is not None:
            random.seed(random_state)
            
        if not 0 < test_size < 1:
            raise ValueError("test_size must be between 0 and 1")
            
        shuffled = random.sample(data, len(data))
        split_idx = int(len(data) * (1 - test_size))
        return shuffled[:split_idx], shuffled[split_idx:]
    
    def add_labels(self, 
                  data: Optional[List[Tuple[str, Dict]]] = None,
                  labels: Optional[List[str]] = None) -> None:
        if labels is not None:
            unique_labels = set(labels)
        elif data is not None:
            unique_labels = set()
            for _, annotations in data:
                for _, _, label in annotations["entities"]:
                    unique_labels.add(label)
        else:
            raise ValueError("You need data or labels")
        
        for label in unique_labels:
            self.ner.add_label(label)
        
        print(f"Added labels: {sorted(unique_labels)}")
    
    def train(self, 
             train_data: List[Tuple[str, Dict]],
             epochs: int = 10,
             batch_size: int = 8,
             dropout: float = 0.5,
             learning_rate: float = 0.001,
             save_path: Optional[str] = None,
             eval_data: Optional[List[Tuple[str, Dict]]] = None,
             early_stopping: Optional[int] = None):
    
        # Init optimizer
        optimizer = self.nlp.initialize()
    
        # Setting parameters for transformers
        if "transformer" in self.nlp.pipe_names:
            batch_size = min(batch_size, 4)
            learning_rate = 1e-4
            dropout = 0.1
    
        # Install learning rate
        if hasattr(optimizer, "learn_rate"):
            optimizer.learn_rate = learning_rate
    
        # init progress bar settings
        best_f1 = -1
        best_epoch = 0
        history = {"loss": [], "f1": []}
    
        # Cycle train 
        for epoch in range(epochs):
            losses = {}
            random.shuffle(train_data)
        
            # Butch proccessing
            batches = [train_data[i:i+batch_size] for i in range(0, len(train_data), batch_size)]
        
            for batch in tqdm(batches, desc=f"Epoch {epoch+1}/{epochs}"):
                examples = []
                for text, annotations in batch:
                    doc = self.nlp.make_doc(text)
                    example = Example.from_dict(doc, annotations)
                    examples.append(example)
            
                self.nlp.update(examples, drop=dropout, losses=losses, sgd=optimizer)
        
            # Log and validation
            epoch_loss = losses.get("ner", 0)
            history["loss"].append(epoch_loss)
            print(f"\nEpoch {epoch+1}/{epochs}")
            print(f"📉 Loss: {epoch_loss:.3f}")
        
            if eval_data:
                metrics = self.evaluate(eval_data, verbose=False)
                avg_f1 = sum(m["f"] for m in metrics.values()) / len(metrics)
                history["f1"].append(avg_f1)
                print(f"📊 F1-score: {avg_f1:.3f}")
            
                if save_path and avg_f1 > best_f1:
                    best_f1 = avg_f1
                    best_epoch = epoch
                    self.nlp.to_disk(save_path)
                    print(f"Saved best model (F1: {best_f1:.3f})")
            
                if early_stopping and (epoch - best_epoch) >= early_stopping:
                    print(f"🛑 Early stopping after {early_stopping} epoch with improvement")
                    break
    
        if eval_data is None and save_path:
            self.nlp.to_disk(save_path)
            print("Saved final model")
    
        return history
    
    def evaluate(self, test_data: List[Tuple[str, Dict]], verbose: bool = True) -> Dict:
        true_entities = []
        pred_entities = []
    
        for text, annotations in tqdm(test_data, desc="🔍 Оценка модели"):
            # Entity from annotation
            true = [(start, end, label) for start, end, label in annotations["entities"]]
            true_entities.append((text, true))
        
            # predicted entity
            doc = self.nlp(text)
            pred = [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]
            pred_entities.append((text, pred))
    
        # Calc metrics
        metrics = self._calculate_metrics(true_entities, pred_entities)
    
        if verbose:
            self._print_detailed_metrics(metrics)
    
        return metrics

    def _calculate_metrics(self, true_entities, pred_entities):
        """Calc metrics for entities"""
        metrics = defaultdict(lambda: {"tp": 0, "fp": 0, "fn": 0})
    
        for (true_text, true), (pred_text, pred) in zip(true_entities, pred_entities):
            # Check position in text
            if true_text != pred_text:
                raise ValueError("The texts don't match!")
        
            true_set = set(true)
            pred_set = set(pred)
        
            # Calc TP, FP, FN for entity type
            for label in set([e[2] for e in true] + [e[2] for e in pred]):
                tp = len([e for e in true if e in pred and e[2] == label])
                fp = len([e for e in pred if e not in true and e[2] == label])
                fn = len([e for e in true if e not in pred and e[2] == label])
            
                metrics[label]["tp"] += tp
                metrics[label]["fp"] += fp
                metrics[label]["fn"] += fn
    
        # Calc precision, recall, f1 for each entity type
        result = {}
        for label, counts in metrics.items():
            tp, fp, fn = counts["tp"], counts["fp"], counts["fn"]
            p = tp / (tp + fp) if (tp + fp) > 0 else 0
            r = tp / (tp + fn) if (tp + fn) > 0 else 0
            f1 = 2 * (p * r) / (p + r) if (p + r) > 0 else 0
        
            result[label] = {
                "precision": p,
                "recall": r,
                "f1": f1,
                "support": tp + fn,
                "tp": tp,
                "fp": fp,
                "fn": fn
            }
    
        return result

    def _print_detailed_metrics(self, metrics):
        print("\nMetrics details:")
        print("{:<20} {:<10} {:<10} {:<10} {:<10}".format(
            "Тип", "Precision", "Recall", "F1", "Support"))
        print("-" * 60)
    
        for label, values in metrics.items():
            print("{:<20} {:<10.3f} {:<10.3f} {:<10.3f} {:<10}".format(
                label,
                values["precision"],
                values["recall"],
                values["f1"],
                values["support"]))
    
        # Micro-average
        total_tp = sum(m["tp"] for m in metrics.values())
        total_fp = sum(m["fp"] for m in metrics.values())
        total_fn = sum(m["fn"] for m in metrics.values())
    
        micro_p = total_tp / (total_tp + total_fp) if (total_tp + total_fp) > 0 else 0
        micro_r = total_tp / (total_tp + total_fn) if (total_tp + total_fn) > 0 else 0
        micro_f1 = 2 * (micro_p * micro_r) / (micro_p + micro_r) if (micro_p + micro_r) > 0 else 0
    
        print("\nTotal final metrics (micro-average):")
        print(f"Precision: {micro_p:.3f}")
        print(f"Recall: {micro_r:.3f}")
        print(f"F1-score: {micro_f1:.3f}")

    def _print_detailed_metrics(self, metrics):
        """Details"""
        print("\nDetails:")
        print("{:<20} {:<10} {:<10} {:<10} {:<10}".format(
            "Тип", "Precision", "Recall", "F1", "Support"))
        print("-" * 60)
    
        for label, values in metrics.items():
            print("{:<20} {:<10.3f} {:<10.3f} {:<10.3f} {:<10}".format(
                label,
                values["precision"],
                values["recall"],
                values["f1"],
                values["support"]))
    
        # Micro-average
        total_tp = sum(m["tp"] for m in metrics.values())
        total_fp = sum(m["fp"] for m in metrics.values())
        total_fn = sum(m["fn"] for m in metrics.values())
    
        micro_p = total_tp / (total_tp + total_fp) if (total_tp + total_fp) > 0 else 0
        micro_r = total_tp / (total_tp + total_fn) if (total_tp + total_fn) > 0 else 0
        micro_f1 = 2 * (micro_p * micro_r) / (micro_p + micro_r) if (micro_p + micro_r) > 0 else 0
    
        print("\nTotal final metrics (micro-average):")
        print(f"Precision: {micro_p:.3f}")
        print(f"Recall: {micro_r:.3f}")
        print(f"F1-score: {micro_f1:.3f}")
    
    def predict(self, 
               text: str,
               return_doc: bool = False) -> Union[List[Tuple[str, str, int, int]], "spacy.tokens.Doc"]:
        doc = self.nlp(text)
        if return_doc:
            return doc
        return [(ent.text, ent.label_, ent.start_char, ent.end_char) for ent in doc.ents]



In [34]:
import time

processor = DataProcessor()

# Upload data
train_data = processor.load_data("../../data/jsonl/dataset_jsonl_train.jsonl")
test_data = processor.load_data("../../data/jsonl/dataset_jsonl_test.jsonl")

# Подготовка для обучения
formatted_train = processor.prepare_data(train_data)
formatted_test = processor.prepare_data(test_data) if test_data else None

time_start = time.time()
# Инициализация тренера
trainer_blank_empty = NERTrainer(
    model_name="blank",
    blank_language="ru",
)

# Добавление меток ДО разделения (теперь используем подготовленные данные)
print("Примеры обучающих данных:", formatted_train[:1])  # покажем первый пример для проверки
trainer_blank_empty.add_labels(formatted_train)

print(f"\nРазмер обучающей выборки: {len(formatted_train)}")
print(f"Размер тестовой выборки: {len(formatted_test) if formatted_test else 0}")

# Обучение модели (увеличим epochs для лучшего качества)
trainer_blank_empty.train(
    formatted_train, 
    epochs=40,
    batch_size=1,  
    save_path=r"C:\Users\mezhonnyy\Desktop\Решения\NER\model\NER_final\data\jsonl\model"
)
print('Train time: ', time.time() - time_start, ' c')
# Оценка модели (если есть тестовая выборка)
time_start = time.time()
if formatted_test:
    metrics = trainer_blank_empty.evaluate(formatted_test)
    print("\nМетрики на тестовой выборке:")
    print(f"Precision: {metrics['precision']:.2f}")
    print(f"Recall: {metrics['recall']:.2f}")
    print(f"F1-score: {metrics['f1_score']:.2f}")
else:
    print("\nТестовая выборка не предоставлена, оценка не выполнена")
print('Inference time: ', time.time() - time_start, ' c')    

Upload data: 0it [00:00, ?it/s]

Succes upload 2051 rows


Upload data: 0it [00:00, ?it/s]

Succes upload 879 raw


Prepare data:   0%|          | 0/2051 [00:00<?, ?it/s]

Error in element: incorrect position: 117-121 for text length 114. Text: ''
Error in element: incorrect position: 262-267 for text length 259. Text: ''
Total error count: 2 (from 2051)


Prepare data:   0%|          | 0/879 [00:00<?, ?it/s]

⚠️ GPU not support, used CPU
Примеры обучающих данных: [('лист нержавеющий 4 x 1500 x 3000 mm aisi 201 12х15г9нд г / к матовый\n\n\nлист нержавеющий 8 x 1500 x 3000 mm aisi 201 12х15г9нд г / к матовый\n\n\nлист нержавеющий 1.5 x 1000 x 2000 mm aisi 201 12х15г9нд х / к шлифованный\n\n\nлист нержавеющий 2 x 1000 x 2000 mm aisi 201 12х15г9нд г / к просечно - вытяжной пвл\n\n\nлист нержавеющий 4 x 1000 x 2000 mm aisi 201 12х15г9нд г / к просечно - вытяжной пвл', {'entities': [(0, 4, 'product'), (5, 16, 'material'), (17, 18, 'thickness'), (21, 25, 'width'), (28, 32, 'length'), (36, 44, 'mark_steel_aisi'), (45, 54, 'mark_steal'), (55, 60, 'tehnology'), (61, 68, 'color'), (71, 75, 'product'), (76, 87, 'material'), (88, 89, 'thickness'), (92, 96, 'width'), (99, 103, 'length'), (107, 115, 'mark_steel_aisi'), (116, 125, 'mark_steal'), (126, 131, 'tehnology'), (132, 139, 'color'), (142, 146, 'product'), (147, 158, 'material'), (159, 162, 'thickness'), (165, 169, 'width'), (172, 176, 'length'), (1

Epoch 1/40:   0%|          | 0/2049 [00:00<?, ?it/s]


Epoch 1/40
📉 Loss: 34751.421


Epoch 2/40:   0%|          | 0/2049 [00:00<?, ?it/s]


Epoch 2/40
📉 Loss: 17453.279


Эпоха 3/40:   0%|          | 0/2049 [00:00<?, ?it/s]


Epoch 3/40
📉 Loss: 13180.380


Epoch 4/40:   0%|          | 0/2049 [00:00<?, ?it/s]


Epoch 4/40
📉 Loss: 11255.004


Epoch 5/40:   0%|          | 0/2049 [00:00<?, ?it/s]


Epoch 5/40
📉 Loss: 9928.187


Epoch 6/40:   0%|          | 0/2049 [00:00<?, ?it/s]


Epoch 6/40
📉 Loss: 9012.005


Epoch 7/40:   0%|          | 0/2049 [00:00<?, ?it/s]


Epoch 7/40
📉 Loss: 7986.002


Epoch 8/40:   0%|          | 0/2049 [00:00<?, ?it/s]


Epoch 8/40
📉 Loss: 7156.043


Epoch 9/40:   0%|          | 0/2049 [00:00<?, ?it/s]


Epoch 9/40
📉 Loss: 6830.694


Epoch 10/40:   0%|          | 0/2049 [00:00<?, ?it/s]


Epoch 10/40
📉 Loss: 6312.031


Epoch 11/40:   0%|          | 0/2049 [00:00<?, ?it/s]


Epoch 11/40
📉 Loss: 5962.034


Epoch 12/40:   0%|          | 0/2049 [00:00<?, ?it/s]


Epoch 12/40
📉 Loss: 5555.889


Epoch 13/40:   0%|          | 0/2049 [00:00<?, ?it/s]


Epoch 13/40
📉 Loss: 5437.939


Epoch 14/40:   0%|          | 0/2049 [00:00<?, ?it/s]


Epoch 14/40
📉 Loss: 5274.134


Эпоха 15/40:   0%|          | 0/2049 [00:00<?, ?it/s]


Epoch 15/40
📉 Loss: 4979.238


Epoch 16/40:   0%|          | 0/2049 [00:00<?, ?it/s]


Epoch 16/40
📉 Loss: 4765.407


Epoch 17/40:   0%|          | 0/2049 [00:00<?, ?it/s]


Epoch 17/40
📉 Loss: 4654.535


Epoch 18/40:   0%|          | 0/2049 [00:00<?, ?it/s]


Epoch 18/40
📉 Loss: 4375.937


Epoch 19/40:   0%|          | 0/2049 [00:00<?, ?it/s]


Epoch 19/40
📉 Loss: 4364.238


Epoch 20/40:   0%|          | 0/2049 [00:00<?, ?it/s]


Epoch 20/40
📉 Loss: 4283.655


Epoch 21/40:   0%|          | 0/2049 [00:00<?, ?it/s]


Epoch 21/40
📉 Loss: 4241.335


Epoch 22/40:   0%|          | 0/2049 [00:00<?, ?it/s]


Epoch 22/40
📉 Loss: 3984.289


Epoch 23/40:   0%|          | 0/2049 [00:00<?, ?it/s]


Epoch 23/40
📉 Loss: 3888.364


Epoch 24/40:   0%|          | 0/2049 [00:00<?, ?it/s]


Epoch 24/40
📉 Loss: 4042.827


Epoch 25/40:   0%|          | 0/2049 [00:00<?, ?it/s]


Epoch 25/40
📉 Loss: 3962.687


Epoch 26/40:   0%|          | 0/2049 [00:00<?, ?it/s]


Epoch 26/40
📉 Loss: 3801.571


Epoch 27/40:   0%|          | 0/2049 [00:00<?, ?it/s]


Epoch 27/40
📉 Loss: 3854.330


Epoch 28/40:   0%|          | 0/2049 [00:00<?, ?it/s]


Epoch 28/40
📉 Loss: 3610.508


Epoch 29/40:   0%|          | 0/2049 [00:00<?, ?it/s]


Epoch 29/40
📉 Loss: 3688.979


Epoch 30/40:   0%|          | 0/2049 [00:00<?, ?it/s]


Epoch 30/40
📉 Loss: 3604.925


Epoch 31/40:   0%|          | 0/2049 [00:00<?, ?it/s]


Epoch 31/40
📉 Loss: 3501.070


Epoch 32/40:   0%|          | 0/2049 [00:00<?, ?it/s]


Epoch 32/40
📉 Loss: 3567.380


Epoch 33/40:   0%|          | 0/2049 [00:00<?, ?it/s]


Epoch 33/40
📉 Loss: 3380.647


Epoch 34/40:   0%|          | 0/2049 [00:00<?, ?it/s]


Epoch 34/40
📉 Loss: 3541.794


Epoch 35/40:   0%|          | 0/2049 [00:00<?, ?it/s]


Epoch 35/40
📉 Loss: 3528.954


Epoch 36/40:   0%|          | 0/2049 [00:00<?, ?it/s]


Epoch 36/40
📉 Loss: 3452.260


Epoch 37/40:   0%|          | 0/2049 [00:00<?, ?it/s]


Epoch 37/40
📉 Loss: 3507.874


Epoch 38/40:   0%|          | 0/2049 [00:00<?, ?it/s]


Epoch 38/40
📉 Loss: 3424.018


Epoch 39/40:   0%|          | 0/2049 [00:00<?, ?it/s]


Epoch 39/40
📉 Loss: 3345.640


Epoch 40/40:   0%|          | 0/2049 [00:00<?, ?it/s]


Epoch 40/40
📉 Loss: 3399.545
Saved final model
Train time:  7203.317417383194  c


🔍 Evaluate model:   0%|          | 0/879 [00:00<?, ?it/s]


Details:
Тип                  Precision  Recall     F1         Support 
------------------------------------------------------------
length               0.965      0.953      0.959      1236      
tehnology            0.977      0.977      0.977      1017      
product              0.994      0.991      0.993      4421      
thickness            0.991      0.989      0.990      2812      
color                0.960      0.960      0.960      100       
coating              0.970      0.959      0.965      171       
mark_steal           0.980      0.979      0.980      2724      
material             0.980      0.992      0.986      1940      
mark_steel_aisi      0.988      0.988      0.988      603       
width                0.967      0.975      0.971      1869      
type                 0.982      0.989      0.985      1862      
mark                 0.915      0.933      0.924      403       
height               0.942      0.921      0.931      579       
form                 

TypeError: unsupported format string passed to dict.__format__

In [44]:
import pandas as pd

print('Train time: ', time.time() - time_start, ' c')
# Оценка модели (если есть тестовая выборка)
time_start = time.time()
if formatted_test:
    metrics = trainer_blank_empty.evaluate(formatted_test)
print('Inference time: ', time.time() - time_start, ' c')    

# Создаем DataFrame и фильтруем классы без поддержки
df = pd.DataFrame.from_dict(metrics, orient='index')
#df = df[df['support'] > 0]  # Игнорируем классы с support=0

# 1. Макро-усредненные метрики (все классы равнозначны)
macro_precision = df['precision'].mean()
macro_recall = df['recall'].mean()
macro_f1 = df['f1'].mean()

print("Макро-усредненные метрики (все классы равны):")
print(f"Macro-Precision: {macro_precision:.4f}")
print(f"Macro-Recall: {macro_recall:.4f}")
print(f"Macro-F1: {macro_f1:.4f}\n")

# 2. Взвешенные метрики (учитывают размер классов)
total_support = df['support'].sum()
weighted_precision = (df['precision'] * df['support']).sum() / total_support
weighted_recall = (df['recall'] * df['support']).sum() / total_support
weighted_f1 = (df['f1'] * df['support']).sum() / total_support

print("Взвешенные метрики (учитывают размер классов):")
print(f"Weighted Precision: {weighted_precision:.4f}")
print(f"Weighted Recall: {weighted_recall:.4f}")
print(f"Weighted F1: {weighted_f1:.4f}\n")

# 3. Микро-усредненные метрики (альтернативный расчет)
micro_precision = weighted_precision  # Для classification report они совпадают
micro_recall = weighted_recall
micro_f1 = weighted_f1

print("Микро-усредненные метрики:")
print(f"Micro-Precision: {micro_precision:.4f}")
print(f"Micro-Recall: {micro_recall:.4f}")
print(f"Micro-F1: {micro_f1:.4f}")

Train time:  122.83794498443604  c


🔍 Оценка модели:   0%|          | 0/879 [00:00<?, ?it/s]


📊 Подробные метрики:
Тип                  Precision  Recall     F1         Поддержка 
------------------------------------------------------------
length               0.965      0.953      0.959      1236      
tehnology            0.977      0.977      0.977      1017      
product              0.994      0.991      0.993      4421      
thickness            0.991      0.989      0.990      2812      
color                0.960      0.960      0.960      100       
coating              0.970      0.959      0.965      171       
mark_steal           0.980      0.979      0.980      2724      
material             0.980      0.992      0.986      1940      
mark_steel_aisi      0.988      0.988      0.988      603       
width                0.967      0.975      0.971      1869      
type                 0.982      0.989      0.985      1862      
mark                 0.915      0.933      0.924      403       
height               0.942      0.921      0.931      579       
form   

In [None]:
processor = DataProcessor()

# Загрузка данных
train_data = processor.load_data("../../data/jsonl/dataset_jsonl_train.jsonl")
test_data = processor.load_data("../../data/jsonl/dataset_jsonl_test.jsonl")

# Подготовка для обучения
formatted_train = processor.prepare_data(train_data)
formatted_test = processor.prepare_data(test_data) if test_data else None

time_start = time.time()
# Инициализация тренера
trainer_blank_empty = NERTrainer(
    blank_language="ru",
    transformer_name="DeepPavlov/rubert-base-cased"
)

# Добавление меток ДО разделения (теперь используем подготовленные данные)
print("Примеры обучающих данных:", formatted_train[:1])  # покажем первый пример для проверки
trainer_blank_empty.add_labels(formatted_train)

print(f"\nРазмер обучающей выборки: {len(formatted_train)}")
print(f"Размер тестовой выборки: {len(formatted_test) if formatted_test else 0}")

# Обучение модели (увеличим epochs для лучшего качества)
trainer_blank_empty.train(
    formatted_train, 
    epochs=10,
    batch_size=16,  
    save_path=r"C:\Users\mezhonnyy\Desktop\Решения\NER\model\NER_final\data\jsonl\model"
)
print('Train time: ', time.time() - time_start, ' c')
# Оценка модели (если есть тестовая выборка)
time_start = time.time()
if formatted_test:
    metrics = trainer_blank_empty.evaluate(formatted_test)
    print("\nМетрики на тестовой выборке:")
    print(f"Precision: {metrics['precision']:.2f}")
    print(f"Recall: {metrics['recall']:.2f}")
    print(f"F1-score: {metrics['f1_score']:.2f}")
else:
    print("\nТестовая выборка не предоставлена, оценка не выполнена")
print('Inference time: ', time.time() - time_start, ' c')    

📥 Загрузка данных: 0it [00:00, ?it/s]

✅ Успешно загружено 2051 записей


📥 Загрузка данных: 0it [00:00, ?it/s]

✅ Успешно загружено 879 записей


🔧 Подготовка данных:   0%|          | 0/2051 [00:00<?, ?it/s]

⚠️ Ошибка в элементе: Некорректные позиции: 117-121 для текста длины 114. Текст: ''
⚠️ Ошибка в элементе: Некорректные позиции: 262-267 для текста длины 259. Текст: ''
🔴 Всего ошибок: 2 (из 2051)


🔧 Подготовка данных:   0%|          | 0/879 [00:00<?, ?it/s]

⚠️ GPU не доступен, используется CPU
✅ Создана трансформерная модель с DeepPavlov/rubert-base-cased
Примеры обучающих данных: [('лист нержавеющий 4 x 1500 x 3000 mm aisi 201 12х15г9нд г / к матовый\n\n\nлист нержавеющий 8 x 1500 x 3000 mm aisi 201 12х15г9нд г / к матовый\n\n\nлист нержавеющий 1.5 x 1000 x 2000 mm aisi 201 12х15г9нд х / к шлифованный\n\n\nлист нержавеющий 2 x 1000 x 2000 mm aisi 201 12х15г9нд г / к просечно - вытяжной пвл\n\n\nлист нержавеющий 4 x 1000 x 2000 mm aisi 201 12х15г9нд г / к просечно - вытяжной пвл', {'entities': [(0, 4, 'product'), (5, 16, 'material'), (17, 18, 'thickness'), (21, 25, 'width'), (28, 32, 'length'), (36, 44, 'mark_steel_aisi'), (45, 54, 'mark_steal'), (55, 60, 'tehnology'), (61, 68, 'color'), (71, 75, 'product'), (76, 87, 'material'), (88, 89, 'thickness'), (92, 96, 'width'), (99, 103, 'length'), (107, 115, 'mark_steel_aisi'), (116, 125, 'mark_steal'), (126, 131, 'tehnology'), (132, 139, 'color'), (142, 146, 'product'), (147, 158, 'material'),

Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Эпоха 1/10:   0%|          | 0/513 [00:00<?, ?it/s]


Эпоха 1/10
📉 Loss: 59980.936


Эпоха 2/10:   0%|          | 0/513 [00:00<?, ?it/s]


Эпоха 2/10
📉 Loss: 21648.147


Эпоха 3/10:   0%|          | 0/513 [00:00<?, ?it/s]


Эпоха 3/10
📉 Loss: 12628.520


Эпоха 4/10:   0%|          | 0/513 [00:00<?, ?it/s]


Эпоха 4/10
📉 Loss: 8729.204


Эпоха 5/10:   0%|          | 0/513 [00:00<?, ?it/s]


Эпоха 5/10
📉 Loss: 6783.503


Эпоха 6/10:   0%|          | 0/513 [00:00<?, ?it/s]


Эпоха 6/10
📉 Loss: 5536.863


Эпоха 7/10:   0%|          | 0/513 [00:00<?, ?it/s]


Эпоха 7/10
📉 Loss: 4647.616


Эпоха 8/10:   0%|          | 0/513 [00:00<?, ?it/s]


Эпоха 8/10
📉 Loss: 4039.089


Эпоха 9/10:   0%|          | 0/513 [00:00<?, ?it/s]


Эпоха 9/10
📉 Loss: 3319.055


Эпоха 10/10:   0%|          | 0/513 [00:00<?, ?it/s]

In [133]:
trainer_blank.predict('купить шестигранник калибр ст40x 27 гост 8560 - 78, 4543 - 2016 цена 013568-95')

[('купить', 'product', 0, 6),
 ('шестигранник', 'product', 7, 19),
 ('калибр', 'type', 20, 26),
 ('ст40x 27', 'mark_steal', 27, 35),
 ('гост 8560 - 78', 'standart_gost', 36, 50),
 ('4543 - 2016', 'standart_gost', 52, 63),
 ('95', 'width', 76, 78)]

In [None]:
trainer_blank_empty.predict('купить шестигранник калибр ст40x 27 гост 8560 - 78, 4543 - 2016 цена 013568-95')

