In [1]:
!pip install tf-keras



In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch

import random
import os
import ast
import json
import spacy
from spacy.training import Example

from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor

import pickle
from tqdm import tqdm
import gc

from torch.utils.data import Dataset, DataLoader
import time




In [3]:
MAX_LEN = 128
BATCH_SIZE = 32
EMB_DIM = 128
LATENT_DIM = 64
N_HEADS = 8
FF_DIM = 512
NUM_LAYERS = 2
VOCAB_SPECIAL = ['<pad>', '<bos>', '<eos>', '<unk>']

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"–ò—Å–ø–æ–ª—å–∑—É–µ–º —É—Å—Ç—Ä–æ–π—Å—Ç–≤–æ: {DEVICE}")
print(torch.cuda.is_available())
print(torch.version.cuda)
print(torch.cuda.get_device_name(0))

–ò—Å–ø–æ–ª—å–∑—É–µ–º —É—Å—Ç—Ä–æ–π—Å—Ç–≤–æ: cuda
True
12.8
NVIDIA GeForce RTX 3060 Laptop GPU


In [4]:
# –ó–∞–≥—Ä—É–∂–∞–µ–º –¥–∞—Ç–∞—Å–µ—Ç
df = pd.read_csv("data/polyOne_aa.csv")

# –ü—Ä–æ–≤–µ—Ä—è–µ–º –Ω–∞–ª–∏—á–∏–µ —Å—Ç–æ–ª–±—Ü–∞ SMILES (–≤ –≤–∞—à–µ–º –¥–∞—Ç–∞—Å–µ—Ç–µ –æ–Ω –Ω–∞–∑—ã–≤–∞–µ—Ç—Å—è –∏–º–µ–Ω–Ω–æ "smiles")
if "smiles" not in df.columns:
    raise ValueError("–°—Ç–æ–ª–±–µ—Ü 'smiles' –Ω–µ –Ω–∞–π–¥–µ–Ω –≤ –¥–∞—Ç–∞—Å–µ—Ç–µ. –ü—Ä–æ–≤–µ—Ä—å—Ç–µ –Ω–∞–∑–≤–∞–Ω–∏–µ —Å—Ç–æ–ª–±—Ü–∞.")

# –£–¥–∞–ª—è–µ–º —Å—Ç—Ä–æ–∫–∏ —Å –ø—É—Å—Ç—ã–º–∏ SMILES
df = df.dropna(subset=["smiles"])

# –ü–æ–ª–Ω—ã–π –º–∞–ø–ø–∏–Ω–≥ –≤—Å–µ—Ö —Å–≤–æ–π—Å—Ç–≤ –∏–∑ –¥–∞—Ç–∞—Å–µ—Ç–∞ —Å –∏—Ö —Ä–∞—Å—à–∏—Ñ—Ä–æ–≤–∫–æ–π
with open("data/user_request_dataset/property_mapping.json", "r", encoding="utf-8") as f:
    PROPERTY_MAPPING = json.load(f)

# –í—Å–µ —á–∏—Å–ª–æ–≤—ã–µ –ø—Ä–∏–∑–Ω–∞–∫–∏ –∏–∑ –¥–∞—Ç–∞—Å–µ—Ç–∞ (–≤ –æ—Ä–∏–≥–∏–Ω–∞–ª—å–Ω—ã—Ö –Ω–∞–∑–≤–∞–Ω–∏—è—Ö)
NUM_FEATURES = list(PROPERTY_MAPPING.keys())
print(NUM_FEATURES)

['PROPERTY_MAPPING']


In [5]:
def categorize_values(series, property_name):
    # –ü—Ä–æ–≤–µ—Ä–∫–∞ –Ω–∞ –ø—Ä–æ–ø—É—â–µ–Ω–Ω—ã–µ –∑–Ω–∞—á–µ–Ω–∏—è
    nan_count = series.isna().sum()
    if nan_count > 0:
        print(f"‚ö†Ô∏è –í–ù–ò–ú–ê–ù–ò–ï: –í –∫–æ–ª–æ–Ω–∫–µ {property_name} –Ω–∞–π–¥–µ–Ω–æ {nan_count} –ø—Ä–æ–ø—É—â–µ–Ω–Ω—ã—Ö –∑–Ω–∞—á–µ–Ω–∏–π (NaN). –û–Ω–∏ –±—É–¥—É—Ç –∏—Å–∫–ª—é—á–µ–Ω—ã –∏–∑ –∞–Ω–∞–ª–∏–∑–∞.")
        series = series.dropna()
    
    # –ü—Ä–æ–≤–µ—Ä–∫–∞ –Ω–∞ –µ–¥–∏–Ω—Å—Ç–≤–µ–Ω–Ω–æ–µ —É–Ω–∏–∫–∞–ª—å–Ω–æ–µ –∑–Ω–∞—á–µ–Ω–∏–µ
    if series.nunique() == 1:
        print(f"‚ö†Ô∏è –í–ù–ò–ú–ê–ù–ò–ï: –í –∫–æ–ª–æ–Ω–∫–µ {property_name} –≤—Å–µ –∑–Ω–∞—á–µ–Ω–∏—è –æ–¥–∏–Ω–∞–∫–æ–≤—ã ({series.iloc[0]}). –í—Å–µ –¥–∞–Ω–Ω—ã–µ –±—É–¥—É—Ç –æ—Ç–Ω–µ—Å–µ–Ω—ã –∫ –∫–∞—Ç–µ–≥–æ—Ä–∏–∏ '—Å—Ä–µ–¥–Ω–µ–µ'.")
        categories = pd.Series(['—Å—Ä–µ–¥–Ω–µ–µ'] * len(series), index=series.index)
        return categories, ['–Ω–∏–∑–∫–æ–µ', '—Å—Ä–µ–¥–Ω–µ–µ', '–≤—ã—Å–æ–∫–æ–µ']
    
    # –ò—Å–ø–æ–ª—å–∑—É–µ–º –∫–≤–∞–Ω—Ç–∏–ª–∏ –¥–ª—è –±–∞–ª–∞–Ω—Å–∏—Ä–æ–≤–∫–∏ –∫–∞—Ç–µ–≥–æ—Ä–∏–π
    low_bound = series.quantile(0.33)
    high_bound = series.quantile(0.67)
    
    # –°–æ–∑–¥–∞–Ω–∏–µ –∫–∞—Ç–µ–≥–æ—Ä–∏–π
    bins = [-np.inf, low_bound, high_bound, np.inf]
    labels = ['–Ω–∏–∑–∫–æ–µ', '—Å—Ä–µ–¥–Ω–µ–µ', '–≤—ã—Å–æ–∫–æ–µ']
    categories = pd.cut(series, bins=bins, labels=labels)
    
    # –ü—Ä–æ–≤–µ—Ä–∫–∞ –±–∞–ª–∞–Ω—Å–∞ –∫–∞—Ç–µ–≥–æ—Ä–∏–π
    value_counts = categories.value_counts().sort_index()
    total = len(categories)
    
    # –í—ã–≤–æ–¥ –∏–Ω—Ñ–æ—Ä–º–∞—Ü–∏–∏
    print(f"–î–∏–∞–ø–∞–∑–æ–Ω –∑–Ω–∞—á–µ–Ω–∏–π –¥–ª—è '{property_name}': {series.min():.4f} - {series.max():.4f}")
    print(f"–ü–æ—Ä–æ–≥–∏ –∫–∞—Ç–µ–≥–æ—Ä–∏–π: –Ω–∏–∑–∫–æ–µ <= {low_bound:.4f}, —Å—Ä–µ–¥–Ω–µ–µ <= {high_bound:.4f}, –≤—ã—Å–æ–∫–æ–µ > {high_bound:.4f}")
    print(f"–†–∞—Å–ø—Ä–µ–¥–µ–ª–µ–Ω–∏–µ –∫–∞—Ç–µ–≥–æ—Ä–∏–π: –Ω–∏–∑–∫–æ–µ={value_counts['–Ω–∏–∑–∫–æ–µ']} ({value_counts['–Ω–∏–∑–∫–æ–µ']/total:.1%}), "
          f"—Å—Ä–µ–¥–Ω–µ–µ={value_counts['—Å—Ä–µ–¥–Ω–µ–µ']} ({value_counts['—Å—Ä–µ–¥–Ω–µ–µ']/total:.1%}), "
          f"–≤—ã—Å–æ–∫–æ–µ={value_counts['–≤—ã—Å–æ–∫–æ–µ']} ({value_counts['–≤—ã—Å–æ–∫–æ–µ']/total:.1%})")
    
    return categories, labels

# –û–ø—Ä–µ–¥–µ–ª—è–µ–º –∫–æ–ª–æ–Ω–∫–∏ –¥–ª—è –æ–±—Ä–∞–±–æ—Ç–∫–∏
common_columns = ['Egc','Egb','Eib','CED','Ei','Eea','nc','ne','epse_6.0','epsc','epse_3.0','epse_1.78','epse_15.0','epse_4.0','epse_5.0','epse_2.0','epse_9.0','epse_7.0','TSb','TSy','epsb','YM','permCH4','permCO2','permH2','permO2','permN2','permHe','Eat','rho','LOI','Xc','Xe','Cp','Td','Tg','Tm']

In [48]:
# –ü—Ä–∏–º–µ–Ω—è–µ–º –∫–∞—Ç–µ–≥–æ—Ä–∏–∑–∞—Ü–∏—é –∫–æ –≤—Å–µ–º –∫–æ–ª–æ–Ω–∫–∞–º
for col in common_columns:
    if col in df:
        print(f"\n–û–±—Ä–∞–±–æ—Ç–∫–∞ –∫–æ–ª–æ–Ω–∫–∏ '{col}' –≤ df:")
        values = df[col]
        categories, labels = categorize_values(values, {col})
        df[f'{col}_category'] = categories
        
        # –ü—Ä–æ–≤–µ—Ä–∫–∞ —Ä–∞—Å–ø—Ä–µ–¥–µ–ª–µ–Ω–∏—è –∫–∞—Ç–µ–≥–æ—Ä–∏–π
        print("–†–∞—Å–ø—Ä–µ–¥–µ–ª–µ–Ω–∏–µ –∫–∞—Ç–µ–≥–æ—Ä–∏–π:")
        print(df[f'{col}_category'].value_counts(dropna=False))
    else:
        print(f"‚ö†Ô∏è –ö–æ–ª–æ–Ω–∫–∞ '{col}' –æ—Ç—Å—É—Ç—Å—Ç–≤—É–µ—Ç")


–û–±—Ä–∞–±–æ—Ç–∫–∞ –∫–æ–ª–æ–Ω–∫–∏ 'Egc' –≤ df:
–î–∏–∞–ø–∞–∑–æ–Ω –∑–Ω–∞—á–µ–Ω–∏–π –¥–ª—è '{'Egc'}': 0.4376 - 7.4522
–ü–æ—Ä–æ–≥–∏ –∫–∞—Ç–µ–≥–æ—Ä–∏–π: –Ω–∏–∑–∫–æ–µ <= 3.1779, —Å—Ä–µ–¥–Ω–µ–µ <= 3.6253, –≤—ã—Å–æ–∫–æ–µ > 3.6253
–†–∞—Å–ø—Ä–µ–¥–µ–ª–µ–Ω–∏–µ –∫–∞—Ç–µ–≥–æ—Ä–∏–π: –Ω–∏–∑–∫–æ–µ=165000 (33.0%), —Å—Ä–µ–¥–Ω–µ–µ=170000 (34.0%), –≤—ã—Å–æ–∫–æ–µ=165000 (33.0%)
–†–∞—Å–ø—Ä–µ–¥–µ–ª–µ–Ω–∏–µ –∫–∞—Ç–µ–≥–æ—Ä–∏–π:
Egc_category
—Å—Ä–µ–¥–Ω–µ–µ    170000
–Ω–∏–∑–∫–æ–µ     165000
–≤—ã—Å–æ–∫–æ–µ    165000
Name: count, dtype: int64

–û–±—Ä–∞–±–æ—Ç–∫–∞ –∫–æ–ª–æ–Ω–∫–∏ 'Egb' –≤ df:
–î–∏–∞–ø–∞–∑–æ–Ω –∑–Ω–∞—á–µ–Ω–∏–π –¥–ª—è '{'Egb'}': 0.7974 - 7.6678
–ü–æ—Ä–æ–≥–∏ –∫–∞—Ç–µ–≥–æ—Ä–∏–π: –Ω–∏–∑–∫–æ–µ <= 2.7763, —Å—Ä–µ–¥–Ω–µ–µ <= 3.2696, –≤—ã—Å–æ–∫–æ–µ > 3.2696
–†–∞—Å–ø—Ä–µ–¥–µ–ª–µ–Ω–∏–µ –∫–∞—Ç–µ–≥–æ—Ä–∏–π: –Ω–∏–∑–∫–æ–µ=165000 (33.0%), —Å—Ä–µ–¥–Ω–µ–µ=170000 (34.0%), –≤—ã—Å–æ–∫–æ–µ=165000 (33.0%)
–†–∞—Å–ø—Ä–µ–¥–µ–ª–µ–Ω–∏–µ –∫–∞—Ç–µ–≥–æ—Ä–∏–π:
Egb_category
—Å—Ä–µ–¥–Ω–µ–µ    170000
–Ω–∏–∑–∫–æ–µ     165000
–≤—ã—Å–æ–∫–

In [7]:
# –ì–ª–æ–±–∞–ª—å–Ω—ã–µ –∫–æ–Ω—Ñ–∏–≥—É—Ä–∞—Ü–∏–æ–Ω–Ω—ã–µ –¥–∞–Ω–Ω—ã–µ (–≤—ã–Ω–µ—Å–µ–Ω—ã –¥–ª—è —É–¥–æ–±—Å—Ç–≤–∞)
GENDER_SYNONYMS = {
    'm': {  # –º—É–∂—Å–∫–æ–π —Ä–æ–¥ (–∫–æ—ç—Ñ—Ñ–∏—Ü–∏–µ–Ω—Ç, —Ä–∞–¥–∏—É—Å)
        '–Ω–∏–∑–∫–∞—è': ['–Ω–∏–∑–∫–∏–π', '–º–∞–ª–µ–Ω—å–∫–∏–π', '–Ω–µ–±–æ–ª—å—à–æ–π', '—Å–∫—Ä–æ–º–Ω—ã–π', '–Ω–µ–≤—ã—Å–æ–∫–∏–π'],
        '—Å—Ä–µ–¥–Ω—è—è': ['—Å—Ä–µ–¥–Ω–∏–π', '—É–º–µ—Ä–µ–Ω–Ω—ã–π', '–Ω–æ—Ä–º–∞–ª—å–Ω—ã–π', '—Å—Ç–∞–Ω–¥–∞—Ä—Ç–Ω—ã–π'],
        '–≤—ã—Å–æ–∫–∞—è': ['–≤—ã—Å–æ–∫–∏–π', '–±–æ–ª—å—à–æ–π', '–∑–Ω–∞—á–∏—Ç–µ–ª—å–Ω—ã–π', '–ø—Ä–µ–≤–æ—Å—Ö–æ–¥–Ω—ã–π', '–≤—ã–¥–∞—é—â–∏–π—Å—è']
    },
    'f': {  # –∂–µ–Ω—Å–∫–∏–π —Ä–æ–¥ (–∞–∫—Ç–∏–≤–Ω–æ—Å—Ç—å, –ø–ª–æ—Ç–Ω–æ—Å—Ç—å)
        '–Ω–∏–∑–∫–∞—è': ['–Ω–∏–∑–∫–∞—è', '–º–∞–ª–µ–Ω—å–∫–∞—è', '–Ω–µ–±–æ–ª—å—à–∞—è', '—Å–∫—Ä–æ–º–Ω–∞—è', '–Ω–µ–≤—ã—Å–æ–∫–∞—è'],
        '—Å—Ä–µ–¥–Ω—è—è': ['—Å—Ä–µ–¥–Ω—è—è', '—É–º–µ—Ä–µ–Ω–Ω–∞—è', '–Ω–æ—Ä–º–∞–ª—å–Ω–∞—è', '—Å—Ç–∞–Ω–¥–∞—Ä—Ç–Ω–∞—è'],
        '–≤—ã—Å–æ–∫–∞—è': ['–≤—ã—Å–æ–∫–∞—è', '–±–æ–ª—å—à–∞—è', '–∑–Ω–∞—á–∏—Ç–µ–ª—å–Ω–∞—è', '–ø—Ä–µ–≤–æ—Å—Ö–æ–¥–Ω–∞—è', '–≤—ã–¥–∞—é—â–∞—è—Å—è']
    },
    'n': {  # —Å—Ä–µ–¥–Ω–∏–π —Ä–æ–¥ (—Å–≤–æ–π—Å—Ç–≤–æ, —Å–æ—Å—Ç–æ—è–Ω–∏–µ)
        '–Ω–∏–∑–∫–∞—è': ['–Ω–∏–∑–∫–æ–µ', '–º–∞–ª–µ–Ω—å–∫–æ–µ', '–Ω–µ–±–æ–ª—å—à–æ–µ', '—Å–∫—Ä–æ–º–Ω–æ–µ', '–Ω–µ–≤—ã—Å–æ–∫–æ–µ'],
        '—Å—Ä–µ–¥–Ω—è—è': ['—Å—Ä–µ–¥–Ω–µ–µ', '—É–º–µ—Ä–µ–Ω–Ω–æ–µ', '–Ω–æ—Ä–º–∞–ª—å–Ω–æ–µ', '—Å—Ç–∞–Ω–¥–∞—Ä—Ç–Ω–æ–µ'],
        '–≤—ã—Å–æ–∫–∞—è': ['–≤—ã—Å–æ–∫–æ–µ', '–±–æ–ª—å—à–æ–µ', '–∑–Ω–∞—á–∏—Ç–µ–ª—å–Ω–æ–µ', '–ø—Ä–µ–≤–æ—Å—Ö–æ–¥–Ω–æ–µ', '–≤—ã–¥–∞—é—â–µ–µ—Å—è']
    }
}

In [8]:
import json

with open("data/user_request_dataset/param_config.json", "r", encoding="utf-8") as f:
    PARAM_CONFIG = json.load(f)

In [9]:
# –®–∞–±–ª–æ–Ω—ã –¥–ª—è –Ω–∞—á–∞–ª–∞ –∑–∞–ø—Ä–æ—Å–∞
with open("data/user_request_dataset/start_phrases.json", "r", encoding="utf-8") as f:
    data = json.load(f)
START_PHRASES = data["START_PHRASES"]

In [10]:
# –¶–µ–ª–∏ –∏—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–∏—è
with open("data/user_request_dataset/purposes.json", "r", encoding="utf-8") as f:
    data = json.load(f)
PURPOSES = data["PURPOSES"]

In [11]:
# –°–æ–µ–¥–∏–Ω–∏—Ç–µ–ª–∏ –¥–ª—è –ø–∞—Ä–∞–º–µ—Ç—Ä–æ–≤
CONNECTORS = [
        ", –∞ —Ç–∞–∫–∂–µ ", ", –ø—Ä–∏ —ç—Ç–æ–º ", ", –Ω–æ –ø—Ä–∏ —ç—Ç–æ–º ", ", –∏ –ø—Ä–∏ —ç—Ç–æ–º ", 
        ", –¥–æ–ø–æ–ª–Ω–∏—Ç–µ–ª—å–Ω–æ ", ", —á—Ç–æ –≤–∞–∂–Ω–æ ", ", —á—Ç–æ –∫—Ä–∏—Ç–∏—á–Ω–æ ", ", –µ—â–µ "
        ", —á—Ç–æ –Ω–µ–æ–±—Ö–æ–¥–∏–º–æ ", ", —á—Ç–æ —Ç—Ä–µ–±—É–µ—Ç—Å—è ", ", —á—Ç–æ –≤–∞–∂–Ω–æ –¥–ª—è ", ", –∏ ", ", –∫–æ—Ç–æ—Ä—ã–π –∏–º–µ–µ—Ç "
    ]

In [13]:
def calculate_thresholds_sampled(df, sample_size=100_000):
    """
    –†–∞—Å—Å—á–∏—Ç—ã–≤–∞–µ—Ç –ø–æ—Ä–æ–≥–∏ –Ω–∞ –æ—Å–Ω–æ–≤–µ –≤—ã–±–æ—Ä–∫–∏ –¥–∞–Ω–Ω—ã—Ö –¥–ª—è —ç–∫–æ–Ω–æ–º–∏–∏ –ø–∞–º—è—Ç–∏
    """
    thresholds = {}
    for param in df.columns:
        if param not in PARAM_CONFIG:
            continue
            
        # –ë–µ—Ä–µ–º –≤—ã–±–æ—Ä–∫—É –≤–º–µ—Å—Ç–æ –ø–æ–ª–Ω–æ–≥–æ –¥–∞—Ç–∞—Ñ—Ä–µ–π–º–∞
        sample = df[param].dropna()
        if len(sample) > sample_size:
            sample = sample.sample(n=sample_size, random_state=42)
            
        if len(sample) < 10:  # –ú–∏–Ω–∏–º—É–º 10 –∑–Ω–∞—á–µ–Ω–∏–π –¥–ª—è —Ä–∞—Å—á–µ—Ç–∞ –ø–æ—Ä–æ–≥–æ–≤
            continue
            
        thresholds[param] = {
            'low': sample.quantile(0.33),
            'high': sample.quantile(0.67)
        }
    
    return thresholds

In [14]:
def get_value_category_safe(value, thresholds, param):
    """–ë–µ–∑–æ–ø–∞—Å–Ω–æ–µ –æ–ø—Ä–µ–¥–µ–ª–µ–Ω–∏–µ –∫–∞—Ç–µ–≥–æ—Ä–∏–∏ –∑–Ω–∞—á–µ–Ω–∏—è"""
    if pd.isna(value) or param not in thresholds:
        return None
        
    if value <= thresholds[param]['low']:
        return '–Ω–∏–∑–∫–∞—è'
    elif value >= thresholds[param]['high']:
        return '–≤—ã—Å–æ–∫–∞—è'
    else:
        return '—Å—Ä–µ–¥–Ω—è—è'

In [15]:
def generate_batch_memory_efficient(batch_size, df, param_config, thresholds, min_params=2, max_params=5):
    """
    –ì–µ–Ω–µ—Ä–∏—Ä—É–µ—Ç –±–∞—Ç—á –ø—Ä–∏–º–µ—Ä–æ–≤ —Å –º–∏–Ω–∏–º–∞–ª—å–Ω—ã–º –∏—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–∏–µ–º –ø–∞–º—è—Ç–∏
    
    –í–º–µ—Å—Ç–æ –ø—Ä–µ–¥–∑–∞–≥—Ä—É–∑–∫–∏ –≤—Å–µ–≥–æ –¥–∞—Ç–∞—Ñ—Ä–µ–π–º–∞, –∏—Å–ø–æ–ª—å–∑—É–µ–º –≤—ã–±–æ—Ä–∫–∏
    """
    results = []
    valid_params = [p for p in param_config.keys() if p in df.columns]
    
    if not valid_params:
        return []
    
    # –ü—Ä–µ–¥–∑–∞–≥—Ä—É–∂–∞–µ–º –∫–æ–Ω—Ñ–∏–≥–∏ –¥–ª—è –≤—Å–µ—Ö –ø–∞—Ä–∞–º–µ—Ç—Ä–æ–≤ –æ–¥–∏–Ω —Ä–∞–∑
    param_templates = {}
    for param in valid_params:
        config = param_config[param]
        gender = config['gender']
        param_templates[param] = {
            'names': config['names'],
            'phrases': config['phrases'],
            'codes': config['codes'],
            'gender': gender,
            'adjectives': GENDER_SYNONYMS[gender]
        }
    
    for _ in range(batch_size):
        # –í—ã–±–∏—Ä–∞–µ–º —Å–ª—É—á–∞–π–Ω—ã–µ –ø–∞—Ä–∞–º–µ—Ç—Ä—ã
        num_params = random.randint(min_params, min(max_params, len(valid_params)))
        selected_params = random.sample(valid_params, num_params)
        
        param_phrases = []
        annotations = []
        
        for param in selected_params:
            config = param_templates[param]
            gender = config['gender']
            
            # –ü–æ–ª—É—á–∞–µ–º –≤—ã–±–æ—Ä–∫—É –¥–∞–Ω–Ω—ã—Ö –¥–ª—è —ç—Ç–æ–≥–æ –ø–∞—Ä–∞–º–µ—Ç—Ä–∞
            param_data = df[param].dropna()
            if len(param_data) == 0:
                continue
                
            # –ë–µ—Ä–µ–º —Å–ª—É—á–∞–π–Ω–æ–µ –∑–Ω–∞—á–µ–Ω–∏–µ –∏–∑ –¥–∞–Ω–Ω—ã—Ö
            idx = random.choice(param_data.index)
            value = param_data.loc[idx]
            
            # –û–ø—Ä–µ–¥–µ–ª—è–µ–º –∫–∞—Ç–µ–≥–æ—Ä–∏—é
            category = get_value_category_safe(value, thresholds, param)
            if not category:
                continue
                
            # –ì–µ–Ω–µ—Ä–∏—Ä—É–µ–º —Ñ—Ä–∞–∑—É –±–µ–∑ –∫—ç—à–∏—Ä–æ–≤–∞–Ω–∏—è –≤—Å–µ—Ö –≤–æ–∑–º–æ–∂–Ω—ã—Ö –∫–æ–º–±–∏–Ω–∞—Ü–∏–π
            adjective = random.choice(config['adjectives'][category])
            name = random.choice(config['names'])
            phrase_template = random.choice(config['phrases'])
            phrase = phrase_template.format(adjective=adjective, name=name)
            
            param_phrases.append(phrase)
            annotations.append({
                "param": param,
                "label": category,
                "phrase": phrase,
                "code": random.choice(config["codes"])
            })
        
        if len(param_phrases) < min_params:
            continue
            
        # –§–æ—Ä–º–∏—Ä—É–µ–º —Ç–µ–∫—Å—Ç –∑–∞–ø—Ä–æ—Å–∞
        params_text = param_phrases[0]
        for i in range(1, len(param_phrases)):
            connector = random.choice(CONNECTORS)
            params_text += connector + param_phrases[i]
        
        start_phrase = random.choice(START_PHRASES)
        purpose = random.choice(PURPOSES)
        text = start_phrase.format(params=params_text, purpose=purpose)
        
        params_str = "|".join([f"{ann['code']}:{ann['label']}" for ann in annotations])
        
        results.append({
            "text": text,
            "annotations": annotations,
            "params_str": params_str
        })
    
    return results

In [16]:
def generate_batch(batch_size, preloaded_data, min_params=2, max_params=10):
    """
    –ì–µ–Ω–µ—Ä–∏—Ä—É–µ—Ç –±–∞—Ç—á –ø—Ä–∏–º–µ—Ä–æ–≤ –∑–∞ –æ–¥–∏–Ω –≤—ã–∑–æ–≤ (–æ–ø—Ç–∏–º–∏–∑–∏—Ä–æ–≤–∞–Ω–æ –¥–ª—è —Å–∫–æ—Ä–æ—Å—Ç–∏)
    """
    results = []
    param_cache = preloaded_data['param_cache']
    valid_params = preloaded_data['valid_params']
    connectors = preloaded_data['connectors']
    start_phrases = preloaded_data['start_phrases']
    purposes = preloaded_data['purposes']
    
    if not valid_params:
        return []
    
    for _ in range(batch_size):
        # –í—ã–±–∏—Ä–∞–µ–º —Å–ª—É—á–∞–π–Ω–æ–µ –∫–æ–ª–∏—á–µ—Å—Ç–≤–æ –ø–∞—Ä–∞–º–µ—Ç—Ä–æ–≤
        num_params = random.randint(min_params, min(max_params, len(valid_params)))
        selected_params = random.sample(valid_params, num_params)
        
        param_phrases = []
        annotations = []
        
        for param in selected_params:
            # –ë—ã—Å—Ç—Ä–æ –≤—ã–±–∏—Ä–∞–µ–º —Å–ª—É—á–∞–π–Ω—É—é –∫–∞—Ç–µ–≥–æ—Ä–∏—é –¥–ª—è –ø–∞—Ä–∞–º–µ—Ç—Ä–∞
            categories = list(param_cache[param].keys())
            if not categories:
                continue
                
            category = random.choice(categories)
            # –ë—ã—Å—Ç—Ä–æ –≤—ã–±–∏—Ä–∞–µ–º —Å–ª—É—á–∞–π–Ω—É—é —Ñ—Ä–∞–∑—É –∏–∑ –∫—ç—à–∞
            phrase_data = random.choice(param_cache[param][category])
            
            param_phrases.append(phrase_data["phrase"])
            annotations.append({
                "param": param,
                "label": category,
                "phrase": phrase_data["phrase"],
                "code": phrase_data["code"]
            })
        
        if len(param_phrases) < min_params:
            continue
            
        # –§–æ—Ä–º–∏—Ä—É–µ–º —Ç–µ–∫—Å—Ç –∑–∞–ø—Ä–æ—Å–∞ (–≤–µ–∫—Ç–æ—Ä–∏–∑–æ–≤–∞–Ω–Ω–æ)
        params_text = param_phrases[0]
        for i in range(1, len(param_phrases)):
            connector = random.choice(connectors)
            params_text += connector + param_phrases[i]
        
        start_phrase = random.choice(start_phrases)
        purpose = random.choice(purposes)
        text = start_phrase.format(params=params_text, purpose=purpose)
        
        params_str = "|".join([f"{ann['code']}:{ann['label']}" for ann in annotations])
        
        results.append({
            "text": text,
            "annotations": annotations,
            "params_str": params_str
        })
    
    return results

In [17]:
def generate_ner_dataset_memory_efficient(df, param_config, n_samples=20_000, batch_size=500, n_workers=4):
    """
    –ì–µ–Ω–µ—Ä–∏—Ä—É–µ—Ç NER-–¥–∞—Ç–∞—Å–µ—Ç —Å –æ–ø—Ç–∏–º–∞–ª—å–Ω—ã–º –∏—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–∏–µ–º –ø–∞–º—è—Ç–∏
    """
    # 1. –†–∞—Å—Å—á–∏—Ç—ã–≤–∞–µ–º –ø–æ—Ä–æ–≥–∏ –Ω–∞ –≤—ã–±–æ—Ä–∫–µ
    print("–†–∞—Å—Å—á–∏—Ç—ã–≤–∞–µ–º –ø–æ—Ä–æ–≥–æ–≤—ã–µ –∑–Ω–∞—á–µ–Ω–∏—è –Ω–∞ –≤—ã–±–æ—Ä–∫–µ –¥–∞–Ω–Ω—ã—Ö...")
    thresholds = calculate_thresholds_sampled(df, sample_size=50_000)
    
    # 2. –û–ø—Ä–µ–¥–µ–ª—è–µ–º –≤–∞–ª–∏–¥–Ω—ã–µ –ø–∞—Ä–∞–º–µ—Ç—Ä—ã
    valid_params = [p for p in param_config.keys() if p in df.columns]
    print(f"–í–∞–ª–∏–¥–Ω—ã–µ –ø–∞—Ä–∞–º–µ—Ç—Ä—ã –¥–ª—è –≥–µ–Ω–µ—Ä–∞—Ü–∏–∏: {len(valid_params)}")
    
    if not valid_params:
        print("‚ö†Ô∏è –ù–µ—Ç –≤–∞–ª–∏–¥–Ω—ã—Ö –ø–∞—Ä–∞–º–µ—Ç—Ä–æ–≤ –¥–ª—è –≥–µ–Ω–µ—Ä–∞—Ü–∏–∏!")
        return pd.DataFrame()
    
    # 3. –ì–µ–Ω–µ—Ä–∏—Ä—É–µ–º –¥–∞–Ω–Ω—ã–µ –±–∞—Ç—á–∞–º–∏
    print(f"üöÄ –ì–µ–Ω–µ—Ä–∞—Ü–∏—è {n_samples} –ø—Ä–∏–º–µ—Ä–æ–≤ —Å –∏—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–∏–µ–º {n_workers} –ø–æ—Ç–æ–∫–æ–≤...")
    
    total_batches = n_samples // batch_size + (1 if n_samples % batch_size else 0)
    all_results = []
    
    # –ò—Å–ø–æ–ª—å–∑—É–µ–º ThreadPoolExecutor –≤–º–µ—Å—Ç–æ ProcessPoolExecutor –¥–ª—è –∏–∑–±–µ–∂–∞–Ω–∏—è –∫–æ–ø–∏—Ä–æ–≤–∞–Ω–∏—è –¥–∞–Ω–Ω—ã—Ö
    with ThreadPoolExecutor(max_workers=n_workers) as executor:
        # –°–æ–∑–¥–∞–µ–º —Å–ø–∏—Å–æ–∫ –∑–∞–¥–∞—á –¥–ª—è –∫–∞–∂–¥–æ–≥–æ –±–∞—Ç—á–∞
        futures = []
        for batch_idx in range(total_batches):
            current_batch_size = batch_size if batch_idx < total_batches - 1 else n_samples % batch_size
            if current_batch_size <= 0:
                continue
                
            future = executor.submit(
                generate_batch_memory_efficient,
                current_batch_size,
                df,
                param_config,
                thresholds
            )
            futures.append(future)
        
        # –ü–æ—Å–ª–µ–¥–æ–≤–∞—Ç–µ–ª—å–Ω–æ –ø–æ–ª—É—á–∞–µ–º —Ä–µ–∑—É–ª—å—Ç–∞—Ç—ã —Å –ø—Ä–æ–≥—Ä–µ—Å—Å-–±–∞—Ä–æ–º
        for future in tqdm(futures, desc="–ì–µ–Ω–µ—Ä–∞—Ü–∏—è –¥–∞–Ω–Ω—ã—Ö"):
            try:
                batch_results = future.result()
                all_results.extend(batch_results)
                
                # –ü—Ä–∏–Ω—É–¥–∏—Ç–µ–ª—å–Ω–æ –∑–∞–ø—É—Å–∫–∞–µ–º —Å–±–æ—Ä—â–∏–∫ –º—É—Å–æ—Ä–∞ –ø–æ—Å–ª–µ –∫–∞–∂–¥–æ–≥–æ –±–∞—Ç—á–∞
                gc.collect()
                
            except Exception as e:
                print(f"‚ö†Ô∏è –û—à–∏–±–∫–∞ –ø—Ä–∏ –≥–µ–Ω–µ—Ä–∞—Ü–∏–∏ –±–∞—Ç—á–∞: {str(e)}")
                continue
    
    # 4. –û–±—Ä–µ–∑–∞–µ–º –¥–æ —Ç–æ—á–Ω–æ–≥–æ –∫–æ–ª–∏—á–µ—Å—Ç–≤–∞ –∑–∞–ø—Ä–æ—à–µ–Ω–Ω—ã—Ö –ø—Ä–∏–º–µ—Ä–æ–≤
    all_results = all_results[:n_samples]
    
    print(f"‚úÖ –°–≥–µ–Ω–µ—Ä–∏—Ä–æ–≤–∞–Ω–æ {len(all_results)} –ø—Ä–∏–º–µ—Ä–æ–≤")
    return pd.DataFrame(all_results)

In [18]:
def cached_generate_dataset_memory_efficient(df, param_config, n_samples=20_000, cache_file="ner_dataset_cache.pkl"):
    """
    –ì–µ–Ω–µ—Ä–∏—Ä—É–µ—Ç –¥–∞—Ç–∞—Å–µ—Ç —Å –∫—ç—à–∏—Ä–æ–≤–∞–Ω–∏–µ–º –∏ –æ–ø—Ç–∏–º–∞–ª—å–Ω—ã–º –∏—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–∏–µ–º –ø–∞–º—è—Ç–∏
    """
    # –ü—Ä–æ–≤–µ—Ä—è–µ–º –Ω–∞–ª–∏—á–∏–µ –∫—ç—à–∞
    if os.path.exists(cache_file):
        try:
            print(f"üì• –ó–∞–≥—Ä—É–∂–∞–µ–º –∫—ç—à–∏—Ä–æ–≤–∞–Ω–Ω—ã–π –¥–∞—Ç–∞—Å–µ—Ç –∏–∑ {cache_file}")
            with open(cache_file, 'rb') as f:
                return pickle.load(f)
        except Exception as e:
            print(f"‚ö†Ô∏è –û—à–∏–±–∫–∞ –ø—Ä–∏ –∑–∞–≥—Ä—É–∑–∫–µ –∫—ç—à–∞: {str(e)}. –ì–µ–Ω–µ—Ä–∏—Ä—É–µ–º –Ω–æ–≤—ã–π –¥–∞—Ç–∞—Å–µ—Ç.")
    
    # –ì–µ–Ω–µ—Ä–∏—Ä—É–µ–º –Ω–æ–≤—ã–π –¥–∞—Ç–∞—Å–µ—Ç —Å —É–ø—Ä–∞–≤–ª–µ–Ω–∏–µ–º –ø–∞–º—è—Ç—å—é
    print(f"üîÑ –ì–µ–Ω–µ—Ä–∏—Ä—É–µ–º –Ω–æ–≤—ã–π –¥–∞—Ç–∞—Å–µ—Ç (–æ–ø—Ç–∏–º–∏–∑–∏—Ä–æ–≤–∞–Ω–æ –¥–ª—è –ø–∞–º—è—Ç–∏)...")
    
    # –†–∞–±–æ—Ç–∞–µ–º —Ç–æ–ª—å–∫–æ —Å –Ω—É–∂–Ω—ã–º–∏ —Å—Ç–æ–ª–±—Ü–∞–º–∏ –¥–ª—è —ç–∫–æ–Ω–æ–º–∏–∏ –ø–∞–º—è—Ç–∏
    needed_columns = [p for p in param_config.keys() if p in df.columns]
    if not needed_columns:
        needed_columns = df.columns[:10].tolist()  # –ë–µ—Ä–µ–º –ø–µ—Ä–≤—ã–µ 10 —Å—Ç–æ–ª–±—Ü–æ–≤ –∫–∞–∫ –∑–∞–ø–∞—Å–Ω–æ–π –≤–∞—Ä–∏–∞–Ω—Ç
        
    # –°–æ–∑–¥–∞–µ–º –º–∞–ª–µ–Ω—å–∫–∏–π –¥–∞—Ç–∞—Ñ—Ä–µ–π–º —Ç–æ–ª—å–∫–æ —Å –Ω—É–∂–Ω—ã–º–∏ —Å—Ç–æ–ª–±—Ü–∞–º–∏
    small_df = df[needed_columns].copy()
    
    # –û—á–∏—â–∞–µ–º –ø–∞–º—è—Ç—å
    gc.collect()
    
    # –ì–µ–Ω–µ—Ä–∏—Ä—É–µ–º –¥–∞—Ç–∞—Å–µ—Ç
    dataset = generate_ner_dataset_memory_efficient(
        small_df, 
        param_config, 
        n_samples=n_samples,
        batch_size=200,  # –£–º–µ–Ω—å—à–∞–µ–º —Ä–∞–∑–º–µ—Ä –±–∞—Ç—á–∞ –¥–ª—è —ç–∫–æ–Ω–æ–º–∏–∏ –ø–∞–º—è—Ç–∏
        n_workers=min(4, os.cpu_count() or 4)  # –ù–µ –±–æ–ª–µ–µ 4 –ø–æ—Ç–æ–∫–æ–≤
    )
    
    # –°–æ—Ö—Ä–∞–Ω—è–µ–º –≤ –∫—ç—à —Ç–æ–ª—å–∫–æ –µ—Å–ª–∏ –¥–∞—Ç–∞—Å–µ—Ç –Ω–µ –ø—É—Å—Ç–æ–π
    if not dataset.empty:
        try:
            with open(cache_file, 'wb') as f:
                pickle.dump(dataset, f)
            print(f"üíæ –ö—ç—à —Å–æ—Ö—Ä–∞–Ω–µ–Ω –≤ {cache_file}")
        except Exception as e:
            print(f"‚ö†Ô∏è –ù–µ —É–¥–∞–ª–æ—Å—å —Å–æ—Ö—Ä–∞–Ω–∏—Ç—å –∫—ç—à: {str(e)}")
    
    return dataset

In [15]:
# –ò—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–∏–µ - –æ–ø—Ç–∏–º–∏–∑–∏—Ä–æ–≤–∞–Ω–Ω–∞—è –≤–µ—Ä—Å–∏—è –¥–ª—è —Ä–∞–±–æ—Ç—ã —Å –æ–≥—Ä–∞–Ω–∏—á–µ–Ω–Ω–æ–π –ø–∞–º—è—Ç—å—é
print("‚ö° –ì–µ–Ω–µ—Ä–∏—Ä—É–µ–º –æ–ø—Ç–∏–º–∏–∑–∏—Ä–æ–≤–∞–Ω–Ω—ã–π –¥–∞—Ç–∞—Å–µ—Ç NER —Å —É–ø—Ä–∞–≤–ª–µ–Ω–∏–µ–º –ø–∞–º—è—Ç—å—é...")
full_dataset = cached_generate_dataset_memory_efficient(
    df, 
    PARAM_CONFIG, 
    n_samples=100_000,  # –ù–∞—á–∏–Ω–∞–µ–º —Å –º–µ–Ω—å—à–µ–≥–æ –∫–æ–ª–∏—á–µ—Å—Ç–≤–∞
    cache_file="polymer_ner_dataset_100k.pkl"
)

print(f"‚úÖ –°–≥–µ–Ω–µ—Ä–∏—Ä–æ–≤–∞–Ω–æ {len(full_dataset)} –ø—Ä–∏–º–µ—Ä–æ–≤")
print(full_dataset.head())

‚ö° –ì–µ–Ω–µ—Ä–∏—Ä—É–µ–º –æ–ø—Ç–∏–º–∏–∑–∏—Ä–æ–≤–∞–Ω–Ω—ã–π –¥–∞—Ç–∞—Å–µ—Ç NER —Å —É–ø—Ä–∞–≤–ª–µ–Ω–∏–µ–º –ø–∞–º—è—Ç—å—é...


NameError: name 'cached_generate_dataset_memory_efficient' is not defined

In [68]:
full_dataset.to_csv('data/user_request_dataset/polymer_dataset_new_100–∫.csv', index=False, encoding='utf-8')

In [46]:
import ast
import json

def normalize_dataset(input_file, output_file):
    print(f"–ó–∞–≥—Ä—É–∑–∫–∞ {input_file}...")
    df = pd.read_csv(input_file)
    
    fixed_count = 0
    error_count = 0
    
    new_params_strs = []
    
    for index, row in df.iterrows():
        try:
            # 1. –ü–∞—Ä—Å–∏–º –∞–Ω–Ω–æ—Ç–∞—Ü–∏–∏
            # –ü—Ä–æ–±—É–µ–º ast (–¥–ª—è –ø–∏—Ç–æ–Ω–æ–≤—Å–∫–∏—Ö —Å—Ç—Ä–æ–∫) –∏–ª–∏ json
            try:
                annotations = ast.literal_eval(row['annotations'])
            except:
                try:
                    annotations = json.loads(row['annotations'])
                except:
                    # –ï—Å–ª–∏ —Å–æ–≤—Å–µ–º –±–∏—Ç–∞—è —Å—Ç—Ä–æ–∫–∞, –æ—Å—Ç–∞–≤–ª—è–µ–º –∫–∞–∫ –µ—Å—Ç—å –∏–ª–∏ –ø—É—Å—Ç—É—é
                    new_params_strs.append("")
                    error_count += 1
                    continue
            
            if not isinstance(annotations, list):
                new_params_strs.append("")
                continue

            # 2. –°–æ–±–∏—Ä–∞–µ–º –Ω–æ–≤—É—é —Å—Ç—Ä–æ–∫—É params_str
            # –ë–µ—Ä–µ–º —á–µ—Ç–∫–∏–π 'param' –∏–∑ –∞–Ω–Ω–æ—Ç–∞—Ü–∏–∏ –∏ 'label'
            # –§–æ—Ä–º–∞—Ç: param1:label1|param2:label2
            parts = []
            for ann in annotations:
                if isinstance(ann, dict) and 'param' in ann and 'label' in ann:
                    clean_param = ann['param'] # –ù–∞–ø—Ä–∏–º–µ—Ä "Tm" –≤–º–µ—Å—Ç–æ "–ø–ª–∞–≤–ª"
                    clean_label = ann['label'] # "—Å—Ä–µ–¥–Ω—è—è"
                    parts.append(f"{clean_param}:{clean_label}")
            
            new_str = "|".join(parts)
            new_params_strs.append(new_str)
            fixed_count += 1
            
        except Exception as e:
            print(f"–û—à–∏–±–∫–∞ –≤ —Å—Ç—Ä–æ–∫–µ {index}: {e}")
            new_params_strs.append("")
            error_count += 1

    # 3. –ó–∞–º–µ–Ω—è–µ–º —Å—Ç–æ–ª–±–µ—Ü
    df['params_str'] = new_params_strs
    
    # 4. –°–æ—Ö—Ä–∞–Ω—è–µ–º
    df.to_csv(output_file, index=False)
    print(f"–ì–æ—Ç–æ–≤–æ! –û–±—Ä–∞–±–æ—Ç–∞–Ω–æ —Å—Ç—Ä–æ–∫: {fixed_count}. –û—à–∏–±–æ–∫: {error_count}")
    print(f"–§–∞–π–ª —Å–æ—Ö—Ä–∞–Ω–µ–Ω –∫–∞–∫: {output_file}")
    
    # –ü–æ–∫–∞–∑—ã–≤–∞–µ–º –ø—Ä–∏–º–µ—Ä
    print("\n–ü—Ä–∏–º–µ—Ä –∏—Å–ø—Ä–∞–≤–ª–µ–Ω–∏—è:")
    print(df[['annotations', 'params_str']].head(1).values)

# –ó–∞–ø—É—Å–∫ –∏—Å–ø—Ä–∞–≤–ª–µ–Ω–∏—è
normalize_dataset(
    'data/user_request_dataset/polymer_dataset_new_100–∫.csv', 
    'data/user_request_dataset/polymer_dataset_clean.csv'
)

–ó–∞–≥—Ä—É–∑–∫–∞ data/user_request_dataset/polymer_dataset_new_100–∫.csv...
–ì–æ—Ç–æ–≤–æ! –û–±—Ä–∞–±–æ—Ç–∞–Ω–æ —Å—Ç—Ä–æ–∫: 99800. –û—à–∏–±–æ–∫: 0
–§–∞–π–ª —Å–æ—Ö—Ä–∞–Ω–µ–Ω –∫–∞–∫: data/user_request_dataset/polymer_dataset_clean.csv

–ü—Ä–∏–º–µ—Ä –∏—Å–ø—Ä–∞–≤–ª–µ–Ω–∏—è:
[["[{'param': 'Tm', 'label': '—Å—Ä–µ–¥–Ω—è—è', 'phrase': '—Å —Å—Ç–∞–Ω–¥–∞—Ä—Ç–Ω–∞—è —Ç–µ–º–ø–µ—Ä–∞—Ç—É—Ä–∞ —Ñ–∞–∑–æ–≤–æ–≥–æ –ø–µ—Ä–µ—Ö–æ–¥–∞', 'code': '–ø–ª–∞–≤–ª'}, {'param': 'epse_9.0', 'label': '–≤—ã—Å–æ–∫–∞—è', 'phrase': '—Å –≤—ã–¥–∞—é—â–∞—è—Å—è –¥–∏—ç–ª–µ–∫—Ç—Ä–∏—á–µ—Å–∫–æ–π –ø—Ä–æ–Ω–∏—Ü–∞–µ–º–æ—Å—Ç—å—é –Ω–∞ 9 –ì–ì—Ü', 'code': '—ç–ø—Å_9'}]"
  'Tm:—Å—Ä–µ–¥–Ω—è—è|epse_9.0:–≤—ã—Å–æ–∫–∞—è']]


In [47]:
full_dataset = pd.read_csv('data/user_request_dataset/polymer_dataset_clean.csv')

In [20]:
def filter_overlaps(entities):
    """
    –§–∏–ª—å—Ç—Ä—É–µ—Ç –ø–µ—Ä–µ—Å–µ–∫–∞—é—â–∏–µ—Å—è —Å—É—â–Ω–æ—Å—Ç–∏, –æ—Å—Ç–∞–≤–ª—è—è —Å–∞–º—ã–µ –¥–ª–∏–Ω–Ω—ã–µ.
    entities: —Å–ø–∏—Å–æ–∫ –∫–æ—Ä—Ç–µ–∂–µ–π (start, end, label)
    """
    # –°–æ—Ä—Ç–∏—Ä—É–µ–º —Å—É—â–Ω–æ—Å—Ç–∏ –ø–æ –¥–ª–∏–Ω–µ (–æ—Ç –¥–ª–∏–Ω–Ω—ã—Ö –∫ –∫–æ—Ä–æ—Ç–∫–∏–º)
    # –ï—Å–ª–∏ –¥–ª–∏–Ω—ã —Ä–∞–≤–Ω—ã, —Å–æ—Ä—Ç–∏—Ä—É–µ–º –ø–æ –Ω–∞—á–∞–ª—å–Ω–æ–π –ø–æ–∑–∏—Ü–∏–∏
    sorted_entities = sorted(entities, key=lambda x: (x[1] - x[0], x[0]), reverse=True)
    
    filtered = []
    for candidate in sorted_entities:
        cand_start, cand_end, _ = candidate
        
        is_overlap = False
        for existing in filtered:
            ex_start, ex_end, _ = existing
            
            # –ü—Ä–æ–≤–µ—Ä–∫–∞ –ø–µ—Ä–µ—Å–µ—á–µ–Ω–∏—è –æ—Ç—Ä–µ–∑–∫–æ–≤:
            # max(start1, start2) < min(end1, end2) –æ–∑–Ω–∞—á–∞–µ—Ç –ø–µ—Ä–µ—Å–µ—á–µ–Ω–∏–µ
            if max(cand_start, ex_start) < min(cand_end, ex_end):
                is_overlap = True
                break
        
        if not is_overlap:
            filtered.append(candidate)
            
    return filtered

def prepare_spacy_training_data(dataset_file):
    """–ü–æ–¥–≥–æ—Ç–æ–≤–∫–∞ –¥–∞–Ω–Ω—ã—Ö –¥–ª—è –æ–±—É—á–µ–Ω–∏—è spaCy —Å —Ñ–∏–ª—å—Ç—Ä–∞—Ü–∏–µ–π –ø–µ—Ä–µ—Å–µ—á–µ–Ω–∏–π"""
    # –ó–∞–≥—Ä—É–∂–∞–µ–º –¥–∞—Ç–∞—Å–µ—Ç
    df = pd.read_csv(dataset_file)
    
    # –°–æ–∑–¥–∞–µ–º –æ–±—É—á–∞—é—â–∏–µ –¥–∞–Ω–Ω—ã–µ –≤ —Ñ–æ—Ä–º–∞—Ç–µ spaCy
    training_data = []
    
    for _, row in df.iterrows():
        text = row['text']
        raw_entities = [] # –°—é–¥–∞ —Å–æ–±–∏—Ä–∞–µ–º –≤—Å–µ—Ö –∫–∞–Ω–¥–∏–¥–∞—Ç–æ–≤
        
        # --- –ë–ª–æ–∫ –ø–∞—Ä—Å–∏–Ω–≥–∞ –∞–Ω–Ω–æ—Ç–∞—Ü–∏–π (—Ç–æ—Ç –∂–µ, —á—Ç–æ –∏ —É –≤–∞—Å) ---
        if 'annotations_str' in row and pd.notna(row['annotations_str']):
            try:
                annotations = ast.literal_eval(row['annotations_str'])
            except (ValueError, SyntaxError):
                try:
                    annotations = json.loads(row['annotations_str'])
                except (json.JSONDecodeError, TypeError):
                    print(f"–ù–µ —É–¥–∞–ª–æ—Å—å —Ä–∞—Å–ø–∞—Ä—Å–∏—Ç—å –∞–Ω–Ω–æ—Ç–∞—Ü–∏–∏: {row['annotations_str']}")
                    continue
        elif 'annotations' in row and pd.notna(row['annotations']):
            try:
                annotations = ast.literal_eval(row['annotations'])
            except (ValueError, SyntaxError):
                try:
                    annotations = json.loads(row['annotations'])
                except (json.JSONDecodeError, TypeError):
                    print(f"–ù–µ —É–¥–∞–ª–æ—Å—å —Ä–∞—Å–ø–∞—Ä—Å–∏—Ç—å –∞–Ω–Ω–æ—Ç–∞—Ü–∏–∏: {row['annotations']}")
                    continue
        else:
            # print("–ù–µ –Ω–∞–π–¥–µ–Ω—ã –∞–Ω–Ω–æ—Ç–∞—Ü–∏–∏ –≤ —Å—Ç—Ä–æ–∫–µ") # –ú–æ–∂–Ω–æ –∑–∞–∫–æ–º–º–µ–Ω—Ç–∏—Ä–æ–≤–∞—Ç—å, —á—Ç–æ–±—ã –Ω–µ —Å–ø–∞–º–∏–ª–æ
            continue
        
        if not isinstance(annotations, list):
            continue
        
        # –°–æ–∑–¥–∞–µ–º –∫–∞–Ω–¥–∏–¥–∞—Ç–æ–≤ –≤ –∞–Ω–Ω–æ—Ç–∞—Ü–∏–∏
        for ann in annotations:
            if not isinstance(ann, dict):
                continue
                
            if 'phrase' not in ann or 'param' not in ann or 'label' not in ann:
                continue
            
            phrase = str(ann['phrase'])
            if not phrase: continue
            
            # !–í–ê–ñ–ù–û: text.find –Ω–∞—Ö–æ–¥–∏—Ç —Ç–æ–ª—å–∫–æ –ü–ï–†–í–û–ï –≤—Ö–æ–∂–¥–µ–Ω–∏–µ. 
            # –ï—Å–ª–∏ –≤ —Ç–µ–∫—Å—Ç–µ —Ñ—Ä–∞–∑–∞ –≤—Å—Ç—Ä–µ—á–∞–µ—Ç—Å—è –¥–≤–∞–∂–¥—ã, –∞ —Ä–∞–∑–º–µ—á–µ–Ω–∞ –≤—Ç–æ—Ä–∞—è - —ç—Ç–æ –ø—Ä–æ–±–ª–µ–º–∞.
            # –î–ª—è –ø—Ä–æ—Å—Ç–æ–≥–æ —Ä–µ—à–µ–Ω–∏—è –ø–æ–∫–∞ –æ—Å—Ç–∞–≤–∏–º find, –Ω–æ –∏–º–µ–π—Ç–µ –≤ –≤–∏–¥—É —ç—Ç–æ—Ç —Ä–∏—Å–∫.
            start = text.find(phrase)
            
            if start != -1:
                end = start + len(phrase)
                label = f"{ann['param']}_{ann['label'].upper()}"
                raw_entities.append((start, end, label))
        
        # --- –ò–°–ü–†–ê–í–õ–ï–ù–ò–ï: –§–∏–ª—å—Ç—Ä—É–µ–º –ø–µ—Ä–µ—Å–µ—á–µ–Ω–∏—è –ø–µ—Ä–µ–¥ –¥–æ–±–∞–≤–ª–µ–Ω–∏–µ–º ---
        if raw_entities:
            clean_entities = filter_overlaps(raw_entities)
            training_data.append((text, {"entities": clean_entities}))
    
    # –°–æ–∑–¥–∞–µ–º –ø–∞–ø–∫—É, –µ—Å–ª–∏ –µ—ë –Ω–µ—Ç
    os.makedirs('data/user_request_dataset', exist_ok=True)
    
    # –°–æ—Ö—Ä–∞–Ω—è–µ–º –≤ JSONL —Ñ–æ—Ä–º–∞—Ç–µ
    with open('data/user_request_dataset/spacy_training_data.jsonl', 'w', encoding='utf-8') as f:
        for text, entities_dict in training_data:
            json.dump({"text": text, "entities": entities_dict["entities"]}, f, ensure_ascii=False)
            f.write('\n')
    
    print(f"–ü–æ–¥–≥–æ—Ç–æ–≤–ª–µ–Ω–æ {len(training_data)} –æ–±—É—á–∞—é—â–∏—Ö –ø—Ä–∏–º–µ—Ä–æ–≤")
    return training_data

In [14]:
# –û–±–Ω–æ–≤–ª–µ–Ω–Ω—ã–π –∫–ª–∞—Å—Å –¥–∞—Ç–∞—Å–µ—Ç–∞
class PropertyDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [15]:
class OptimizedPolymerNERTrainer:
    """–û–ø—Ç–∏–º–∏–∑–∏—Ä–æ–≤–∞–Ω–Ω—ã–π –∫–ª–∞—Å—Å –¥–ª—è –æ–±—É—á–µ–Ω–∏—è NER-–º–æ–¥–µ–ª–∏"""
    
    def __init__(self, model_name="ru_core_news_sm"):
        """–ò–Ω–∏—Ü–∏–∞–ª–∏–∑–∞—Ü–∏—è –º–æ–¥–µ–ª–∏"""
        # –ü—Ä–æ–≤–µ—Ä—è–µ–º –¥–æ—Å—Ç—É–ø–Ω–æ—Å—Ç—å GPU
        self.use_gpu = torch.cuda.is_available()
        if self.use_gpu:
            print(f"–ò—Å–ø–æ–ª—å–∑—É–µ—Ç—Å—è GPU: {torch.cuda.get_device_name(0)}")
        else:
            print("GPU –Ω–µ –¥–æ—Å—Ç—É–ø–µ–Ω, –∏—Å–ø–æ–ª—å–∑—É–µ—Ç—Å—è CPU")
        
        try:
            self.nlp = spacy.load(model_name)
        except OSError:
            print(f"–ú–æ–¥–µ–ª—å {model_name} –Ω–µ –Ω–∞–π–¥–µ–Ω–∞. –°–æ–∑–¥–∞–µ–º –±–∞–∑–æ–≤—É—é –º–æ–¥–µ–ª—å.")
            self.nlp = spacy.blank("ru")
        
        # –î–æ–±–∞–≤–ª—è–µ–º NER –∫–æ–º–ø–æ–Ω–µ–Ω—Ç, –µ—Å–ª–∏ –µ–≥–æ –Ω–µ—Ç
        if "ner" not in self.nlp.pipe_names:
            self.ner = self.nlp.add_pipe("ner")
        else:
            self.ner = self.nlp.get_pipe("ner")
    
    def add_labels_to_ner(self, training_data):
        """–î–æ–±–∞–≤–ª–µ–Ω–∏–µ –º–µ—Ç–æ–∫ –≤ NER –∫–æ–º–ø–æ–Ω–µ–Ω—Ç"""
        for _, annotations in training_data:
            for ent in annotations.get("entities", []):
                if len(ent) >= 3:  # –£–±–µ–¥–∏–º—Å—è, —á—Ç–æ –µ—Å—Ç—å –º–µ—Ç–∫–∞
                    self.ner.add_label(ent[2])  # ent[2] - —ç—Ç–æ –º–µ—Ç–∫–∞
    
    def train_model_fast(self, training_data, n_iter=20):
        """–ë—ã—Å—Ç—Ä–æ–µ –æ–±—É—á–µ–Ω–∏–µ –º–æ–¥–µ–ª–∏ (5 —ç–ø–æ—Ö –¥–ª—è —Ç–µ—Å—Ç–∏—Ä–æ–≤–∞–Ω–∏—è)"""
        print(f"–ù–∞—á–∏–Ω–∞–µ–º –±—ã—Å—Ç—Ä–æ–µ –æ–±—É—á–µ–Ω–∏–µ ({n_iter} —ç–ø–æ—Ö)...")
        
        # –î–æ–±–∞–≤–ª—è–µ–º –º–µ—Ç–∫–∏
        self.add_labels_to_ner(training_data)
        
        # –£–±–∏—Ä–∞–µ–º –¥—Ä—É–≥–∏–µ –∫–æ–º–ø–æ–Ω–µ–Ω—Ç—ã –≤–æ –≤—Ä–µ–º—è –æ–±—É—á–µ–Ω–∏—è
        pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
        unaffected_pipes = [pipe for pipe in self.nlp.pipe_names if pipe not in pipe_exceptions]
        
        # –ë—ã—Å—Ç—Ä–æ–µ –æ–±—É—á–µ–Ω–∏–µ
        with self.nlp.disable_pipes(*unaffected_pipes):  # –¢–æ–ª—å–∫–æ –æ–±—É—á–∞–µ–º NER
            # –ü–æ–ª—É—á–∞–µ–º –æ–ø—Ç–∏–º–∏–∑–∞—Ç–æ—Ä
            optimizer = self.nlp.begin_training()
            
            # –£–≤–µ–ª–∏—á–∏–≤–∞–µ–º —Ä–∞–∑–º–µ—Ä –±–∞—Ç—á–∞ –¥–ª—è —É—Å–∫–æ—Ä–µ–Ω–∏—è
            batch_size = 8  # –£–≤–µ–ª–∏—á–µ–Ω —Å 4 –¥–æ 8
            
            for i in range(n_iter):
                print(f"–≠–ø–æ—Ö–∞ {i+1}/{n_iter}")
                losses = {}
                random.shuffle(training_data)
                
                # –ò—Å–ø–æ–ª—å–∑—É–µ–º –±–æ–ª—å—à–∏–µ –±–∞—Ç—á–∏ –¥–ª—è —É—Å–∫–æ—Ä–µ–Ω–∏—è
                batches = spacy.util.minibatch(
                    training_data, 
                    size=spacy.util.compounding(8.0, 64.0, 1.001)  # –£–≤–µ–ª–∏—á–µ–Ω –º–∏–Ω–∏–º–∞–ª—å–Ω—ã–π —Ä–∞–∑–º–µ—Ä
                )
                
                for batch in batches:
                    examples = []
                    for text, annotations in batch:
                        doc = self.nlp.make_doc(text)
                        example = Example.from_dict(doc, annotations)
                        examples.append(example)
                    
                    # –£–º–µ–Ω—å—à–∞–µ–º drop –¥–ª—è –±–æ–ª–µ–µ —Å—Ç–∞–±–∏–ª—å–Ω–æ–≥–æ –æ–±—É—á–µ–Ω–∏—è
                    self.nlp.update(examples, drop=0.3, losses=losses, sgd=optimizer)
                
                print(f"–ü–æ—Ç–µ—Ä–∏: {losses}")
                
                # –†–∞–Ω–Ω–∏–π –≤—ã—Ö–æ–¥, –µ—Å–ª–∏ –ø–æ—Ç–µ—Ä–∏ –æ—á–µ–Ω—å –º–∞–ª–µ–Ω—å–∫–∏–µ
                if losses.get('ner', 0) < 0.01:
                    print("–î–æ—Å—Ç–∏–≥–Ω—É—Ç—ã –º–∏–Ω–∏–º–∞–ª—å–Ω—ã–µ –ø–æ—Ç–µ—Ä–∏, –æ—Å—Ç–∞–Ω–∞–≤–ª–∏–≤–∞–µ–º –æ–±—É—á–µ–Ω–∏–µ")
                    break
        
        return self.nlp
    
    def save_model(self, model_path):
        """–°–æ—Ö—Ä–∞–Ω–µ–Ω–∏–µ –æ–±—É—á–µ–Ω–Ω–æ–π –º–æ–¥–µ–ª–∏"""
        self.nlp.to_disk(model_path)
        print(f"–ú–æ–¥–µ–ª—å —Å–æ—Ö—Ä–∞–Ω–µ–Ω–∞ –≤ {model_path}")

In [16]:
import torch
print("CUDA –¥–æ—Å—Ç—É–ø–Ω–∞:", torch.cuda.is_available())
print("torch.cuda.current_device():", torch.cuda.current_device())
print("torch.cuda.get_device_name(0):", torch.cuda.get_device_name(0))
print("torch.cuda.memory_allocated():", torch.cuda.memory_allocated())


CUDA –¥–æ—Å—Ç—É–ø–Ω–∞: True
torch.cuda.current_device(): 0
torch.cuda.get_device_name(0): NVIDIA GeForce RTX 3060 Laptop GPU
torch.cuda.memory_allocated(): 0


In [21]:
def train_with_progress_monitoring():
    """–û–±—É—á–µ–Ω–∏–µ —Å –º–æ–Ω–∏—Ç–æ—Ä–∏–Ω–≥–æ–º –ø—Ä–æ–≥—Ä–µ—Å—Å–∞"""
    start_time = time.time()
    
    try:
        # –ü–æ–¥–≥–æ—Ç–æ–≤–∫–∞ –¥–∞–Ω–Ω—ã—Ö
        training_data = prepare_spacy_training_data(full_dataset)
        
        if not training_data:
            print("–ù–µ—Ç –æ–±—É—á–∞—é—â–∏—Ö –¥–∞–Ω–Ω—ã—Ö!")
            return None
        
        print(f"–ù–∞—á–∏–Ω–∞–µ–º –æ–±—É—á–µ–Ω–∏–µ –Ω–∞ {len(training_data)} –ø—Ä–∏–º–µ—Ä–∞—Ö...")
        print("–≠—Ç–æ –∑–∞–π–º–µ—Ç –Ω–µ—Å–∫–æ–ª—å–∫–æ –º–∏–Ω—É—Ç...")
        
        # –°–æ–∑–¥–∞–Ω–∏–µ –∏ –æ–±—É—á–µ–Ω–∏–µ –º–æ–¥–µ–ª–∏
        trainer = OptimizedPolymerNERTrainer()
        model = trainer.train_model_fast(training_data, n_iter=20)
        
        # –°–æ–∑–¥–∞–µ–º –¥–∏—Ä–µ–∫—Ç–æ—Ä–∏—é –¥–ª—è –º–æ–¥–µ–ª–∏
        import os
        os.makedirs("models", exist_ok=True)
        
        # –°–æ—Ö—Ä–∞–Ω–µ–Ω–∏–µ –º–æ–¥–µ–ª–∏
        trainer.save_model("models/nlp_model_fast")
        
        end_time = time.time()
        training_time = end_time - start_time
        print(f"–û–±—É—á–µ–Ω–∏–µ –∑–∞–≤–µ—Ä—à–µ–Ω–æ –∑–∞ {training_time:.2f} —Å–µ–∫—É–Ω–¥!")
        
        return model
        
    except Exception as e:
        print(f"–û—à–∏–±–∫–∞ –ø—Ä–∏ –æ–±—É—á–µ–Ω–∏–∏: {e}")
        import traceback
        traceback.print_exc()
        return None

# –ë—ã—Å—Ç—Ä—ã–π –∑–∞–ø—É—Å–∫
model = train_with_progress_monitoring()

–ü–æ–¥–≥–æ—Ç–æ–≤–ª–µ–Ω–æ 99800 –æ–±—É—á–∞—é—â–∏—Ö –ø—Ä–∏–º–µ—Ä–æ–≤
–ù–∞—á–∏–Ω–∞–µ–º –æ–±—É—á–µ–Ω–∏–µ –Ω–∞ 99800 –ø—Ä–∏–º–µ—Ä–∞—Ö...
–≠—Ç–æ –∑–∞–π–º–µ—Ç –Ω–µ—Å–∫–æ–ª—å–∫–æ –º–∏–Ω—É—Ç...
–ò—Å–ø–æ–ª—å–∑—É–µ—Ç—Å—è GPU: NVIDIA GeForce RTX 3060 Laptop GPU
–ú–æ–¥–µ–ª—å ru_core_news_sm –Ω–µ –Ω–∞–π–¥–µ–Ω–∞. –°–æ–∑–¥–∞–µ–º –±–∞–∑–æ–≤—É—é –º–æ–¥–µ–ª—å.
–ù–∞—á–∏–Ω–∞–µ–º –±—ã—Å—Ç—Ä–æ–µ –æ–±—É—á–µ–Ω–∏–µ (20 —ç–ø–æ—Ö)...
–≠–ø–æ—Ö–∞ 1/20
–ü–æ—Ç–µ—Ä–∏: {'ner': np.float32(892923.06)}
–≠–ø–æ—Ö–∞ 2/20
–ü–æ—Ç–µ—Ä–∏: {'ner': np.float32(618089.4)}
–≠–ø–æ—Ö–∞ 3/20
–ü–æ—Ç–µ—Ä–∏: {'ner': np.float32(604502.06)}
–≠–ø–æ—Ö–∞ 4/20
–ü–æ—Ç–µ—Ä–∏: {'ner': np.float32(410091.25)}
–≠–ø–æ—Ö–∞ 5/20
–ü–æ—Ç–µ—Ä–∏: {'ner': np.float32(218867.47)}
–≠–ø–æ—Ö–∞ 6/20
–ü–æ—Ç–µ—Ä–∏: {'ner': np.float32(182427.06)}
–≠–ø–æ—Ö–∞ 7/20
–ü–æ—Ç–µ—Ä–∏: {'ner': np.float32(172708.69)}
–≠–ø–æ—Ö–∞ 8/20
–ü–æ—Ç–µ—Ä–∏: {'ner': np.float32(175065.67)}
–≠–ø–æ—Ö–∞ 9/20
–ü–æ—Ç–µ—Ä–∏: {'ner': np.float32(167125.36)}
–≠–ø–æ—Ö–∞ 10/20
–ü–æ—Ç–µ—Ä–∏: {'ner': np.

In [58]:
class PolymerParameterExtractor:
    """–ö–ª–∞—Å—Å –¥–ª—è –∏–∑–≤–ª–µ—á–µ–Ω–∏—è –ø–∞—Ä–∞–º–µ—Ç—Ä–æ–≤ –∏–∑ –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—å—Å–∫–∏—Ö –∑–∞–ø—Ä–æ—Å–æ–≤"""
    
    def __init__(self, model_path="models/nlp_model_fast"):
        """–ó–∞–≥—Ä—É–∑–∫–∞ –æ–±—É—á–µ–Ω–Ω–æ–π –º–æ–¥–µ–ª–∏"""
        try:
            self.nlp = spacy.load(model_path)
        except OSError:
            print("–ú–æ–¥–µ–ª—å –Ω–µ –Ω–∞–π–¥–µ–Ω–∞. –ò—Å–ø–æ–ª—å–∑—É–µ—Ç—Å—è –±–∞–∑–æ–≤–∞—è –º–æ–¥–µ–ª—å.")
            self.nlp = spacy.load("ru_core_news_sm")
    
    def extract_parameters(self, text):
        """–ò–∑–≤–ª–µ—á–µ–Ω–∏–µ –ø–∞—Ä–∞–º–µ—Ç—Ä–æ–≤ –∏–∑ —Ç–µ–∫—Å—Ç–∞"""
        doc = self.nlp(text)
        parameters = {}
        
        for ent in doc.ents:
            # –ü–∞—Ä—Å–∏–º –º–µ—Ç–∫—É —Ñ–æ—Ä–º–∞—Ç–∞ PARAM_CATEGORY
            if '_' in ent.label_:
                param, category = ent.label_.split('_', 1)
                parameters[param] = {
                    'category': category.lower(),
                    'value': ent.text,
                    'start': ent.start_char,
                    'end': ent.end_char
                }
        
        return parameters
    
    def get_parameter_info(self):
        """–ü–æ–ª—É—á–µ–Ω–∏–µ –∏–Ω—Ñ–æ—Ä–º–∞—Ü–∏–∏ –æ –ø–∞—Ä–∞–º–µ—Ç—Ä–∞—Ö"""
        return {
            "Egc": "–≠–Ω–µ—Ä–≥–∏—è –≥—Ä—É–ø–ø–æ–≤–æ–≥–æ –≤–∫–ª–∞–¥–∞ (–∫–î–∂/–º–æ–ª—å)",
            "Egb": "–≠–Ω–µ—Ä–≥–∏—è –≤–∫–ª–∞–¥–∞ —Å–≤—è–∑–∏ (–∫–î–∂/–º–æ–ª—å)",
            "Eib": "–≠–Ω–µ—Ä–≥–∏—è –≤–Ω—É—Ç—Ä–µ–Ω–Ω–µ–π —Å–≤—è–∑–∏ (–∫–î–∂/–º–æ–ª—å)",
            "CED": "–ü–ª–æ—Ç–Ω–æ—Å—Ç—å –∫–æ–≥–µ–∑–∏–æ–Ω–Ω–æ–π —ç–Ω–µ—Ä–≥–∏–∏ (–î–∂/—Å–º¬≥)",
            "Ei": "–≠–Ω–µ—Ä–≥–∏—è –∏–æ–Ω–∏–∑–∞—Ü–∏–∏ (—ç–í)",
            "Eea": "–≠–ª–µ–∫—Ç—Ä–æ–Ω–Ω–æ–µ —Å—Ä–æ–¥—Å—Ç–≤–æ (—ç–í)",
            "Eat": "–≠–Ω–µ—Ä–≥–∏—è –∞—Ç–æ–º–∏–∑–∞—Ü–∏–∏ (–∫–î–∂/–º–æ–ª—å)",
            "nc": "–ö–æ–ª–∏—á–µ—Å—Ç–≤–æ –∞—Ç–æ–º–æ–≤ —É–≥–ª–µ—Ä–æ–¥–∞",
            "ne": "–ö–æ–ª–∏—á–µ—Å—Ç–≤–æ —ç–ª–µ–∫—Ç—Ä–æ–Ω–æ–≤",
            "Xc": "–°—Ç–µ–ø–µ–Ω—å –∫—Ä–∏—Å—Ç–∞–ª–ª–∏—á–Ω–æ—Å—Ç–∏ (%)",
            "Xe": "–ü–ª–æ—Ç–Ω–æ—Å—Ç—å —Å—à–∏–≤–∫–∏ (–º–æ–ª—å/–º¬≥)",
            "epse_6.0": "–î–∏—ç–ª–µ–∫—Ç—Ä–∏—á–µ—Å–∫–∞—è –ø—Ä–æ–Ω–∏—Ü–∞–µ–º–æ—Å—Ç—å –ø—Ä–∏ 6.0 –ì–ì—Ü",
            "epsc": "–°—Ç–∞—Ç–∏—á–µ—Å–∫–∞—è –¥–∏—ç–ª–µ–∫—Ç—Ä–∏—á–µ—Å–∫–∞—è –ø—Ä–æ–Ω–∏—Ü–∞–µ–º–æ—Å—Ç—å (0 –ì—Ü)",
            "epse_3.0": "–î–∏—ç–ª–µ–∫—Ç—Ä–∏—á–µ—Å–∫–∞—è –ø—Ä–æ–Ω–∏—Ü–∞–µ–º–æ—Å—Ç—å –ø—Ä–∏ 3.0 –ì–ì—Ü",
            "epse_1.78": "–î–∏—ç–ª–µ–∫—Ç—Ä–∏—á–µ—Å–∫–∞—è –ø—Ä–æ–Ω–∏—Ü–∞–µ–º–æ—Å—Ç—å –ø—Ä–∏ 1.78 –ì–ì—Ü",
            "epse_15.0": "–î–∏—ç–ª–µ–∫—Ç—Ä–∏—á–µ—Å–∫–∞—è –ø—Ä–æ–Ω–∏—Ü–∞–µ–º–æ—Å—Ç—å –ø—Ä–∏ 15.0 –ì–ì—Ü",
            "epse_4.0": "–î–∏—ç–ª–µ–∫—Ç—Ä–∏—á–µ—Å–∫–∞—è –ø—Ä–æ–Ω–∏—Ü–∞–µ–º–æ—Å—Ç—å –ø—Ä–∏ 4.0 –ì–ì—Ü",
            "epse_5.0": "–î–∏—ç–ª–µ–∫—Ç—Ä–∏—á–µ—Å–∫–∞—è –ø—Ä–æ–Ω–∏—Ü–∞–µ–º–æ—Å—Ç—å –ø—Ä–∏ 5.0 –ì–ì—Ü",
            "epse_2.0": "–î–∏—ç–ª–µ–∫—Ç—Ä–∏—á–µ—Å–∫–∞—è –ø—Ä–æ–Ω–∏—Ü–∞–µ–º–æ—Å—Ç—å –ø—Ä–∏ 2.0 –ì–ì—Ü",
            "epse_9.0": "–î–∏—ç–ª–µ–∫—Ç—Ä–∏—á–µ—Å–∫–∞—è –ø—Ä–æ–Ω–∏—Ü–∞–µ–º–æ—Å—Ç—å –ø—Ä–∏ 9.0 –ì–ì—Ü",
            "epse_7.0": "–î–∏—ç–ª–µ–∫—Ç—Ä–∏—á–µ—Å–∫–∞—è –ø—Ä–æ–Ω–∏—Ü–∞–µ–º–æ—Å—Ç—å –ø—Ä–∏ 7.0 –ì–ì—Ü",
            "epsb": "–î–∏—ç–ª–µ–∫—Ç—Ä–∏—á–µ—Å–∫–∞—è –ø—Ä–æ—á–Ω–æ—Å—Ç—å –ø—Ä–æ–±–æ—è (–∫–í/–º–º)",
            "TSb": "–ü—Ä–æ—á–Ω–æ—Å—Ç—å –ø—Ä–∏ —Ä–∞–∑—Ä—ã–≤–µ (–ú–ü–∞)",
            "TSy": "–ü—Ä–æ—á–Ω–æ—Å—Ç—å –ø—Ä–∏ —Ç–µ–∫—É—á–µ—Å—Ç–∏ (–ú–ü–∞)",
            "YM": "–ú–æ–¥—É–ª—å –Æ–Ω–≥–∞ (–ú–ü–∞)",
            "permCH4": "–ü—Ä–æ–Ω–∏—Ü–∞–µ–º–æ—Å—Ç—å –¥–ª—è CH‚ÇÑ (–ë—ç—Ä—Ä)",
            "permCO2": "–ü—Ä–æ–Ω–∏—Ü–∞–µ–º–æ—Å—Ç—å –¥–ª—è CO‚ÇÇ (–ë—ç—Ä—Ä)",
            "permH2": "–ü—Ä–æ–Ω–∏—Ü–∞–µ–º–æ—Å—Ç—å –¥–ª—è H‚ÇÇ (–ë—ç—Ä—Ä)",
            "permO2": "–ü—Ä–æ–Ω–∏—Ü–∞–µ–º–æ—Å—Ç—å –¥–ª—è O‚ÇÇ (–ë—ç—Ä—Ä)",
            "permN2": "–ü—Ä–æ–Ω–∏—Ü–∞–µ–º–æ—Å—Ç—å –¥–ª—è N‚ÇÇ (–ë—ç—Ä—Ä)",
            "permHe": "–ü—Ä–æ–Ω–∏—Ü–∞–µ–º–æ—Å—Ç—å –¥–ª—è He (–ë—ç—Ä—Ä)",
            "Cp": "–£–¥–µ–ª—å–Ω–∞—è —Ç–µ–ø–ª–æ—ë–º–∫–æ—Å—Ç—å (–î–∂/(–≥¬∑–ö))",
            "Td": "–¢–µ–º–ø–µ—Ä–∞—Ç—É—Ä–∞ —Ç–µ—Ä–º–∏—á–µ—Å–∫–æ–≥–æ —Ä–∞–∑–ª–æ–∂–µ–Ω–∏—è (–ö)",
            "Tg": "–¢–µ–º–ø–µ—Ä–∞—Ç—É—Ä–∞ —Å—Ç–µ–∫–ª–æ–≤–∞–Ω–∏—è (–ö)",
            "Tm": "–¢–µ–º–ø–µ—Ä–∞—Ç—É—Ä–∞ –ø–ª–∞–≤–ª–µ–Ω–∏—è (–ö)",
            "rho": "–ü–ª–æ—Ç–Ω–æ—Å—Ç—å (–≥/—Å–º¬≥)",
            "LOI": "–ü–æ—Ä–æ–≥ –∫–∏—Å–ª–æ—Ä–æ–¥–Ω–æ–≥–æ –∏–Ω–¥–µ–∫—Å–∞ (%)"
        }

# –ò—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–∏–µ
extractor = PolymerParameterExtractor()

In [60]:
class ParameterRangeConverter:
    """–ö–ª–∞—Å—Å –¥–ª—è –ø—Ä–µ–æ–±—Ä–∞–∑–æ–≤–∞–Ω–∏—è –∫–∞—Ç–µ–≥–æ—Ä–∏–π –≤ —á–∏—Å–ª–æ–≤—ã–µ –¥–∏–∞–ø–∞–∑–æ–Ω—ã"""
    
    def __init__(self, training_datasets):
        """–ò–Ω–∏—Ü–∏–∞–ª–∏–∑–∞—Ü–∏—è —Å –∏—Å—Ö–æ–¥–Ω—ã–º–∏ –¥–∞—Ç–∞—Å–µ—Ç–∞–º–∏ –¥–ª—è –æ–ø—Ä–µ–¥–µ–ª–µ–Ω–∏—è –¥–∏–∞–ø–∞–∑–æ–Ω–æ–≤"""
        self.thresholds = self._calculate_thresholds(training_datasets)
    
    def _calculate_thresholds(self, datasets):
        """–í—ã—á–∏—Å–ª–µ–Ω–∏–µ –ø–æ—Ä–æ–≥–æ–≤ –¥–ª—è –∫–∞–∂–¥–æ–≥–æ –ø–∞—Ä–∞–º–µ—Ç—Ä–∞"""
        thresholds = {}
        
        # –û–±—ä–µ–¥–∏–Ω—è–µ–º –≤—Å–µ –¥–∞—Ç–∞—Å–µ—Ç—ã –¥–ª—è –≤—ã—á–∏—Å–ª–µ–Ω–∏—è –≥–ª–æ–±–∞–ª—å–Ω—ã—Ö –ø–æ—Ä–æ–≥–æ–≤
        all_data = pd.concat(datasets, ignore_index=True)
        
        # –í—ã—á–∏—Å–ª—è–µ–º –ø–æ—Ä–æ–≥–∏ –¥–ª—è –≤—Å–µ—Ö –ø–∞—Ä–∞–º–µ—Ç—Ä–æ–≤
        for param in set(common_columns):
            if param in all_data.columns:
                valid_values = all_data[param].dropna()
                if len(valid_values) > 0:
                    low = valid_values.quantile(0.33)
                    high = valid_values.quantile(0.67)
                    thresholds[param] = {
                        'low': low,
                        'high': high,
                        'min': valid_values.min(),
                        'max': valid_values.max(),
                        'mean': valid_values.mean(),
                        'std': valid_values.std()
                    }
        
        return thresholds
    
    def get_range_for_category(self, parameter, category):
        """–ü–æ–ª—É—á–µ–Ω–∏–µ —á–∏—Å–ª–æ–≤–æ–≥–æ –¥–∏–∞–ø–∞–∑–æ–Ω–∞ –¥–ª—è –ø–∞—Ä–∞–º–µ—Ç—Ä–∞ –∏ –∫–∞—Ç–µ–≥–æ—Ä–∏–∏"""
        if parameter not in self.thresholds:
            return None
        
        param_info = self.thresholds[parameter]
        
        if category == '–Ω–∏–∑–∫–∞—è':
            return {
                'min': param_info['min'],
                'max': param_info['low'],
                'description': f"–ù–∏–∑–∫–∏–µ –∑–Ω–∞—á–µ–Ω–∏—è {parameter}"
            }
        elif category == '—Å—Ä–µ–¥–Ω—è—è':
            return {
                'min': param_info['low'],
                'max': param_info['high'],
                'description': f"–°—Ä–µ–¥–Ω–∏–µ –∑–Ω–∞—á–µ–Ω–∏—è {parameter}"
            }
        elif category == '–≤—ã—Å–æ–∫–∞—è':
            return {
                'min': param_info['high'],
                'max': param_info['max'],
                'description': f"–í—ã—Å–æ–∫–∏–µ –∑–Ω–∞—á–µ–Ω–∏—è {parameter}"
            }
        else:
            return None
    
    def get_detailed_recommendation(self, parameter, category):
        """–ü–æ–ª—É—á–µ–Ω–∏–µ –¥–µ—Ç–∞–ª—å–Ω–æ–π —Ä–µ–∫–æ–º–µ–Ω–¥–∞—Ü–∏–∏"""
        if parameter not in self.thresholds:
            return "–ü–∞—Ä–∞–º–µ—Ç—Ä –Ω–µ –Ω–∞–π–¥–µ–Ω"
        
        param_info = self.thresholds[parameter]
        range_info = self.get_range_for_category(parameter, category)
        
        if not range_info:
            return "–ù–µ–≤–µ—Ä–Ω–∞—è –∫–∞—Ç–µ–≥–æ—Ä–∏—è"
        
        # –§–æ—Ä–º–∞—Ç–∏—Ä—É–µ–º –∑–Ω–∞—á–µ–Ω–∏—è –≤ –∑–∞–≤–∏—Å–∏–º–æ—Å—Ç–∏ –æ—Ç –ø–æ—Ä—è–¥–∫–∞ –≤–µ–ª–∏—á–∏–Ω—ã
        if abs(param_info['max'] - param_info['min']) > 100:
            min_val = f"{range_info['min']:.2f}"
            max_val = f"{range_info['max']:.2f}"
        elif abs(param_info['max'] - param_info['min']) > 1:
            min_val = f"{range_info['min']:.3f}"
            max_val = f"{range_info['max']:.3f}"
        else:
            min_val = f"{range_info['min']:.4f}"
            max_val = f"{range_info['max']:.4f}"
        
        return {
            'parameter': parameter,
            'category': category,
            'range': f"[{min_val}, {max_val}]",
            'min_value': float(range_info['min']),
            'max_value': float(range_info['max']),
            'description': range_info['description']
        }

# –ò–Ω–∏—Ü–∏–∞–ª–∏–∑–∞—Ü–∏—è –∫–æ–Ω–≤–µ—Ä—Ç–µ—Ä–∞
converter = ParameterRangeConverter([df])

In [61]:
class PolymerRecommendationSystem:
    """–û—Å–Ω–æ–≤–Ω–æ–π –∫–ª–∞—Å—Å –¥–ª—è –æ–±—Ä–∞–±–æ—Ç–∫–∏ –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—å—Å–∫–∏—Ö –∑–∞–ø—Ä–æ—Å–æ–≤"""
    
    def __init__(self, ner_model_path="models/nlp_model_fast", training_datasets=None):
        """–ò–Ω–∏—Ü–∏–∞–ª–∏–∑–∞—Ü–∏—è —Å–∏—Å—Ç–µ–º—ã"""
        self.extractor = PolymerParameterExtractor(ner_model_path)
        self.converter = ParameterRangeConverter(training_datasets) if training_datasets else None
    
    def process_request(self, user_request):
        """–û–±—Ä–∞–±–æ—Ç–∫–∞ –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—å—Å–∫–æ–≥–æ –∑–∞–ø—Ä–æ—Å–∞"""
        print(f"–û–±—Ä–∞–±–æ—Ç–∫–∞ –∑–∞–ø—Ä–æ—Å–∞: {user_request}")
        
        # 1. –ò–∑–≤–ª–µ–∫–∞–µ–º –ø–∞—Ä–∞–º–µ—Ç—Ä—ã
        parameters = self.extractor.extract_parameters(user_request)
        
        if not parameters:
            return {
                "error": "–ù–µ —É–¥–∞–ª–æ—Å—å –∏–∑–≤–ª–µ—á—å –ø–∞—Ä–∞–º–µ—Ç—Ä—ã –∏–∑ –∑–∞–ø—Ä–æ—Å–∞",
                "extracted_text": user_request
            }
        
        # 2. –ü—Ä–µ–æ–±—Ä–∞–∑—É–µ–º –≤ —á–∏—Å–ª–æ–≤—ã–µ –¥–∏–∞–ø–∞–∑–æ–Ω—ã
        recommendations = {}
        for param, info in parameters.items():
            category = info['category']
            if self.converter:
                recommendation = self.converter.get_detailed_recommendation(param, category)
                recommendations[param] = {
                    'category': category,
                    'extracted_phrase': info['value'],
                    'recommendation': recommendation
                }
            else:
                recommendations[param] = {
                    'category': category,
                    'extracted_phrase': info['value'],
                    'recommendation': f"–î–ª—è –ø–∞—Ä–∞–º–µ—Ç—Ä–∞ {param} —Å –∫–∞—Ç–µ–≥–æ—Ä–∏–µ–π {category}"
                }
        
        return {
            "original_request": user_request,
            "extracted_parameters": parameters,
            "recommendations": recommendations,
            "success": True
        }
    
    def format_output(self, result):
        """–§–æ—Ä–º–∞—Ç–∏—Ä–æ–≤–∞–Ω–∏–µ —Ä–µ–∑—É–ª—å—Ç–∞—Ç–∞ –¥–ª—è –≤—ã–≤–æ–¥–∞"""
        if not result.get('success', False):
            return f"–û—à–∏–±–∫–∞: {result.get('error', '–ù–µ–∏–∑–≤–µ—Å—Ç–Ω–∞—è –æ—à–∏–±–∫–∞')}"
        
        output = []
        output.append("=== –†–ï–ö–û–ú–ï–ù–î–ê–¶–ò–ò –ü–û –ü–û–õ–ò–ú–ï–†–£ ===")
        output.append(f"–ó–∞–ø—Ä–æ—Å: {result['original_request']}")
        output.append("")
        output.append("–ò–∑–≤–ª–µ—á–µ–Ω–Ω—ã–µ –ø–∞—Ä–∞–º–µ—Ç—Ä—ã:")
        
        for param, info in result['recommendations'].items():
            output.append(f"  {param}:")
            output.append(f"    –ö–∞—Ç–µ–≥–æ—Ä–∏—è: {info['category']}")
            output.append(f"    –ò–∑–≤–ª–µ—á–µ–Ω–æ: {info['extracted_phrase']}")
            if isinstance(info['recommendation'], dict):
                rec = info['recommendation']
                output.append(f"    –î–∏–∞–ø–∞–∑–æ–Ω –∑–Ω–∞—á–µ–Ω–∏–π: {rec['range']}")
                output.append(f"    –û–ø–∏—Å–∞–Ω–∏–µ: {rec['description']}")
            else:
                output.append(f"    –†–µ–∫–æ–º–µ–Ω–¥–∞—Ü–∏—è: {info['recommendation']}")
            output.append("")
        
        return "\n".join(output)

In [26]:
def save_model_artifacts(model, converter, extractor, save_dir="models/nlp_model_fast/ner"):
    """
    –°–æ—Ö—Ä–∞–Ω–µ–Ω–∏–µ –≤—Å–µ—Ö –∫–æ–º–ø–æ–Ω–µ–Ω—Ç–æ–≤ —Å–∏—Å—Ç–µ–º—ã (NLP –º–æ–¥–µ–ª—å + –≤—Å–ø–æ–º–æ–≥–∞—Ç–µ–ª—å–Ω—ã–µ –∫–ª–∞—Å—Å—ã)
    
    Args:
        model: –≠–∫–∑–µ–º–ø–ª—è—Ä OptimizedPolymerNERTrainer
        converter: –û–±—ä–µ–∫—Ç –∫–æ–Ω–≤–µ—Ä—Ç–µ—Ä–∞ (–Ω–∞–ø—Ä–∏–º–µ—Ä, –¥–ª—è –ø–µ—Ä–µ–≤–æ–¥–∞ –∑–Ω–∞—á–µ–Ω–∏–π)
        extractor: –û–±—ä–µ–∫—Ç —ç–∫—Å—Ç—Ä–∞–∫—Ç–æ—Ä–∞ (–ª–æ–≥–∏–∫–∞ –∏–∑–≤–ª–µ—á–µ–Ω–∏—è)
        save_dir: –ü—É—Ç—å –∫ –ø–∞–ø–∫–µ –¥–ª—è —Å–æ—Ö—Ä–∞–Ω–µ–Ω–∏—è
    """
    # –°–æ–∑–¥–∞–µ–º –¥–∏—Ä–µ–∫—Ç–æ—Ä–∏—é, –µ—Å–ª–∏ –µ—ë –Ω–µ—Ç
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
        
    print(f"–°–æ—Ö—Ä–∞–Ω–µ–Ω–∏–µ –∞—Ä—Ç–µ—Ñ–∞–∫—Ç–æ–≤ –≤: {save_dir}...")

    try:
        # 1. –°–æ—Ö—Ä–∞–Ω—è–µ–º –º–æ–¥–µ–ª—å spaCy
        nlp = model.nlp if hasattr(model, 'nlp') else model
        nlp_path = os.path.join(save_dir, "spacy_ner_model")
        nlp.to_disk(nlp_path)
        print(f"‚úì NLP –º–æ–¥–µ–ª—å —Å–æ—Ö—Ä–∞–Ω–µ–Ω–∞ –≤ {nlp_path}")

        # 2. –°–æ—Ö—Ä–∞–Ω—è–µ–º Python –æ–±—ä–µ–∫—Ç—ã (Converter –∏ Extractor) —á–µ—Ä–µ–∑ pickle
        # –ú—ã —Å–æ—Ö—Ä–∞–Ω—è–µ–º –∏—Ö –≤ –æ–¥–∏–Ω —Ñ–∞–π–ª —Å–ª–æ–≤–∞—Ä–µ–º, —á—Ç–æ–±—ã –ø—Ä–æ—â–µ –±—ã–ª–æ –≥—Ä—É–∑–∏—Ç—å
        artifacts = {
            "converter": converter,
            "extractor": extractor,
            # –ú–æ–∂–Ω–æ –¥–æ–±–∞–≤–∏—Ç—å –º–µ—Ç–∞–¥–∞–Ω–Ω—ã–µ –≤–µ—Ä—Å–∏–∏
            "version": "1.0",
            "model_type": "polymer_ner"
        }
        
        artifacts_path = os.path.join(save_dir, "artifacts.pkl")
        with open(artifacts_path, "wb") as f:
            pickle.dump(artifacts, f)
            
        print(f"‚úì –í—Å–ø–æ–º–æ–≥–∞—Ç–µ–ª—å–Ω—ã–µ –∫–æ–º–ø–æ–Ω–µ–Ω—Ç—ã (converter, extractor) —Å–æ—Ö—Ä–∞–Ω–µ–Ω—ã –≤ {artifacts_path}")
        print("–í—Å–µ –∫–æ–º–ø–æ–Ω–µ–Ω—Ç—ã —Å–∏—Å—Ç–µ–º—ã —É—Å–ø–µ—à–Ω–æ —Å–æ—Ö—Ä–∞–Ω–µ–Ω—ã.")
        
    except Exception as e:
        print(f"CRITICAL ERROR –ø—Ä–∏ —Å–æ—Ö—Ä–∞–Ω–µ–Ω–∏–∏ –º–æ–¥–µ–ª–∏: {e}")
        raise e

In [66]:
# –°–æ–∑–¥–∞–µ–º —Å–∏—Å—Ç–µ–º—É
system = PolymerRecommendationSystem(
    ner_model_path="models/nlp_model_fast",
    training_datasets=[df]
)

# –û–±—Ä–∞–±–∞—Ç—ã–≤–∞–µ–º –∑–∞–ø—Ä–æ—Å
request = "–ò–Ω—Ç–µ—Ä–µ—Å—É–µ—Ç –ø–æ–ª–∏–º–µ—Ä —Å —Å –º–∞–ª–µ–Ω—å–∫–∏–º –∏–Ω–¥–µ–∫—Å–æ–º –≥–æ—Ä—é—á–µ—Å—Ç–∏, —á—Ç–æ –≤–∞–∂–Ω–æ –¥–ª—è —Å —Å—Ç–∞–Ω–¥–∞—Ä—Ç–Ω–æ–π —Å—Ç–∞—Ç–∏—á–µ—Å–∫–æ–π –¥–∏—ç–ª–µ–∫—Ç—Ä–∏—á–µ—Å–∫–æ–π –ø—Ä–æ–Ω–∏—Ü–∞–µ–º–æ—Å—Ç–∏, –µ—â–µ , —á—Ç–æ –Ω–µ–æ–±—Ö–æ–¥–∏–º–æ –Ω–∏–∑–∫–∞—è –ø—Ä–æ–Ω–∏—Ü–∞–µ–º–æ—Å—Ç—å –¥–ª—è –≥–µ–ª–∏—è, —á—Ç–æ –≤–∞–∂–Ω–æ —Å –≤—ã–¥–∞—é—â–∏–º—Å—è —ç–Ω–µ—Ä–≥–µ—Ç–∏—á–µ—Å–∫–∏–º –≤–∫–ª–∞–¥–æ–º —Å–≤—è–∑–µ–π, –∏ –Ω–∏–∑–∫–æ–π –ø–ª–æ—Ç–Ω–æ—Å—Ç—å—é, —á—Ç–æ–±—ã –≤—ã–¥–µ—Ä–∂–∏–≤–∞—Ç—å –≤—ã—Å–æ–∫–∏–µ –Ω–∞–≥—Ä—É–∑–∫–∏"
result = system.process_request(request)
print(system.format_output(result))

–û–±—Ä–∞–±–æ—Ç–∫–∞ –∑–∞–ø—Ä–æ—Å–∞: –ò–Ω—Ç–µ—Ä–µ—Å—É–µ—Ç –ø–æ–ª–∏–º–µ—Ä —Å —Å –º–∞–ª–µ–Ω—å–∫–∏–º –∏–Ω–¥–µ–∫—Å–æ–º –≥–æ—Ä—é—á–µ—Å—Ç–∏, —á—Ç–æ –≤–∞–∂–Ω–æ –¥–ª—è —Å —Å—Ç–∞–Ω–¥–∞—Ä—Ç–Ω–æ–π —Å—Ç–∞—Ç–∏—á–µ—Å–∫–æ–π –¥–∏—ç–ª–µ–∫—Ç—Ä–∏—á–µ—Å–∫–æ–π –ø—Ä–æ–Ω–∏—Ü–∞–µ–º–æ—Å—Ç–∏, –µ—â–µ , —á—Ç–æ –Ω–µ–æ–±—Ö–æ–¥–∏–º–æ –Ω–∏–∑–∫–∞—è –ø—Ä–æ–Ω–∏—Ü–∞–µ–º–æ—Å—Ç—å –¥–ª—è –≥–µ–ª–∏—è, —á—Ç–æ –≤–∞–∂–Ω–æ —Å –≤—ã–¥–∞—é—â–∏–º—Å—è —ç–Ω–µ—Ä–≥–µ—Ç–∏—á–µ—Å–∫–∏–º –≤–∫–ª–∞–¥–æ–º —Å–≤—è–∑–µ–π, –∏ –Ω–∏–∑–∫–æ–π –ø–ª–æ—Ç–Ω–æ—Å—Ç—å—é, —á—Ç–æ–±—ã –≤—ã–¥–µ—Ä–∂–∏–≤–∞—Ç—å –≤—ã—Å–æ–∫–∏–µ –Ω–∞–≥—Ä—É–∑–∫–∏
=== –†–ï–ö–û–ú–ï–ù–î–ê–¶–ò–ò –ü–û –ü–û–õ–ò–ú–ï–†–£ ===
–ó–∞–ø—Ä–æ—Å: –ò–Ω—Ç–µ—Ä–µ—Å—É–µ—Ç –ø–æ–ª–∏–º–µ—Ä —Å —Å –º–∞–ª–µ–Ω—å–∫–∏–º –∏–Ω–¥–µ–∫—Å–æ–º –≥–æ—Ä—é—á–µ—Å—Ç–∏, —á—Ç–æ –≤–∞–∂–Ω–æ –¥–ª—è —Å —Å—Ç–∞–Ω–¥–∞—Ä—Ç–Ω–æ–π —Å—Ç–∞—Ç–∏—á–µ—Å–∫–æ–π –¥–∏—ç–ª–µ–∫—Ç—Ä–∏—á–µ—Å–∫–æ–π –ø—Ä–æ–Ω–∏—Ü–∞–µ–º–æ—Å—Ç–∏, –µ—â–µ , —á—Ç–æ –Ω–µ–æ–±—Ö–æ–¥–∏–º–æ –Ω–∏–∑–∫–∞—è –ø—Ä–æ–Ω–∏—Ü–∞–µ–º–æ—Å—Ç—å –¥–ª—è –≥–µ–ª–∏—è, —á—Ç–æ –≤–∞–∂–Ω–æ —Å –≤—ã–¥–∞—é—â–∏–º—