In [63]:
import pandas as pd
import os
import warnings

warnings.filterwarnings("ignore")
ROOT = os.getcwd().split('\\LLM')[0] + '\\LLM\\data\\'
supplier_df = pd.read_pickle(os.path.join(ROOT, "supplier_df.pkl"))
category_df = pd.read_pickle(os.path.join(ROOT, "category_df.pkl"))
mapping_df = pd.read_pickle(os.path.join(ROOT, "mapping_df.pkl"))

mapping_df = mapping_df[['Артикул', 'type_id', 'description_category_id']]
mapping_df.drop_duplicates(inplace=True)

vcdf = mapping_df['Артикул'].value_counts()
vcdf = vcdf[vcdf==1]
articuls = vcdf.index.tolist()

clean_mapping_df = mapping_df[mapping_df['Артикул'].isin(articuls)]

clean_supplier_df = supplier_df[supplier_df['Код артикула'].isin(clean_mapping_df['Артикул'])]
clean_supplier_df.loc[:, 'text'] = clean_supplier_df.apply(lambda row: '/'.join(row[['Название', 'Группа товаров', 'Раздел']].tolist()), axis=1)
clean_supplier_df.rename(columns={'Код артикула':'articul'}, inplace=True)

prepare_supplier_df = clean_supplier_df[['articul', 'text']]
if prepare_supplier_df.shape[0] != clean_supplier_df.shape[0]:
  print('ERROR: не все товары были найдены у поставщика')

def get_category_text(v) -> str | None:
    type_id = v['type_id']
    description_category_id = v['description_category_id']
    filtered_category_df = category_df[((category_df['2_type_id'] == type_id) & (category_df['1_description_category_id'] == description_category_id))]
    if filtered_category_df.empty:
      print(f'NOT FOUND ERROR: {type_id}')
      return None
    elif filtered_category_df.shape[0] > 1:
      print(f'NOT UNIQUE ERROR: {type_id}')
    else:
      return '/'.join(filtered_category_df.iloc[0][['0_category_name', '1_category_name', '2_type_name']].tolist())

data = pd.DataFrame({
    'text': clean_mapping_df.merge(right=prepare_supplier_df, left_on='Артикул', right_on='articul', how='left')['text'].tolist(),
    'label': clean_mapping_df[['description_category_id', 'type_id']].apply(get_category_text, axis=1).tolist()
})



In [86]:
from tqdm import tqdm
import re
all_text = ' '.join(data.text.tolist())
all_text = re.sub(',', ' ', all_text)
all_text = list(set(all_text.split(' ')))
all_text = [v for v in all_text if not re.search('\d', v)]

word_df = pd.DataFrame(columns = data.label.unique().tolist(),
                       index = all_text)

grouped_text = data.groupby('label')['text'].apply(lambda x: ' '.join(x)).to_dict()

for word in tqdm(word_df.index, total=len(word_df)):
    pattern = re.escape(word)
    regex = re.compile(pattern)
    for label, text in grouped_text.items():
        word_df.loc[word, label] = len(regex.findall(text))
word_df = word_df.iloc[1:]
word_df = word_df.apply(lambda row: row / row.sum() * 100, axis=1)

100%|██████████| 1909/1909 [00:18<00:00, 104.30it/s]


In [87]:
word_df

Unnamed: 0,Строительство и ремонт/Средства защиты и пожаротушения/Очки защитные,Строительство и ремонт/Средства защиты и пожаротушения/Щиток защитный,Строительство и ремонт/Оснастка для инструмента/Бита,Строительство и ремонт/Инструменты для ремонта и строительства/Бокорезы,Строительство и ремонт/Инструменты для ремонта и строительства/Захват,Строительство и ремонт/Инструменты для ремонта и строительства/Зубило,Строительство и ремонт/Инструменты для ремонта и строительства/Щипцы строительные,Строительство и ремонт/Инструменты для ремонта и строительства/Киянка,Строительство и ремонт/Инструменты для ремонта и строительства/Ключ баллонный,Строительство и ремонт/Инструменты для ремонта и строительства/Набор ключей,...,Дом и сад/Насосы для дачи/Скважинный насос,Строительство и ремонт/Инструменты для ремонта и строительства/Полосогиб/крюкогиб,Строительство и ремонт/Инструменты для ремонта и строительства/Болторез,Дом и сад/Садовая техника/Газонокосилка электрическая,Дом и сад/Садовая техника/Садовый триммер бензиновый,Дом и сад/Садовый инструмент/Поясной держатель для садовых инструментов,Дом и сад/Оснастка и запчасти к садовой технике/Леска для триммера,Дом и сад/Оснастка и запчасти к садовой технике/Катушка с леской,Дом и сад/Оснастка и запчасти к садовой технике/Нож для газонокосилки,Строительство и ремонт/Расходники для инструмента/Губка шлифовальная
имбусовые,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,22.727273,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
инструмент/Гладилки,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
РР,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
двухлапковая,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
профиль,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
никелированный,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
скидки/Режущий,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
канатно-веревочные/Шпагаты,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
каленый,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [88]:
data['label'].value_counts()

label
Строительство и ремонт/Расходники для инструмента/Сверло                    160
Строительство и ремонт/Инструменты для ремонта и строительства/Ключ         122
Дом и сад/Садовый инструмент/Лопата                                          92
Строительство и ремонт/Инструменты для ремонта и строительства/Шпатель       83
Строительство и ремонт/Средства защиты и пожаротушения/Перчатки защитные     79
                                                                           ... 
Строительство и ремонт/Инструменты для ремонта и строительства/Труборез       1
Строительство и ремонт/Ручки, замки и фурнитура/Проушина                      1
Строительство и ремонт/Расходники для инструмента/Пика                        1
Строительство и ремонт/Ручки, замки и фурнитура/Петля мебельная               1
Строительство и ремонт/Расходники для инструмента/Губка шлифовальная          1
Name: count, Length: 222, dtype: int64

In [89]:
word_df.loc[
    word_df['Строительство и ремонт/Расходники для инструмента/Сверло']==100,
].index

Index(['пылеудаление', 'улучшенное', 'нитридтитановых', 'коробке', '(через',
       'HSS', 'мм.)', 'центровочное', 'Форстнера', 'бокс', 'перовое',
       'быстрорежущая', 'НSS', 'перовых', 'мм.'],
      dtype='object')

In [115]:
def get_patter(category, cache):
    if category in cache:
        return cache[category]
    
    patterns = word_df.loc[word_df[category] == 100].index.tolist()
    cache[category] = patterns
    return patterns

import time
start = time.time()
pattern_cache = {}
columns = word_df.columns.tolist()
supplier_values = supplier_df.iloc[:10, 1].values  # Сохраняем нужные значения для 10 строк

for i, value in enumerate(supplier_values):
    print(value)
    variants = pd.DataFrame(index=columns, columns=['count'])

    for col in columns:
        paterns = get_patter(col, pattern_cache)  # Используем кэш
        if paterns:
            finds = [p in value for p in paterns]
            variants.loc[col, 'count'] = sum(finds) / len(paterns) * 100
    
    variants = variants[(variants['count'] != 0) & variants['count'].notna()]
    if variants.empty:
        print('NOT FOUND')
    else:
        for idx, row in variants.iterrows():
            print(row['count'], ' | ', idx)
    print()
print(f'TOTAL TIME: {time.time() - start}')

Комбинезон защитный СТАНДАРТ, 40 г/м2,  для малярных и строительных работ, 176 - 182 см Сибртех
NOT FOUND

Комбинезон защитный СТАНДАРТ, 40 г/м2,  для малярных и строительных работ, 182 - 188 см Сибртех
NOT FOUND

Комбинезон защитный ЛЮКС, 60 г/м2, для малярных и строит. работ, 176-182 см Сибртех
NOT FOUND

Комбинезон защитный ЛЮКС, 60 г/м2, для малярных и строит. работ, 182-188 см Сибртех
NOT FOUND

Очки защитные закрытого типа с непрямой вентиляцией, поликарбонат Россия Сибртех
28.57142857142857  |  Строительство и ремонт/Средства защиты и пожаротушения/Очки защитные

Очки защитные закрытого типа с прямой вентиляцией, поликарбонат Россия Сибртех
21.428571428571427  |  Строительство и ремонт/Средства защиты и пожаротушения/Очки защитные

Очки защитные закрытого типа, герметичные, поликарбонат Россия Сибртех
21.428571428571427  |  Строительство и ремонт/Средства защиты и пожаротушения/Очки защитные

Очки защитные "Панорама" с непрямой вентиляцией Сибртех
21.428571428571427  |  Строител

In [114]:
len(supplier_df)*(0.5161001682281494/10)/60

12.194586808284125