In [None]:
import pandas as pd
data_chunks = pd.read_csv('/kaggle/input/tender-hack/data.csv', chunksize=10000, sep=';')
characteristics_chunks = pd.read_csv('/kaggle/input/tender-hack/characteristic.csv', chunksize=10000, sep=';')

merged_data = pd.DataFrame()
for data_chunk, characteristics_chunk in zip(data_chunks, characteristics_chunks):
    data_chunk = data_chunk.rename(columns = {"ID" : "id",
                                              "Название СТЕ" : "STE",
                                              "Ссылка на изображение" : "url_image",
                                              "ID конечной категории Портала" : "id_final_category_portal",
                                              "Модель" : "model",
                                              "Производитель" : "producer"})
    characteristics_chunk = characteristics_chunk.rename(columns={"ID характеристики" : "id_characteristic",
                                                                  "Название характеристики" : "name_characteristic",
                                                                  "ID конечной категории Портала" : "id_final_category_portal",
                                                                  "Наименование конечной категории Портала" : "name_final_category_portal",
                                                                  "Тип значения характеристики" : "type_value_characteristic"})
    merged_chunk = pd.merge(data_chunk, characteristics_chunk, on='id_final_category_portal')
    merged_data = pd.concat([merged_data, merged_chunk], ignore_index=True)

In [None]:
# Выбор нужных столбцов
selected_columns = ['STE', 'id', 'name_final_category_portal', 'id_final_category_portal']
result_data = merged_data[selected_columns]

# Удаление дубликатов
result_data = result_data.drop_duplicates()
result_data.head()


In [None]:
def get_category(df, id_category):
    category = df.query("id_final_category_portal == @id_category").name_final_category_portal.head(1).values[0]
    return category
get_category(result_data, 793373188)

In [None]:
import re
import pymorphy2

morph = pymorphy2.MorphAnalyzer()
def pp(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\w*\d\w*', '', text)
    text = re.sub(r'\b\w{1}\b', '', text)
    text = re.sub(r'\b[a-z]+\b', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    words = text.split()
    filtered_words = [word for word in words if morph.word_is_known(word)]
    return ' '.join(filtered_words)
    return text

In [None]:
!pip install pymorphy2

In [None]:
!pip install -q -U transformers
!pip install -q -U accelerate
!pip install -q -U bitsandbytes

In [None]:
import torch
print(torch.__version__)
print(torch.cuda.is_available())
print(torch.version.cuda)

In [None]:
data = pd.DataFrame({'category_id': result_data['id_final_category_portal'],
                     'ste_name': result_data['STE'].apply(pp)})
data.head(10)

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, GlobalAveragePooling1D, Dropout, Conv1D, LSTM, Bidirectional
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

# Преобразование category_id для обучения
category_mapping = {id: idx for idx, id in enumerate(sorted(data['category_id'].unique()))}
data['category_id'] = data['category_id'].map(category_mapping)

# Разбиение на обучающую и тестовую выборки
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# Токенизация и векторизация текста
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_data['ste_name'])
train_sequences = tokenizer.texts_to_sequences(train_data['ste_name'])
test_sequences = tokenizer.texts_to_sequences(test_data['ste_name'])

max_length = 50
train_padded = pad_sequences(train_sequences, maxlen=max_length)
test_padded = pad_sequences(test_sequences, maxlen=max_length)

# Создание модели с использованием Bidirectional LSTM и Conv1D
model = Sequential([
    Embedding(len(tokenizer.word_index) + 1, 100),
    Conv1D(64, 5, activation='relu'),
    Bidirectional(LSTM(64, return_sequences=True)),
    GlobalAveragePooling1D(),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(len(category_mapping), activation='softmax')
])

model.compile(optimizer=Adam(learning_rate=0.001),
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

# Использование EarlyStopping для предотвращения переобучения
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Обучение модели
model.fit(train_padded, train_data['category_id'],
          batch_size=128, epochs=50,
          validation_data=(test_padded, test_data['category_id']),
          callbacks=[early_stopping])

# Функция для предсказания топ-5 категорий
def predict_top5_categories(ste_name):
    sequence = tokenizer.texts_to_sequences([ste_name])
    padded = pad_sequences(sequence, maxlen=max_length)
    probs = model.predict(padded)[0]
    top5_ids = probs.argsort()[-5:][::-1]
    top5_category_ids = [list(category_mapping.keys())[list(category_mapping.values()).index(i)] for i in top5_ids]
    return top5_category_ids

# Пример использования
ste_name = 'ноутбук черный'
top5_category_ids = predict_top5_categories(ste_name)
print(f"Top-5 категорий для товара '{ste_name}': {top5_category_ids}")