In [1]:
import multiprocessing
import pymorphy2
import re
from nltk.corpus import stopwords
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import numpy as np
import pandas as pd

In [2]:
morph = pymorphy2.MorphAnalyzer()

# Загрузка стоп-слов заранее, чтобы не загружать их при каждом вызове функции
stop_words = set(stopwords.words('russian'))

def preprocess_text(text):
    text = re.sub(r'\b\d+\b', '', text)
    text = re.sub(r'[^a-zA-Zа-яА-Я0-9]', ' ', text)
    text = re.sub(r'<[^>]+>', '', text)
    # Приведение текста к нижнему регистру
    text = text.lower()
    
    # Токенизация текста
    tokens = word_tokenize(text)
    
    preprocessed_text = [morph.parse(word)[0].normal_form for word in tokens if word not in stop_words]
    
    return preprocessed_text

In [3]:
api_pool = {'Answer': 'Мужская обувь Nike', 'Company_id': 3982023139,	'Price': 30000,	'Name': 'Василий Александрович Пупкин'}

In [4]:
api_pool_text = preprocess_text(api_pool['Answer'])

In [5]:
api_pool_text

['мужской', 'обувь', 'nike']

In [6]:
model = Word2Vec.load('word2vec')
model.build_vocab([api_pool_text], update=True)
model.alpha = 0.01
model.train(api_pool_text, total_examples=model.corpus_count, epochs=1)

(5, 16)

In [8]:
data = pd.read_csv('data/final_dataset.csv', index_col='Unnamed: 0')

In [9]:
col = data.columns[0:24]

In [10]:
real_col = data.columns[24:-5]

In [11]:
if api_pool['Name'] not in data.index:
    new_dict = {api_pool['Company_id']: api_pool['Price']}
    new_row = pd.DataFrame({'PriceToCompany': [new_dict]}, index=[api_pool['Name']])
    new_row = new_row.reindex(columns=data.columns, fill_value=0)
    data = pd.concat([data, new_row])
else:
    if api_pool['Company_id'] not in data.loc[api_pool['Name']]['PriceToCompany']:
        data.loc[api_pool['Name']]['PriceToCompany'][api_pool['Company_id']] = api_pool['Price']
    else:
        data.loc[api_pool['Name']]['PriceToCompany'][api_pool['Company_id']] += api_pool['Price']

In [12]:
answer_vectors = [model.wv[word] for word in api_pool_text]
max_inds = col[np.argsort(-data.loc[api_pool['Name'], col])[:3]]
for cl in col:
    res = np.mean(np.maximum(model.wv.cosine_similarities(model.wv[cl], answer_vectors), 0))
    data.loc[api_pool['Name'], cl] += res
data.loc[api_pool['Name'], real_col] = 0
data.loc[api_pool['Name'],max_inds[0]+'_real'] = 1
data.loc[api_pool['Name'], 'max1_col'] = max_inds[0]
data.loc[api_pool['Name'], 'max2_col'] = max_inds[1]
data.loc[api_pool['Name'], 'max3_col'] = max_inds[2]

In [15]:
data.to_csv('data/final_dataset.csv')
model.save('word2vec')