# Загрузка библиотек:

In [1]:
%pip install pymorphy3 annoy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [12]:
import pandas as pd, numpy as np, re, nltk

import pymorphy3
morph = pymorphy3.MorphAnalyzer()

from functools import lru_cache
from annoy import AnnoyIndex
from tqdm import tqdm
from itertools import islice

from nltk.tokenize import word_tokenize
nltk.download('punkt')

from nltk.corpus import stopwords
nltk.download('stopwords')
stopwords_russian = stopwords.words('russian')

from sklearn.feature_extraction.text import TfidfVectorizer
tf_idf_vectorizer = TfidfVectorizer()

from sklearn.decomposition import TruncatedSVD
lsa = TruncatedSVD(n_components = 100, random_state = 21)

from google.colab import drive
drive.mount("/content/drive")
data_path = "/content/drive/MyDrive/Recommendation system/"

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Получение данных:

In [3]:
df_products = pd.read_csv(data_path + "products.csv", delimiter = ";")
df_participants = pd.read_csv(data_path + "participants.csv", delimiter = ";")

## Удаление пустых значений:

In [4]:
df_products.dropna(inplace = True)
df_participants.dropna(inplace = True)

## Слияние таблиц:

In [5]:
df_merged = df_products.merge(df_participants, how = "inner", on = "pn_lot")

## Разбиение слитых данных на тренировочные (3 месяца) и тестируемые (последний месяц):

In [6]:
df_train = df_merged[df_merged["min_publish_date"] <= '2022-09-01']
df_test = df_merged[df_merged["min_publish_date"] >= '2022-09-01']

# Функции обработки атрибутов:

## Лемматизация и токенизация:

In [7]:
@lru_cache
def lemmatize_token(token: str) -> str:
    return morph.parse(token)[0].normal_form

def process_item_description(item_description: str) -> str:
    item_description = item_description.lower().strip()
    item_description = re.sub(r"[^\w\d\s]", "", item_description)
    tokens = [token for token in word_tokenize(item_description) if token not in stopwords_russian]
    lemmatized_tokens = [lemmatize_token(token) for token in tokens]
    return " ".join(lemmatized_tokens)

def process_item_descriptions(item_descriptions: list) -> list:
    return [process_item_description(item_description) for item_description in tqdm(item_descriptions)]

## Получение текстовых сущностей:

In [8]:
def obtaining_train_lsa_vectors(item_descriptions: list) -> list:
    train_tf_idf_vectors = tf_idf_vectorizer.fit_transform(item_descriptions)
    train_lsa_vectors = lsa.fit_transform(train_tf_idf_vectors)
    return train_lsa_vectors

def obtaining_test_lsa_vectors(item_descriptions: list) -> list:
    test_tf_idf_vectors = tf_idf_vectorizer.transform(item_descriptions)
    test_lsa_vectors = lsa.transform(test_tf_idf_vectors)
    return test_lsa_vectors

# Обработка наименований закупок для всех тренировочных данных, сгруппированных по поставщику:

In [9]:
grouped_item_descriptions = df_train.groupby("participant_inn_kpp")['item_descriptions'].agg('; '.join)

grouped_indexes = grouped_item_descriptions.index
grouped_item_descriptions = grouped_item_descriptions.to_numpy()

In [10]:
grouped_item_descriptions = process_item_descriptions(grouped_item_descriptions)

100%|██████████| 42910/42910 [11:45<00:00, 60.83it/s] 


In [13]:
grouped_item_descriptions_vectors = obtaining_train_lsa_vectors(grouped_item_descriptions)

In [14]:
grouped_participant_vectors = {grouped_index: grouped_item_descriptions_vectors for grouped_index, grouped_item_descriptions_vectors in zip(grouped_indexes, grouped_item_descriptions_vectors)}

# Обработка наименований закупок для всех тестовых данных:

In [15]:
test_indexes = df_test.index
test_item_descriptions = df_test["item_descriptions"].to_numpy()

In [16]:
processed_test_item_descriptions = process_item_descriptions(test_item_descriptions)

100%|██████████| 228977/228977 [03:19<00:00, 1147.08it/s]


In [17]:
test_lsa_vectors = obtaining_test_lsa_vectors(processed_test_item_descriptions)

# Создание хранилища LSA-векторов:

In [18]:
annoy_storage = AnnoyIndex(100, "angular")

annoy_storage.set_seed(21)

In [19]:
for index, vector in zip(test_indexes, test_lsa_vectors):
    annoy_storage.add_item(index, vector)

In [20]:
annoy_storage.build(21)

True

# Формирование рекомендаций:

In [56]:
def indexes_to_pn_lots(indexes: list) -> set:
    return set(df_test.loc[df_test.index == index, "pn_lot"].to_numpy()[0] for index in indexes)

def predict(grouped_participant_vectors: dict) -> dict:
    participant_predictions = dict()
    for participant in tqdm(grouped_participant_vectors.keys()):
        participant_okpd2_codes = df_train.loc[df_train["participant_inn_kpp"] == participant, "okpd2_code"].unique()
        participant_region_codes = df_train.loc[df_train["participant_inn_kpp"] == participant, "region_code"].unique()
        participant_df = df_test[(df_test["okpd2_code"].isin(participant_okpd2_codes)) & (df_test["region_code"].isin(participant_region_codes))]
        predictions = indexes_to_pn_lots(annoy_storage.get_nns_by_vector(grouped_participant_vectors[participant], 10))
        final_predictions = participant_df.loc[participant_df["pn_lot"].isin(predictions), "pn_lot"].unique()
        participant_predictions.update({participant: final_predictions})
    return participant_predictions

In [57]:
grouped_participant_vectors_sliced = dict(islice(grouped_participant_vectors.items(), 5_000))

In [58]:
participant_predictions = predict(grouped_participant_vectors_sliced)

100%|██████████| 5000/5000 [12:12<00:00,  6.83it/s]


# Оценка работы рекомендательной системы:

In [65]:
#def accuracy(y_true: list, y_pred: list) -> list:
#    return np.count_nonzero(np.isin(y_true, y_pred)) / len(y_pred)

def precision_at_k(y_true, y_pred, k):
    y_pred = y_pred[:k]
    intersection_length = len(set(y_true) & set(y_pred))
    return intersection_length / len(y_pred)

def evaluate_recsys(participant_predictions: list) -> None:
#    accuracy_list = []
    precision_at_k_list = []
    for participant in tqdm(participant_predictions.keys()):
        y_true = df_test.loc[df_test["participant_inn_kpp"] == participant, "pn_lot"].to_numpy()
        y_pred = participant_predictions[participant]
        if len(y_pred) == 0:
#            accuracy_list.append(0)
            precision_at_k_list.append(0)
        else:
#            accuracy_list.append(accuracy(y_true, y_pred))
            precision_at_k_list.append(precision_at_k(y_true, y_pred, 5))
    print(f"Precision@5 = {np.mean(precision_at_k_list) * 100}%")

In [60]:
evaluate_recsys(participant_predictions)

100%|██████████| 5000/5000 [01:55<00:00, 43.30it/s]

Precision@5 = 5.549333333333334%





# Игровая площадка.

In [61]:
participant = df_train["participant_inn_kpp"].sample(1).to_numpy()[0]
participant

'7839480449_780601001'

In [62]:
df_train[df_train["participant_inn_kpp"] == participant].head()

Unnamed: 0,pn_lot,region_code,okpd2_code,min_publish_date,purchase_name,item_descriptions,participant_inn_kpp,is_winner
56209,0372200026322000045_,78,71.2,2022-06-02,Выполнение работ по замерам сопротивлений изол...,Измерение полного сопротивления цепи фаза-нуль...,7839480449_780601001,1
68266,0372200101322000009_,78,43.2,2022-07-11,Выполнение работ по замеру сопротивления изоля...,Выполнение работ по замеру сопротивления изоля...,7839480449_780601001,1
86984,0372200041522000033_,78,71.2,2022-06-24,Выполнение работ по замеру сопротивления изоляции,Выполнение работ по замеру сопротивления изоля...,7839480449_780601001,0
120464,0372200146522000011_,78,71.2,2022-07-20,Оказание услуг по измерению сопротивления изол...,Замер полного сопротивления цепи фаза-нуль (то...,7839480449_780601001,1
125756,0372100011122000029_,78,71.2,2022-06-06,Оказание услуг по проведению замеров сопротивл...,Оказание услуг по проведению замеров сопротивл...,7839480449_780601001,0


In [63]:
predictions = indexes_to_pn_lots(annoy_storage.get_nns_by_vector(grouped_participant_vectors[participant], 5))
predictions

{'0301400000322000100_', '0356100015422000057_', '0373100030822000074_'}

In [64]:
df_test[df_test["pn_lot"].isin(predictions)].head()

Unnamed: 0,pn_lot,region_code,okpd2_code,min_publish_date,purchase_name,item_descriptions,participant_inn_kpp,is_winner
465554,0301400000322000100_,2,71.2,2022-09-22,Измерение сопротивления изоляции оборудования,Услуги по электротехническим испытаниям электр...,7730623392_772301001,0
465555,0301400000322000100_,2,71.2,2022-09-22,Измерение сопротивления изоляции оборудования,Услуги по электротехническим испытаниям электр...,637700579550_,0
465556,0301400000322000100_,2,71.2,2022-09-22,Измерение сопротивления изоляции оборудования,Услуги по электротехническим испытаниям электр...,0277123718_027701001,1
803394,0356100015422000057_,59,71.2,2022-09-02,Оказание услуг по измерению сопротивления изол...,Услуги по измерению сопротивления изоляции и з...,5911997452_591101001,1
872265,0373100030822000074_,77,71.2,2022-09-13,Оказание услуг по профилактическим испытаниям ...,Оказание услуг по профилактическим испытаниям ...,0274936176_027201001,0
