# Downloading libraries and importing libraries | Загрузка и импорт библиотек:



In [None]:
%pip install pymorphy3 annoy

Collecting pymorphy3
  Downloading pymorphy3-1.2.0-py3-none-any.whl (55 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/55.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.4/55.4 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting annoy
  Downloading annoy-1.17.3.tar.gz (647 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m647.5/647.5 kB[0m [31m20.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting dawg-python>=0.7.1 (from pymorphy3)
  Downloading DAWG_Python-0.7.2-py2.py3-none-any.whl (11 kB)
Collecting docopt>=0.6 (from pymorphy3)
  Downloading docopt-0.6.2.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pymorphy3-dicts-ru (from pymorphy3)
  Downloading pymorphy3_dicts_ru-2.4.417150.4580142-py2.py3-none-any.whl (8.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.4/8.4 

In [None]:
import pandas as pd, numpy as np, re, nltk

import pymorphy3
morph = pymorphy3.MorphAnalyzer()

from itertools import chain
from functools import lru_cache
from annoy import AnnoyIndex
from tqdm import tqdm

from nltk.tokenize import word_tokenize
nltk.download("punkt")

from nltk.corpus import stopwords
nltk.download("stopwords")
stopwords_russian = stopwords.words("russian")

from sklearn.feature_extraction.text import TfidfVectorizer
tf_idf_vectorizer = TfidfVectorizer()

from sklearn.decomposition import TruncatedSVD
lsa_model = TruncatedSVD(n_components = 100, random_state = 21)

from google.colab import drive
drive.mount("/content/drive")
data_path = "/content/drive/MyDrive/Recommendation system/Data/"

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Mounted at /content/drive


# Acquiring and processing the data | Получение и обработка данных:


In [None]:
df_lots = pd.read_csv(data_path + "data.csv", delimiter = ";", dtype = {"region_code": str, "okpd2_code2": str})
df_participants = pd.read_csv(data_path + "participants.csv", delimiter = ";")

  df_lots = pd.read_csv(data_path + "data.csv", delimiter = ";", dtype = {"region_code": str, "okpd2_code2": str})


## Deleting ambiguous values | Удаление неоднозначных значений:

In [None]:
ambiguous_okpd2_codes = ["2727", "2746", "2747", "2723", "2736", "2735", "2733", "2726", "2725", "2790", "2724", "2728", "2763", "2771"]

df_lots = df_lots[~df_lots["okpd2_code2"].isin(ambiguous_okpd2_codes)]

## Filling missing values | Заполнение пустых значений:

In [None]:
df_lots.fillna(value = "", inplace = True)

## Merging the data | Объединение данных:



In [None]:
df_merged = df_lots.merge(df_participants, how = "inner", on = "pn_lot_md5")

## Replacing old pn_lot_md5 indexes with new ones to optimize further processes | Замена старых индексов pn_lot_md5 новыми для оптимизации дальнейших процессов:

In [None]:
pn_lot_md5_indexes_dict = {}
index = 0

for pn_lot_md5 in df_lots["pn_lot_md5"].unique():
    pn_lot_md5_indexes_dict.update({pn_lot_md5: index})
    index += 1

In [None]:
df_lots["pn_lot_md5"] = df_lots["pn_lot_md5"].map(pn_lot_md5_indexes_dict)

df_merged["pn_lot_md5"] = df_merged["pn_lot_md5"].map(pn_lot_md5_indexes_dict)

## Splitting the merged data into train data (3 years) and testing data (1 year) / Разбиение объединённых данных на обучающие (3 года) и тестируемые (1 год):

In [None]:
df_train = df_merged[df_merged["min_publish_date"] <= "2022-01-01"]

df_test_merged = df_merged[df_merged["min_publish_date"] >= "2022-01-01"]
df_test_not_merged = df_lots[df_lots["min_publish_date"] >= "2022-01-01"]

## Creation of dictionaries with the grouped data to optimize further processes / Создание словарей со сгруппированными данными для оптимизации дальнейших процессов:

In [None]:
# Creation of a dictionary with OKPD2 and region codes grouped by participants |
# Создание словаря кодов региона и ОКПД2, сгруппированных по поставщикам:
df_grouped_train_participants = df_train.groupby("participant_inn_kpp")["region_code", "okpd2_code2", "fz", "etp"].agg(lambda x: x.unique())
grouped_train_participants = df_grouped_train_participants.to_dict("index")

  df_grouped_train_participants = df_train.groupby("participant_inn_kpp")["region_code", "okpd2_code2", "fz", "etp"].agg(lambda x: x.unique())


In [None]:
# Creation of a dictionary with lots from the testing data grouped by OKPD2 code |
# Создание словаря лотов из тестируемого набора данных, сгруппированных по ОКПД2 коду:
df_grouped_test_okpd2_codes = df_test_not_merged.groupby("okpd2_code2")["pn_lot_md5"].agg(list)
grouped_test_okpd2_codes = df_grouped_test_okpd2_codes.to_dict()

# Creation of a dictionary with lots from the testing data grouped by region code |
# Создание словаря лотов из тестируемого набора данных, сгруппированных по коду региона:
df_grouped_test_region_codes = df_test_not_merged.groupby("region_code")["pn_lot_md5"].agg(list)
grouped_test_region_codes = df_grouped_test_region_codes.to_dict()

# Creation of a dictionary with lots from the testing data grouped by FZ |
# Создание словаря лотов из тестируемого набора данных, сгруппированных по ФЗ:
df_grouped_test_fz = df_test_not_merged.groupby("fz")["pn_lot_md5"].agg(list)
grouped_test_fz = df_grouped_test_fz.to_dict()

# Creation of a dictionary with lots from the testing data grouped by etp |
# Создание словаря лотов из тестируемого набора данных, сгруппированных по площадке:
df_grouped_test_etp = df_test_not_merged.groupby("etp")["pn_lot_md5"].agg(list)
grouped_test_etp = df_grouped_test_fz.to_dict()

In [None]:
# Creation of a dictionary with lots from the testing data grouped by participants |
# Создание словаря лотов из тестируемого набора данных, сгруппированных по поставщикам:
df_grouped_test_participants = df_test_merged.groupby("participant_inn_kpp")["pn_lot_md5"].agg(list)
grouped_test_participants = df_grouped_test_participants.to_dict()

# Preprocessing and obtaining TF-IDF and LSA vectors for the "item_descriptions" attribute | Предварительная обработка и получение TF-IDF и LSA векторов для аттрибута "item_descriptions":

## Functions | Функции:

In [None]:
# Lemmatisation function | Функция лемматизации:
@lru_cache
def lemmatize_token(token: str) -> str:
    return morph.parse(token)[0].normal_form

# Text preprocessing function | Функция предобработки текста:
def preprocess_item_description(item_description: str) -> str:
    item_description = item_description.lower().strip()
    item_description = re.sub(r"[^\w\d\s]", "", item_description)
    tokens = [token for token in word_tokenize(item_description) if token not in stopwords_russian]
    lemmatized_tokens = [lemmatize_token(token) for token in tokens]
    return " ".join(lemmatized_tokens)

# Texts preprocessing function | Функция предобработки текстов:
def preprocess_item_descriptions(item_descriptions: list) -> list:
    return [preprocess_item_description(item_description) for item_description in tqdm(item_descriptions)]

# Fit and transform TF-IDF and LSA models to the training data function | Функция подгонки и преобразования моделей TF-IDF и LSA для обучающихся данных:
def obtaining_train_lsa_vectors(item_descriptions: list) -> list:
    train_tf_idf_vectors = tf_idf_vectorizer.fit_transform(item_descriptions)
    train_lsa_vectors = lsa_model.fit_transform(train_tf_idf_vectors)
    return train_lsa_vectors

# Transform TF-IDF and LSA models to the testing data | Функция преобразования моделей TF-IDF и LSA для тестовых данных:
def obtaining_test_lsa_vectors(item_descriptions: list) -> list:
    test_tf_idf_vectors = tf_idf_vectorizer.transform(item_descriptions)
    test_lsa_vectors = lsa_model.transform(test_tf_idf_vectors)
    return test_lsa_vectors

## Application of the functions | Применение функций:

### Application of the functions to the training data grouped by participants | Применение функций на обучающих данных, сгруппированных по поставщикам:

In [None]:
train_item_descriptions = df_train.groupby("participant_inn_kpp")["item_descriptions"].agg(" ".join)
train_item_descriptions_indexes = train_item_descriptions.index

In [None]:
preprocessed_train_item_descriptions = preprocess_item_descriptions(train_item_descriptions)

100%|██████████| 52920/52920 [09:21<00:00, 94.30it/s] 


In [None]:
train_item_descriptions_lsa_vectors = obtaining_train_lsa_vectors(preprocessed_train_item_descriptions)

In [None]:
train_lsa_vectors_dict = {train_item_descriptions_index: train_item_descriptions_lsa_vector for train_item_descriptions_index, train_item_descriptions_lsa_vector in zip(train_item_descriptions_indexes, train_item_descriptions_lsa_vectors)}

### Application of the functions to the testing data | Применение функций на тестируемых данных:

In [None]:
test_item_descriptions = df_test_not_merged["item_descriptions"].to_numpy()
test_indexes = df_test_not_merged["pn_lot_md5"]

In [None]:
preprocessed_test_item_descriptions = preprocess_item_descriptions(test_item_descriptions)

100%|██████████| 41873/41873 [00:43<00:00, 968.78it/s]


In [None]:
test_lsa_vectors_dict = obtaining_test_lsa_vectors(preprocessed_test_item_descriptions)

# Working with the "Annoy" library | Работа с библиотекой "Annoy":

In [None]:
# Creating a storage for LSA vectors | Создание хранилища для LSA векторов:
storage = AnnoyIndex(100, "angular")

# Adding test indexes and LSA vectors to the storage | Добавление тестовых индексов и LSA-векторов в хранилище:
for index, vector in zip(test_indexes, test_lsa_vectors_dict):
    storage.add_item(index, vector)

# Building a forest of 25 trees | Построение леса из 25 деревьев:
storage.build(25)

True

# Getting recommendations for each participant | Получение рекомендаций для каждого поставщика:

In [None]:
# Getting 15 recommendations function | Функция получения 15 рекомендаций:
def recommend(train_lsa_vectors_dict: dict) -> dict:
    recommendations_dict = dict()
    for participant in tqdm(train_lsa_vectors_dict.keys()):
        participant_okpd2_codes = grouped_train_participants[participant]["okpd2_code2"]
        participant_region_codes = grouped_train_participants[participant]["region_code"]
        participant_fz = grouped_train_participants[participant]["fz"]
        participant_etp = grouped_train_participants[participant]["etp"]

        lots_okpd2_codes = set(chain(*[grouped_test_okpd2_codes[code] for code in participant_okpd2_codes]))
        lots_region_codes = set(chain(*[grouped_test_region_codes[code] for code in participant_region_codes]))
        lots_fz = set(chain(*[grouped_test_fz[fz] for fz in participant_fz]))
        lots_etp = set(chain(*[grouped_test_etp[etp]for etp in participant_fz]))

        lots_for_participant = lots_okpd2_codes & lots_region_codes & lots_fz & lots_etp

        pre_recommendations = set(storage.get_nns_by_vector(train_lsa_vectors_dict[participant], 15))
        recommendations = pre_recommendations & lots_for_participant

        recommendations_dict.update({participant: recommendations})
    return recommendations_dict

In [None]:
recommendations_dict = recommend(train_lsa_vectors_dict)

100%|██████████| 52920/52920 [04:23<00:00, 200.95it/s]


# Evaluation of recommendation system | Оценка работы рекомендательной системы:

In [None]:
# Precision@k metric function | Функция, рассчитывающая метрику precision@k:
def precision_at_k(y_true: list, y_pred: list, k: int) -> float:
    y_pred = y_pred[:k]
    intersection_length = len(set(y_true) & set(y_pred))
    return intersection_length / len(y_pred)

# Mean Average Recall at K (MAR@K) metric function | Функция, рассчитывающая метрику MAR@k:
def mean_average_recall_at_k(y_true_dict: dict, y_pred_dict: dict, k: int) -> float:
    average_recall_list = []
    for participant, y_true in y_true_dict.items():
        y_pred = y_pred_dict.get(participant, [])
        recall_values = []
        relevant_count = 0
        for i, item in enumerate(y_pred[:k]):
            if item in y_true:
                relevant_count += 1
                recall = relevant_count / (i + 1)
                recall_values.append(recall)
        average_recall = sum(recall_values) / len(y_true) if len(y_true) > 0 else 0
        average_recall_list.append(average_recall)
    mean_average_recall = sum(average_recall_list) / len(y_true_dict)
    return mean_average_recall

# Metric evaluation function | Функция оценки работы рекомендательной системы по метрикам:
def evaluation(recommendations_dict: dict) -> None:
    precision_at_k_list = []
    mar_at_k_list = []
    count = 0
    for participant in tqdm(recommendations_dict.keys()):
        y_true = grouped_test_participants.get(participant, [])
        y_pred = list(recommendations_dict[participant])
        if len(y_pred) == 0:
            pass
        else:
            precision_at_k_list.append(precision_at_k(y_true, y_pred, 5))
            mar_at_k_list.append(mean_average_recall_at_k({participant:y_true}, {participant:y_pred}, 5))
    print(f"\n\nPrecision@5 = {round(np.mean(precision_at_k_list) * 100, 2)}%\nMAR@5 = {round(np.mean(mar_at_k_list) * 100, 2)}%")

In [None]:
evaluation(recommendations_dict)

100%|██████████| 52920/52920 [00:00<00:00, 438857.21it/s]



Precision@5 = 1.5%
MAR@5 = 0.96%



