# Downloading libraries and importing libraries / Загрузка и импорт библиотек:



In [None]:
%pip install pymorphy3 annoy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pymorphy3
  Downloading pymorphy3-1.2.0-py3-none-any.whl (55 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.4/55.4 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting annoy
  Downloading annoy-1.17.2.tar.gz (647 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m647.4/647.4 kB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting dawg-python>=0.7.1 (from pymorphy3)
  Downloading DAWG_Python-0.7.2-py2.py3-none-any.whl (11 kB)
Collecting docopt>=0.6 (from pymorphy3)
  Downloading docopt-0.6.2.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pymorphy3-dicts-ru (from pymorphy3)
  Downloading pymorphy3_dicts_ru-2.4.417150.4580142-py2.py3-none-any.whl (8.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.4/8.4 MB[0m [31m74.

In [None]:
import pandas as pd, numpy as np, re, nltk

import pymorphy3
morph = pymorphy3.MorphAnalyzer()

from itertools import chain
from functools import lru_cache
from annoy import AnnoyIndex
from tqdm import tqdm

from nltk.tokenize import word_tokenize
nltk.download("punkt")

from nltk.corpus import stopwords
nltk.download("stopwords")
stopwords_russian = stopwords.words("russian")

from sklearn.feature_extraction.text import TfidfVectorizer
tf_idf_vectorizer = TfidfVectorizer()

from sklearn.decomposition import TruncatedSVD
lsa_model = TruncatedSVD(n_components = 100, random_state = 21)

from google.colab import drive
drive.mount("/content/drive")
data_path = "/content/drive/MyDrive/Recommendation system/"

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Acquiring and processing the data / Получение и обработка данных:


In [None]:
df_lots = pd.read_csv(data_path + "data.csv", delimiter = ";", dtype = {"region_code": str, "okpd2_code2": str})
df_participants = pd.read_csv(data_path + "participants.csv", delimiter = ";")

  df_lots = pd.read_csv(data_path + "data.csv", delimiter = ";", dtype = {"region_code": str, "okpd2_code2": str})


## Deleting ambiguous values / Удаление неоднозначных значений:

In [None]:
ambiguous_okpd2_codes = ["2727", "2746", "2747", "2723", "2736", "2735", "2733", "2726", "2725", "2790", "2724", "2728", "2763", "2771"]

df_lots = df_lots[~df_lots["okpd2_code2"].isin(ambiguous_okpd2_codes)]

## Filling missing values / Заполнение пустых значений:

In [None]:
df_lots.fillna(value = "", inplace = True)

## Merging the data / Объединение данных:



In [None]:
df_merged = df_lots.merge(df_participants, how = "inner", on = "pn_lot_md5")

## Splitting the merged data into train data (3 years) and testing data (1 year) / Разбиение объединённых данных на обучающие (3 года) и тестируемые (1 год):

In [None]:
df_train = df_merged[df_merged["min_publish_date"] <= "2022-01-01"]

df_test_merged = df_merged[df_merged["min_publish_date"] >= "2022-01-01"]
df_test_not_merged = df_lots[df_lots["min_publish_date"] >= "2022-01-01"]

## Creation of dictionaries with the grouped data to optimize further processes / Создание словарей со сгруппированными данными для оптимизации дальнейших процессов:

In [None]:
# Creation of a dictionary with OKPD2 and region codes grouped by participants /
# Создание словаря кодов региона и ОКПД2, сгруппированных по поставщикам:
df_grouped_train_participants = df_train.loc[:, ["okpd2_code2", "region_code", "participant_inn_kpp"]]
df_grouped_train_participants = df_grouped_train_participants.groupby("participant_inn_kpp")["region_code", "okpd2_code2"].agg(lambda x: x.unique())
grouped_train_participants = df_grouped_train_participants.to_dict("index")

  df_grouped_train_participants = df_grouped_train_participants.groupby("participant_inn_kpp")["region_code", "okpd2_code2"].agg(lambda x: x.unique())


In [None]:
# Creation of a dictionary with lots from the testing data grouped by OKPD2 code /
# Создание словаря лотов из тестируемого набора данных, сгруппированных по ОКПД2 коду:
df_grouped_test_okpd2_codes = df_test_not_merged.loc[:, ["pn_lot_md5", "okpd2_code2"]]
df_grouped_test_okpd2_codes = df_grouped_test_okpd2_codes.groupby("okpd2_code2").agg(lambda value: word_tokenize(" ".join(value.unique())))
grouped_test_okpd2_codes = df_grouped_test_okpd2_codes.to_dict("index")

In [None]:
# Creation of a dictionary with lots from the testing data grouped by region code /
# Создание словаря лотов из тестируемого набора данных, сгруппированных по коду региона:
df_grouped_test_region_codes = df_test_not_merged.loc[:, ["pn_lot_md5", "region_code"]]
df_grouped_test_region_codes = df_grouped_test_region_codes.groupby("region_code").agg(lambda value: word_tokenize(" ".join(value.unique())))
grouped_test_region_codes = df_grouped_test_region_codes.to_dict("index")

In [None]:
# Creation of a dictionary with lots from the testing data grouped by participants /
# Создание словаря лотов из тестируемого набора данных, сгруппированных по поставщикам:
df_grouped_test_participants = df_test_merged.loc[:, ["pn_lot_md5", "participant_inn_kpp"]]
df_grouped_test_participants = df_grouped_test_participants.groupby("participant_inn_kpp").agg(lambda value: word_tokenize(" ".join(value.unique())))
grouped_test_participants = df_grouped_test_participants.to_dict("index")

# Preprocessing and obtaining TF-IDF and LSA vectors for the "item_descriptions" attribute / Предварительная обработка и получение TF-IDF и LSA векторов для аттрибута "item_descriptions":

## Functions / Функции:

In [None]:
# Lemmatisation function / Функция лемматизации:
@lru_cache
def lemmatize_token(token: str) -> str:
    return morph.parse(token)[0].normal_form

# Text preprocessing function / Функция предобработки текста:
def preprocess_item_description(item_description: str) -> str:
    item_description = item_description.lower().strip()
    item_description = re.sub(r"[^\w\d\s]", "", item_description)
    tokens = [token for token in word_tokenize(item_description) if token not in stopwords_russian]
    lemmatized_tokens = [lemmatize_token(token) for token in tokens]
    return " ".join(lemmatized_tokens)

# Texts preprocessing function / Функция предобработки текстов:
def preprocess_item_descriptions(item_descriptions: list) -> list:
    return [preprocess_item_description(item_description) for item_description in tqdm(item_descriptions)]

# Fit and transform TF-IDF and LSA models to the training data function / Функция подгонки и преобразования моделей TF-IDF и LSA для обучающихся данных:
def obtaining_train_lsa_vectors(item_descriptions: list) -> list:
    train_tf_idf_vectors = tf_idf_vectorizer.fit_transform(item_descriptions)
    train_lsa_vectors = lsa_model.fit_transform(train_tf_idf_vectors)
    return train_lsa_vectors

# Transform TF-IDF and LSA models to the testing data / Функция преобразования моделей TF-IDF и LSA для тестовых данных:
def obtaining_test_lsa_vectors(item_descriptions: list) -> list:
    test_tf_idf_vectors = tf_idf_vectorizer.transform(item_descriptions)
    test_lsa_vectors = lsa_model.transform(test_tf_idf_vectors)
    return test_lsa_vectors

## Application of the functions / Применение функций:

### Application of the functions to the training data grouped by participants / Применение функций на обучающих данных, сгруппированных по поставщикам:

In [None]:
train_item_descriptions = df_train.groupby("participant_inn_kpp")["item_descriptions"].agg(" ".join)
train_item_descriptions_indexes = train_item_descriptions.index

In [None]:
preprocessed_train_item_descriptions = preprocess_item_descriptions(train_item_descriptions)

100%|██████████| 52920/52920 [05:20<00:00, 165.08it/s]


In [None]:
train_item_descriptions_lsa_vectors = obtaining_train_lsa_vectors(preprocessed_train_item_descriptions)

In [None]:
train_lsa_vectors_dict = {train_item_descriptions_index: train_item_descriptions_lsa_vector for train_item_descriptions_index, train_item_descriptions_lsa_vector in zip(train_item_descriptions_indexes, train_item_descriptions_lsa_vectors)}

### Application of the functions to the testing data / Применение функций на тестируемых данных:

In [None]:
test_item_descriptions = df_test_not_merged["item_descriptions"].to_numpy()
test_indexes = df_test_not_merged.index

In [None]:
preprocessed_test_item_descriptions = preprocess_item_descriptions(test_item_descriptions)

100%|██████████| 41874/41874 [00:26<00:00, 1601.18it/s]


In [None]:
test_lsa_vectors_dict = obtaining_test_lsa_vectors(preprocessed_test_item_descriptions)

# Working with the "Annoy" library / Работа с библиотекой "Annoy":

In [None]:
# Creating a storage for LSA vectors / Создание хранилища для LSA векторов:
storage = AnnoyIndex(100, "angular")

# Adding test indexes and LSA vectors to the storage / Добавление тестовых индексов и LSA-векторов в хранилище:
for index, vector in zip(test_indexes, test_lsa_vectors_dict):
    storage.add_item(index, vector)

# Building a forest of 25 trees / Построение леса из 25 деревьев:
storage.build(25)

True

# Getting recommendations for each participant / Получение рекомендаций для каждого поставщика:

In [None]:
# Indexes to lots conversion function / Функция преобразования индексов в закупки:
def indexes_to_pn_lots_md5(indexes: list) -> list:
    return set(df_test_not_merged.loc[df_test_not_merged.index == index, "pn_lot_md5"].to_numpy()[0] for index in indexes)

# Getting 15 recommendations function / Функция получения 15 рекомендаций:
def recommend(train_lsa_vectors_dict: dict) -> dict:
    recommendations_dict = dict()
    for participant in tqdm(train_lsa_vectors_dict.keys()):
        region_codes = grouped_train_participants[participant]["region_code"]
        okpd2_codes = grouped_train_participants[participant]["okpd2_code2"]

        lots_okpd2_codes = set(chain(*[grouped_test_okpd2_codes.get(code, {'pn_lot_md5': []})["pn_lot_md5"] for code in okpd2_codes]))
        lots_region_codes = set(chain(*[grouped_test_region_codes.get(code, {'pn_lot_md5': []})["pn_lot_md5"] for code in region_codes]))

        lots_for_participant = lots_okpd2_codes & lots_region_codes

        pre_recommendations = indexes_to_pn_lots_md5(storage.get_nns_by_vector(train_lsa_vectors_dict[participant], 20))
        recommendations = pre_recommendations & lots_for_participant

        recommendations_dict.update({participant: recommendations})
    return recommendations_dict

In [None]:
recommendations_dict = recommend(train_lsa_vectors_dict)

100%|██████████| 52920/52920 [03:40<00:00, 239.91it/s]


# Evaluation of recommendation system / Оценка работы рекомендательной системы:

In [None]:
# Precision@k metric function / Функция, рассчитывающая метрику precision@k:
def precision_at_k(y_true: list, y_pred: list, k: int) -> float:
    y_pred = y_pred[:k]
    intersection_length = len(set(y_true) & set(y_pred))
    return intersection_length / len(y_pred)

# Metric evaluation function / Функция оценки работы рекомендательной системы по метрикам:
def evaluation(recommendations_dict: list) -> None:
    precision_at_k_list = []
    count = 0
    for participant in tqdm(recommendations_dict.keys()):
        try:
            y_true = grouped_test_participants[participant]["pn_lot_md5"]
        except KeyError:
            y_true = []
            count += 1
        y_pred = list(recommendations_dict[participant])
        if len(y_pred) == 0:
            precision_at_k_list.append(0)
        else:
            precision_at_k_list.append(precision_at_k(y_true, y_pred, 5))
    print(f"\n\nPrecision@5 = {round(np.mean(precision_at_k_list) * 100, 2)}%\nParticipants not found: {count} of {len(grouped_train_participants)}")

In [None]:
evaluation(recommendations_dict)

100%|██████████| 52920/52920 [00:00<00:00, 377769.40it/s]



Precision@5 = 0.48%
Participants not found: 44366 of 52920





# Playground / Игровая площадка:

In [None]:
participant = df_train["participant_inn_kpp"].sample(1).to_numpy()[0]

df_train[df_train["participant_inn_kpp"] == participant].sample(5)

Unnamed: 0,fz,pn_lot_md5,region_code,etp,min_publish_date,purchase_name,lot_name,forsmallbiz,lot_price,customer_inn_kpp,okpd2_code2,item_descriptions,participant_inn_kpp,is_winner
191643,44fz,15ea18747a08f68a04bfadcd6cd73b58,40,roseltorg.ru,2019-04-10,Поставка садово-парковых светильников,,False,40863.25,4007005485_400701001,27.4,Светильник РТУ-08-250-001 Пушкинский (или экви...,5047147248_504701001,1
94999,44fz,ef4fdf351c81e673b3370cf8ecb5e726,51,sberbank-ast.ru,2019-07-31,Поставка электронного табло Бегущая строка,,True,34450.0,5191120200_519001001,27.9,Специальное световое интерактивное табло-инфор...,5047147248_504701001,0
60537,44fz,8407b73e3ffaebfd9fe122be4bf750fa,65,roseltorg.ru,2019-04-08,Поставка подводных светильников для фонтана,,True,450999.6,6501281770_650101001,27.4,Подводный светильник,5047147248_504701001,1
589210,223fz,57afe14c5c50d0a273a3db7901267eba,35,rts-tender.ru,2019-03-13,Поставка светильников светодиодных,Поставка светильников светодиодных,True,341086.5,3528055532_352801001,27.4,,5047147248_504701001,0
312747,44fz,e1672f34824e0ade6e3d1e0423baad83,78,rts-tender.ru,2019-07-31,Поставка табло для бассейна для Государственно...,,False,38313.33,7806059034_780601001,27.4,Табло для бассейна,5047147248_504701001,1


In [None]:
region_codes = grouped_train_participants[participant]["region_code"]
okpd2_codes = grouped_train_participants[participant]["okpd2_code2"]
print(region_codes, okpd2_codes, "\n")

lots_okpd2_codes = set(chain(*[grouped_test_okpd2_codes[code]["pn_lot_md5"] for code in okpd2_codes]))
lots_region_codes = set(chain(*[grouped_test_region_codes[code]["pn_lot_md5"] for code in region_codes]))
lots_for_participant = lots_okpd2_codes & lots_region_codes

pre_recommendations = indexes_to_pn_lots_md5(storage.get_nns_by_vector(train_lsa_vectors_dict[participant], 15))
recommendations = pre_recommendations & lots_for_participant
recommendations

['24' '29' '30' '32' '34' '38' '42' '63' '46' '55' '61' '65' '68' '74'
 '78' '77' '86' '51' '75' '02' '13' '21' '23' '27' '33' '40' '48' '54'
 '56' '58' '60' '64' '71' '91' '16' '14' '50' '66' '72' '52' '22' '35'] ['27.4' '27.9' '27.3'] 



{'4a970df75f588b8679075110000bf337',
 '5a65f87312b261cab356fd7adea8f130',
 '65bb4b992ceebbae5b25fa799252a6fd',
 '74284b23fbd6eb2da57ff9be543db6c9',
 '78ee137d863fc5d0a51f13b5d4502fc9',
 'ad5301b820c39e020f602220884662a3',
 'b2df447d0e19d91009b1c7fd9315fb2f',
 'b9d488723c1e759ca98b0126294ce2a2',
 'cfa09a34f9696888e1c87308e64aeefe',
 'd09f288f65a4f147fd4924d464869aea',
 'da3f487ee7698e1ddee4257934662a9e',
 'fe2b664fa576c63e32c5743b2540fe62'}

In [None]:
df_test_not_merged[df_test_not_merged["pn_lot_md5"].isin(recommendations)].head()

Unnamed: 0,fz,pn_lot_md5,region_code,etp,min_publish_date,purchase_name,lot_name,forsmallbiz,lot_price,customer_inn_kpp,okpd2_code2,item_descriptions
661,44fz,b2df447d0e19d91009b1c7fd9315fb2f,16,zakazrf.ru,2022-07-25,Поставка светодиодных светильников,,True,485984.5,1644022340_164401001,27.4,Светильник светодиодный внутреннего освещения ...
3225,44fz,b9d488723c1e759ca98b0126294ce2a2,24,tektorg.ru,2022-05-20,ЭА-№-5188/22 «Поставка светильников светодиодн...,,True,120449.7,2464008420_246401001,27.4,Светильник светодиодный аварийного освещения
38933,44fz,65bb4b992ceebbae5b25fa799252a6fd,13,roseltorg.ru,2022-11-28,Светильник (в рамках капитального ремонта),,True,60133.4,1308079686_130801001,27.4,Светильник светодиодный внутреннего освещения ...
106532,44fz,ad5301b820c39e020f602220884662a3,77,sberbank-ast.ru,2022-09-14,Поставка осветительного оборудования,,False,2246300.0,7734111035_773401001,27.4,Светильник || Светильник переносной светодиодн...
115341,44fz,da3f487ee7698e1ddee4257934662a9e,77,roseltorg.ru,2022-11-07,Поставка уличных светодиодных светильников для...,,True,190710.8,7719210793_771801001,27.4,Светильник наружного освещения светодиодный
