# Downloading libraries and importing libraries / Загрузка и импорт библиотек:



In [28]:
%pip install pymorphy3 annoy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [29]:
import pandas as pd, numpy as np, re, nltk

import pymorphy3
morph = pymorphy3.MorphAnalyzer()

from itertools import chain
from functools import lru_cache
from annoy import AnnoyIndex
from tqdm import tqdm

from nltk.tokenize import word_tokenize
nltk.download("punkt")

from nltk.corpus import stopwords
nltk.download("stopwords")
stopwords_russian = stopwords.words("russian")

from sklearn.feature_extraction.text import TfidfVectorizer
tf_idf_vectorizer = TfidfVectorizer()

from sklearn.decomposition import TruncatedSVD
lsa_model = TruncatedSVD(n_components = 100, random_state = 21)

from google.colab import drive
drive.mount("/content/drive")
data_path = "/content/drive/MyDrive/Recommendation system/"

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Acquiring and processing the data / Получение и обработка данных:


In [30]:
df_lots = pd.read_csv(data_path + "data.csv", delimiter = ";", dtype = {"region_code": str, "okpd2_code2": str})
df_participants = pd.read_csv(data_path + "participants.csv", delimiter = ";")

  df_lots = pd.read_csv(data_path + "data.csv", delimiter = ";", dtype = {"region_code": str, "okpd2_code2": str})


## Deleting ambiguous values / Удаление неоднозначных значений:

In [31]:
ambiguous_okpd2_codes = ["2727", "2746", "2747", "2723", "2736", "2735", "2733", "2726", "2725", "2790", "2724", "2728", "2763", "2771"]

df_lots = df_lots[~df_lots["okpd2_code2"].isin(ambiguous_okpd2_codes)]

## Filling missing values / Заполнение пустых значений:

In [32]:
df_lots.fillna(value = "", inplace = True)

## Merging the data / Объединение данных:



In [33]:
df_merged = df_lots.merge(df_participants, how = "inner", on = "pn_lot_md5")

## Splitting the merged data into train data (3 years) and testing data (1 year) / Разбиение объединённых данных на обучающие (3 года) и тестируемые (1 год):

In [34]:
df_train = df_merged[df_merged["min_publish_date"] <= "2022-01-01"]

df_test_merged = df_merged[df_merged["min_publish_date"] >= "2022-01-01"]
df_test_not_merged = df_lots[df_lots["min_publish_date"] >= "2022-01-01"]

## Creation of dictionaries with the grouped data to optimize further processes / Создание словарей со сгруппированными данными для оптимизации дальнейших процессов:

In [35]:
# Creation of a dictionary with OKPD2 and region codes grouped by participants /
# Создание словаря кодов региона и ОКПД2, сгруппированных по поставщикам:
df_grouped_train_participants = df_train.loc[:, ["okpd2_code2", "region_code", "participant_inn_kpp"]]
df_grouped_train_participants = df_grouped_train_participants.groupby("participant_inn_kpp")["region_code", "okpd2_code2"].agg(lambda x: x.unique())
grouped_train_participants = df_grouped_train_participants.to_dict("index")

  df_grouped_train_participants = df_grouped_train_participants.groupby("participant_inn_kpp")["region_code", "okpd2_code2"].agg(lambda x: x.unique())


In [36]:
# Creation of a dictionary with lots from the testing data grouped by OKPD2 code /
# Создание словаря лотов из тестируемого набора данных, сгруппированных по ОКПД2 коду:
df_grouped_test_okpd2_codes = df_test_not_merged.loc[:, ["pn_lot_md5", "okpd2_code2"]]
df_grouped_test_okpd2_codes = df_grouped_test_okpd2_codes.groupby("okpd2_code2").agg(lambda value: word_tokenize(" ".join(value.unique())))
grouped_test_okpd2_codes = df_grouped_test_okpd2_codes.to_dict("index")

In [37]:
# Creation of a dictionary with lots from the testing data grouped by region code /
# Создание словаря лотов из тестируемого набора данных, сгруппированных по коду региона:
df_grouped_test_region_codes = df_test_not_merged.loc[:, ["pn_lot_md5", "region_code"]]
df_grouped_test_region_codes = df_grouped_test_region_codes.groupby("region_code").agg(lambda value: word_tokenize(" ".join(value.unique())))
grouped_test_region_codes = df_grouped_test_region_codes.to_dict("index")

In [38]:
# Creation of a dictionary with lots from the testing data grouped by participants /
# Создание словаря лотов из тестируемого набора данных, сгруппированных по поставщикам:
df_grouped_test_participants = df_test_merged.loc[:, ["pn_lot_md5", "participant_inn_kpp"]]
df_grouped_test_participants = df_grouped_test_participants.groupby("participant_inn_kpp").agg(lambda value: word_tokenize(" ".join(value.unique())))
grouped_test_participants = df_grouped_test_participants.to_dict("index")

# Preprocessing and obtaining TF-IDF and LSA vectors for the "item_descriptions" attribute / Предварительная обработка и получение TF-IDF и LSA векторов для аттрибута "item_descriptions":

## Functions / Функции:

In [39]:
# Lemmatisation function / Функция лемматизации:
@lru_cache
def lemmatize_token(token: str) -> str:
    return morph.parse(token)[0].normal_form

# Text preprocessing function / Функция предобработки текста:
def preprocess_item_description(item_description: str) -> str:
    item_description = item_description.lower().strip()
    item_description = re.sub(r"[^\w\d\s]", "", item_description)
    tokens = [token for token in word_tokenize(item_description) if token not in stopwords_russian]
    lemmatized_tokens = [lemmatize_token(token) for token in tokens]
    return " ".join(lemmatized_tokens)

# Texts preprocessing function / Функция предобработки текстов:
def preprocess_item_descriptions(item_descriptions: list) -> list:
    return [preprocess_item_description(item_description) for item_description in tqdm(item_descriptions)]

# Fit and transform TF-IDF and LSA models to the training data function / Функция подгонки и преобразования моделей TF-IDF и LSA для обучающихся данных:
def obtaining_train_lsa_vectors(item_descriptions: list) -> list:
    train_tf_idf_vectors = tf_idf_vectorizer.fit_transform(item_descriptions)
    train_lsa_vectors = lsa_model.fit_transform(train_tf_idf_vectors)
    return train_lsa_vectors

# Transform TF-IDF and LSA models to the testing data / Функция преобразования моделей TF-IDF и LSA для тестовых данных:
def obtaining_test_lsa_vectors(item_descriptions: list) -> list:
    test_tf_idf_vectors = tf_idf_vectorizer.transform(item_descriptions)
    test_lsa_vectors = lsa_model.transform(test_tf_idf_vectors)
    return test_lsa_vectors

## Application of the functions / Применение функций:

### Application of the functions to the training data grouped by participants / Применение функций на обучающих данных, сгруппированных по поставщикам:

In [40]:
train_item_descriptions = df_train.groupby("participant_inn_kpp")["item_descriptions"].agg(" ".join)
train_item_descriptions_indexes = train_item_descriptions.index

In [41]:
preprocessed_train_item_descriptions = preprocess_item_descriptions(train_item_descriptions)

100%|██████████| 52920/52920 [08:27<00:00, 104.29it/s]


In [42]:
train_item_descriptions_lsa_vectors = obtaining_train_lsa_vectors(preprocessed_train_item_descriptions)

In [43]:
train_lsa_vectors_dict = {train_item_descriptions_index: train_item_descriptions_lsa_vector for train_item_descriptions_index, train_item_descriptions_lsa_vector in zip(train_item_descriptions_indexes, train_item_descriptions_lsa_vectors)}

### Application of the functions to the testing data / Применение функций на тестируемых данных:

In [44]:
test_item_descriptions = df_test_not_merged["item_descriptions"].to_numpy()
test_indexes = df_test_not_merged.index

In [45]:
preprocessed_test_item_descriptions = preprocess_item_descriptions(test_item_descriptions)

100%|██████████| 41873/41873 [00:44<00:00, 950.59it/s]


In [46]:
test_lsa_vectors_dict = obtaining_test_lsa_vectors(preprocessed_test_item_descriptions)

# Working with the "Annoy" library / Работа с библиотекой "Annoy":

In [47]:
# Creating a storage for LSA vectors / Создание хранилища для LSA векторов:
storage = AnnoyIndex(100, "angular")

# Adding test indexes and LSA vectors to the storage / Добавление тестовых индексов и LSA-векторов в хранилище:
for index, vector in zip(test_indexes, test_lsa_vectors_dict):
    storage.add_item(index, vector)

# Building a forest of 25 trees / Построение леса из 25 деревьев:
storage.build(25)

True

# Getting recommendations for each participant / Получение рекомендаций для каждого поставщика:

In [48]:
# Indexes to lots conversion function / Функция преобразования индексов в закупки:
def indexes_to_pn_lots_md5(indexes: list) -> list:
    return set(df_test_not_merged.loc[df_test_not_merged.index == index, "pn_lot_md5"].to_numpy()[0] for index in indexes)

# Getting 15 recommendations function / Функция получения 15 рекомендаций:
def recommend(train_lsa_vectors_dict: dict) -> dict:
    recommendations_dict = dict()
    for participant in tqdm(train_lsa_vectors_dict.keys()):
        region_codes = grouped_train_participants[participant]["region_code"]
        okpd2_codes = grouped_train_participants[participant]["okpd2_code2"]

        lots_okpd2_codes = set(chain(*[grouped_test_okpd2_codes.get(code, {'pn_lot_md5': []})["pn_lot_md5"] for code in okpd2_codes]))
        lots_region_codes = set(chain(*[grouped_test_region_codes.get(code, {'pn_lot_md5': []})["pn_lot_md5"] for code in region_codes]))

        lots_for_participant = lots_okpd2_codes & lots_region_codes

        pre_recommendations = indexes_to_pn_lots_md5(storage.get_nns_by_vector(train_lsa_vectors_dict[participant], 20))
        recommendations = pre_recommendations & lots_for_participant

        recommendations_dict.update({participant: recommendations})
    return recommendations_dict

In [49]:
recommendations_dict = recommend(train_lsa_vectors_dict)

100%|██████████| 52920/52920 [06:34<00:00, 134.07it/s]


# Evaluation of recommendation system / Оценка работы рекомендательной системы:

In [50]:
# Precision@k metric function / Функция, рассчитывающая метрику precision@k:
def precision_at_k(y_true: list, y_pred: list, k: int) -> float:
    y_pred = y_pred[:k]
    intersection_length = len(set(y_true) & set(y_pred))
    return intersection_length / len(y_pred)

# Metric evaluation function / Функция оценки работы рекомендательной системы по метрикам:
def evaluation(recommendations_dict: list) -> None:
    precision_at_k_list = []
    count = 0
    for participant in tqdm(recommendations_dict.keys()):
        try:
            y_true = grouped_test_participants[participant]["pn_lot_md5"]
        except KeyError:
            y_true = []
            count += 1
        y_pred = list(recommendations_dict[participant])
        if len(y_pred) == 0:
            precision_at_k_list.append(0)
        else:
            precision_at_k_list.append(precision_at_k(y_true, y_pred, 5))
    print(f"\n\nPrecision@5 = {round(np.mean(precision_at_k_list) * 100, 2)}%\nParticipants not found: {count} of {len(grouped_train_participants)}")

In [51]:
evaluation(recommendations_dict)

100%|██████████| 52920/52920 [00:00<00:00, 227027.32it/s]



Precision@5 = 0.49%
Participants not found: 44366 of 52920





# Playground / Игровая площадка:

In [61]:
participant = df_train["participant_inn_kpp"].sample(1).to_numpy()[0]

df_train[df_train["participant_inn_kpp"] == participant].sample(5)

Unnamed: 0,fz,pn_lot_md5,region_code,etp,min_publish_date,purchase_name,lot_name,forsmallbiz,lot_price,customer_inn_kpp,okpd2_code2,item_descriptions,participant_inn_kpp,is_winner
110399,44fz,e78d9721321fe938693b95b33851a76f,10,sberbank-ast.ru,2019-05-30,Электродвигатель,,False,54669.27,1006004229_100601001,27.1,Электродвигатели переменного тока многофазные ...,1001333533_100101001,0
111010,44fz,9b5e112cabf41b27ec4ef338c092ffd2,10,sberbank-ast.ru,2021-08-10,Поставка электротехнической продукции,,False,9630.8,1013000200_101102003,27.4,Лампа ДНаТ,1001333533_100101001,0
415991,44fz,17473a9761b7b2c82bb91c45152f3543,50,rts-tender.ru,2020-07-20,Поставка расходных материалов для электромонта...,,True,286143.8,5075027714_507501001,27.3,Арматура кабельная || Контакторы электромагнит...,1001333533_100101001,0
16578,44fz,e2b7fcf892cc8edd50e6d3f514b2f1ef,29,sberbank-ast.ru,2020-10-14,Поставка элементов питания,,False,43209.0,2901005199_290101001,27.2,Элемент первичный и батарея первичных элементов,1001333533_100101001,1
110496,44fz,c18ee00785e4f035acaddcec8e7770b2,10,sberbank-ast.ru,2020-04-06,кабель,,True,8090.6,1006004282_100601001,27.3,Кабель,1001333533_100101001,1


In [62]:
region_codes = grouped_train_participants[participant]["region_code"]
okpd2_codes = grouped_train_participants[participant]["okpd2_code2"]
print(region_codes, okpd2_codes, "\n")

lots_okpd2_codes = set(chain(*[grouped_test_okpd2_codes[code]["pn_lot_md5"] for code in okpd2_codes]))
lots_region_codes = set(chain(*[grouped_test_region_codes[code]["pn_lot_md5"] for code in region_codes]))
lots_for_participant = lots_okpd2_codes & lots_region_codes

pre_recommendations = indexes_to_pn_lots_md5(storage.get_nns_by_vector(train_lsa_vectors_dict[participant], 15))
recommendations = pre_recommendations & lots_for_participant
recommendations

['29' '33' '63' '78' '47' '51' '60' '68' '10' '11' '34' '35' '39' '53'
 '57' '61' '64' '71' '77' '16' '52' '50' '83'] ['27.4' '27.2' '27.3' '27.5' '27.1' '27.9'] 



{'091ed3a5a12d8bfc103d609c77708932',
 '1703cfdfd44034065ffc1df48e7023f3',
 '2124a58a68b981b8324ba55b5e8d5316',
 '8e162ec527f95c578eba72ce87a60e35',
 '961007709cd51713bae44f51302158ab',
 'a51a6e2eff0788d0177fc5c259d66f8e',
 'ac878486222707dc386c376260c01a80',
 'acfed0c3898951169e74bd71dc86da5d',
 'b2d9e04f270f971af1db734d848a04ed',
 'dcc5120fc1bf6321eb5b92a5a333fd5a',
 'f3cb93076927ffb5146dca9af4415426'}

In [63]:
df_test_not_merged[df_test_not_merged["pn_lot_md5"].isin(recommendations)].head()

Unnamed: 0,fz,pn_lot_md5,region_code,etp,min_publish_date,purchase_name,lot_name,forsmallbiz,lot_price,customer_inn_kpp,okpd2_code2,item_descriptions
6309,44fz,961007709cd51713bae44f51302158ab,29,sberbank-ast.ru,2022-04-07,Закупка материалов для нужд МО Коношское,,True,643359.46,2912004704_291201001,27.4,"Автоматический вы-ключатель ВА47-29 2Р 6А 4,5 ..."
56465,44fz,dcc5120fc1bf6321eb5b92a5a333fd5a,34,sberbank-ast.ru,2022-07-04,Электротовары,,True,271882.75,3438003314_343801001,27.4,Выключатель одноклавишный наружный 250В 6А || ...
58674,44fz,ac878486222707dc386c376260c01a80,52,fabrikant.ru,2022-11-22,Электротовары,,True,64385.2,5201003280_520101001,27.4,Автоматический выключатель 1 ф 16 А || Автомат...
86019,44fz,8e162ec527f95c578eba72ce87a60e35,61,rts-tender.ru,2022-06-08,Поставка электротехнических товаров для нужд М...,,False,154661.85,6166008686_616601001,27.4,Лампа светодиодная E27 220-240 В 10 Вт груша н...
102255,44fz,f3cb93076927ffb5146dca9af4415426,78,sberbank-ast.ru,2022-03-18,Поставка электротоваров,,True,301322.0,7817004380_781701001,27.4,Автоматический выключатель однополюсный 10А ||...
