In [1]:
import numpy as np
import pandas as pd
import re
from tqdm import tqdm_notebook
from tqdm import tqdm

from lightfm import LightFM
from lightfm.evaluation import precision_at_k
from lightfm.evaluation import recall_at_k
from lightfm.evaluation import auc_score
from lightfm.data import Dataset
from lightfm.cross_validation import random_train_test_split

import os
import glob



Документация:

https://making.lyst.com/lightfm/docs/home.html

https://github.com/lyst/lightfm/tree/master/examples/dataset

In [2]:
# загружаем данные

train_labels = pd.read_csv("../data/external/train_labels.csv", sep=';')
test_data = pd.read_csv("../data/external/test_data.csv", sep=';')

ohe_fz = pd.read_csv("../data/intermid/ohe_fz.csv")
ohe_okpd2 = pd.read_csv("../data/intermid/ohe_okpd2.csv")
ohe_region = pd.read_csv("../data/intermid/ohe_region.csv")
ohe_date_scale_price = pd.read_csv("../data/intermid/ohe_date_log_scale_price.csv")


  exec(code_obj, self.user_global_ns, self.user_ns)


In [3]:
# загружаем векторы текстовых описаний

# text_vectors = pd.read_csv("../data/intermid/svd_vectors_redused.csv")
# text_vectors = pd.read_csv("../data/intermid/svd_vectors_ngrams_redused.csv")
# text_vectors = pd.read_csv("../data/intermid/doc2Vec_vectors.csv")

path = r'../data/intermid/bert'
all_files = glob.glob(os.path.join(path, "*.csv"))
df_from_each_file = (pd.read_csv(f) for f in all_files)
text_vectors = pd.concat(df_from_each_file, ignore_index=True)

In [4]:
# приведем идентификаторы процедур и участников к числовому типу для удобства

digits = re.compile("[0-9]+")
def digits_only(text):
    return int(''.join(digits.findall(text)))

train_labels['pn_lot_anon'] = train_labels['pn_lot_anon'].apply(digits_only)
train_labels['participant_inn_kpp_anon'] = train_labels['participant_inn_kpp_anon'].apply(digits_only)
test_data['pn_lot_anon'] = test_data['pn_lot_anon'].apply(digits_only)

In [5]:
# собираем датасет признаков процедур

features = ohe_fz.merge\
(ohe_okpd2, how='left', left_on='pn_lot_id', right_on='pn_lot_id').merge\
(ohe_region, how='left', left_on='pn_lot_id', right_on='pn_lot_id').merge\
(ohe_date_scale_price, how='left', left_on='pn_lot_id', right_on='pn_lot_id').merge\
(text_vectors, how='left', left_on='pn_lot_id', right_on='pn_lot_id')

In [6]:
# количества уникальных участников и процедур

print(train_labels.participant_inn_kpp_anon.nunique())
print(features.pn_lot_id.nunique())

16499
862999


In [7]:
# идентификаторы пользователей и процедур, список имен признаков

user_ids = train_labels.participant_inn_kpp_anon.unique().tolist()
item_ids = features.pn_lot_id.unique().tolist()
item_features_names = features.columns.tolist()[1:]

In [8]:
# идентифицируем датасет с параметрами и передаем списки

dataset = Dataset(user_identity_features=False, item_identity_features=True)

dataset.fit(users=user_ids, items=item_ids, item_features=item_features_names)

In [9]:
user_mappings1, _, item_mappings1, _ = dataset.mapping()
print(len(user_mappings1), len(item_mappings1))

16499 862999


In [10]:
# увеличим 'is_winner' на 1 чтобы отличить актуальные процедуры 

train_labels['is_winner'] = train_labels['is_winner']+1
train_labels

Unnamed: 0,pn_lot_anon,participant_inn_kpp_anon,is_winner,fz
0,1770702,3661899,2,44fz
1,1058704,3661899,2,44fz
2,4186044,3661899,2,44fz
3,6882961,3661899,2,44fz
4,5821609,3661899,2,44fz
...,...,...,...,...
928443,4113734,9228706,1,44fz
928444,1672363,9228706,2,223fz
928445,5319307,9228706,2,223fz
928446,7591446,2431264,2,223fz


In [11]:
%%time
# построение матрицы взаимодействий

interactions = dataset.build_interactions(np.array(train_labels[['participant_inn_kpp_anon', 'pn_lot_anon', 'is_winner']]))
interactions

Wall time: 4.8 s


(<16499x862999 sparse matrix of type '<class 'numpy.int32'>'
 	with 928448 stored elements in COOrdinate format>,
 <16499x862999 sparse matrix of type '<class 'numpy.float32'>'
 	with 928448 stored elements in COOrdinate format>)

In [12]:
# размерность датасета признаков

features.shape

(862999, 637)

In [13]:
%%time
# построение матрицы признаков элементов

item_features = dataset.build_item_features(((row[0].astype(int), dict(zip(item_features_names, row[1:].tolist()))) for row in np.array(features)), normalize=True)

Wall time: 17min 35s


In [14]:
# размерности

print(dataset.interactions_shape())
print(dataset.item_features_shape())
print(dataset.model_dimensions())
print(dataset.user_features_shape())

(16499, 862999)
(862999, 863635)
(0, 863635)
(16499, 0)


In [15]:
# # делим данные на обучающие и тестовые

# train, test = random_train_test_split(interactions[1], test_percentage=0.2, random_state=None)

In [16]:
# задаем параметры модели

no_components=100
epochs=10

In [17]:
# создаем модель

model = LightFM(no_components=no_components, loss='warp', item_alpha=1e-6)

In [18]:
# обучаем модель

# model.fit(train, user_features=None, item_features=item_features, sample_weight=train, epochs=10, verbose=True)
model.fit(interactions[0], user_features=None, item_features=item_features, sample_weight=interactions[1], epochs=epochs, verbose=True)

Epoch: 100%|███████████████████████████████████████████████████████████████████████| 10/10 [4:24:32<00:00, 1587.26s/it]


<lightfm.lightfm.LightFM at 0x272093dfc70>

In [19]:
# Сохранение модели на диск

In [19]:
import pickle

# path = '../models/lightfm_svd_full_data_100c_10e.model'
# path = '../models/lightfm_svd_full_data_50c_10e.model'
# path = '../models/lightfm_svd_full_data_75c_10e.model'
# path = '../models/lightfm_svd_full_data_100c_5e.model'
# path = '../models/lightfm_svd_full_data_100c_20e.model'
# path = '../models/lightfm_doc2vec_full_data_100c_10e.model'
# path = '../models/lightfm_svd_ngrams_full_data_100c_10e.model'
path = '../models/lightfm_bert_full_data_100c_10e.model'

with open(path, 'wb') as f:
    pickle.dump(model, f)

In [20]:
# Загрузка модели из диска

with open(path, 'rb') as f:
    model = pickle.load(f)

In [21]:
model.get_item_representations(features=item_features)[1].shape

(862999, 100)

In [22]:
# формируем список индексов актуальных процедур

user_mappings = {k:d for d,k in user_mappings1.items()}
item_mappings = {k:d for d,k in item_mappings1.items()}

actual_item_mapping = pd.Series(item_mappings1)[test_data.pn_lot_anon.tolist()]
items = actual_item_mapping.tolist()

In [23]:
# Формируем рекомендации для каждого пользователя

res = []
for user_id, user_index in tqdm(user_mappings1.items()):
    res1 = pd.Series(model.predict(user_index, items, item_features=item_features), items).sort_values(ascending=False).keys()[:35]
    rec = pd.Series(item_mappings)[res1].tolist()
    for r in rec:
        res.append((user_id, r))

100%|█████████████████████████████████████████████████████████████████████████| 16499/16499 [16:12:24<00:00,  3.54s/it]


In [24]:
# Формируем файл с рекомендациями

In [25]:
recommendations = pd.DataFrame(res)
recommendations['similarity_score'] = 1
recommendations.columns = ['inn_kpp', 'actual_recommended_pn_lot', 'similarity_score']

In [26]:
recommendations['inn_kpp'] = 'inn_kpp_' + recommendations['inn_kpp'].astype(str)
recommendations['actual_recommended_pn_lot'] = 'pn_lot_' + recommendations['actual_recommended_pn_lot'].astype(str)

In [27]:
recommendations

Unnamed: 0,inn_kpp,actual_recommended_pn_lot,similarity_score
0,inn_kpp_3661899,pn_lot_440011,1
1,inn_kpp_3661899,pn_lot_7909031,1
2,inn_kpp_3661899,pn_lot_2708581,1
3,inn_kpp_3661899,pn_lot_2682191,1
4,inn_kpp_3661899,pn_lot_4863,1
...,...,...,...
577460,inn_kpp_8441221,pn_lot_2381004,1
577461,inn_kpp_8441221,pn_lot_7557363,1
577462,inn_kpp_8441221,pn_lot_3833386,1
577463,inn_kpp_8441221,pn_lot_3097125,1


In [28]:
# # 100c_10e_svd
# recommendations.to_csv("../data/processed/recommendations_svd.csv", index=False, sep=';')

# # 50c_10e_svd
# recommendations.to_csv("../data/processed/recommendations_svd_v2.csv", index=False, sep=';')

# # 75c_10e_svd
# recommendations.to_csv("../data/processed/recommendations_svd_v3.csv", index=False, sep=';')

# # 100c_5e_svd
# recommendations.to_csv("../data/processed/recommendations_svd_v4.csv", index=False, sep=';')

# # 100c_20e_svd
# recommendations.to_csv("../data/processed/recommendations_svd_v5.csv", index=False, sep=';')

# # 100c_10e_doc2vec
# recommendations.to_csv("../data/processed/recommendations_doc2vec.csv", index=False, sep=';')

# # 100c_10e_svd_ngrams
# recommendations.to_csv("../data/processed/recommendations_svd_ngrams.csv", index=False, sep=';')

# 100c_10e_bert
recommendations.to_csv("../data/processed/recommendations_bert.csv", index=False, sep=';')