# Курсовой проект по курсу Рекомендательные системы

**Требования к проекту**

Проект осуществляется на данных из вебинара (данные считаны в начале ДЗ)

Целевая метрика - money precision @ 5. Порог для уcпешной сдачи проекта money precision @ 5 > 20%

Бизнес ограничения в топ-5 товарах:
- Для каждого юзера 5 рекомендаций (иногда модели могут возвращать < 5)
- 2 новых товара (юзер никогда не покупал)
- 1 дорогой товар, > 7 долларов
- Все товары из разных категорий (категория - sub_commodity_desc)

Стоимость каждого рекомендованного товара > 1 доллара Будет public тестовый датасет, на котором вы сможете измерять метрику Также будет private тестовый датасет для измерения финального качества НЕ обязательно использовать 2-ух уровневые рекоммендательные системы в проекте Вы сдаете код проекта в виде github репозитория и .csv файл с рекомендациями. В .csv файле 2 столбца: user_id - (item_id1, item_id2, ..., item_id5)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit.als import AlternatingLeastSquares
from implicit.nearest_neighbours import bm25_weight, tfidf_weight

# Функции из 1-ого вебинара
import os, sys

module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)
  
from src.utils import prefilter_items,postfilter_items
from src.recommenders import MainRecommender
from src.metrics import precision_at_k, recall_at_k,money_precision_at_k,money_recall_at_k

In [2]:
#Читаем данные из файла с продажами конкретных товаров для конкретных пользователей
data = pd.read_csv('data/retail_train.csv')

data.columns = [col.lower() for col in data.columns]
data.rename(columns={'household_key': 'user_id',
                    'product_id': 'item_id'},
           inplace=True)


test_size_weeks = 3

#Разбиваем данные на тестовые и тренировочные, берем последние 3 недели для тренировочных данных
data_train = data[data['week_no'] < data['week_no'].max() - test_size_weeks]
data_test = data[data['week_no'] >= data['week_no'].max() - test_size_weeks]

data_train.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [3]:
#Читаем данные характеристик товаров
item_features = pd.read_csv('data/product.csv')
item_features.columns = [col.lower() for col in item_features.columns]
item_features.rename(columns={'product_id': 'item_id'}, inplace=True)
        
item_features.head(2)

Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,


In [4]:
#Количество категорий (субкатегорий)
item_features.sub_commodity_desc.nunique()

2383

In [5]:
data_train=prefilter_items(data_train, take_n_popular=int(len(data_train)/100*5), item_features=item_features)

In [6]:
def myfunc(x):
    #result=np.log1p(x)
    result=x
    result=np.sum(result)
    return result

In [7]:
#создаем user_item_matrix матрицу
recommender = MainRecommender(data_train, item_features, data, n_factors=200, regularization=0.001, iterations=40, 
                              num_threads=16, weighting=True, use_item_prices=True,model_name='ALS',
                              fit_own_model=False,target_values='quantity',funcs=myfunc)

recommender.user_item_matrix.head(3)



HBox(children=(FloatProgress(value=0.0, max=40.0), HTML(value='')))




item_id,25671,26093,26190,26426,26601,26636,26691,26738,26889,27021,...,17328345,17328742,17329473,17329749,17330255,17330511,17382205,17383227,17827644,17829232
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
recommender.user_item_matrix

item_id,25671,26093,26190,26426,26601,26636,26691,26738,26889,27021,...,17328345,17328742,17329473,17329749,17330255,17330511,17382205,17383227,17827644,17829232
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2496,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2497,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2498,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2499,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
#Максимальное количество покупок
recommender.user_item_matrix.max().max()

527.0

In [10]:
#Тренируем алгоритм ALS  и получаем для пользователя рекомендации
recs=recommender.get_recomendations_per_user(2,recommender.model,N=5)

In [11]:
recs

[(67608, 0.8314369),
 (16796, 0.77426517),
 (53311, 0.754006),
 (74759, 0.74464583),
 (13685, 0.7425083)]

In [12]:
#Пример расчитанной рекомендаций 5 товаров для пользователя с userid=2
print('Список рекомендованных товаров с userid=2 - itemids:',[recommender.id_to_itemid[rec[0]] for rec in recs])

Список рекомендованных товаров с userid=2 - itemids: [13133763, 944568, 8357614, 17215077, 913983]


In [13]:
recommender.item_prices

Unnamed: 0,item_id,price
0,25671,3.49
1,26081,0.99
2,26093,1.59
3,26190,1.54
4,26355,0.99
...,...,...
89046,17991689,2.49
89047,17991691,2.49
89048,18000012,4.99
89049,18024155,3.99


In [14]:
recommender.item_prices[recommender.item_prices['item_id']==914190]['price'].values[0]

1.2613225549960219

In [15]:
#Тестовые актуальные данные
result = data_test.groupby('user_id')['item_id'].unique().reset_index()
result.columns=['user_id', 'actual']

result.head(10)

Unnamed: 0,user_id,actual
0,1,"[821867, 834484, 856942, 865456, 889248, 90795..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963..."
2,6,"[920308, 926804, 946489, 1006718, 1017061, 107..."
3,7,"[840386, 889774, 898068, 909714, 929067, 95347..."
4,8,"[835098, 872137, 910439, 924610, 992977, 10412..."
5,9,"[864335, 990865, 1029743, 9297474, 10457112, 8..."
6,13,"[6534178, 1104146, 829197, 840361, 862070, 884..."
7,14,"[840601, 867293, 933067, 951590, 952408, 96569..."
8,15,"[910439, 1082185, 959076, 1023958, 1082310, 13..."
9,16,"[1062973, 1082185, 13007710]"


In [16]:
result['user_id'].shape

(2042,)

In [17]:
result['als'] = result['user_id'].apply(lambda x: recommender.get_recommendations(x, model=recommender.model, N=5))
#result['als'] = result['user_id'].apply(lambda x: recommender.get_own_recommendations(x,  N=5)) #money_precision_at_k=0.147 - плохо 
result['item_prices']=result['als'].apply(lambda x: recommender.fill_prices(x))
result

Unnamed: 0,user_id,actual,als,item_prices
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[995242, 856942, 1075074, 8090541, 5577022]","[1.2125626712676238, 2.7830128205128197, 1.170..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[1044078, 951590, 1092026, 1004906, 981760]","[3.120577578475343, 1.6106869054280881, 1.1153..."
2,6,"[920308, 926804, 946489, 1006718, 1017061, 107...","[878996, 981760, 1024306, 1127831, 845208]","[2.8601832620647536, 1.0302197064184617, 1.921..."
3,7,"[840386, 889774, 898068, 909714, 929067, 95347...","[995242, 1086732, 7147142, 1122358, 909611]","[1.2125626712676238, 1.817735849056604, 5.8567..."
4,8,"[835098, 872137, 910439, 924610, 992977, 10412...","[981760, 844179, 916122, 1004906, 1029743]","[1.0302197064184617, 3.710114824687299, 4.1319..."
...,...,...,...,...
2037,2496,[6534178],"[981760, 844179, 899624, 1004906, 916122]","[1.0302197064184617, 3.710114824687299, 3.4575..."
2038,2497,"[1016709, 9835695, 1132298, 16809501, 845294, ...","[981760, 5569230, 844179, 899624, 995242]","[1.0302197064184617, 3.0792820945945047, 3.710..."
2039,2498,"[15716530, 834484, 901776, 914190, 958382, 972...","[981760, 1004906, 961554, 909534, 844179]","[1.0302197064184617, 2.385178231292398, 1.5206..."
2040,2499,"[867188, 877580, 902396, 914190, 951590, 95813...","[5568378, 1044078, 859075, 981760, 1053690]","[2.2642239382239673, 3.120577578475343, 1.5282..."


In [18]:
result.apply(lambda row: money_precision_at_k(row['als'], row['actual'],row['item_prices'],k=5), axis=1).mean()

0.21859116901086978

In [19]:
take_n_popular=500
popularity = data_train.groupby('item_id')['quantity'].sum().reset_index()
popularity.rename(columns={'quantity': 'n_sold'}, inplace=True)
top = popularity.sort_values('n_sold', ascending=False).head(take_n_popular).item_id.tolist()
popular_items = data_train.loc[data['item_id'].isin(top)]['item_id'].tolist()

In [20]:
result.head()

Unnamed: 0,user_id,actual,als,item_prices
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[995242, 856942, 1075074, 8090541, 5577022]","[1.2125626712676238, 2.7830128205128197, 1.170..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[1044078, 951590, 1092026, 1004906, 981760]","[3.120577578475343, 1.6106869054280881, 1.1153..."
2,6,"[920308, 926804, 946489, 1006718, 1017061, 107...","[878996, 981760, 1024306, 1127831, 845208]","[2.8601832620647536, 1.0302197064184617, 1.921..."
3,7,"[840386, 889774, 898068, 909714, 929067, 95347...","[995242, 1086732, 7147142, 1122358, 909611]","[1.2125626712676238, 1.817735849056604, 5.8567..."
4,8,"[835098, 872137, 910439, 924610, 992977, 10412...","[981760, 844179, 916122, 1004906, 1029743]","[1.0302197064184617, 3.710114824687299, 4.1319..."


In [21]:
result['final_als']=result.apply(lambda row: postfilter_items(row['als'],item_features, row['user_id'], data_test, recommender._item_prices,popular_items, N=5), axis=1)

In [22]:
result.apply(lambda row: money_precision_at_k(row['final_als'], row['actual'],row['item_prices'],k=5), axis=1).mean()

0.14495406591495533

In [23]:
final_result=result.loc[:, result.columns.isin(['user_id','final_als'])].copy()

In [24]:
final_result

Unnamed: 0,user_id,final_als
0,1,"[26601, 981760, 1075074, 995242, 5577022]"
1,3,"[26601, 1092026, 1044078, 981760, 1004906]"
2,6,"[26601, 1004906, 878996, 1024306, 845208]"
3,7,"[26601, 7147142, 995242, 909611, 1004906]"
4,8,"[26601, 916122, 981760, 1029743, 1004906]"
...,...,...
2037,2496,"[26601, 899624, 981760, 916122, 1048462]"
2038,2497,"[26601, 844179, 981760, 995242, 1004906]"
2039,2498,"[26601, 961554, 981760, 844179, 1004906]"
2040,2499,"[26601, 859075, 5568378, 1053690, 1004906]"


In [25]:
final_result.rename(columns={'final_als': 'item_id'}, inplace=True)

In [26]:
final_result

Unnamed: 0,user_id,item_id
0,1,"[26601, 981760, 1075074, 995242, 5577022]"
1,3,"[26601, 1092026, 1044078, 981760, 1004906]"
2,6,"[26601, 1004906, 878996, 1024306, 845208]"
3,7,"[26601, 7147142, 995242, 909611, 1004906]"
4,8,"[26601, 916122, 981760, 1029743, 1004906]"
...,...,...
2037,2496,"[26601, 899624, 981760, 916122, 1048462]"
2038,2497,"[26601, 844179, 981760, 995242, 1004906]"
2039,2498,"[26601, 961554, 981760, 844179, 1004906]"
2040,2499,"[26601, 859075, 5568378, 1053690, 1004906]"


In [27]:
final_result.to_csv('recommendations.csv',index=False)

**Были перепробованы основные модели - ALS, BPR,ItemItemRecommender, были перепробованы основные гиперпараметры модели -n_factors,n_iterations, были перепробованы все поля - quantity, sales_value, были перепробованы несколько агрегатных функций и в итоге получился максимально возможный результат. Время на 2-ух уровневую модель не хватило, поэтому можно сказать что добиться от 1-уровневой модели показателя после постфилтрации money_precision_at_k для к=5 очень сложно.
Получилось money_precision_at_k для к=5 до постфильтрации=21.8% и после постфильтрации=14.5%**