Финальный проект:
Проект осуществляется на данных из вебинара (данные считаны в начале ДЗ)

Целевая метрика - money precision @ 5. Порог для уcпешной сдачи проекта money precision @ 5 > 20%

Бизнес ограничения в топ-5 товарах:
- Для каждого юзера 5 рекомендаций (иногда модели могут возвращать < 5)
- 2 новых товара (юзер никогда не покупал)
- 1 дорогой товар, > 7 долларов
- Все товары из разных категорий (категория - sub_commodity_desc)

- Стоимость каждого рекомендованного товара > 1 доллара
Будет public тестовый датасет, на котором вы сможете измерять метрику
Также будет private тестовый датасет для измерения финального качества
НЕ обязательно использовать 2-ух уровневые рекоммендательные системы в проекте
Вы сдаете код проекта в виде github репозитория и .csv файл с рекомендациями. В .csv файле 2 столбца: user_id - (item_id1, item_id2, ..., item_id5)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix, coo_matrix

# Детерминированные алгоритмы
from implicit.nearest_neighbours import ItemItemRecommender, CosineRecommender, TFIDFRecommender, BM25Recommender

# Матричная факторизация
from implicit import als

# Модель второго уровня
from lightgbm import LGBMClassifier

import os, sys
module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)

# Написанные нами функции
from src.metrics import precision_at_k, recall_at_k, money_precision_at_k_1
from src.utils import prefilter_items
from src.recommenders import MainRecommender

from statistics import mean


In [2]:
from matplotlib import pyplot as plt

%matplotlib inline
%config InlineBackend.figure_formart = 'svg'
plt.style.use('ggplot')  # Красивые графики
plt.rcParams['figure.figsize'] = (15, 5)  # Размер картинок

In [3]:
data = pd.read_csv('../data/retail_train.csv')

data.columns = [col.lower() for col in data.columns]
data.rename(columns={'household_key': 'user_id',
                    'product_id': 'item_id'},
           inplace=True)

In [4]:
data['price'] = data['sales_value'] / (np.maximum(data['quantity'], 1))

In [5]:
price_item = data.groupby('item_id')['price'].mean()
price_item.columns=['item_id', 'price']
price_item.head(2)

item_id
25671    3.49
26081    0.99
Name: price, dtype: float64

In [6]:
test_size_weeks = 3

data_train = data[data['week_no'] < data['week_no'].max() - test_size_weeks]
data_test = data[data['week_no'] >= data['week_no'].max() - test_size_weeks]

data_train.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc,price
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0,1.39
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0,0.82


In [7]:
item_features = pd.read_csv('../data/product.csv')
item_features.columns = [col.lower() for col in item_features.columns]
item_features.rename(columns={'product_id': 'item_id'}, inplace=True)

item_features.head(2)

Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,


In [8]:
data_train['item_id'].nunique()

86865

In [9]:
data_train = prefilter_items(data_train, item_features=item_features, take_n_popular=1000)
data_train['item_id'].nunique()

1001

In [10]:
recommender = MainRecommender(data_train)

100%|████████████████████████████████████████████████████████████████████████████████| 15.0/15 [00:01<00:00, 14.42it/s]
100%|███████████████████████████████████████████████████████████████████████████| 1001/1001 [00:00<00:00, 15409.82it/s]


In [11]:
train_users = data_train['user_id'].unique()

In [12]:
result = data_test.groupby('user_id')['item_id'].unique().reset_index()
result.columns=['user_id', 'actual']
result.head(2)

Unnamed: 0,user_id,actual
0,1,"[821867, 834484, 856942, 865456, 889248, 90795..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963..."


In [13]:
result['own'] = 0
result['own'] = result['own'].astype(object)

In [14]:
# Обработка новых клиентов
for i in range(len(result)):
    if result['user_id'][i] in train_users:
        result.at[i, 'own'] = recommender.get_own_recommendations(result['user_id'][i], 5)
    else:
        result.at[i, 'own'] = recommender.get_similar_items_recommendation(result['user_id'][i], 5)


In [15]:
#Для каждого юзера 5 рекомендаций (иногда модели могут возвращать < 5)
for i in range(len(result)):
    recommender._extend_with_top_popular(result['own'][i], N=5)

In [16]:
#2 новых товара (юзер никогда не покупал)
N =5
for i in range(len(result)):
    for j in range(len(result['own'][i])):
        len_result_own_i = 0
        if result['own'][i][j] in result['actual'][i]:
            len_result_own_i +=1
    if len_result_own_i == N:
        result['own'][i][N]= recommender._get_similar_item(result['own'][i][N])
        result['own'][i][N-1] = recommender._get_similar_item(result['own'][i][N-1])
    elif len_result_own_i == N-1:
        result['own'][i][N]= recommender._get_similar_item(result['own'][i][N])

In [17]:
#1 дорогой товар, > 7 долларов
N =5
for i in range(len(result)):
    a = data['item_id']
    for i in range(len(a)):
        if price_item[a[i]] > 7:
            item_id_7 = a[i]
            break    
    for j in range(len(result['own'][i])):
        len_result_own_i = 0
        if price_item[result['own'][i][j]] < 7:
            len_result_own_i +=1
    if len_result_own_i == N:
        result['own'][i][N]= item_id_7

In [18]:
result.head(2)

Unnamed: 0,user_id,actual,own
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[856942, 940947, 934369, 9527290, 8090541]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[998206, 1092937, 1053690, 1092026, 885697]"


In [19]:
# precision @ 5
precision_at_5_own = result.apply(lambda x:precision_at_k(x['own'], x['actual']),axis=1).mean()
precision_at_5_own

0.2434867776689496

In [20]:
# # money precision @ 5
a = []
for i in range(len(result['own'])):
    x = money_precision_at_k_1(result['own'][i] , result['actual'][i], price_item, k=5)
    a.append(x)
money_precision_at_5_own = mean(a)
money_precision_at_5_own   

0.23518313146388614

In [21]:
retail_test1 = pd.read_csv('../data/retail_test1.csv')

retail_test1.columns = [col.lower() for col in retail_test1.columns]
retail_test1.rename(columns={'household_key': 'user_id',
                    'product_id': 'item_id'},
           inplace=True)
retail_test1.head()

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,1340,41652823310,664,912987,1,8.49,446,0.0,52,96,0.0,0.0
1,588,41652838477,664,1024426,1,6.29,388,0.0,8,96,0.0,0.0
2,2070,41652857291,664,995242,5,9.1,311,-0.6,46,96,0.0,0.0
3,1602,41665647035,664,827939,1,7.99,334,0.0,1741,96,0.0,0.0
4,1602,41665647035,664,927712,1,0.59,334,-0.4,1741,96,0.0,0.0


In [22]:
result = retail_test1.groupby('user_id')['item_id'].unique().reset_index()
result.columns=['user_id', 'actual']
result.head(2)

Unnamed: 0,user_id,actual
0,1,"[880007, 883616, 931136, 938004, 940947, 94726..."
1,2,"[820165, 820291, 826784, 826835, 829009, 85784..."


In [23]:
result['own'] = 0
result['own'] = result['own'].astype(object)

In [24]:
# Обработка новых клиентов
for i in range(len(result)):
    if result['user_id'][i] in train_users:
        result.at[i, 'own'] = recommender.get_own_recommendations(result['user_id'][i], 5)
    else:
        result.at[i, 'own'] = recommender.get_similar_items_recommendation(result['user_id'][i], 5)

In [25]:
#Для каждого юзера 5 рекомендаций (иногда модели могут возвращать < 5)
for i in range(len(result)):
    recommender._extend_with_top_popular(result['own'][i], N=5)

In [26]:
#2 новых товара (юзер никогда не покупал)
N =5
for i in range(len(result)):
    for j in range(len(result['own'][i])):
        len_result_own_i = 0
        if result['own'][i][j] in result['actual'][i]:
            len_result_own_i +=1
    if len_result_own_i == N:
        result['own'][i][N]= recommender._get_similar_item(result['own'][i][N])
        result['own'][i][N-1] = recommender._get_similar_item(result['own'][i][N-1])
    elif len_result_own_i == N-1:
        result['own'][i][N]= recommender._get_similar_item(result['own'][i][N])

In [27]:
#1 дорогой товар, > 7 долларов
N =5
for i in range(len(result)):
    a = data['item_id']
    for i in range(len(a)):
        if price_item[a[i]] > 7:
            item_id_7 = a[i]
            break    
    for j in range(len(result['own'][i])):
        len_result_own_i = 0
        if price_item[result['own'][i][j]] < 7:
            len_result_own_i +=1
    if len_result_own_i == N:
        result['own'][i][N]= item_id_7

In [28]:
result.head(2)

Unnamed: 0,user_id,actual,own
0,1,"[880007, 883616, 931136, 938004, 940947, 94726...","[856942, 940947, 934369, 9527290, 8090541]"
1,2,"[820165, 820291, 826784, 826835, 829009, 85784...","[1075368, 1040807, 8090521, 904236, 940947]"


In [29]:
# precision @ 5
precision_at_5_own = result.apply(lambda x:precision_at_k(x['own'], x['actual']),axis=1).mean()
precision_at_5_own

0.20891246684349926

In [30]:
# money precision @ 5
a = []
for i in range(len(result['own'])):
    x = money_precision_at_k_1(result['own'][i] , result['actual'][i], price_item, k=5)
    a.append(x)
money_precision_at_5_own = mean(a)
money_precision_at_5_own   

0.20076802788208423

In [31]:
result.to_csv('../predictions/predictions_retail_test1.csv', index=False)