# Course project Recommendation systems


## **Задачи**
- Выдать рекомендации товаров пользователям из retail_test1.csv
- Получить метрику precision@5 > 0.235


# Import libs

In [509]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit import als

# Модель второго уровня
from lightgbm import LGBMClassifier

import os, sys
module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)

# Написанные нами функции
from src.metrics import precision_at_k, recall_at_k
from src.utils import prefilter_items
from src.recommenders import MainRecommender

## Read data

In [510]:
PATH_DATA = "../../data"

In [511]:
data = pd.read_csv(os.path.join(PATH_DATA,'retail_train.csv'))
data_test = pd.read_csv(os.path.join(PATH_DATA,'retail_test1.csv'))
item_features = pd.read_csv(os.path.join(PATH_DATA,'product.csv'))
user_features = pd.read_csv(os.path.join(PATH_DATA,'hh_demographic.csv'))

# Set global const

In [512]:
ITEM_COL = 'item_id'
USER_COL = 'user_id'
ACTUAL_COL = 'actual'
N_CANDIDATES = 200
TOPK_PRECISION = 5
# N = Neighbors

# Process features dataset

In [513]:
# column processing
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': ITEM_COL}, inplace=True)
user_features.rename(columns={'household_key': USER_COL }, inplace=True)

# Split dataset for train, eval, test

In [514]:
# Важна схема обучения и валидации!
# -- давние покупки -- | -- 6 недель -- | -- 3 недель -- 
# подобрать размер 2-ого датасета (6 недель) --> learning curve (зависимость метрики recall@k от размера датасета)


VAL_MATCHER_WEEKS = 6
VAL_RANKER_WEEKS = 3

In [515]:
# берем данные для тренировки matching модели
data_train_matcher = data[data['week_no'] < data['week_no'].max() - (VAL_MATCHER_WEEKS + VAL_RANKER_WEEKS)]

# берем данные для валидации matching модели
data_val_matcher = data[(data['week_no'] >= data['week_no'].max() - (VAL_MATCHER_WEEKS + VAL_RANKER_WEEKS)) &
                      (data['week_no'] < data['week_no'].max() - (VAL_RANKER_WEEKS))]

# берем данные для тренировки ranking модели
data_train_ranker = data_val_matcher.copy()  # Для наглядности. Далее мы добавим изменения, и они будут отличаться

# берем данные для теста ranking, matching модели
data_val_ranker = data[data['week_no'] >= data['week_no'].max() - VAL_RANKER_WEEKS]

In [516]:
# сделаем объединенный сет данных для первого уровня (матчинга)
df_join_train_matcher = pd.concat([data_train_matcher, data_val_matcher])

In [517]:
def print_stats_data(df_data, name_df):
    print(name_df)
    print(f"Shape: {df_data.shape} Users: {df_data[USER_COL].nunique()} Items: {df_data[ITEM_COL].nunique()}")

In [518]:
print_stats_data(data_train_matcher,'train_matcher')
print_stats_data(data_val_matcher,'val_matcher')
print_stats_data(data_train_ranker,'train_ranker')
print_stats_data(data_val_ranker,'val_ranker')

train_matcher
Shape: (2108779, 12) Users: 2498 Items: 83685
val_matcher
Shape: (169711, 12) Users: 2154 Items: 27649
train_ranker
Shape: (169711, 12) Users: 2154 Items: 27649
val_ranker
Shape: (118314, 12) Users: 2042 Items: 24329


Выше видим разброс по пользователям и товарам и дальше мы перейдем к warm-start (только известные пользователи)

In [519]:
data_val_matcher.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
2104867,2070,40618492260,594,1019940,1,1.0,311,-0.29,40,86,0.0,0.0
2107468,2021,40618753059,594,840361,1,0.99,443,0.0,101,86,0.0,0.0


# Prefilter items

Отберем товары из модуля utils.py

In [520]:
n_items_before = data_train_matcher['item_id'].nunique()

data_train_matcher = prefilter_items(data_train_matcher, item_features=item_features, take_n_popular=5000)

n_items_after = data_train_matcher['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['price'] = data['sales_value'] / (np.maximum(data['quantity'], 1))


Decreased # items from 83685 to 5001


Функции для оценки метрик recall@k и precision@5

In [521]:
def calc_recall(df_data, top_k):
    for col_name in df_data.columns[2:]:
        yield col_name, df_data.apply(lambda row: recall_at_k(row[col_name], row[ACTUAL_COL], k=top_k), axis=1).mean()

In [522]:
def calc_precision(df_data, top_k):
    for col_name in df_data.columns[2:]:
        yield col_name, df_data.apply(lambda row: precision_at_k(row[col_name], row[ACTUAL_COL], k=top_k), axis=1).mean()

# Make cold-start to warm-start

In [523]:
# # ищем общих пользователей
common_users = data_train_matcher.user_id.values

data_val_matcher = data_val_matcher[data_val_matcher.user_id.isin(common_users)]
data_train_ranker = data_train_ranker[data_train_ranker.user_id.isin(common_users)]
data_val_ranker = data_val_ranker[data_val_ranker.user_id.isin(common_users)]

print_stats_data(data_train_matcher,'train_matcher')
print_stats_data(data_val_matcher,'val_matcher')
print_stats_data(data_train_ranker,'train_ranker')
print_stats_data(data_val_ranker,'val_ranker')

train_matcher
Shape: (861404, 13) Users: 2495 Items: 5001
val_matcher
Shape: (169615, 12) Users: 2151 Items: 27644
train_ranker
Shape: (169615, 12) Users: 2151 Items: 27644
val_ranker
Shape: (118282, 12) Users: 2040 Items: 24325


# Init/train recommender

In [524]:
recommender = MainRecommender(data_train_matcher)

  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/5001 [00:00<?, ?it/s]

Подготовим тестовый датасет

In [525]:
result_test = data_test.groupby(USER_COL)[ITEM_COL].unique().reset_index()
result_test.columns=[USER_COL, ACTUAL_COL]
result_test.head(2)

Unnamed: 0,user_id,actual
0,1,"[880007, 883616, 931136, 938004, 940947, 94726..."
1,2,"[820165, 820291, 826784, 826835, 829009, 85784..."


Есть ли пользователи которых нет на трейне

In [526]:

result_test.loc[~result_test[USER_COL].isin(data_train_matcher[USER_COL].unique())]

Unnamed: 0,user_id,actual
1708,2259,"[822346, 825289, 830750, 831536, 859010, 86181..."
1762,2325,"[849274, 863885, 872137, 877913, 883932, 96520..."


In [527]:
# delete
result_test = result_test.loc[result_test[USER_COL].isin(data_train_matcher[USER_COL].unique())]

За базовое решение примем генерацию кандидатов из самых
популярных во всём тестовом датасете. У каждого пользователя
будут одинаковые рекомендации:

In [528]:
result_test['top_popular'] = result_test[USER_COL].apply(lambda x: recommender.overall_top_purchases[:N_CANDIDATES])
result_test.head(3)

Unnamed: 0,user_id,actual,top_popular
0,1,"[880007, 883616, 931136, 938004, 940947, 94726...","[1029743, 1106523, 5569230, 916122, 844179, 10..."
1,2,"[820165, 820291, 826784, 826835, 829009, 85784...","[1029743, 1106523, 5569230, 916122, 844179, 10..."
2,3,"[827683, 908531, 989069, 1071377, 1080155, 109...","[1029743, 1106523, 5569230, 916122, 844179, 10..."


In [529]:
baseline_metric = tuple(*calc_precision(result_test, 5))[1]
baseline_metric

0.08390865639936272

Подготовим датасет для оценки качкства кандидатов

In [530]:
result_eval_matcher = data_val_matcher.groupby(USER_COL)[ITEM_COL].unique().reset_index()
result_eval_matcher.columns=[USER_COL, ACTUAL_COL]
result_eval_matcher.head(2)

Unnamed: 0,user_id,actual
0,1,"[853529, 865456, 867607, 872137, 874905, 87524..."
1,2,"[15830248, 838136, 839656, 861272, 866211, 870..."


Подберем кандидатов и сравненим их качество

In [531]:
def make_recommendations(df_result, rec_name_model, N=50):
    rec_name = rec_name_model[0]
    rec_model = rec_name_model[1]
    df_result[rec_name] = df_result[USER_COL].apply(lambda x: rec_model(x, N=N))

In [532]:
own_rec = ('own_rec', recommender.get_own_recommendations)
als_rec = ('als_rec', recommender.get_als_recommendations)
sim_user_rec = ('sim_user_rec', recommender.get_similar_users_recommendation)
sim_item_rec = ('sim_item_rec', recommender.get_similar_items_recommendation)

In [533]:
%%time

for rec in (own_rec, als_rec, sim_user_rec, sim_item_rec):
    make_recommendations(result_eval_matcher, rec, N=N_CANDIDATES)

CPU times: user 32min 7s, sys: 25min 37s, total: 57min 45s
Wall time: 23min 37s


In [534]:
def rec_list_concat(*lists, res_len=N_CANDIDATES):
    res = []
    for i in range(len(lists[0])):
        for j in range(len(lists)):
            res.append(lists[j][i])
    return res[:res_len]


result_eval_matcher['als_and_own_rec'] = list(map(rec_list_concat, result_eval_matcher['als_rec'], result_eval_matcher['own_rec']))
result_eval_matcher['own_and_als_rec'] = list(map(rec_list_concat, result_eval_matcher['own_rec'], result_eval_matcher['als_rec']))
result_eval_matcher['own_and_sim_item_rec'] = list(map(rec_list_concat, result_eval_matcher['own_rec'],result_eval_matcher['sim_user_rec']))
result_eval_matcher['als_and_sim_item_rec'] = list(map(rec_list_concat, result_eval_matcher['own_rec'], result_eval_matcher['als_rec']))
result_eval_matcher.head(2)

Unnamed: 0,user_id,actual,own_rec,als_rec,sim_user_rec,sim_item_rec,als_and_own_rec,own_and_als_rec,own_and_sim_item_rec,als_and_sim_item_rec
0,1,"[853529, 865456, 867607, 872137, 874905, 87524...","[856942, 9297615, 5577022, 877391, 9655212, 88...","[1037332, 1094924, 6533936, 885290, 856942, 10...","[974336, 12487356, 8090539, 1135983, 1134633, ...","[824758, 1007512, 9297615, 5577022, 990762, 98...","[1037332, 856942, 1094924, 9297615, 6533936, 5...","[856942, 1037332, 9297615, 1094924, 5577022, 6...","[856942, 974336, 9297615, 12487356, 5577022, 8...","[856942, 1037332, 9297615, 1094924, 5577022, 6..."
1,2,"[15830248, 838136, 839656, 861272, 866211, 870...","[911974, 1076580, 1103898, 5567582, 1056620, 9...","[5569230, 916122, 7166861, 7168026, 1029743, 1...","[1054402, 9419422, 7168057, 830202, 9392953, 9...","[8090537, 5569845, 917816, 985999, 9419563, 81...","[5569230, 911974, 916122, 1076580, 7166861, 11...","[911974, 5569230, 1076580, 916122, 1103898, 71...","[911974, 1054402, 1076580, 9419422, 1103898, 7...","[911974, 5569230, 1076580, 916122, 1103898, 71..."


Recall@k кандидатов полученный разным способом

In [535]:
sorted(calc_recall(result_eval_matcher, N_CANDIDATES), key=lambda x: x[1],reverse=True)

[('own_rec', 0.13537278412833254),
 ('als_and_own_rec', 0.12451373188722584),
 ('own_and_als_rec', 0.12451373188722584),
 ('als_and_sim_item_rec', 0.12451373188722584),
 ('own_and_sim_item_rec', 0.09894715106931025),
 ('als_rec', 0.09782101941819307),
 ('sim_item_rec', 0.08450748678374158),
 ('sim_user_rec', 0.016321758561222668)]

Целевая метрика модели

In [536]:
sorted(calc_precision(result_eval_matcher, TOPK_PRECISION), key=lambda x: x[1],reverse=True)

[('own_rec', 0.17712691771269176),
 ('own_and_als_rec', 0.17154811715481172),
 ('als_and_sim_item_rec', 0.17154811715481172),
 ('als_and_own_rec', 0.15946071594607159),
 ('own_and_sim_item_rec', 0.12859135285913528),
 ('als_rec', 0.11817759181775918),
 ('sim_item_rec', 0.05597396559739656),
 ('sim_user_rec', 0.012273361227336126)]

Лучший recall@k показали кандидаты, созданные по принципу own_recommendations.

# Ranking part

## Подготовка данных для трейна

In [593]:
# взяли пользователей из трейна для ранжирования
df_match_candidates = pd.DataFrame(data_train_ranker[USER_COL].unique())
df_match_candidates.columns = [USER_COL]

In [594]:
# собираем кандитатов с первого этапа (matcher)
df_match_candidates['candidates'] = df_match_candidates[USER_COL].apply(lambda x: recommender.get_own_recommendations(x, N=N_CANDIDATES))

In [595]:
df_match_candidates.head(2)

Unnamed: 0,user_id,candidates
0,2070,"[1105426, 1097350, 879194, 948640, 928263, 944..."
1,2021,"[950935, 1119454, 835578, 863762, 1019142, 102..."


In [596]:
# разворачиваем товары
df_items = df_match_candidates.apply(lambda x: pd.Series(x['candidates']), axis=1).stack().reset_index(level=1, drop=True)
df_items.name = ITEM_COL

In [597]:
df_match_candidates = df_match_candidates.drop('candidates', axis=1).join(df_items)

In [598]:
df_match_candidates.head(4)

Unnamed: 0,user_id,item_id
0,2070,1105426
0,2070,1097350
0,2070,879194
0,2070,948640


### Check warm start

In [599]:
print_stats_data(df_match_candidates, 'match_candidates')

match_candidates
Shape: (430200, 2) Users: 2151 Items: 4598


### Создаем трейн сет для ранжирования с учетом кандидатов с этапа 1 

In [689]:
df_ranker_train = data_train_ranker[[USER_COL, ITEM_COL]].copy()
df_ranker_train['target'] = 1  # тут только покупки 
df_ranker_train.head()

Unnamed: 0,user_id,item_id,target
2104867,2070,1019940,1
2107468,2021,840361,1
2107469,2021,856060,1
2107470,2021,869344,1
2107471,2021,896862,1


In [690]:
df_ranker_train = df_match_candidates.merge(df_ranker_train, on=[USER_COL, ITEM_COL], how='left')

# чистим дубликаты
df_ranker_train = df_ranker_train.drop_duplicates(subset=[USER_COL, ITEM_COL])

# дополняем нулями таргет
df_ranker_train['target'].fillna(0, inplace= True)

In [691]:
# проверим дубликаты пар
df_agg = df_ranker_train.groupby(by=['user_id', 'item_id']).count()
df_agg[df_agg.target > 1]

Unnamed: 0_level_0,Unnamed: 1_level_0,target
user_id,item_id,Unnamed: 2_level_1


In [692]:
# проверка на баланс классов
df_ranker_train.target.value_counts()

0.0    397661
1.0     17984
Name: target, dtype: int64

In [693]:
df_ranker_train

Unnamed: 0,user_id,item_id,target
0,2070,1105426,0.0
1,2070,1097350,0.0
2,2070,879194,0.0
3,2070,948640,0.0
4,2070,928263,0.0
...,...,...,...
437597,1745,849202,0.0
437598,1745,944137,0.0
437599,1745,1051211,0.0
437600,1745,831628,0.0


Доля положительного класса

In [694]:
df_ranker_train['target'].mean()

0.04326769238171998

## Подготавливаем фичи для обучения модели

### Описательные фичи

In [695]:
item_features.head(2)

Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,


In [696]:
user_features.head(2)

Unnamed: 0,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,user_id
0,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1
1,45-54,A,50-74K,Homeowner,2 Adults No Kids,2,None/Unknown,7


In [697]:
df_ranker_train = df_ranker_train.merge(item_features, on='item_id', how='left')
df_ranker_train = df_ranker_train.merge(user_features, on='user_id', how='left')

df_ranker_train.head(2)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc
0,2070,1105426,0.0,69,DELI,Private,SANDWICHES,SANDWICHES - (COLD),,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown
1,2070,1097350,0.0,2468,GROCERY,National,DOMESTIC WINE,VALUE GLASS WINE,4 LTR,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown


### Поведенческие фичи

##### Чтобы считать поведенческие фичи, нужно учесть все данные что были до data_val_ranker

In [698]:
df_join_train_matcher.head()

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0
2,2375,26984851472,1,1036325,1,0.99,364,-0.3,1631,1,0.0,0.0
3,2375,26984851472,1,1082185,1,1.21,364,0.0,1631,1,0.0,0.0
4,2375,26984851472,1,8160430,1,1.5,364,-0.39,1631,1,0.0,0.0


In [699]:
df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=ITEM_COL).agg('sales_value').sum().rename('total_item_sales_value'), how='left',on=ITEM_COL)
df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=ITEM_COL).agg('quantity').sum().rename('total_quantity_value'), how='left',on=ITEM_COL)
df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=ITEM_COL).agg(USER_COL).count().rename('item_freq'), how='left',on=ITEM_COL)
df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=USER_COL).agg(USER_COL).count().rename('user_freq'), how='left',on=USER_COL)
df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=USER_COL).agg('sales_value').sum().rename('total_user_sales_value'), how='left',on=USER_COL)
df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=ITEM_COL).agg('quantity').sum().rename('item_quantity_per_week')/df_join_train_matcher.week_no.nunique(), how='left',on=ITEM_COL)
df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=USER_COL).agg('quantity').sum().rename('user_quantity_per_week')/df_join_train_matcher.week_no.nunique(), how='left',on=USER_COL)
df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=ITEM_COL).agg('quantity').sum().rename('item_quantity_per_basket')/df_join_train_matcher.basket_id.nunique(), how='left',on=ITEM_COL)
df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=USER_COL).agg('quantity').sum().rename('user_quantity_per_baskter')/df_join_train_matcher.basket_id.nunique(), how='left',on=USER_COL)
df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=ITEM_COL).agg(USER_COL).count().rename('item_freq_per_basket')/df_join_train_matcher.basket_id.nunique(), how='left',on=ITEM_COL)
df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=USER_COL).agg(USER_COL).count().rename('user_freq_per_basket')/df_join_train_matcher.basket_id.nunique(), how='left',on=USER_COL)

df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=[USER_COL,ITEM_COL]).agg('quantity').sum().rename('user_item_total_quantity_value'), how='left',on=[USER_COL,ITEM_COL])

# df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=[USER_COL,ITEM_COL]).agg('sales_value').sum().rename('total_user_item_sales_value'), how='left',on=[USER_COL,ITEM_COL])
df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=[USER_COL,ITEM_COL]).agg('quantity').sum().rename('user_item_quantity_per_week')/df_join_train_matcher.week_no.nunique(), how='left',on=[USER_COL,ITEM_COL])
df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=[USER_COL,ITEM_COL]).agg('quantity').sum().rename('user_item_quantity_per_basket')/df_join_train_matcher.basket_id.nunique(), how='left',on=[USER_COL,ITEM_COL])

In [700]:
df_ranker_train.head()

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,...,total_user_sales_value,item_quantity_per_week,user_quantity_per_week,item_quantity_per_basket,user_quantity_per_baskter,item_freq_per_basket,user_freq_per_basket,user_item_total_quantity_value,user_item_quantity_per_week,user_item_quantity_per_basket
0,2070,1105426,0.0,69,DELI,Private,SANDWICHES,SANDWICHES - (COLD),,45-54,...,5754.86,1.241758,1218.32967,0.000461,0.452137,0.000404,0.00814,7.0,0.076923,2.9e-05
1,2070,1097350,0.0,2468,GROCERY,National,DOMESTIC WINE,VALUE GLASS WINE,4 LTR,45-54,...,5754.86,0.593407,1218.32967,0.00022,0.452137,0.000208,0.00814,,,
2,2070,879194,0.0,69,DRUG GM,Private,DIAPERS & DISPOSABLES,BABY DIAPERS,14 CT,45-54,...,5754.86,0.593407,1218.32967,0.00022,0.452137,0.000188,0.00814,3.0,0.032967,1.2e-05
3,2070,948640,0.0,1213,DRUG GM,National,ORAL HYGIENE PRODUCTS,WHITENING SYSTEMS,3 OZ,45-54,...,5754.86,0.538462,1218.32967,0.0002,0.452137,0.000179,0.00814,,,
4,2070,928263,0.0,69,DRUG GM,Private,DIAPERS & DISPOSABLES,BABY DIAPERS,13 CT,45-54,...,5754.86,0.648352,1218.32967,0.000241,0.452137,0.000216,0.00814,3.0,0.032967,1.2e-05


In [701]:
X_train = df_ranker_train.drop('target', axis=1)
y_train = df_ranker_train[['target']]

In [702]:
cat_feats = X_train.columns[2:].tolist()
X_train[cat_feats] = X_train[cat_feats].astype('category')

## Обучение модели ранжирования

In [703]:
%%time
lgb = LGBMClassifier(objective='binary',
                     max_depth=10,
                     n_estimators=200,
                     learning_rate=0.1,
                     categorical_column=cat_feats,
                     n_jobs=-1)

lgb.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


CPU times: user 48.6 s, sys: 23.7 s, total: 1min 12s
Wall time: 18.5 s


LGBMClassifier(categorical_column=['manufacturer', 'department', 'brand',
                                   'commodity_desc', 'sub_commodity_desc',
                                   'curr_size_of_product', 'age_desc',
                                   'marital_status_code', 'income_desc',
                                   'homeowner_desc', 'hh_comp_desc',
                                   'household_size_desc', 'kid_category_desc',
                                   'total_item_sales_value',
                                   'total_quantity_value', 'item_freq',
                                   'user_freq', 'total_user_sales_value',
                                   'item_quantity_per_week',
                                   'user_quantity_per_week',
                                   'item_quantity_per_basket',
                                   'user_quantity_per_baskter',
                                   'item_freq_per_basket',
                                   'user_fre

In [704]:
train_preds = lgb.predict_proba(X_train)

In [705]:
df_ranker_predict = df_ranker_train.copy()

In [706]:
df_ranker_predict['proba_item_purchase'] = train_preds[:,1]

## Подведем итоги

    Мы обучили модель ранжирования на покупках из сета data_train_ranker и на кандитатах от own_recommendations, что является тренировочным сетом, и теперь наша задача предсказать и оценить именно на тестовом сете.

# Evaluation on test dataset

In [707]:
result_eval_ranker = data_val_ranker.groupby(USER_COL)[ITEM_COL].unique().reset_index()
result_eval_ranker.columns=[USER_COL, ACTUAL_COL]
result_eval_ranker.head(2)

Unnamed: 0,user_id,actual
0,1,"[821867, 834484, 856942, 865456, 889248, 90795..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963..."


## Eval matching on test dataset

In [708]:
%%time
result_eval_ranker['own_rec'] = result_eval_ranker[USER_COL].apply(lambda x: recommender.get_own_recommendations(x, N=N_CANDIDATES))

CPU times: user 6.38 s, sys: 0 ns, total: 6.38 s
Wall time: 6.39 s


In [709]:
# померяем precision только модели матчинга, чтобы понимать влияение ранжирования на метрики

sorted(calc_precision(result_eval_ranker, TOPK_PRECISION), key=lambda x: x[1], reverse=True)

[('own_rec', 0.14441176470588235)]

## Eval re-ranked matched result on test dataset
    
    

In [710]:
def rerank(user_id):
    return df_ranker_predict[df_ranker_predict[USER_COL]==user_id].sort_values('proba_item_purchase', ascending=False).head(5).item_id.tolist()

In [711]:
result_eval_ranker['reranked_own_rec'] = result_eval_ranker[USER_COL].apply(lambda user_id: rerank(user_id))

In [712]:
print(*sorted(calc_precision(result_eval_ranker, TOPK_PRECISION), key=lambda x: x[1], reverse=True), sep='\n')

('reranked_own_rec', 0.2959791122715405)
('own_rec', 0.14441176470588235)


  return flags.sum() / len(recommended_list)


## Оценка на тесте

In [713]:
data_test.head()

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,1340,41652823310,664,912987,1,8.49,446,0.0,52,96,0.0,0.0
1,588,41652838477,664,1024426,1,6.29,388,0.0,8,96,0.0,0.0
2,2070,41652857291,664,995242,5,9.1,311,-0.6,46,96,0.0,0.0
3,1602,41665647035,664,827939,1,7.99,334,0.0,1741,96,0.0,0.0
4,1602,41665647035,664,927712,1,0.59,334,-0.4,1741,96,0.0,0.0


Повторим процедуру инициализации тестового датасета, как делали
на базовом решении:

In [714]:
result_test = data_test.groupby(USER_COL)[ITEM_COL].unique().reset_index()
result_test.columns=[USER_COL, ACTUAL_COL]
result_test.head(2)

Unnamed: 0,user_id,actual
0,1,"[880007, 883616, 931136, 938004, 940947, 94726..."
1,2,"[820165, 820291, 826784, 826835, 829009, 85784..."


In [715]:
result_test = result_test.loc[result_test[USER_COL].isin(data_train_matcher[USER_COL].unique())]

In [716]:
#Чистим холодный старт, чтобы необученных юзеров не было в тестовом датасете
common_users = data_train_matcher.user_id.values

result_test = result_test[result_test.user_id.isin(common_users)]

In [717]:
#Строим базовые решения
result_test['own_precision'] = result_test[USER_COL].apply(lambda x: recommender.get_own_recommendations(x, N=N_CANDIDATES))

In [718]:
result_test['reranked_own_precision'] = result_test[USER_COL].apply(lambda user_id: rerank(user_id))

Оценим метрику:

In [719]:
print(*sorted(calc_precision(result_test, TOPK_PRECISION), key=lambda x: x[1], reverse=True), sep='\n')

('reranked_own_precision', 0.2373873873873874)
('own_precision', 0.12278279341476367)


  return flags.sum() / len(recommended_list)


In [722]:
result_test.head(2)

Unnamed: 0,user_id,actual,own_precision,reranked_own_precision
0,1,"[880007, 883616, 931136, 938004, 940947, 94726...","[856942, 9297615, 5577022, 877391, 9655212, 88...","[856942, 9655212, 940947, 877391, 1082269]"
1,2,"[820165, 820291, 826784, 826835, 829009, 85784...","[911974, 1076580, 1103898, 5567582, 1056620, 9...","[1106523, 899624, 5569230, 885023, 916122]"


ИТОГ: 

('reranked_own_precision', 0.2373873873873874). Сделал в 3 шага: 1) Сделал MVP, почистил холодный старт, построил матчинг 2) Надобавлял фичей. В процессе тестирования модели часть фичей пришлось удалить (в том числе и парных), они отрицательно сказывались на результате.  3) Увеличил гиперпараметр N-Neihbours до 200, и сделал предварительную настройку LGBMClassifier.