In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
pd.set_option('display.max_columns', 100)

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit import als

# Модель второго уровня
from catboost import CatBoost, CatBoostClassifier

import os, sys
module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)

os.environ['MKL_NUM_THREADS'] = '1'

from src.metrics import precision_at_k, recall_at_k
from src.utils import prefilter_items
import src.features as ft
from src.recommenders import MainRecommender

import warnings
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv('./data/retail_train.csv')
item_features = pd.read_csv('./data/product.csv')
user_features = pd.read_csv('./data/hh_demographic.csv')

In [3]:
ITEM_COL = 'item_id'
USER_COL = 'user_id'
ACTUAL_COL = 'actual'

# N = Neighbors
N_PREDICT = 10 

# Подготовка фичей датасета

In [4]:
# column processing
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': ITEM_COL}, inplace=True)
user_features.rename(columns={'household_key': USER_COL }, inplace=True)

# Split dataset for train, eval, test

In [5]:
VAL_MATCHER_WEEKS = 6
VAL_RANKER_WEEKS = 3

In [6]:
# берем данные для тренировки matching модели
data_train_matcher = data[data['week_no'] < data['week_no'].max() - (VAL_MATCHER_WEEKS + VAL_RANKER_WEEKS)]

# берем данные для валидации matching модели
data_val_matcher = data[(data['week_no'] >= data['week_no'].max() - (VAL_MATCHER_WEEKS + VAL_RANKER_WEEKS)) &
                      (data['week_no'] < data['week_no'].max() - (VAL_RANKER_WEEKS))]


# берем данные для тренировки ranking модели
data_train_ranker = data_val_matcher.copy()  # Для наглядности. Далее мы добавим изменения, и они будут отличаться

# берем данные для теста ranking, matching модели
data_val_ranker = data[data['week_no'] >= data['week_no'].max() - VAL_RANKER_WEEKS]

In [7]:
# сделаем объединенный сет данных для первого уровня (матчинга)
df_join_train_matcher = pd.concat([data_train_matcher, data_val_matcher])

In [8]:
def print_stats_data(df_data, name_df):
    print(name_df)
    print(f"Shape: {df_data.shape} Users: {df_data[USER_COL].nunique()} Items: {df_data[ITEM_COL].nunique()}")

In [9]:
print_stats_data(data_train_matcher,'train_matcher')
print_stats_data(data_val_matcher,'val_matcher')
print_stats_data(data_train_ranker,'train_ranker')
print_stats_data(data_val_ranker,'val_ranker')

train_matcher
Shape: (2108779, 12) Users: 2498 Items: 83685
val_matcher
Shape: (169711, 12) Users: 2154 Items: 27649
train_ranker
Shape: (169711, 12) Users: 2154 Items: 27649
val_ranker
Shape: (118314, 12) Users: 2042 Items: 24329


In [10]:
data_train_matcher.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


# Prefilter items

In [11]:
# Определю количество наиболее популярных товаров, которые составляют 90% продаж
items_by_poprularity = data_train_matcher.groupby(by='item_id')['basket_id'].nunique(). \
    reset_index().sort_values(by='basket_id', ascending=False)
items_by_poprularity.rename(columns={'basket_id': 'n_purchases'}, inplace=True)
items_by_poprularity.head()

Unnamed: 0,item_id,n_purchases
34192,1082185,24318
54389,6534178,16233
28450,1029743,11661
24657,995242,10226
36790,1106523,8011


In [12]:
purchses_sum = items_by_poprularity.n_purchases.sum()
top_90_percent_items_list = []
purchses_commul_sum = 0
for item, n_purchases in zip(items_by_poprularity.item_id, items_by_poprularity.n_purchases):
    purchses_commul_sum += n_purchases
    if (purchses_commul_sum / purchses_sum) < 0.9:
        top_90_percent_items_list.append(item)
    else:
        break

n_popular = len(top_90_percent_items_list)
n_popular

18714

In [13]:
# Оставляю в датасете только 18714 товаров. Id остальных заменю на 999999
n_items_before = data_train_matcher['item_id'].nunique()

data_train_matcher = prefilter_items(data_train_matcher, item_features=item_features, 
                                     take_n_popular=n_popular)

n_items_after = data_train_matcher['item_id'].nunique()
print(f'Decreased # items from {n_items_before} to {n_items_after}')

Decreased # items from 83685 to 18715


# Make cold-start to warm-start

In [14]:
# ищем общих пользователей
common_users = data_train_matcher.user_id.values

data_val_matcher = data_val_matcher[data_val_matcher.user_id.isin(common_users)]
data_train_ranker = data_train_ranker[data_train_ranker.user_id.isin(common_users)]
data_val_ranker = data_val_ranker[data_val_ranker.user_id.isin(common_users)]

print_stats_data(data_train_matcher,'train_matcher')
print_stats_data(data_val_matcher,'val_matcher')
print_stats_data(data_train_ranker,'train_ranker')
print_stats_data(data_val_ranker,'val_ranker')

train_matcher
Shape: (861404, 13) Users: 2495 Items: 18715
val_matcher
Shape: (169615, 12) Users: 2151 Items: 27644
train_ranker
Shape: (169615, 12) Users: 2151 Items: 27644
val_ranker
Shape: (118282, 12) Users: 2040 Items: 24325


# Grid search

In [15]:
# from itertools import product

In [16]:
# weightings_list = ['bm25', 'tfidf']
# model_type_list = ['als', 'bpr']
# user_item_matrix_values = ['binary', 'sales_value', 'purchase_sum']
# own_recommender_type_list = ['item-item', 'cosine', 'tfidf']
# recs_type_list = ['own', 'rec', 'itm', 'usr']

In [17]:
# result_dict = {
#     'weighting': [],
#     'model_type': [],
#     'own_recommender_type': [],
#     'user_item_matrix_values': [],
#     'own_recall': [],
#     'rec_recall': [],
#     'itm_recall': [],
#     'usr_recall': []
# }

In [18]:
# %%time
# for ui_value, weighting, model_type, own_recommender in product(
#             user_item_matrix_values, weightings_list, model_type_list, own_recommender_type_list):
#     base_recommender = MainRecommender(data_train_matcher, weighting=weighting, 
#                                        model_type=model_type, own_recommender_type=own_recommender, 
#                                        user_item_matrix_values=ui_value)
#     result_dict['weighting'].append(weighting)
#     result_dict['model_type'].append(model_type)
#     result_dict['own_recommender_type'].append(own_recommender)
#     result_dict['user_item_matrix_values'].append(ui_value)
    
#     for el in recs_type_list:
#         res = base_recommender.evalMetrics(metric_type='recall', df_result=data_val_matcher, 
#                         target_col_name=USER_COL, recommend_model_type=el, N_PREDICT=N_PREDICT)
#         result_dict[el + '_recall'].append(res)

In [19]:
# result_df = pd.DataFrame(result_dict)
# result_df

In [20]:
# for col in result_df.columns.to_list()[4:]:
#     print(f'Best {col}:\n{result_df.loc[np.argmax(result_df[col]), :]}')
#     print('*' * 30)

# Init/train recommender

In [21]:
recommender = MainRecommender(data_train_matcher, weighting='tfidf',
                             model_type='als', own_recommender_type='cosine',
                             user_item_matrix_values='binary')

### Recall@50 of matching

In [22]:
recs_type_list = ['own', 'rec', 'itm', 'usr']

In [23]:
TOPK_RECALL = 10

In [24]:
# for el in recs_type_list:
#     res = recommender.evalMetrics(metric_type='recall', df_result=data_val_matcher, 
#                     target_col_name=USER_COL, recommend_model_type=el, N_PREDICT=TOPK_RECALL)
#     print(f'{el} recall: {res}')

### Precision@5 of matching

In [25]:
TOPK_PRECISION = 5

In [26]:
# for el in recs_type_list:
#     res = recommender.evalMetrics(metric_type='precision', df_result=data_val_matcher, 
#                     target_col_name=USER_COL, recommend_model_type=el, N_PREDICT=TOPK_PRECISION)
#     print(f'{el} precision: {res}')

# Ranking part

## Подготовка данных для трейна

In [27]:
# взяли пользователей из трейна для ранжирования
df_match_candidates = pd.DataFrame(data_train_ranker[USER_COL].unique())
df_match_candidates.columns = [USER_COL]

In [28]:
# собираем кандитатов с первого этапа (matcher)
df_match_candidates['candidates'] = df_match_candidates[USER_COL].apply(
    lambda x: recommender.get_own_recommendations(x, N=N_PREDICT))

In [29]:
df_match_candidates.head(2)

Unnamed: 0,user_id,candidates
0,2070,"[1029743, 913210, 1105426, 933067, 838186, 109..."
1,2021,"[1119454, 950935, 1041390, 844179, 1013928, 65..."


In [30]:
df_items = df_match_candidates.apply(lambda x: pd.Series(x['candidates']), axis=1).stack().reset_index(level=1, drop=True)
df_items.name = 'item_id'

In [31]:
df_match_candidates = df_match_candidates.drop('candidates', axis=1).join(df_items)

In [32]:
df_match_candidates.head(4)

Unnamed: 0,user_id,item_id
0,2070,1029743
0,2070,913210
0,2070,1105426
0,2070,933067


### Check warm start

In [33]:
print_stats_data(df_match_candidates, 'match_candidates')

match_candidates
Shape: (21510, 2) Users: 2151 Items: 6322


### Создаем трейн сет для ранжирования с учетом кандидатов с этапа 1 

In [34]:
df_ranker_train = data_train_ranker[[USER_COL, ITEM_COL]].copy()
df_ranker_train['target'] = 1  # тут только покупки 

In [35]:
df_ranker_train.head()

Unnamed: 0,user_id,item_id,target
2104867,2070,1019940,1
2107468,2021,840361,1
2107469,2021,856060,1
2107470,2021,869344,1
2107471,2021,896862,1


#### Не хватает нулей в датасете, поэтому добавляем наших кандитатов в качество нулей

In [36]:
df_ranker_train = df_match_candidates.merge(df_ranker_train, on=[USER_COL, ITEM_COL], how='left')

# чистим дубликаты
df_ranker_train = df_ranker_train.drop_duplicates(subset=[USER_COL, ITEM_COL])

df_ranker_train['target'].fillna(0, inplace= True)

In [37]:
df_ranker_train.target.value_counts()

0.0    15950
1.0     5556
Name: target, dtype: int64

In [38]:
df_ranker_train.head(2)

Unnamed: 0,user_id,item_id,target
0,2070,1029743,0.0
1,2070,913210,1.0


## Подготавливаем фичи для обучения модели

In [39]:
item_features.head(2)

Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,


In [40]:
user_features.head(2)

Unnamed: 0,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,user_id
0,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1
1,45-54,A,50-74K,Homeowner,2 Adults No Kids,2,None/Unknown,7


In [41]:
df_ranker_train = df_ranker_train.merge(item_features, on='item_id', how='left')
df_ranker_train = df_ranker_train.merge(user_features, on='user_id', how='left')

df_ranker_train.head(2)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc
0,2070,1029743,0.0,69,GROCERY,Private,FLUID MILK PRODUCTS,FLUID MILK WHITE ONLY,1 GA,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown
1,2070,913210,1.0,2,GROCERY,National,WATER - CARBONATED/FLVRD DRINK,NON-CRBNTD DRNKING/MNERAL WATE,405.6 OZ,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown


### Добавляю новые фичи в датасет

Заказ товара в последних 5 покупках в виде последовательности бит (категориальная).

In [42]:
last_5_purchases_as_bit_array = ft.get_last_5_purchases_as_bit_array(data_train_ranker, item_features)
df_ranker_train = df_ranker_train.merge(last_5_purchases_as_bit_array, on=['item_id'], how='left')

#### user_id

In [43]:
# Средняя сумма покупки 1 товара в каждой категории
mean_purchase_by_department= ft.get_mean_purchase_per_item_by_department(data_train_ranker, 
                                                                      item_features)
df_ranker_train = df_ranker_train.merge(mean_purchase_by_department[['user_id', 
                        'department', 'mean_purchase']], on=['user_id', 'department'], how='left')

# Кол-во покупок в каждой категории
num_purchases_by_department = ft.get_num_purchases_per_department(data_train_ranker, item_features)
df_ranker_train = df_ranker_train.merge(num_purchases_by_department[['user_id', 'department', 
                                        'num_purchases']], on=['user_id', 'department'], how='left')

# Доля покупок утром/днем/вечером
user_trans_df = ft.get_proportion_of_purchases_by_times_of_day(data_train_ranker)
df_ranker_train = df_ranker_train.merge(user_trans_df, on='user_id', how='left')

#### item_id

In [44]:
# Кол-во покупок в неделю
week_purchases_df = ft.get_num_purchases_per_week(data_train_ranker)
df_ranker_train = df_ranker_train.merge(week_purchases_df[['item_id', 'n_purchases_per_week']], 
                                        on=['item_id'], how='left')

# Среднее кол-во покупок 1 товара в категории в неделю
mean_n_purchases_per_week = ft.get_mean_num_purchases_per_item_dept_week(data_train_ranker, 
                                                                      item_features)
df_ranker_train = df_ranker_train.merge(mean_n_purchases_per_week[['department', 
                                        'mean_n_purchases_per_week']], on=['department'], how='left')

# (Кол-во покупок в неделю) / (Среднее кол-во покупок 1 товара в категории в неделю)
df_ranker_train['n_purchases_div_by_mean'] = \
    df_ranker_train.n_purchases_per_week / df_ranker_train.mean_n_purchases_per_week

# Цена
item_price_df = ft.get_price(data_train_ranker)
df_ranker_train = df_ranker_train.merge(item_price_df[['item_id', 'price']], 
                                        on=['item_id'], how='left')

# Цена / Средняя цена товара в категории
df_ranker_train.loc[(df_ranker_train.department == 'PRODUCE') &
                    (df_ranker_train.price == float('Inf')), 'price'] = 0
mean_price_by_department = ft.get_mean_price_by_department(df_ranker_train)
df_ranker_train = df_ranker_train.merge(mean_price_by_department[['department', 'mean_price']], 
                                        on=['department'], how='left')
df_ranker_train['price_div_by_mean_dept_price'] = df_ranker_train.price / df_ranker_train.mean_price
df_ranker_train.drop('mean_price', axis=1, inplace=True)

### user_id - item_id

In [45]:
# (Кол-во покупок юзером конкретной категории в неделю) - (Среднее 
# кол-во покупок всеми юзерами конкретной категории в неделю)
n_purchases_sub_by_mean = ft.get_num_purchases_sub_by_mean(data_train_ranker, item_features)
df_ranker_train = df_ranker_train.merge(n_purchases_sub_by_mean[['user_id', 'department',
        'n_purchases_sub_by_mean']], on=['user_id', 'department'], how='left')

# (Кол-во покупок юзером конкретной категории в неделю) / (Среднее 
# кол-во покупок всеми юзерами конкретной категории в неделю)
n_purchases_div_by_mean = ft.get_num_purchases_div_by_mean_all_users(data_train_ranker, item_features)
df_ranker_train = df_ranker_train.merge(n_purchases_div_by_mean[['user_id', 'department',
        'n_purchases_div_by_mean_all_users']], on=['user_id', 'department'], how='left')

# (Средняя сумма покупки 1 товара в каждой категории (берем категорию item_id)) - (Цена item_id)
sales_values_by_dept = ft.get_mean_sales_value_per_item_by_department(data_train_ranker, item_features)
df_ranker_train = df_ranker_train.merge(sales_values_by_dept[['department', 'mean_sale_sum_per_item']], 
                                        on=['department'], how='left')
df_ranker_train['mean_sale_sum_per_item_sub_price'] = \
    df_ranker_train.mean_sale_sum_per_item - df_ranker_train.price
df_ranker_train.drop('mean_sale_sum_per_item', axis=1, inplace=True)

Поведенческие фичи

In [46]:
# Общая сумма покупок каждого товара
total_item_sales_value = ft.get_total_item_sales_value(df_join_train_matcher)
df_ranker_train = df_ranker_train.merge(total_item_sales_value, how='left', on='item_id')

# Общее количество по каждому товару
total_quantity_value = ft.get_total_quantity_value(df_join_train_matcher)
df_ranker_train = df_ranker_train.merge(total_quantity_value, how='left', on='item_id')

# Количество покупателей по каждому товару
item_freq = ft.get_item_freq(df_join_train_matcher)
df_ranker_train = df_ranker_train.merge(item_freq, how='left', on='item_id')

# Частота пользователей
user_freq = ft.get_user_freq(df_join_train_matcher)
df_ranker_train = df_ranker_train.merge(user_freq, how='left', on=USER_COL)

# Общее количество покупок по каждому пользователю
total_user_sales_value = ft.get_total_user_sales_value(df_join_train_matcher)
df_ranker_train = df_ranker_train.merge(total_user_sales_value, how='left', on='user_id')

# Среднее количество покупок товара в неделю
item_quantity_per_week = ft.get_item_quantity_per_week(df_join_train_matcher)
df_ranker_train = df_ranker_train.merge(item_quantity_per_week, how='left', on='item_id')

# Среднее количество купленного товара пользователем в неделю
user_quantity_per_week = ft.get_user_quantity_per_week(df_join_train_matcher)
df_ranker_train = df_ranker_train.merge(user_quantity_per_week, how='left', on='user_id')

# Среднее количество товара за 1 покупку
item_quantity_per_basket = ft.get_item_quantity_per_basket(df_join_train_matcher)
df_ranker_train = df_ranker_train.merge(item_quantity_per_basket, how='left', on='item_id')

# Среднее количество товара у польователя за 1 покупку
user_quantity_per_basket = ft.get_user_quantity_per_basket(df_join_train_matcher)
df_ranker_train = df_ranker_train.merge(user_quantity_per_basket, how='left', on='user_id')

# Средняя частота товара в карзине
item_freq_per_basket = ft.get_item_freq_per_basket(df_join_train_matcher)
df_ranker_train = df_ranker_train.merge(item_freq_per_basket, how='left', on='item_id')

# Средняя частота пользователей купивших товар
user_freq_per_basket = ft.get_user_freq_per_basket(df_join_train_matcher)
df_ranker_train = df_ranker_train.merge(user_freq_per_basket, how='left', on='user_id')

Факторы товаров из модели матричной факторизации

In [47]:
item_factors = recommender.get_item_factors()
df_ranker_train = df_ranker_train.merge(item_factors, on=['item_id'], how='left')

Факторы пользователей из модели матричной факторизации

In [48]:
user_factors = recommender.get_user_factors()
df_ranker_train = df_ranker_train.merge(user_factors, on=['user_id'], how='left')

### Разбиваю на X и y

In [54]:
df_ranker_train.fillna(0, inplace=True)

In [55]:
df_ranker_train.head()

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,item_in_last_5_transactions,mean_purchase,num_purchases,morning_trans,day_trans,evening_trans,n_purchases_per_week,mean_n_purchases_per_week,n_purchases_div_by_mean,price,price_div_by_mean_dept_price,n_purchases_sub_by_mean,n_purchases_div_by_mean_all_users,mean_sale_sum_per_item_sub_price,total_item_sales_value,total_quantity_value,item_freq,user_freq,total_user_sales_value,item_quantity_per_week,user_quantity_per_week,item_quantity_per_basket,user_quantity_per_basket,item_freq_per_basket,user_freq_per_basket,item_factor_0,item_factor_1,item_factor_2,item_factor_3,item_factor_4,item_factor_5,item_factor_6,item_factor_7,item_factor_8,item_factor_9,item_factor_10,item_factor_11,item_factor_12,item_factor_13,item_factor_14,item_factor_15,item_factor_16,item_factor_17,item_factor_18,item_factor_19,user_factor_0,user_factor_1,user_factor_2,user_factor_3,user_factor_4,user_factor_5,user_factor_6,user_factor_7,user_factor_8,user_factor_9,user_factor_10,user_factor_11,user_factor_12,user_factor_13,user_factor_14,user_factor_15,user_factor_16,user_factor_17,user_factor_18,user_factor_19
0,2070,1029743,0.0,69,GROCERY,Private,FLUID MILK PRODUCTS,FLUID MILK WHITE ONLY,1 GA,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,0,1.461549,39.0,0.714286,0.0,0.285714,179.333333,2.499367,71.75151,2.682058,0.655522,6.192347,21.127701,-0.150089,35764.66,15015,12737,1996,5754.86,165.0,1218.32967,0.061233,0.452137,0.051943,0.00814,0.066465,0.026564,0.115785,0.020399,0.052085,0.018378,0.034077,0.020123,-0.018639,0.025482,0.034647,0.064597,0.019448,0.002459,0.062425,0.027316,0.030092,0.101295,0.121174,0.086379,1.86111,1.653347,3.124222,2.104717,-0.488921,0.495965,1.286621,2.274672,1.392407,-0.224331,-1.356235,0.561147,0.983569,-1.059754,-1.361881,-2.61805,3.619438,2.754422,2.089347,0.12518
1,2070,913210,1.0,2,GROCERY,National,WATER - CARBONATED/FLVRD DRINK,NON-CRBNTD DRNKING/MNERAL WATE,405.6 OZ,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,0,1.461549,39.0,0.714286,0.0,0.285714,17.666667,2.499367,7.068457,3.99,0.975196,6.192347,21.127701,-1.458031,5406.18,1364,1175,1996,5754.86,14.989011,1218.32967,0.005563,0.452137,0.004792,0.00814,0.059623,-0.034862,0.048245,0.014096,0.035486,0.040294,0.002642,0.050328,0.058283,0.071104,-0.011521,0.041084,0.064835,0.006787,0.035533,0.02106,0.018043,0.030404,0.093669,0.034801,1.86111,1.653347,3.124222,2.104717,-0.488921,0.495965,1.286621,2.274672,1.392407,-0.224331,-1.356235,0.561147,0.983569,-1.059754,-1.361881,-2.61805,3.619438,2.754422,2.089347,0.12518
2,2070,1105426,0.0,69,DELI,Private,SANDWICHES,SANDWICHES - (COLD),,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,0,3.596667,2.0,0.714286,0.0,0.285714,1.0,2.260583,0.442364,3.99,0.861,0.808016,5.20876,0.537463,442.9,113,99,1996,5754.86,1.241758,1218.32967,0.000461,0.452137,0.000404,0.00814,0.011821,0.02022,0.022124,-0.003266,0.007878,0.032496,-0.011329,-0.002333,0.026204,0.008198,-0.025307,0.014758,0.004584,0.020256,0.006443,-0.007712,-0.005453,0.003935,0.005894,0.020193,1.86111,1.653347,3.124222,2.104717,-0.488921,0.495965,1.286621,2.274672,1.392407,-0.224331,-1.356235,0.561147,0.983569,-1.059754,-1.361881,-2.61805,3.619438,2.754422,2.089347,0.12518
3,2070,933067,1.0,1425,MEAT-PCKGD,National,BACON,FLAVORED/OTHER,16 OZ,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,0,3.426364,3.0,0.714286,0.0,0.285714,7.333333,2.36945,3.094952,3.99,0.84042,0.80253,5.064053,-0.155712,1974.07,711,520,1996,5754.86,7.813187,1218.32967,0.0029,0.452137,0.002121,0.00814,0.031612,0.04928,0.019087,0.029102,0.030467,-0.006045,0.042394,0.001145,0.028372,0.011762,0.065151,0.072213,0.044012,0.009182,-0.007959,0.010895,-0.005781,0.090651,0.01095,0.005597,1.86111,1.653347,3.124222,2.104717,-0.488921,0.495965,1.286621,2.274672,1.392407,-0.224331,-1.356235,0.561147,0.983569,-1.059754,-1.361881,-2.61805,3.619438,2.754422,2.089347,0.12518
4,2070,838186,1.0,1790,GROCERY,National,BAKED SWEET GOODS,SW GDS:DONUTS,18.2 OZ,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,0,1.461549,39.0,0.714286,0.0,0.285714,7.5,2.499367,3.00076,3.99,0.975196,6.192347,21.127701,-1.458031,2848.27,752,694,1996,5754.86,8.263736,1218.32967,0.003067,0.452137,0.00283,0.00814,0.05606,0.006627,0.062418,0.010776,0.058622,0.046359,0.054751,0.059989,0.012659,0.044626,0.008342,0.029874,0.041143,-0.011917,0.070987,0.009419,-0.016196,0.05837,0.03178,0.028939,1.86111,1.653347,3.124222,2.104717,-0.488921,0.495965,1.286621,2.274672,1.392407,-0.224331,-1.356235,0.561147,0.983569,-1.059754,-1.361881,-2.61805,3.619438,2.754422,2.089347,0.12518


In [56]:
X_train = df_ranker_train.drop('target', axis=1)
y_train = df_ranker_train[['target']]

In [57]:
cat_feats = X_train.columns[2:16].tolist()
X_train[cat_feats] = X_train[cat_feats].astype('category')

## Обучение модели ранжирования

In [53]:
cb = CatBoost()

In [58]:
cb.fit(X_train, y_train, cat_features=cat_feats)

train_preds = cb.predict_proba(X_train)

Learning rate set to 0.06739
0:	learn: 0.4316849	total: 210ms	remaining: 3m 29s
1:	learn: 0.4264022	total: 254ms	remaining: 2m 6s
2:	learn: 0.4214993	total: 298ms	remaining: 1m 38s
3:	learn: 0.4172681	total: 341ms	remaining: 1m 24s
4:	learn: 0.4135997	total: 383ms	remaining: 1m 16s
5:	learn: 0.4106147	total: 424ms	remaining: 1m 10s
6:	learn: 0.4073980	total: 470ms	remaining: 1m 6s
7:	learn: 0.4044920	total: 512ms	remaining: 1m 3s
8:	learn: 0.4021349	total: 556ms	remaining: 1m 1s
9:	learn: 0.3999185	total: 597ms	remaining: 59.1s
10:	learn: 0.3979629	total: 640ms	remaining: 57.5s
11:	learn: 0.3960279	total: 681ms	remaining: 56.1s
12:	learn: 0.3942621	total: 722ms	remaining: 54.9s
13:	learn: 0.3927993	total: 764ms	remaining: 53.8s
14:	learn: 0.3911522	total: 806ms	remaining: 52.9s
15:	learn: 0.3900394	total: 848ms	remaining: 52.2s
16:	learn: 0.3887222	total: 890ms	remaining: 51.4s
17:	learn: 0.3876865	total: 934ms	remaining: 51s
18:	learn: 0.3866631	total: 976ms	remaining: 50.4s
19:	learn

161:	learn: 0.3582428	total: 7.14s	remaining: 36.9s
162:	learn: 0.3581782	total: 7.18s	remaining: 36.9s
163:	learn: 0.3580550	total: 7.27s	remaining: 37.1s
164:	learn: 0.3579695	total: 7.32s	remaining: 37s
165:	learn: 0.3578200	total: 7.37s	remaining: 37s
166:	learn: 0.3577081	total: 7.42s	remaining: 37s
167:	learn: 0.3575903	total: 7.46s	remaining: 36.9s
168:	learn: 0.3574294	total: 7.54s	remaining: 37.1s
169:	learn: 0.3572879	total: 7.58s	remaining: 37s
170:	learn: 0.3572075	total: 7.62s	remaining: 36.9s
171:	learn: 0.3570276	total: 7.66s	remaining: 36.9s
172:	learn: 0.3569624	total: 7.74s	remaining: 37s
173:	learn: 0.3568262	total: 7.78s	remaining: 36.9s
174:	learn: 0.3567063	total: 7.82s	remaining: 36.9s
175:	learn: 0.3565262	total: 7.86s	remaining: 36.8s
176:	learn: 0.3563435	total: 7.9s	remaining: 36.7s
177:	learn: 0.3562123	total: 7.94s	remaining: 36.7s
178:	learn: 0.3560555	total: 7.99s	remaining: 36.7s
179:	learn: 0.3559867	total: 8.03s	remaining: 36.6s
180:	learn: 0.3559510	t

320:	learn: 0.3408951	total: 14.1s	remaining: 29.8s
321:	learn: 0.3408122	total: 14.2s	remaining: 29.8s
322:	learn: 0.3406590	total: 14.2s	remaining: 29.7s
323:	learn: 0.3405422	total: 14.2s	remaining: 29.7s
324:	learn: 0.3404525	total: 14.3s	remaining: 29.7s
325:	learn: 0.3403985	total: 14.3s	remaining: 29.6s
326:	learn: 0.3403244	total: 14.4s	remaining: 29.6s
327:	learn: 0.3402357	total: 14.4s	remaining: 29.5s
328:	learn: 0.3402300	total: 14.5s	remaining: 29.5s
329:	learn: 0.3402027	total: 14.5s	remaining: 29.4s
330:	learn: 0.3401654	total: 14.5s	remaining: 29.4s
331:	learn: 0.3400365	total: 14.6s	remaining: 29.3s
332:	learn: 0.3399347	total: 14.6s	remaining: 29.3s
333:	learn: 0.3398279	total: 14.7s	remaining: 29.3s
334:	learn: 0.3397791	total: 14.7s	remaining: 29.2s
335:	learn: 0.3396314	total: 14.8s	remaining: 29.2s
336:	learn: 0.3395275	total: 14.8s	remaining: 29.1s
337:	learn: 0.3394654	total: 14.9s	remaining: 29.1s
338:	learn: 0.3394649	total: 14.9s	remaining: 29s
339:	learn: 0.

479:	learn: 0.3283894	total: 20.9s	remaining: 22.7s
480:	learn: 0.3283072	total: 21s	remaining: 22.6s
481:	learn: 0.3282629	total: 21s	remaining: 22.6s
482:	learn: 0.3282130	total: 21.1s	remaining: 22.5s
483:	learn: 0.3281517	total: 21.1s	remaining: 22.5s
484:	learn: 0.3281158	total: 21.1s	remaining: 22.4s
485:	learn: 0.3280819	total: 21.2s	remaining: 22.4s
486:	learn: 0.3279875	total: 21.2s	remaining: 22.4s
487:	learn: 0.3278748	total: 21.3s	remaining: 22.3s
488:	learn: 0.3278218	total: 21.3s	remaining: 22.3s
489:	learn: 0.3277539	total: 21.3s	remaining: 22.2s
490:	learn: 0.3276331	total: 21.4s	remaining: 22.2s
491:	learn: 0.3275247	total: 21.4s	remaining: 22.1s
492:	learn: 0.3274999	total: 21.5s	remaining: 22.1s
493:	learn: 0.3274306	total: 21.5s	remaining: 22s
494:	learn: 0.3273851	total: 21.5s	remaining: 22s
495:	learn: 0.3273688	total: 21.6s	remaining: 21.9s
496:	learn: 0.3273080	total: 21.6s	remaining: 21.9s
497:	learn: 0.3272626	total: 21.7s	remaining: 21.8s
498:	learn: 0.327151

642:	learn: 0.3177195	total: 27.6s	remaining: 15.3s
643:	learn: 0.3176413	total: 27.6s	remaining: 15.3s
644:	learn: 0.3175135	total: 27.7s	remaining: 15.2s
645:	learn: 0.3174475	total: 27.7s	remaining: 15.2s
646:	learn: 0.3173920	total: 27.8s	remaining: 15.1s
647:	learn: 0.3173118	total: 27.8s	remaining: 15.1s
648:	learn: 0.3172616	total: 27.8s	remaining: 15.1s
649:	learn: 0.3171223	total: 27.9s	remaining: 15s
650:	learn: 0.3170726	total: 27.9s	remaining: 15s
651:	learn: 0.3169683	total: 28s	remaining: 14.9s
652:	learn: 0.3168822	total: 28s	remaining: 14.9s
653:	learn: 0.3167593	total: 28s	remaining: 14.8s
654:	learn: 0.3166766	total: 28.1s	remaining: 14.8s
655:	learn: 0.3165951	total: 28.1s	remaining: 14.7s
656:	learn: 0.3164957	total: 28.2s	remaining: 14.7s
657:	learn: 0.3164577	total: 28.2s	remaining: 14.7s
658:	learn: 0.3163920	total: 28.2s	remaining: 14.6s
659:	learn: 0.3163592	total: 28.3s	remaining: 14.6s
660:	learn: 0.3162482	total: 28.3s	remaining: 14.5s
661:	learn: 0.3160974	

803:	learn: 0.3059397	total: 34.3s	remaining: 8.36s
804:	learn: 0.3058591	total: 34.4s	remaining: 8.32s
805:	learn: 0.3058011	total: 34.4s	remaining: 8.28s
806:	learn: 0.3057010	total: 34.4s	remaining: 8.23s
807:	learn: 0.3056492	total: 34.5s	remaining: 8.19s
808:	learn: 0.3055973	total: 34.5s	remaining: 8.15s
809:	learn: 0.3055232	total: 34.6s	remaining: 8.1s
810:	learn: 0.3054810	total: 34.6s	remaining: 8.06s
811:	learn: 0.3053936	total: 34.6s	remaining: 8.02s
812:	learn: 0.3053317	total: 34.7s	remaining: 7.97s
813:	learn: 0.3052616	total: 34.7s	remaining: 7.93s
814:	learn: 0.3051793	total: 34.8s	remaining: 7.89s
815:	learn: 0.3051700	total: 34.8s	remaining: 7.84s
816:	learn: 0.3051132	total: 34.8s	remaining: 7.8s
817:	learn: 0.3050522	total: 34.9s	remaining: 7.76s
818:	learn: 0.3049980	total: 34.9s	remaining: 7.72s
819:	learn: 0.3049049	total: 35s	remaining: 7.67s
820:	learn: 0.3048814	total: 35s	remaining: 7.63s
821:	learn: 0.3047944	total: 35s	remaining: 7.59s
822:	learn: 0.304778

964:	learn: 0.2956113	total: 40.8s	remaining: 1.48s
965:	learn: 0.2955429	total: 40.8s	remaining: 1.44s
966:	learn: 0.2954684	total: 40.9s	remaining: 1.4s
967:	learn: 0.2954193	total: 40.9s	remaining: 1.35s
968:	learn: 0.2953459	total: 41s	remaining: 1.31s
969:	learn: 0.2952608	total: 41s	remaining: 1.27s
970:	learn: 0.2952163	total: 41.1s	remaining: 1.23s
971:	learn: 0.2951476	total: 41.1s	remaining: 1.18s
972:	learn: 0.2950535	total: 41.1s	remaining: 1.14s
973:	learn: 0.2950090	total: 41.2s	remaining: 1.1s
974:	learn: 0.2949796	total: 41.2s	remaining: 1.06s
975:	learn: 0.2948959	total: 41.3s	remaining: 1.01s
976:	learn: 0.2948362	total: 41.3s	remaining: 972ms
977:	learn: 0.2947515	total: 41.3s	remaining: 930ms
978:	learn: 0.2947060	total: 41.4s	remaining: 887ms
979:	learn: 0.2947039	total: 41.4s	remaining: 845ms
980:	learn: 0.2946601	total: 41.5s	remaining: 803ms
981:	learn: 0.2945776	total: 41.5s	remaining: 761ms
982:	learn: 0.2944837	total: 41.5s	remaining: 718ms
983:	learn: 0.2944

NameError: name 'lgb' is not defined

In [None]:
df_ranker_predict = df_ranker_train.copy()

In [None]:
df_ranker_predict['proba_item_purchase'] = train_preds[:,1]

In [None]:
df_ranker_predict.head()

# Evaluation on test dataset

In [None]:
print('Matcher\'s precision:')
for el in recs_type_list:
    res = recommender.evalMetrics(metric_type='precision', df_result=data_val_matcher, 
                    target_col_name=USER_COL, recommend_model_type=el, N_PREDICT=TOPK_PRECISION)
    print(f'{el} precision: {res}')

In [None]:
print('Ranker\'s precision:')
for el in recs_type_list:
    res = recommender.evalMetrics(metric_type='precision', df_result=data_val_ranker, 
                                    target_col_name=USER_COL, recommend_model_type=el, 
                                    N_PREDICT=TOPK_PRECISION)
    print(f'{el} precision: {res}')

## Eval re-ranked matched result on test dataset

In [None]:
print('Re-ranked precision:')
recommender.reranked_metrics(metric_type='precision', df_result=data_val_ranker, 
                             df_predict=df_ranker_predict, target_col_name=USER_COL, 
                             recommend_model_type='rec', N_PREDICT=TOPK_PRECISION)

# Оценка на тесте для выполнения курсового проекта

In [None]:
df_test = pd.read_csv('./data/retail_test1.csv')
df_test.head()

In [None]:
# warm start
df_test = df_test[df_test.user_id.isin(common_users)]
print_stats_data(df_test,'df_test')

In [None]:
print('Test precision:')
recommender.evalMetrics(metric_type='precision', df_result=df_test, 
                          target_col_name=USER_COL, recommend_model_type='own', 
                          N_PREDICT=TOPK_PRECISION)

In [None]:
print('Test re-ranked precision:')
recommender.reranked_metrics(metric_type='precision', df_result=df_test, 
                             df_predict=df_ranker_predict, target_col_name=USER_COL, 
                             recommend_model_type='own', N_PREDICT=TOPK_PRECISION)