# Черновой ноутбук с базовым решением

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
pd.set_option('display.max_columns', 100)

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit import als

# Модель второго уровня
from lightgbm import LGBMClassifier

import os, sys
module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)

os.environ['MKL_NUM_THREADS'] = '1'

# Написанные нами функции
from src.metrics import precision_at_k, recall_at_k
from src.utils import prefilter_items
import src.features as ft
from src.recommenders import MainRecommender

import warnings
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv('./data/retail_train.csv')
item_features = pd.read_csv('./data/product.csv')
user_features = pd.read_csv('./data/hh_demographic.csv')

In [3]:
ITEM_COL = 'item_id'
USER_COL = 'user_id'
ACTUAL_COL = 'actual'

# N = Neighbors
N_PREDICT = 10 

## Подготовка фичей датасета

In [4]:
# column processing
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': ITEM_COL}, inplace=True)
user_features.rename(columns={'household_key': USER_COL }, inplace=True)

## Разбиваю датасет на train, eval, test

In [5]:
RANKER_WEEKS = 6

In [6]:
# берем данные для тренировки matching модели
data_matcher = data[data['week_no'] < data['week_no'].max() - RANKER_WEEKS]

# берем данные для тренировки ranking модели
data_ranker = data[data['week_no'] >= data['week_no'].max() - RANKER_WEEKS]

In [7]:
def print_stats_data(df_data, name_df):
    print(name_df)
    print(f"Shape: {df_data.shape} Users: {df_data[USER_COL].nunique()} Items: {df_data[ITEM_COL].nunique()}")

In [8]:
print_stats_data(data_matcher,'matcher')
print_stats_data(data_ranker,'ranker')

matcher
Shape: (2193515, 12) Users: 2499 Items: 85334
ranker
Shape: (203289, 12) Users: 2197 Items: 30040


In [9]:
data_matcher.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


## Предфильтрация товаров

In [10]:
# Определю количество наиболее популярных товаров, которые составляют 90% продаж
items_by_poprularity = data_matcher.groupby(by='item_id')['basket_id'].nunique(). \
    reset_index().sort_values(by='basket_id', ascending=False)
items_by_poprularity.rename(columns={'basket_id': 'n_purchases'}, inplace=True)
items_by_poprularity.head()

Unnamed: 0,item_id,n_purchases
34457,1082185,25221
54964,6534178,16889
28682,1029743,12323
24866,995242,10448
37082,1106523,8420


In [11]:
purchses_sum = items_by_poprularity.n_purchases.sum()
top_90_percent_items_list = []
purchses_commul_sum = 0
for item, n_purchases in zip(items_by_poprularity.item_id, items_by_poprularity.n_purchases):
    purchses_commul_sum += n_purchases
    if (purchses_commul_sum / purchses_sum) < 0.9:
        top_90_percent_items_list.append(item)
    else:
        break

n_popular = len(top_90_percent_items_list)
print(f'90% top items: {n_popular}')

90% top items: 18851


In [12]:
# Оставляю в датасете только 18714 товаров. Id остальных заменю на 999999
n_items_before = data_matcher['item_id'].nunique()

data_matcher = prefilter_items(data_matcher, item_features=item_features, 
                                     take_n_popular=n_popular)

n_items_after = data_matcher['item_id'].nunique()
print(f'Decreased # items from {n_items_before} to {n_items_after}')

Decreased # items from 85334 to 18852


## Убираю холодный пользователей

In [13]:
# ищем общих пользователей
common_users = data_matcher.user_id.values

data_ranker = data_ranker[data_ranker.user_id.isin(common_users)]

print_stats_data(data_matcher,'matcher')
print_stats_data(data_ranker,'ranker')

matcher
Shape: (896610, 13) Users: 2496 Items: 18852
ranker
Shape: (203235, 12) Users: 2194 Items: 30037


GRID SEARCH

In [14]:
# factors = [20, 30, 40,]
# regularization = [0.1, 0.01, 0.001]
# iterations = [10, 15, 20]

In [15]:
# from itertools import product

In [16]:
# res_dict = {
#     'recall': [],
#     'precision': [],
#     'factors': [], 
#     'regularization': [], 
#     'iterations': []
# }

In [17]:
# %%time
# res_list = []
# for factor, reg, iteration in product(factors, regularization, iterations):
#     recommender_params = {'factors': factor, 'regularization': reg, 'iterations': iteration, 
#                           'num_threads': 6, 'random_state': 0}
#     base_recommender = MainRecommender(data_matcher, weighting='tfidf',
#                              model_type='als', own_recommender_type='cosine', 
#                              user_item_matrix_values='binary', recommender_params=recommender_params)
#     recall = base_recommender.evalMetrics(metric_type='recall', df_result=data_ranker, 
#                     target_col_name=USER_COL, recommend_model_type='rec', N_PREDICT=10)
#     precision = base_recommender.evalMetrics(metric_type='precision', df_result=data_ranker, 
#                     target_col_name=USER_COL, recommend_model_type='rec', N_PREDICT=5)
#     res_list.append({'recall': recall, 'precision': precision, 'factors': factor, 
#                      'regularization': reg, 'iterations': iteration})

In [18]:
# res_df = pd.DataFrame(res_list)
# res_df.head()

In [19]:
# for col in res_df.columns.to_list()[:2]:
#     print(f'Best {col}:\n{res_df.loc[np.argmax(res_df[col]), :]}')
#     print('*' * 30)

# Обучаю рекомендательную модель

In [20]:
recommender_params = {'factors': 40, 'regularization': 0.001, 'iterations': 20, 
                          'num_threads': 6, 'random_state': 0}

In [21]:
recommender = MainRecommender(data_matcher, weighting='tfidf',
                             model_type='als', own_recommender_type='cosine', 
                             user_item_matrix_values='binary', recommender_params=recommender_params)

### Recall@10 of matching

In [22]:
recs_type_list = ['own', 'rec', 'itm', 'usr']

In [23]:
TOPK_RECALL = 10

In [24]:
%%time
for el in recs_type_list:
    res = recommender.evalMetrics(metric_type='recall', df_result=data_matcher, 
                    target_col_name=USER_COL, recommend_model_type=el, N_PREDICT=TOPK_RECALL)
    print(f'{el} recall: {res}')

own recall: 0.14155149828984645
rec recall: 0.04517868744189889
itm recall: 0.025547770045570732
usr recall: 0.01717979404827861
Wall time: 2min 33s


Для дальнейшего ранжирования буду использовать own_recommender, т.к. он показал наибольшее значение $recall@k$

# Ранжирование

## Подготовка данных для трейна

In [25]:
# взяли пользователей из трейна для ранжирования
df_match_candidates = pd.DataFrame(data_ranker[USER_COL].unique())
df_match_candidates.columns = [USER_COL]

In [26]:
# собираем кандитатов с первого этапа (matcher)
df_match_candidates['candidates'] = df_match_candidates[USER_COL].apply(
    lambda x: recommender.get_own_recommendations(x, N=N_PREDICT))

In [27]:
df_match_candidates.head(2)

Unnamed: 0,user_id,candidates
0,84,"[903529, 829722, 920025, 901061, 987518, 55693..."
1,1753,"[1116028, 1106523, 890183, 912704, 967041, 110..."


In [28]:
df_items = df_match_candidates.apply(lambda x: pd.Series(x['candidates']), axis=1
                                    ).stack().reset_index(level=1, drop=True)
df_items.name = 'item_id'

In [29]:
df_match_candidates = df_match_candidates.drop('candidates', axis=1).join(df_items)

In [30]:
df_match_candidates.head(4)

Unnamed: 0,user_id,item_id
0,84,903529
0,84,829722
0,84,920025
0,84,901061


### Проверка "теплого" старта

In [31]:
print_stats_data(df_match_candidates, 'match_candidates')

match_candidates
Shape: (21940, 2) Users: 2194 Items: 6377


### Создаю трейн сет для ранжирования с учетом кандидатов с этапа 1 

In [32]:
df_ranker_train = data_ranker[[USER_COL, ITEM_COL]].copy()
df_ranker_train['target'] = 1  # тут только покупки 

In [33]:
df_ranker_train.head()

Unnamed: 0,user_id,item_id,target
2191387,84,862732,1
2191388,84,920025,1
2191389,84,984283,1
2191390,84,1096226,1
2191391,84,1120258,1


#### Не хватает нулей в датасете, поэтому добавляем наших кандитатов в качество нулей

In [34]:
df_ranker_train = df_match_candidates.merge(df_ranker_train, on=[USER_COL, ITEM_COL], how='left')

# чистим дубликаты
df_ranker_train = df_ranker_train.drop_duplicates(subset=[USER_COL, ITEM_COL])

df_ranker_train['target'].fillna(0, inplace= True)

In [35]:
df_ranker_train.target.value_counts()

0.0    15619
1.0     6316
Name: target, dtype: int64

In [36]:
df_ranker_train.head(2)

Unnamed: 0,user_id,item_id,target
0,84,903529,1.0
2,84,829722,0.0


## Подготавливаем фичи для обучения модели

In [37]:
item_features.head(2)

Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,


In [38]:
user_features.head(2)

Unnamed: 0,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,user_id
0,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1
1,45-54,A,50-74K,Homeowner,2 Adults No Kids,2,None/Unknown,7


In [39]:
df_ranker_train = df_ranker_train.merge(item_features, on='item_id', how='left')
df_ranker_train = df_ranker_train.merge(user_features, on='user_id', how='left')

df_ranker_train.head(2)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc
0,84,903529,1.0,539,DRUG GM,National,CIGARETTES,CIGARETTES,972976 PK,,,,,,,
1,84,829722,0.0,70,GROCERY,National,LAUNDRY DETERGENTS,LIQUID LAUNDRY DETERGENTS,32 LOAD,,,,,,,


### Добавляю новые фичи в датасет
Заказ товара в последних 5 транзакциях в виде последовательности бит (категориальная).

In [40]:
last_5_purchases_as_bit_array = ft.get_last_5_purchases_as_bit_array(data_ranker, item_features)
df_ranker_train = df_ranker_train.merge(last_5_purchases_as_bit_array, on=['item_id'], how='left')

#### user_id


In [41]:
# Средняя сумма покупки 1 товара в каждой категории
mean_purchase_by_department= ft.get_mean_purchase_per_item_by_department(data_ranker, 
                                                                      item_features)
df_ranker_train = df_ranker_train.merge(mean_purchase_by_department[['user_id', 
                        'department', 'mean_purchase']], on=['user_id', 'department'], how='left')

# Кол-во покупок в каждой категории
num_purchases_by_department = ft.get_num_purchases_per_department(data_ranker, item_features)
df_ranker_train = df_ranker_train.merge(num_purchases_by_department[['user_id', 'department', 
                                        'num_purchases']], on=['user_id', 'department'], how='left')

# Доля покупок утром/днем/вечером
user_trans_df = ft.get_proportion_of_purchases_by_times_of_day(data_ranker)
df_ranker_train = df_ranker_train.merge(user_trans_df, on='user_id', how='left')

# Кол-во магазинов, в которых продавался товар
n_stores = ft.get_n_stores(data_ranker)
df_ranker_train = df_ranker_train.merge(n_stores, on=['item_id'], how='left')

#### item_id

In [42]:
# Кол-во покупок в неделю
week_purchases_df = ft.get_num_purchases_per_week(data_ranker)
df_ranker_train = df_ranker_train.merge(week_purchases_df[['item_id', 'n_purchases_per_week']], 
                                        on=['item_id'], how='left')

# Среднее кол-во покупок 1 товара в категории в неделю
mean_n_purchases_per_week = ft.get_mean_num_purchases_per_item_dept_week(data_ranker, 
                                                                      item_features)
df_ranker_train = df_ranker_train.merge(mean_n_purchases_per_week[['department', 
                                        'mean_n_purchases_per_week']], on=['department'], how='left')

# (Кол-во покупок в неделю) / (Среднее кол-во покупок 1 товара в категории в неделю)
df_ranker_train['n_purchases_div_by_mean'] = \
    df_ranker_train.n_purchases_per_week / df_ranker_train.mean_n_purchases_per_week

# Цена
item_price_df = ft.get_price(data_ranker)
df_ranker_train = df_ranker_train.merge(item_price_df[['item_id', 'price']], 
                                        on=['item_id'], how='left')

# Цена / Средняя цена товара в категории
df_ranker_train.loc[(df_ranker_train.department == 'PRODUCE') &
                    (df_ranker_train.price == float('Inf')), 'price'] = 0
mean_price_by_department = ft.get_mean_price_by_department(df_ranker_train)
df_ranker_train = df_ranker_train.merge(mean_price_by_department[['department', 'mean_price']], 
                                        on=['department'], how='left')
df_ranker_train['price_div_by_mean_dept_price'] = df_ranker_train.price / df_ranker_train.mean_price
df_ranker_train.drop('mean_price', axis=1, inplace=True)

# Кол-во транзакций клиента
n_transactions = ft.get_n_transactions(data_ranker)
df_ranker_train = df_ranker_train.merge(n_transactions, on=['user_id'], how='left')

# mean / max / std кол-ва уникальных товаров в корзине клиента
unique_items = ft.get_unique_items_in_basket(data_ranker)
df_ranker_train = df_ranker_train.merge(unique_items, on=['user_id'], how='left')

# mean / max / std кол-ва уникальных категорий в корзине клиента
nunique_departments = ft.get_unique_departments_in_basket(data_ranker, item_features)
df_ranker_train = df_ranker_train.merge(nunique_departments, on=['user_id'], how='left')

#### user_id - item_id

In [43]:
# (Кол-во покупок юзером конкретной категории в неделю) - (Среднее 
# кол-во покупок всеми юзерами конкретной категории в неделю)
n_purchases_sub_by_mean = ft.get_num_purchases_sub_by_mean(data_ranker, item_features)
df_ranker_train = df_ranker_train.merge(n_purchases_sub_by_mean[['user_id', 'department',
        'n_purchases_sub_by_mean']], on=['user_id', 'department'], how='left')

# (Кол-во покупок юзером конкретной категории в неделю) / (Среднее 
# кол-во покупок всеми юзерами конкретной категории в неделю)
n_purchases_div_by_mean = ft.get_num_purchases_div_by_mean_all_users(data_ranker, item_features)
df_ranker_train = df_ranker_train.merge(n_purchases_div_by_mean[['user_id', 'department',
        'n_purchases_div_by_mean_all_users']], on=['user_id', 'department'], how='left')

# (Средняя сумма покупки 1 товара в каждой категории (берем категорию item_id)) - (Цена item_id)
sales_values_by_dept = ft.get_mean_sales_value_per_item_by_department(data_ranker, item_features)
df_ranker_train = df_ranker_train.merge(sales_values_by_dept[['department', 'mean_sale_sum_per_item']], 
                                        on=['department'], how='left')
df_ranker_train['mean_sale_sum_per_item_sub_price'] = \
    df_ranker_train.mean_sale_sum_per_item - df_ranker_train.price
df_ranker_train.drop('mean_sale_sum_per_item', axis=1, inplace=True)

Поведенческие фичи

In [44]:
# Общая сумма покупок каждого товара
total_item_sales_value = ft.get_total_item_sales_value(data_matcher)
df_ranker_train = df_ranker_train.merge(total_item_sales_value, how='left', on='item_id')

# Общее количество по каждому товару
total_quantity_value = ft.get_total_quantity_value(data_matcher)
df_ranker_train = df_ranker_train.merge(total_quantity_value, how='left', on='item_id')

# Количество покупателей по каждому товару
item_freq = ft.get_item_freq(data_matcher)
df_ranker_train = df_ranker_train.merge(item_freq, how='left', on='item_id')

# Частота пользователей
user_freq = ft.get_user_freq(data_matcher)
df_ranker_train = df_ranker_train.merge(user_freq, how='left', on=USER_COL)

# Общее количество покупок по каждому пользователю
total_user_sales_value = ft.get_total_user_sales_value(data_matcher)
df_ranker_train = df_ranker_train.merge(total_user_sales_value, how='left', on='user_id')

# Среднее количество покупок товара в неделю
item_quantity_per_week = ft.get_item_quantity_per_week(data_matcher)
df_ranker_train = df_ranker_train.merge(item_quantity_per_week, how='left', on='item_id')

# Среднее количество купленного товара пользователем в неделю
user_quantity_per_week = ft.get_user_quantity_per_week(data_matcher)
df_ranker_train = df_ranker_train.merge(user_quantity_per_week, how='left', on='user_id')

# Среднее количество товара за 1 покупку
item_quantity_per_basket = ft.get_item_quantity_per_basket(data_matcher)
df_ranker_train = df_ranker_train.merge(item_quantity_per_basket, how='left', on='item_id')

# Среднее количество товара у польователя за 1 покупку
user_quantity_per_basket = ft.get_user_quantity_per_basket(data_matcher)
df_ranker_train = df_ranker_train.merge(user_quantity_per_basket, how='left', on='user_id')

# Средняя частота товара в карзине
item_freq_per_basket = ft.get_item_freq_per_basket(data_matcher)
df_ranker_train = df_ranker_train.merge(item_freq_per_basket, how='left', on='item_id')

# Средняя частота пользователей купивших товар
user_freq_per_basket = ft.get_user_freq_per_basket(data_matcher)
df_ranker_train = df_ranker_train.merge(user_freq_per_basket, how='left', on='user_id')

Факторы товаров из модели матричной факторизации

In [45]:
item_factors = recommender.get_item_factors()
df_ranker_train = df_ranker_train.merge(item_factors, on=['item_id'], how='left')

Факторы пользователей из модели матричной факторизации

In [46]:
user_factors = recommender.get_user_factors()
df_ranker_train = df_ranker_train.merge(user_factors, on=['user_id'], how='left')

### Разбиваю на X и y

In [47]:
df_ranker_train.head()

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,item_in_last_5_transactions,mean_purchase,num_purchases,morning_trans,day_trans,evening_trans,n_stores,n_purchases_per_week,mean_n_purchases_per_week,n_purchases_div_by_mean,price,price_div_by_mean_dept_price,n_transactions,mean_unique_items,max_unique_items,std_unique_items,mean_unique_departments,max_unique_departments,std_unique_departments,n_purchases_sub_by_mean,n_purchases_div_by_mean_all_users,mean_sale_sum_per_item_sub_price,total_item_sales_value,total_quantity_value,item_freq,user_freq,total_user_sales_value,item_quantity_per_week,user_quantity_per_week,item_quantity_per_basket,user_quantity_per_basket,item_freq_per_basket,user_freq_per_basket,item_factor_0,...,item_factor_30,item_factor_31,item_factor_32,item_factor_33,item_factor_34,item_factor_35,item_factor_36,item_factor_37,item_factor_38,item_factor_39,user_factor_0,user_factor_1,user_factor_2,user_factor_3,user_factor_4,user_factor_5,user_factor_6,user_factor_7,user_factor_8,user_factor_9,user_factor_10,user_factor_11,user_factor_12,user_factor_13,user_factor_14,user_factor_15,user_factor_16,user_factor_17,user_factor_18,user_factor_19,user_factor_20,user_factor_21,user_factor_22,user_factor_23,user_factor_24,user_factor_25,user_factor_26,user_factor_27,user_factor_28,user_factor_29,user_factor_30,user_factor_31,user_factor_32,user_factor_33,user_factor_34,user_factor_35,user_factor_36,user_factor_37,user_factor_38,user_factor_39
0,84,903529,1.0,539,DRUG GM,National,CIGARETTES,CIGARETTES,972976 PK,,,,,,,,0.0,2.047667,6.0,1.0,0.0,0.0,6.0,1.5,1.492914,1.004746,3.495556,0.565679,6,7.666667,19,6.314006,2.333333,4,1.032796,0.978761,5.42399,0.495253,201.95,60,42,163,693.28,0.681818,1.965909,0.000345,0.000994,0.000241,0.000937,-0.003743,...,-0.014371,-0.005028,0.005625,-0.00032,0.000704,-0.010622,0.008231,0.007779,-0.003941,0.008322,0.830292,-0.142959,-1.159415,1.116186,0.112815,-0.429911,2.07463,0.728005,-1.343736,-0.790817,-0.920649,0.169293,0.341664,0.550525,0.295602,-0.90044,0.647953,1.454529,-1.731251,2.114842,1.429279,0.199569,0.981325,-1.412788,0.122323,-0.965338,0.472458,1.429912,0.032275,-0.249054,-2.373923,-1.085,2.20889,-0.492749,-0.868033,-0.059651,1.62429,0.164655,0.097899,1.799973
1,84,829722,0.0,70,GROCERY,National,LAUNDRY DETERGENTS,LIQUID LAUNDRY DETERGENTS,32 LOAD,,,,,,,,0.0,2.02129,5.0,1.0,0.0,0.0,10.0,2.166667,2.530571,0.856197,3.848462,0.939951,6,7.666667,19,6.314006,2.333333,4,1.032796,0.735284,3.777633,-1.302478,599.29,150,134,163,693.28,1.704545,1.965909,0.000862,0.000994,0.00077,0.000937,0.010809,...,-0.017181,0.020641,0.026315,0.011182,0.040963,0.030335,0.02495,0.004742,0.018431,0.017588,0.830292,-0.142959,-1.159415,1.116186,0.112815,-0.429911,2.07463,0.728005,-1.343736,-0.790817,-0.920649,0.169293,0.341664,0.550525,0.295602,-0.90044,0.647953,1.454529,-1.731251,2.114842,1.429279,0.199569,0.981325,-1.412788,0.122323,-0.965338,0.472458,1.429912,0.032275,-0.249054,-2.373923,-1.085,2.20889,-0.492749,-0.868033,-0.059651,1.62429,0.164655,0.097899,1.799973
2,84,920025,1.0,764,GROCERY,National,LAUNDRY ADDITIVES,FABRIC SOFTENER LIQUID,60 LOAD,,,,,,,,0.0,2.02129,5.0,1.0,0.0,0.0,4.0,1.333333,2.530571,0.52689,5.99,1.463002,6,7.666667,19,6.314006,2.333333,4,1.032796,0.735284,3.777633,-3.444016,427.78,77,74,163,693.28,0.875,1.965909,0.000442,0.000994,0.000425,0.000937,0.007223,...,-0.01216,0.004036,0.020169,0.003333,0.00347,0.019078,0.016412,0.018258,0.005658,0.003187,0.830292,-0.142959,-1.159415,1.116186,0.112815,-0.429911,2.07463,0.728005,-1.343736,-0.790817,-0.920649,0.169293,0.341664,0.550525,0.295602,-0.90044,0.647953,1.454529,-1.731251,2.114842,1.429279,0.199569,0.981325,-1.412788,0.122323,-0.965338,0.472458,1.429912,0.032275,-0.249054,-2.373923,-1.085,2.20889,-0.492749,-0.868033,-0.059651,1.62429,0.164655,0.097899,1.799973
3,84,901061,0.0,608,GROCERY,National,FRZN MEAT/MEAT DINNERS,FRZN BREADED PREPARED CHICK,28.8 OZ,,,,,,,,0.0,2.02129,5.0,1.0,0.0,0.0,4.0,1.333333,2.530571,0.52689,5.99,1.463002,6,7.666667,19,6.314006,2.333333,4,1.032796,0.735284,3.777633,-3.444016,417.17,83,81,163,693.28,0.943182,1.965909,0.000477,0.000994,0.000465,0.000937,0.030921,...,-0.007308,0.02536,0.018805,0.005701,-0.008073,0.011066,0.014936,0.007071,0.011444,0.016641,0.830292,-0.142959,-1.159415,1.116186,0.112815,-0.429911,2.07463,0.728005,-1.343736,-0.790817,-0.920649,0.169293,0.341664,0.550525,0.295602,-0.90044,0.647953,1.454529,-1.731251,2.114842,1.429279,0.199569,0.981325,-1.412788,0.122323,-0.965338,0.472458,1.429912,0.032275,-0.249054,-2.373923,-1.085,2.20889,-0.492749,-0.868033,-0.059651,1.62429,0.164655,0.097899,1.799973
4,84,987518,0.0,415,GROCERY,National,LAUNDRY DETERGENTS,LIQUID LAUNDRY DETERGENTS,100 OZ,,,,,,,,,2.02129,5.0,1.0,0.0,0.0,,,2.530571,,,,6,7.666667,19,6.314006,2.333333,4,1.032796,0.735284,3.777633,,187.88,59,58,163,693.28,0.670455,1.965909,0.000339,0.000994,0.000333,0.000937,-0.008966,...,0.020374,0.010791,0.015696,0.004568,0.012818,0.006943,0.012276,-0.005066,0.005226,0.001444,0.830292,-0.142959,-1.159415,1.116186,0.112815,-0.429911,2.07463,0.728005,-1.343736,-0.790817,-0.920649,0.169293,0.341664,0.550525,0.295602,-0.90044,0.647953,1.454529,-1.731251,2.114842,1.429279,0.199569,0.981325,-1.412788,0.122323,-0.965338,0.472458,1.429912,0.032275,-0.249054,-2.373923,-1.085,2.20889,-0.492749,-0.868033,-0.059651,1.62429,0.164655,0.097899,1.799973


In [48]:
X_train = df_ranker_train.drop('target', axis=1)
y_train = df_ranker_train[['target']]

In [49]:
cat_feats = X_train.columns[2:16].tolist()
X_train[cat_feats] = X_train[cat_feats].astype('category')

## Обучение модели ранжирования

GRID SEARCH

In [50]:
# max_depth_list = [5, 10, 15, 20, 30]
# n_estimators_list = [400, 500, 600, 700]
# learning_rate_list = [0.1, 0.05, 0.01]
# n_iters = len(max_depth_list) * len(n_estimators_list) * len(learning_rate_list)

In [51]:
# from itertools import product

In [52]:
# %%time
# i = 0
# res_list = []
# for max_depth, n_estimators, learning_rate in product(max_depth_list, 
#                                                       n_estimators_list, learning_rate_list):
#     i += 1
#     base_lgb = LGBMClassifier(objective='binary',
#                              max_depth=max_depth,
#                              n_estimators=n_estimators,
#                              learning_rate=learning_rate,
#                              categorical_column=cat_feats,
#                              random_state=0)
#     base_lgb.fit(X_train, y_train)
#     base_train_preds = base_lgb.predict_proba(X_train)
#     base_df_ranker_predict = df_ranker_train.copy()
#     base_df_ranker_predict['proba_item_purchase'] = base_train_preds[:,1]
#     ranker_precision = recommender.evalMetrics(metric_type='precision', df_result=data_val_ranker, 
#                     target_col_name=USER_COL, recommend_model_type='own', N_PREDICT=5)
#     reranked_precision = recommender.reranked_metrics(metric_type='precision', 
#                             df_result=data_val_ranker, df_predict=base_df_ranker_predict,
#                             target_col_name=USER_COL, recommend_model_type='own', N_PREDICT=5)
#     res_list.append({'ranker_precision': ranker_precision, 'reranked_precision': reranked_precision,
#                     'max_depth': max_depth, 'n_estimators': n_estimators,
#                     'learning_rate': learning_rate})
#     print(f'{i}/{n_iters}: ranker_precision: {ranker_precision}, reranked_precision: {reranked_precision}, max_depth: {max_depth}, n_estimators: {n_estimators}, learning_rate: {learning_rate}')

In [53]:
# res_df = pd.DataFrame(res_list)
# res_df.iloc[np.argmax(res_df['reranked_precision']), :]

обучаю модель

In [54]:
lgb = LGBMClassifier(objective='binary',
                     max_depth=10,
                     n_estimators=700,
                     learning_rate=0.05,
                     categorical_column=cat_feats,
                     random_state=0)

In [55]:
lgb.fit(X_train, y_train)

train_preds = lgb.predict_proba(X_train)

In [56]:
df_ranker_predict = df_ranker_train.copy()

In [57]:
df_ranker_predict['proba_item_purchase'] = train_preds[:,1]

In [58]:
df_ranker_predict.head()

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,item_in_last_5_transactions,mean_purchase,num_purchases,morning_trans,day_trans,evening_trans,n_stores,n_purchases_per_week,mean_n_purchases_per_week,n_purchases_div_by_mean,price,price_div_by_mean_dept_price,n_transactions,mean_unique_items,max_unique_items,std_unique_items,mean_unique_departments,max_unique_departments,std_unique_departments,n_purchases_sub_by_mean,n_purchases_div_by_mean_all_users,mean_sale_sum_per_item_sub_price,total_item_sales_value,total_quantity_value,item_freq,user_freq,total_user_sales_value,item_quantity_per_week,user_quantity_per_week,item_quantity_per_basket,user_quantity_per_basket,item_freq_per_basket,user_freq_per_basket,item_factor_0,...,item_factor_31,item_factor_32,item_factor_33,item_factor_34,item_factor_35,item_factor_36,item_factor_37,item_factor_38,item_factor_39,user_factor_0,user_factor_1,user_factor_2,user_factor_3,user_factor_4,user_factor_5,user_factor_6,user_factor_7,user_factor_8,user_factor_9,user_factor_10,user_factor_11,user_factor_12,user_factor_13,user_factor_14,user_factor_15,user_factor_16,user_factor_17,user_factor_18,user_factor_19,user_factor_20,user_factor_21,user_factor_22,user_factor_23,user_factor_24,user_factor_25,user_factor_26,user_factor_27,user_factor_28,user_factor_29,user_factor_30,user_factor_31,user_factor_32,user_factor_33,user_factor_34,user_factor_35,user_factor_36,user_factor_37,user_factor_38,user_factor_39,proba_item_purchase
0,84,903529,1.0,539,DRUG GM,National,CIGARETTES,CIGARETTES,972976 PK,,,,,,,,0.0,2.047667,6.0,1.0,0.0,0.0,6.0,1.5,1.492914,1.004746,3.495556,0.565679,6,7.666667,19,6.314006,2.333333,4,1.032796,0.978761,5.42399,0.495253,201.95,60,42,163,693.28,0.681818,1.965909,0.000345,0.000994,0.000241,0.000937,-0.003743,...,-0.005028,0.005625,-0.00032,0.000704,-0.010622,0.008231,0.007779,-0.003941,0.008322,0.830292,-0.142959,-1.159415,1.116186,0.112815,-0.429911,2.07463,0.728005,-1.343736,-0.790817,-0.920649,0.169293,0.341664,0.550525,0.295602,-0.90044,0.647953,1.454529,-1.731251,2.114842,1.429279,0.199569,0.981325,-1.412788,0.122323,-0.965338,0.472458,1.429912,0.032275,-0.249054,-2.373923,-1.085,2.20889,-0.492749,-0.868033,-0.059651,1.62429,0.164655,0.097899,1.799973,0.657019
1,84,829722,0.0,70,GROCERY,National,LAUNDRY DETERGENTS,LIQUID LAUNDRY DETERGENTS,32 LOAD,,,,,,,,0.0,2.02129,5.0,1.0,0.0,0.0,10.0,2.166667,2.530571,0.856197,3.848462,0.939951,6,7.666667,19,6.314006,2.333333,4,1.032796,0.735284,3.777633,-1.302478,599.29,150,134,163,693.28,1.704545,1.965909,0.000862,0.000994,0.00077,0.000937,0.010809,...,0.020641,0.026315,0.011182,0.040963,0.030335,0.02495,0.004742,0.018431,0.017588,0.830292,-0.142959,-1.159415,1.116186,0.112815,-0.429911,2.07463,0.728005,-1.343736,-0.790817,-0.920649,0.169293,0.341664,0.550525,0.295602,-0.90044,0.647953,1.454529,-1.731251,2.114842,1.429279,0.199569,0.981325,-1.412788,0.122323,-0.965338,0.472458,1.429912,0.032275,-0.249054,-2.373923,-1.085,2.20889,-0.492749,-0.868033,-0.059651,1.62429,0.164655,0.097899,1.799973,0.13391
2,84,920025,1.0,764,GROCERY,National,LAUNDRY ADDITIVES,FABRIC SOFTENER LIQUID,60 LOAD,,,,,,,,0.0,2.02129,5.0,1.0,0.0,0.0,4.0,1.333333,2.530571,0.52689,5.99,1.463002,6,7.666667,19,6.314006,2.333333,4,1.032796,0.735284,3.777633,-3.444016,427.78,77,74,163,693.28,0.875,1.965909,0.000442,0.000994,0.000425,0.000937,0.007223,...,0.004036,0.020169,0.003333,0.00347,0.019078,0.016412,0.018258,0.005658,0.003187,0.830292,-0.142959,-1.159415,1.116186,0.112815,-0.429911,2.07463,0.728005,-1.343736,-0.790817,-0.920649,0.169293,0.341664,0.550525,0.295602,-0.90044,0.647953,1.454529,-1.731251,2.114842,1.429279,0.199569,0.981325,-1.412788,0.122323,-0.965338,0.472458,1.429912,0.032275,-0.249054,-2.373923,-1.085,2.20889,-0.492749,-0.868033,-0.059651,1.62429,0.164655,0.097899,1.799973,0.788844
3,84,901061,0.0,608,GROCERY,National,FRZN MEAT/MEAT DINNERS,FRZN BREADED PREPARED CHICK,28.8 OZ,,,,,,,,0.0,2.02129,5.0,1.0,0.0,0.0,4.0,1.333333,2.530571,0.52689,5.99,1.463002,6,7.666667,19,6.314006,2.333333,4,1.032796,0.735284,3.777633,-3.444016,417.17,83,81,163,693.28,0.943182,1.965909,0.000477,0.000994,0.000465,0.000937,0.030921,...,0.02536,0.018805,0.005701,-0.008073,0.011066,0.014936,0.007071,0.011444,0.016641,0.830292,-0.142959,-1.159415,1.116186,0.112815,-0.429911,2.07463,0.728005,-1.343736,-0.790817,-0.920649,0.169293,0.341664,0.550525,0.295602,-0.90044,0.647953,1.454529,-1.731251,2.114842,1.429279,0.199569,0.981325,-1.412788,0.122323,-0.965338,0.472458,1.429912,0.032275,-0.249054,-2.373923,-1.085,2.20889,-0.492749,-0.868033,-0.059651,1.62429,0.164655,0.097899,1.799973,0.137591
4,84,987518,0.0,415,GROCERY,National,LAUNDRY DETERGENTS,LIQUID LAUNDRY DETERGENTS,100 OZ,,,,,,,,,2.02129,5.0,1.0,0.0,0.0,,,2.530571,,,,6,7.666667,19,6.314006,2.333333,4,1.032796,0.735284,3.777633,,187.88,59,58,163,693.28,0.670455,1.965909,0.000339,0.000994,0.000333,0.000937,-0.008966,...,0.010791,0.015696,0.004568,0.012818,0.006943,0.012276,-0.005066,0.005226,0.001444,0.830292,-0.142959,-1.159415,1.116186,0.112815,-0.429911,2.07463,0.728005,-1.343736,-0.790817,-0.920649,0.169293,0.341664,0.550525,0.295602,-0.90044,0.647953,1.454529,-1.731251,2.114842,1.429279,0.199569,0.981325,-1.412788,0.122323,-0.965338,0.472458,1.429912,0.032275,-0.249054,-2.373923,-1.085,2.20889,-0.492749,-0.868033,-0.059651,1.62429,0.164655,0.097899,1.799973,0.000146


# Оценка на тесте для выполнения курсового проекта

In [59]:
df_test = pd.read_csv('./data/retail_test1.csv')
df_test.head()

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,1340,41652823310,664,912987,1,8.49,446,0.0,52,96,0.0,0.0
1,588,41652838477,664,1024426,1,6.29,388,0.0,8,96,0.0,0.0
2,2070,41652857291,664,995242,5,9.1,311,-0.6,46,96,0.0,0.0
3,1602,41665647035,664,827939,1,7.99,334,0.0,1741,96,0.0,0.0
4,1602,41665647035,664,927712,1,0.59,334,-0.4,1741,96,0.0,0.0


In [60]:
# теплый старт
df_test = df_test[df_test.user_id.isin(common_users)]
print_stats_data(df_test,'df_test')

df_test
Shape: (88665, 12) Users: 1883 Items: 20492


In [61]:
TOPK_PRECISION = 5

In [62]:
print('Test precision:')
recommender.evalMetrics(metric_type='precision', df_result=df_test, 
                target_col_name=USER_COL, recommend_model_type='own', N_PREDICT=TOPK_PRECISION)

Test precision:


0.2351566648964398

In [63]:
print('Test re-ranked precision:')
recommender.reranked_metrics(metric_type='precision', df_result=df_test, 
                                  df_predict=df_ranker_predict, target_col_name=USER_COL, 
                                  recommend_model_type='own', N_PREDICT=TOPK_PRECISION)

Test re-ranked precision:


0.2608070757324467

### Перепроверяю получившийся вариант отдельной функцией

In [64]:
def calc_precision(df_data, top_k):
    for col_name in df_data.columns[2:]:
        yield col_name, df_data.apply(lambda row: precision_at_k(row[col_name], 
                                      row[ACTUAL_COL], k=top_k), axis=1).mean()

In [65]:
def rerank(user_id):
    return df_ranker_predict[df_ranker_predict[USER_COL]==user_id].sort_values(
                'proba_item_purchase', ascending=False).head(5).item_id.tolist()

In [66]:
result_test = df_test.groupby(USER_COL)[ITEM_COL].unique().reset_index()
result_test.columns=[USER_COL, ACTUAL_COL]

In [67]:
result_test['own_rec'] = result_test[USER_COL].apply(
                         lambda x: recommender.get_own_recommendations(x, N=N_PREDICT))

In [68]:
result_test['reranked_own_rec'] = result_test[USER_COL].apply(lambda user_id: rerank(user_id))

In [69]:
result_test.head(2)

Unnamed: 0,user_id,actual,own_rec,reranked_own_rec
0,1,"[880007, 883616, 931136, 938004, 940947, 94726...","[856942, 940947, 5577022, 9297615, 9655212, 11...","[877391, 9297615, 940947, 5577022, 9655212]"
1,2,"[820165, 820291, 826784, 826835, 829009, 85784...","[8090521, 1106523, 5569230, 916122, 1040807, 1...","[916122, 1106523, 1075368, 5569230, 1076580]"


In [70]:
print(*sorted(calc_precision(result_test, TOPK_PRECISION), key=lambda x: x[1], reverse=True), sep='\n')

('reranked_own_rec', 0.2608070757324467)
('own_rec', 0.23536909187466604)
