### Практическое задание к уроку 6
**Задание 1.**

A) Попробуйте различные варианты генерации кандидатов. Какие из них дают наибольший recall@k ?
- Пока пробуем отобрать 50 кандидатов (k=50)
- Качество измеряем на data_val_matcher: следующие 6 недель после трейна

Дают ли own recommendtions + top-popular лучший recall?  

B)* Как зависит recall@k от k? Постройте для одной схемы генерации кандидатов эту зависимость для k = {20, 50, 100, 200, 500}  
C)* Исходя из прошлого вопроса, как вы думаете, какое значение k является наиболее разумным?

**Задание 2.**

Обучите модель 2-ого уровня, при этом:

- Добавьте минимум по 2 фичи для юзера, товара и пары юзер-товар

- Измерьте отдельно precision@5 модели 1-ого уровня и двухуровневой модели на data_val_ranker

- Вырос ли precision@5 при использовании двухуровневой модели?

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
pd.set_option('display.max_columns', 100)

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit import als

# Модель второго уровня
from lightgbm import LGBMClassifier

import os, sys
module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)

os.environ['MKL_NUM_THREADS'] = '1'

# Написанные нами функции
from src.metrics import precision_at_k, recall_at_k
from src.utils import prefilter_items
from src.recommenders import MainRecommender

import warnings
warnings.filterwarnings('ignore')



In [2]:
data = pd.read_csv('./data/retail_train.csv')
item_features = pd.read_csv('./data/product.csv')
user_features = pd.read_csv('./data/hh_demographic.csv')

In [3]:
ITEM_COL = 'item_id'
USER_COL = 'user_id'
ACTUAL_COL = 'actual'

# N = Neighbors
N_PREDICT = 10

# Process features dataset

In [4]:
# column processing
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': ITEM_COL}, inplace=True)
user_features.rename(columns={'household_key': USER_COL }, inplace=True)

# Split dataset for train, eval, test

In [5]:
VAL_MATCHER_WEEKS = 6
VAL_RANKER_WEEKS = 3

In [6]:
# берем данные для тренировки matching модели
data_train_matcher = data[data['week_no'] < data['week_no'].max() - (VAL_MATCHER_WEEKS + VAL_RANKER_WEEKS)]

# берем данные для валидации matching модели
data_val_matcher = data[(data['week_no'] >= data['week_no'].max() - (VAL_MATCHER_WEEKS + VAL_RANKER_WEEKS)) &
                      (data['week_no'] < data['week_no'].max() - (VAL_RANKER_WEEKS))]


# берем данные для тренировки ranking модели
data_train_ranker = data_val_matcher.copy()  # Для наглядности. Далее мы добавим изменения, и они будут отличаться

# берем данные для теста ranking, matching модели
data_val_ranker = data[data['week_no'] >= data['week_no'].max() - VAL_RANKER_WEEKS]

In [7]:
# сделаем объединенный сет данных для первого уровня (матчинга)
df_join_train_matcher = pd.concat([data_train_matcher, data_val_matcher])

In [8]:
def print_stats_data(df_data, name_df):
    print(name_df)
    print(f"Shape: {df_data.shape} Users: {df_data[USER_COL].nunique()} Items: {df_data[ITEM_COL].nunique()}")

In [9]:
print_stats_data(data_train_matcher,'train_matcher')
print_stats_data(data_val_matcher,'val_matcher')
print_stats_data(data_train_ranker,'train_ranker')
print_stats_data(data_val_ranker,'val_ranker')

train_matcher
Shape: (2108779, 12) Users: 2498 Items: 83685
val_matcher
Shape: (169711, 12) Users: 2154 Items: 27649
train_ranker
Shape: (169711, 12) Users: 2154 Items: 27649
val_ranker
Shape: (118314, 12) Users: 2042 Items: 24329


In [10]:
data_train_matcher.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


# Prefilter items

In [11]:
# Определю количество наиболее популярных товаров, которые составляют 90% продаж
items_by_poprularity = data_train_matcher.groupby(by='item_id')['basket_id'].nunique(). \
    reset_index().sort_values(by='basket_id', ascending=False)
items_by_poprularity.rename(columns={'basket_id': 'n_purchases'}, inplace=True)
items_by_poprularity.head()

Unnamed: 0,item_id,n_purchases
34192,1082185,24318
54389,6534178,16233
28450,1029743,11661
24657,995242,10226
36790,1106523,8011


In [12]:
purchses_sum = items_by_poprularity.n_purchases.sum()
top_90_percent_items_list = []
purchses_commul_sum = 0
for item, n_purchases in zip(items_by_poprularity.item_id, items_by_poprularity.n_purchases):
    purchses_commul_sum += n_purchases
    if (purchses_commul_sum / purchses_sum) < 0.9:
        top_90_percent_items_list.append(item)
    else:
        break

n_popular = len(top_90_percent_items_list)
n_popular

18714

In [13]:
# Оставляю в датасете только 18714 товаров. Id остальных заменю на 999999
n_items_before = data_train_matcher['item_id'].nunique()

data_train_matcher = prefilter_items(data_train_matcher, item_features=item_features, 
                                     take_n_popular=n_popular)

n_items_after = data_train_matcher['item_id'].nunique()
print(f'Decreased # items from {n_items_before} to {n_items_after}')

Decreased # items from 83685 to 18715


# Make cold-start to warm-start

In [14]:
# ищем общих пользователей
common_users = data_train_matcher.user_id.values

data_val_matcher = data_val_matcher[data_val_matcher.user_id.isin(common_users)]
data_train_ranker = data_train_ranker[data_train_ranker.user_id.isin(common_users)]
data_val_ranker = data_val_ranker[data_val_ranker.user_id.isin(common_users)]

print_stats_data(data_train_matcher,'train_matcher')
print_stats_data(data_val_matcher,'val_matcher')
print_stats_data(data_train_ranker,'train_ranker')
print_stats_data(data_val_ranker,'val_ranker')

train_matcher
Shape: (861404, 13) Users: 2495 Items: 18715
val_matcher
Shape: (169615, 12) Users: 2151 Items: 27644
train_ranker
Shape: (169615, 12) Users: 2151 Items: 27644
val_ranker
Shape: (118282, 12) Users: 2040 Items: 24325


# Grid search

In [15]:
# from itertools import product

In [16]:
# weightings_list = ['bm25', 'tfidf']
# model_type_list = ['als', 'bpr']
# user_item_matrix_values = ['binary', 'sales_value', 'purchase_sum']
# own_recommender_type_list = ['item-item', 'cosine', 'tfidf']
# recs_type_list = ['own', 'rec', 'itm', 'usr']

In [17]:
# result_dict = {
#     'weighting': [],
#     'model_type': [],
#     'own_recommender_type': [],
#     'user_item_matrix_values': [],
#     'own_recall': [],
#     'rec_recall': [],
#     'itm_recall': [],
#     'usr_recall': []
# }

In [18]:
# %%time
# for ui_value, weighting, model_type, own_recommender in product(
#             user_item_matrix_values, weightings_list, model_type_list, own_recommender_type_list):
#     base_recommender = MainRecommender(data_train_matcher, weighting=weighting, 
#                                        model_type=model_type, own_recommender_type=own_recommender, 
#                                        user_item_matrix_values=ui_value)
#     result_dict['weighting'].append(weighting)
#     result_dict['model_type'].append(model_type)
#     result_dict['own_recommender_type'].append(own_recommender)
#     result_dict['user_item_matrix_values'].append(ui_value)
    
#     for el in recs_type_list:
#         res = base_recommender.evalMetrics(metric_type='recall', df_result=data_val_matcher, 
#                         target_col_name=USER_COL, recommend_model_type=el, N_PREDICT=N_PREDICT)
#         result_dict[el + '_recall'].append(res)

In [19]:
# result_df = pd.DataFrame(result_dict)
# result_df

In [20]:
# for col in result_df.columns.to_list()[4:]:
#     print(f'Best {col}:\n{result_df.loc[np.argmax(result_df[col]), :]}')
#     print('*' * 30)

# Init/train recommender

In [21]:
recommender = MainRecommender(data_train_matcher, weighting='tfidf',
                                 model_type='als', own_recommender_type='cosine')

### Recall@50 of matching

In [22]:
recs_type_list = ['own', 'rec', 'itm', 'usr']

In [23]:
TOPK_RECALL = 10

In [24]:
# for el in recs_type_list:
#     res = recommender.evalMetrics(metric_type='recall', df_result=data_val_matcher, 
#                     target_col_name=USER_COL, recommend_model_type=el, N_PREDICT=TOPK_RECALL)
#     print(f'{el} recall: {res}')

### Precision@5 of matching

In [25]:
TOPK_PRECISION = 5

In [26]:
# for el in recs_type_list:
#     res = recommender.evalMetrics(metric_type='precision', df_result=data_val_matcher, 
#                     target_col_name=USER_COL, recommend_model_type=el, N_PREDICT=TOPK_PRECISION)
#     print(f'{el} precision: {res}')

# Ranking part

## Подготовка данных для трейна

In [27]:
# взяли пользователей из трейна для ранжирования
df_match_candidates = pd.DataFrame(data_train_ranker[USER_COL].unique())
df_match_candidates.columns = [USER_COL]

In [28]:
# собираем кандитатов с первого этапа (matcher)
df_match_candidates['candidates'] = df_match_candidates[USER_COL].apply(lambda x: recommender.get_own_recommendations(x, N=N_PREDICT))

In [29]:
df_match_candidates.head(2)

Unnamed: 0,user_id,candidates
0,2070,"[1029743, 913210, 1105426, 933067, 838186, 109..."
1,2021,"[1119454, 950935, 1041390, 844179, 1013928, 65..."


In [30]:
df_items = df_match_candidates.apply(lambda x: pd.Series(x['candidates']), axis=1).stack().reset_index(level=1, drop=True)
df_items.name = 'item_id'

In [31]:
df_match_candidates = df_match_candidates.drop('candidates', axis=1).join(df_items)

In [32]:
df_match_candidates.head(4)

Unnamed: 0,user_id,item_id
0,2070,1029743
0,2070,913210
0,2070,1105426
0,2070,933067


### Check warm start

In [33]:
print_stats_data(df_match_candidates, 'match_candidates')

match_candidates
Shape: (21510, 2) Users: 2151 Items: 6322


### Создаем трейн сет для ранжирования с учетом кандидатов с этапа 1 

In [34]:
df_ranker_train = data_train_ranker[[USER_COL, ITEM_COL]].copy()
df_ranker_train['target'] = 1  # тут только покупки 

In [35]:
df_ranker_train.head()

Unnamed: 0,user_id,item_id,target
2104867,2070,1019940,1
2107468,2021,840361,1
2107469,2021,856060,1
2107470,2021,869344,1
2107471,2021,896862,1


#### Не хватает нулей в датасете, поэтому добавляем наших кандитатов в качество нулей

In [36]:
df_ranker_train = df_match_candidates.merge(df_ranker_train, on=[USER_COL, ITEM_COL], how='left')

# чистим дубликаты
df_ranker_train = df_ranker_train.drop_duplicates(subset=[USER_COL, ITEM_COL])

df_ranker_train['target'].fillna(0, inplace= True)

In [37]:
df_ranker_train.target.value_counts()

0.0    15950
1.0     5556
Name: target, dtype: int64

In [38]:
df_ranker_train.head(2)

Unnamed: 0,user_id,item_id,target
0,2070,1029743,0.0
1,2070,913210,1.0


## Подготавливаем фичи для обучения модели

In [39]:
item_features.head(2)

Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,


In [40]:
user_features.head(2)

Unnamed: 0,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,user_id
0,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1
1,45-54,A,50-74K,Homeowner,2 Adults No Kids,2,None/Unknown,7


In [41]:
df_ranker_train = df_ranker_train.merge(item_features, on='item_id', how='left')
df_ranker_train = df_ranker_train.merge(user_features, on='user_id', how='left')

df_ranker_train.head(2)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc
0,2070,1029743,0.0,69,GROCERY,Private,FLUID MILK PRODUCTS,FLUID MILK WHITE ONLY,1 GA,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown
1,2070,913210,1.0,2,GROCERY,National,WATER - CARBONATED/FLVRD DRINK,NON-CRBNTD DRNKING/MNERAL WATE,405.6 OZ,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown


Заказ товара в последних 5 транзакциях в виде последовательности бит (категориальная).

In [42]:
a = data_train_ranker.merge(item_features, on='item_id', 
                how='left').groupby(by=['day', 'trans_time', 'basket_id', 'item_id']
                )['quantity'].count().reset_index().sort_values(by=['day', 'trans_time'], ascending=False)

Unnamed: 0,day,trans_time,basket_id,item_id,quantity
169609,635,2341,41260424015,853119,1
169610,635,2341,41260424015,911311,1
169611,635,2341,41260424015,926692,1
169612,635,2341,41260424015,1079987,1
169613,635,2341,41260424015,1100273,1


In [43]:
trans_dict = {}
trans_no = 0

for day, trans_time, basket_id, item_id, quantity in a.values:
    if basket_id not in trans_dict.keys():
        trans_dict[basket_id] = {'trans_no': trans_no, 'item_id': []}
        trans_no += 1
    trans_dict[basket_id]['item_id'].append(item_id)
    if len(trans_dict) >= 5:
        break

In [44]:
trans_list = []

for value in trans_dict.values():
    trans_list.insert(value['trans_no'], value['item_id'])

In [45]:
result_list = []

for item in df_ranker_train.item_id.unique():
    item_trans = ''
    for trans in trans_list:
        item_trans += '1' if item in trans else '0'
    result_list.append({'item_id': item, 'item_in_last_5_transactions': item_trans})

In [46]:
df_ranker_train = df_ranker_train.merge(pd.DataFrame(result_list), on=['item_id'], how='left')

### user_id
1. Средняя сумма покупки 1 товара в каждой категории

In [47]:
def get_mean_purchase_per_item_by_department(data_train_ranker, item_features):
    sales_value_by_department = data_train_ranker.merge(item_features, on='item_id', 
            how='left').groupby(by=['user_id', 'department'])['sales_value'].sum().reset_index()
    quantity_by_department = data_train_ranker.merge(item_features, on='item_id', 
            how='left').groupby(by=['user_id', 'department'])['quantity'].sum().reset_index()
    mean_purchase_by_department = sales_value_by_department.merge(quantity_by_department, 
                                                    on=['user_id', 'department'], how='left')
    mean_purchase_by_department.drop(0, axis=0, inplace=True)
    mean_purchase_by_department.reset_index(inplace=True)
    mean_purchase_by_department.drop('index', axis=1, inplace=True)
    mean_purchase_by_department['mean_purchase'] = \
            mean_purchase_by_department['sales_value'] / mean_purchase_by_department['quantity']
    return mean_purchase_by_department

In [48]:
mean_purchase_by_department= get_mean_purchase_per_item_by_department(data_train_ranker, 
                                                                      item_features)
df_ranker_train = df_ranker_train.merge(mean_purchase_by_department[['user_id', 
                        'department', 'mean_purchase']], on=['user_id', 'department'], how='left')
df_ranker_train.head()

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,item_in_last_5_transactions,mean_purchase
0,2070,1029743,0.0,69,GROCERY,Private,FLUID MILK PRODUCTS,FLUID MILK WHITE ONLY,1 GA,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,0,1.461549
1,2070,913210,1.0,2,GROCERY,National,WATER - CARBONATED/FLVRD DRINK,NON-CRBNTD DRNKING/MNERAL WATE,405.6 OZ,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,0,1.461549
2,2070,1105426,0.0,69,DELI,Private,SANDWICHES,SANDWICHES - (COLD),,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,0,3.596667
3,2070,933067,1.0,1425,MEAT-PCKGD,National,BACON,FLAVORED/OTHER,16 OZ,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,0,3.426364
4,2070,838186,1.0,1790,GROCERY,National,BAKED SWEET GOODS,SW GDS:DONUTS,18.2 OZ,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,0,1.461549


2. Кол-во покупок в каждой категории

In [49]:
def get_num_purchases_per_department(data_train_ranker, item_features):
    num_purchases_by_department = data_train_ranker.merge(item_features, on='item_id', 
            how='left').groupby(by=['user_id', 'department'])['basket_id'].nunique().reset_index()
    num_purchases_by_department.rename(columns={'basket_id': 'num_purchases'}, inplace=True)
    num_purchases_by_department.drop(0, axis=0, inplace=True)
    num_purchases_by_department.reset_index(inplace=True)
    num_purchases_by_department.drop('index', axis=1, inplace=True)
    return num_purchases_by_department

In [50]:
num_purchases_by_department = get_num_purchases_per_department(data_train_ranker, item_features)
df_ranker_train = df_ranker_train.merge(num_purchases_by_department[['user_id', 'department', 
                                        'num_purchases']], on=['user_id', 'department'], how='left')
df_ranker_train.head(2)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,item_in_last_5_transactions,mean_purchase,num_purchases
0,2070,1029743,0.0,69,GROCERY,Private,FLUID MILK PRODUCTS,FLUID MILK WHITE ONLY,1 GA,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,0,1.461549,39.0
1,2070,913210,1.0,2,GROCERY,National,WATER - CARBONATED/FLVRD DRINK,NON-CRBNTD DRNKING/MNERAL WATE,405.6 OZ,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,0,1.461549,39.0


3. Доля покупок утром/днем/вечером

In [51]:
def get_proportion_of_purchases_by_times_of_day(data_train_ranker):
    users_transactions = data_train_ranker[['user_id', 'trans_time']].drop_duplicates(
                        subset=['trans_time']).reset_index().drop('index', axis=1)
    users_list = users_transactions.user_id.unique().tolist()
    
    user_trans_dict = {
        'user_id': [],
        'morning_trans': [],
        'day_trans': [],
        'evening_trans': []
    }
    for user in users_list:
        num_trans = users_transactions.loc[users_transactions.user_id == user, 
                                           'trans_time'].count()
        morning_trans = users_transactions[(users_transactions.user_id == user) & 
                                           (users_transactions.trans_time <= 900)].trans_time.count()
        day_trans = users_transactions[(users_transactions.user_id == user) & 
                                       (users_transactions.trans_time > 900) & 
                                       (users_transactions.trans_time < 1800)].trans_time.count()
        evening_trans = users_transactions[(users_transactions.user_id == user) & 
                                           (users_transactions.trans_time >= 1800)].trans_time.count()
        user_trans_dict['user_id'].append(user)
        user_trans_dict['morning_trans'].append(morning_trans / num_trans)
        user_trans_dict['day_trans'].append(day_trans / num_trans)
        user_trans_dict['evening_trans'].append(evening_trans / num_trans)
        
    user_trans_df = pd.DataFrame(user_trans_dict)
    return user_trans_df

In [52]:
user_trans_df = get_proportion_of_purchases_by_times_of_day(data_train_ranker)
df_ranker_train = df_ranker_train.merge(user_trans_df, on='user_id', how='left')
df_ranker_train.head(2)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,item_in_last_5_transactions,mean_purchase,num_purchases,morning_trans,day_trans,evening_trans
0,2070,1029743,0.0,69,GROCERY,Private,FLUID MILK PRODUCTS,FLUID MILK WHITE ONLY,1 GA,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,0,1.461549,39.0,0.714286,0.0,0.285714
1,2070,913210,1.0,2,GROCERY,National,WATER - CARBONATED/FLVRD DRINK,NON-CRBNTD DRNKING/MNERAL WATE,405.6 OZ,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,0,1.461549,39.0,0.714286,0.0,0.285714


### item_id
1. Кол-во покупок в неделю

In [53]:
def get_num_purchses_per_week(data_train_ranker):
    num_purchases_by_week = data_train_ranker.groupby(by=['item_id', 
                                        'week_no'])['basket_id'].nunique().reset_index()
    num_purchases_by_week.rename(columns={'basket_id': 'week_num_purchases'}, inplace=True)
    week_purchases_df = num_purchases_by_week.groupby(by='item_id').agg({'week_no': 'count', 
                                                 'week_num_purchases': 'sum'}).reset_index()
    week_purchases_df['n_purchases_per_week'] = \
                week_purchases_df.week_num_purchases / week_purchases_df.week_no
    return week_purchases_df

In [54]:
week_purchases_df = get_num_purchses_per_week(data_train_ranker)
df_ranker_train = df_ranker_train.merge(week_purchases_df[['item_id', 'n_purchases_per_week']], 
                                        on=['item_id'], how='left')
df_ranker_train.head(2)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,item_in_last_5_transactions,mean_purchase,num_purchases,morning_trans,day_trans,evening_trans,n_purchases_per_week
0,2070,1029743,0.0,69,GROCERY,Private,FLUID MILK PRODUCTS,FLUID MILK WHITE ONLY,1 GA,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,0,1.461549,39.0,0.714286,0.0,0.285714,179.333333
1,2070,913210,1.0,2,GROCERY,National,WATER - CARBONATED/FLVRD DRINK,NON-CRBNTD DRNKING/MNERAL WATE,405.6 OZ,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,0,1.461549,39.0,0.714286,0.0,0.285714,17.666667


2. Среднее кол-во покупок 1 товара в категории в неделю

In [55]:
def get_mean_num_purchases_per_item_dept_week(data_train_ranker, item_features):
    n_purhases_by_dept = data_train_ranker.merge(item_features, on='item_id', how='left').groupby(
        by=['department', 'week_no', 'item_id'])['basket_id'].nunique().reset_index()
    n_purhases_by_dept.rename(columns={'basket_id': 'n_purchases'}, inplace=True)
    n_items_per_week = n_purhases_by_dept.groupby(by=['department', 'week_no'])[
        'item_id'].count().reset_index()
    n_items_per_week.rename(columns={'item_id': 'n_items'}, inplace=True)
    n_purchases_per_week = n_purhases_by_dept.groupby(by=['department', 'week_no'])[
        'n_purchases'].sum().reset_index()
    mean_purchases_per_week = n_items_per_week.merge(n_purchases_per_week, 
                                                     on=['department', 'week_no'], how='left')
    mean_purchases_per_week['mean_purchases_per_week'] = round(
        mean_purchases_per_week.n_purchases / mean_purchases_per_week.n_items, 4)
    mean_purchases_per_week.drop([0, 1, 2, 3, 4, 5], axis=0, inplace=True)
    mean_purchases_per_week.reset_index(inplace=True)
    mean_purchases_per_week.drop('index', axis=1, inplace=True)
    
    mean_n_purchases_per_week = mean_purchases_per_week.groupby(by='department').agg({
        'week_no': 'count', 'mean_purchases_per_week': 'sum'}).reset_index()
    mean_n_purchases_per_week['mean_n_purchases_per_week'] = \
        mean_n_purchases_per_week.mean_purchases_per_week / mean_n_purchases_per_week.week_no
    return mean_n_purchases_per_week

In [56]:
mean_n_purchases_per_week = get_mean_num_purchases_per_item_dept_week(data_train_ranker, 
                                                                      item_features)
df_ranker_train = df_ranker_train.merge(mean_n_purchases_per_week[['department', 
                                        'mean_n_purchases_per_week']], on=['department'], how='left')
df_ranker_train.head(2)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,item_in_last_5_transactions,mean_purchase,num_purchases,morning_trans,day_trans,evening_trans,n_purchases_per_week,mean_n_purchases_per_week
0,2070,1029743,0.0,69,GROCERY,Private,FLUID MILK PRODUCTS,FLUID MILK WHITE ONLY,1 GA,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,0,1.461549,39.0,0.714286,0.0,0.285714,179.333333,2.499367
1,2070,913210,1.0,2,GROCERY,National,WATER - CARBONATED/FLVRD DRINK,NON-CRBNTD DRNKING/MNERAL WATE,405.6 OZ,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,0,1.461549,39.0,0.714286,0.0,0.285714,17.666667,2.499367


3. (Кол-во покупок в неделю) / (Среднее кол-во покупок 1 товара в категории в неделю)

In [57]:
df_ranker_train['n_purchases_div_by_mean'] = \
    df_ranker_train.n_purchases_per_week / df_ranker_train.mean_n_purchases_per_week
df_ranker_train.head(2)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,item_in_last_5_transactions,mean_purchase,num_purchases,morning_trans,day_trans,evening_trans,n_purchases_per_week,mean_n_purchases_per_week,n_purchases_div_by_mean
0,2070,1029743,0.0,69,GROCERY,Private,FLUID MILK PRODUCTS,FLUID MILK WHITE ONLY,1 GA,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,0,1.461549,39.0,0.714286,0.0,0.285714,179.333333,2.499367,71.75151
1,2070,913210,1.0,2,GROCERY,National,WATER - CARBONATED/FLVRD DRINK,NON-CRBNTD DRNKING/MNERAL WATE,405.6 OZ,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,0,1.461549,39.0,0.714286,0.0,0.285714,17.666667,2.499367,7.068457


4. Цена

In [58]:
def get_price(data_train_ranker):
    item_price_df = data_train_ranker[['item_id', 'quantity', 'sales_value', 'retail_disc']].copy()
    item_price_df['price'] = (item_price_df.sales_value - 
                              item_price_df.retail_disc) / item_price_df.quantity
    item_price_df = item_price_df.groupby(by=['item_id'])['price'].mean().reset_index()
    return item_price_df

In [59]:
item_price_df = get_price(data_train_ranker)
df_ranker_train = df_ranker_train.merge(item_price_df[['item_id', 'price']], 
                                        on=['item_id'], how='left')
df_ranker_train.head(2)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,item_in_last_5_transactions,mean_purchase,num_purchases,morning_trans,day_trans,evening_trans,n_purchases_per_week,mean_n_purchases_per_week,n_purchases_div_by_mean,price
0,2070,1029743,0.0,69,GROCERY,Private,FLUID MILK PRODUCTS,FLUID MILK WHITE ONLY,1 GA,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,0,1.461549,39.0,0.714286,0.0,0.285714,179.333333,2.499367,71.75151,2.682058
1,2070,913210,1.0,2,GROCERY,National,WATER - CARBONATED/FLVRD DRINK,NON-CRBNTD DRNKING/MNERAL WATE,405.6 OZ,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,0,1.461549,39.0,0.714286,0.0,0.285714,17.666667,2.499367,7.068457,3.99


Цена / Средняя цена товара в категории

In [60]:
def get_mean_price_by_department(df_ranker_train):
    mean_price_by_department = df_ranker_train[['department', 
                            'price']].groupby('department')['price'].mean().reset_index()
    mean_price_by_department.rename(columns={'price': 'mean_price'}, inplace=True)
    return mean_price_by_department

In [61]:
df_ranker_train[['department', 'price']].loc[(df_ranker_train.department == 'PRODUCE') &
                                             (df_ranker_train.price > 7), 'price']

32        9.950000
44        9.950000
378      13.221429
6995     14.990000
7214     14.990000
8163     13.221429
10887    13.221429
12631    14.990000
20872          inf
Name: price, dtype: float64

In [62]:
df_ranker_train.loc[(df_ranker_train.department == 'PRODUCE') &
                    (df_ranker_train.price > 7), 'price'] = 0

In [63]:
mean_price_by_department = get_mean_price_by_department(df_ranker_train)
df_ranker_train = df_ranker_train.merge(mean_price_by_department[['department', 'mean_price']], 
                                        on=['department'], how='left')
df_ranker_train['price_div_by_mean_dept_price'] = df_ranker_train.price / df_ranker_train.mean_price
df_ranker_train.drop('mean_price', axis=1, inplace=True)
df_ranker_train.head(2)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,item_in_last_5_transactions,mean_purchase,num_purchases,morning_trans,day_trans,evening_trans,n_purchases_per_week,mean_n_purchases_per_week,n_purchases_div_by_mean,price,price_div_by_mean_dept_price
0,2070,1029743,0.0,69,GROCERY,Private,FLUID MILK PRODUCTS,FLUID MILK WHITE ONLY,1 GA,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,0,1.461549,39.0,0.714286,0.0,0.285714,179.333333,2.499367,71.75151,2.682058,0.655522
1,2070,913210,1.0,2,GROCERY,National,WATER - CARBONATED/FLVRD DRINK,NON-CRBNTD DRNKING/MNERAL WATE,405.6 OZ,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,0,1.461549,39.0,0.714286,0.0,0.285714,17.666667,2.499367,7.068457,3.99,0.975196


### user_id - item_id
1. (Кол-во покупок юзером конкретной категории в неделю) - (Среднее кол-во покупок всеми юзерами конкретной категории в неделю)

In [64]:
def get_user_nun_purchases_per_week(data_train_ranker, item_features):
    purchases_by_usr = data_train_ranker.merge(item_features, on='item_id', how='left'
            ).groupby(by=['user_id', 'department', 'week_no'])['basket_id'].nunique().reset_index()
    usr_purchases_by_dept = purchases_by_usr.groupby(by=['user_id', 'department']).agg({
                                            'week_no': 'count', 'basket_id': 'sum'}).reset_index()
    usr_purchases_by_dept.rename(columns={'week_no': 'n_weeks', 
                                          'basket_id': 'n_purchases'}, inplace=True)
    usr_purchases_by_dept.drop(0, axis=0, inplace=True)
    usr_purchases_by_dept.reset_index(inplace=True)
    usr_purchases_by_dept.drop('index', axis=1, inplace=True)
    usr_purchases_by_dept['usr_purchases_by_dept_per_week'] = \
            usr_purchases_by_dept.n_purchases / usr_purchases_by_dept.n_weeks
    return usr_purchases_by_dept

In [65]:
def get_mean_purchases_all_users_by_department_per_week(data_train_ranker, item_features):
    all_users_purchases = data_train_ranker.merge(item_features, on='item_id', how='left'
            ).groupby(by=['department', 'week_no', 'user_id'])['basket_id'].nunique().reset_index()
    all_users_purchases_by_dept = all_users_purchases.groupby(by='department').agg({
                           'week_no': 'nunique', 'user_id': 'count', 'basket_id': 'sum'}).reset_index()
    all_users_purchases_by_dept.rename(columns={'user_id': 'n_users', 'week_no': 'n_weeks', 
                                                'basket_id': 'n_purchases'}, inplace=True)
    all_users_purchases_by_dept.drop(0, axis=0, inplace=True)
    all_users_purchases_by_dept.reset_index(inplace=True)
    all_users_purchases_by_dept.drop('index', axis=1, inplace=True)
    all_users_purchases_by_dept['mean_purchses_all_usrs_per_week'] = \
        all_users_purchases_by_dept.n_purchases / all_users_purchases_by_dept.n_users \
        / all_users_purchases_by_dept.n_weeks
    return all_users_purchases_by_dept

In [66]:
def get_num_purchases_sub_by_mean(data_train_ranker, item_features):
    usr_purchases_by_dept = get_user_nun_purchases_per_week(data_train_ranker, item_features)
    all_users_purchases_by_dept = get_mean_purchases_all_users_by_department_per_week(
                                                            data_train_ranker, item_features)
    
    n_purchases_sub_by_mean = usr_purchases_by_dept[['user_id', 'department', 
        'usr_purchases_by_dept_per_week']].merge(all_users_purchases_by_dept[['department',
        'mean_purchses_all_usrs_per_week']], on='department', how='left')
    n_purchases_sub_by_mean['n_purchases_sub_by_mean'] = \
        n_purchases_sub_by_mean.usr_purchases_by_dept_per_week - \
        n_purchases_sub_by_mean.mean_purchses_all_usrs_per_week
    return n_purchases_sub_by_mean

In [67]:
n_purchases_sub_by_mean = get_num_purchases_sub_by_mean(data_train_ranker, item_features)
df_ranker_train = df_ranker_train.merge(n_purchases_sub_by_mean[['user_id', 'department',
        'n_purchases_sub_by_mean']], on=['user_id', 'department'], how='left')
df_ranker_train.head(2)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,item_in_last_5_transactions,mean_purchase,num_purchases,morning_trans,day_trans,evening_trans,n_purchases_per_week,mean_n_purchases_per_week,n_purchases_div_by_mean,price,price_div_by_mean_dept_price,n_purchases_sub_by_mean
0,2070,1029743,0.0,69,GROCERY,Private,FLUID MILK PRODUCTS,FLUID MILK WHITE ONLY,1 GA,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,0,1.461549,39.0,0.714286,0.0,0.285714,179.333333,2.499367,71.75151,2.682058,0.655522,6.192347
1,2070,913210,1.0,2,GROCERY,National,WATER - CARBONATED/FLVRD DRINK,NON-CRBNTD DRNKING/MNERAL WATE,405.6 OZ,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,0,1.461549,39.0,0.714286,0.0,0.285714,17.666667,2.499367,7.068457,3.99,0.975196,6.192347


2. (Кол-во покупок юзером конкретной категории в неделю) / (Среднее кол-во покупок всеми юзерами конкретной категории в неделю)  

In [68]:
def get_num_purchases_div_by_mean_all_users(data_train_ranker, item_features):
    usr_purchases_by_dept = get_user_nun_purchases_per_week(data_train_ranker, item_features)
    all_users_purchases_by_dept = get_mean_purchases_all_users_by_department_per_week(
                                                            data_train_ranker, item_features)
    
    n_purchases_div_by_mean = usr_purchases_by_dept[['user_id', 'department', 
        'usr_purchases_by_dept_per_week']].merge(all_users_purchases_by_dept[['department',
        'mean_purchses_all_usrs_per_week']], on='department', how='left')
    n_purchases_div_by_mean['n_purchases_div_by_mean_all_users'] = \
        n_purchases_sub_by_mean.usr_purchases_by_dept_per_week / \
        n_purchases_sub_by_mean.mean_purchses_all_usrs_per_week
    return n_purchases_div_by_mean

In [69]:
n_purchases_div_by_mean = get_num_purchases_div_by_mean_all_users(data_train_ranker, item_features)
df_ranker_train = df_ranker_train.merge(n_purchases_div_by_mean[['user_id', 'department',
        'n_purchases_div_by_mean_all_users']], on=['user_id', 'department'], how='left')
df_ranker_train.head(2)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,item_in_last_5_transactions,mean_purchase,num_purchases,morning_trans,day_trans,evening_trans,n_purchases_per_week,mean_n_purchases_per_week,n_purchases_div_by_mean,price,price_div_by_mean_dept_price,n_purchases_sub_by_mean,n_purchases_div_by_mean_all_users
0,2070,1029743,0.0,69,GROCERY,Private,FLUID MILK PRODUCTS,FLUID MILK WHITE ONLY,1 GA,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,0,1.461549,39.0,0.714286,0.0,0.285714,179.333333,2.499367,71.75151,2.682058,0.655522,6.192347,21.127701
1,2070,913210,1.0,2,GROCERY,National,WATER - CARBONATED/FLVRD DRINK,NON-CRBNTD DRNKING/MNERAL WATE,405.6 OZ,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,0,1.461549,39.0,0.714286,0.0,0.285714,17.666667,2.499367,7.068457,3.99,0.975196,6.192347,21.127701


(Средняя сумма покупки 1 товара в каждой категории (берем категорию item_id)) - (Цена item_id)

In [70]:
def get_mean_sales_value_per_item_by_department(data_train_ranker, item_features):
    sales_values_by_dept = data_train_ranker.merge(item_features, on='item_id', 
                how='left').groupby(by=['department']).agg({'item_id': 'count', 
                'sales_value': 'sum'}).reset_index()
    sales_values_by_dept.drop(0, axis=0, inplace=True)
    sales_values_by_dept.reset_index(inplace=True)
    sales_values_by_dept.drop('index', axis=1, inplace=True)
    sales_values_by_dept.rename(columns={'item_id': 'n_items', 'sales_value': 'sale_sum'}, 
                                inplace=True)
    sales_values_by_dept['mean_sale_sum_per_item'] = \
                    sales_values_by_dept.sale_sum / sales_values_by_dept.n_items
    return sales_values_by_dept

In [71]:
sales_values_by_dept = get_mean_sales_value_per_item_by_department(data_train_ranker, item_features)
df_ranker_train = df_ranker_train.merge(sales_values_by_dept[['department', 'mean_sale_sum_per_item']], 
                                        on=['department'], how='left')
df_ranker_train['mean_sale_sum_per_item_sub_price'] = \
    df_ranker_train.mean_sale_sum_per_item - df_ranker_train.price
df_ranker_train.drop('mean_sale_sum_per_item', axis=1, inplace=True)
df_ranker_train.head(2)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,item_in_last_5_transactions,mean_purchase,num_purchases,morning_trans,day_trans,evening_trans,n_purchases_per_week,mean_n_purchases_per_week,n_purchases_div_by_mean,price,price_div_by_mean_dept_price,n_purchases_sub_by_mean,n_purchases_div_by_mean_all_users,mean_sale_sum_per_item_sub_price
0,2070,1029743,0.0,69,GROCERY,Private,FLUID MILK PRODUCTS,FLUID MILK WHITE ONLY,1 GA,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,0,1.461549,39.0,0.714286,0.0,0.285714,179.333333,2.499367,71.75151,2.682058,0.655522,6.192347,21.127701,-0.150089
1,2070,913210,1.0,2,GROCERY,National,WATER - CARBONATED/FLVRD DRINK,NON-CRBNTD DRNKING/MNERAL WATE,405.6 OZ,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,0,1.461549,39.0,0.714286,0.0,0.285714,17.666667,2.499367,7.068457,3.99,0.975196,6.192347,21.127701,-1.458031


Поведенческие фичи

In [72]:
# Общая сумма покупок каждого товара
df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=ITEM_COL). \
                  agg('sales_value').sum().rename('total_item_sales_value'), how='left',on=ITEM_COL)

# Общее количество по каждому товару
df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=ITEM_COL). \
                  agg('quantity').sum().rename('total_quantity_value'), how='left',on=ITEM_COL)

# Количество покупателей по каждому товару
df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=ITEM_COL). \
                  agg(USER_COL).count().rename('item_freq'), how='left',on=ITEM_COL)

# Частота пользователей
df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=USER_COL). \
                  agg(USER_COL).count().rename('user_freq'), how='left',on=USER_COL)

# Общее количество покупок по каждому пользователю
df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=USER_COL). \
                  agg('sales_value').sum().rename('total_user_sales_value'), how='left',on=USER_COL)

# Среднее количество покупок товара в неделю
df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=ITEM_COL). \
                  agg('quantity').sum().rename('item_quantity_per_week') / \
                  df_join_train_matcher.week_no.nunique(), how='left',on=ITEM_COL)

# Среднее количество купленного товара пользователем в неделю
df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=USER_COL). \
                  agg('quantity').sum().rename('user_quantity_per_week') / \
                  df_join_train_matcher.week_no.nunique(), how='left',on=USER_COL)

# Среднее количество товара за 1 покупку
df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=ITEM_COL). \
                  agg('quantity').sum().rename('item_quantity_per_basket') / \
                  df_join_train_matcher.basket_id.nunique(), how='left',on=ITEM_COL)

# Среднее количество товара у польователя за 1 покупку
df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=USER_COL). \
                  agg('quantity').sum().rename('user_quantity_per_baskter') / \
                  df_join_train_matcher.basket_id.nunique(), how='left',on=USER_COL)

# Средняя частота товара в карзине
df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=ITEM_COL). \
                  agg(USER_COL).count().rename('item_freq_per_basket') / 
                  df_join_train_matcher.basket_id.nunique(), how='left',on=ITEM_COL)

# Средняя частота пользователей купивших товар
df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=USER_COL). \
                  agg(USER_COL).count().rename('user_freq_per_basket') / \
                  df_join_train_matcher.basket_id.nunique(), how='left',on=USER_COL)

Факторы товаров из модели матричной факторизации

In [73]:
item_factors = pd.DataFrame(recommender.model.item_factors)
item_factors.columns = [f'item_factor_{i}' for i in range(len(item_factors.columns))]
item_ids = [recommender.id_to_itemid[itm_id] for itm_id in range(item_factors.shape[0])]
item_factors = pd.concat([pd.DataFrame(item_ids), item_factors], axis=1)
item_factors.rename(columns={0: 'item_id'}, inplace=True)
item_factors.head()

Unnamed: 0,item_id,item_factor_0,item_factor_1,item_factor_2,item_factor_3,item_factor_4,item_factor_5,item_factor_6,item_factor_7,item_factor_8,item_factor_9,item_factor_10,item_factor_11,item_factor_12,item_factor_13,item_factor_14,item_factor_15,item_factor_16,item_factor_17,item_factor_18,item_factor_19
0,27978,0.002045,-0.007561,0.000279,0.000944,-0.000209,0.002397,-0.00633,-0.000978,-0.004241,-0.000347,-0.006964,0.000454,-0.003426,-0.002238,0.003494,0.002723,-0.00028,-0.001372,0.005459,-0.006276
1,32124,0.005415,-0.003266,-0.000481,-0.000976,0.007541,0.003742,0.002792,0.000209,-0.002771,0.003472,-0.004921,0.001991,-0.000318,0.001992,-0.002208,1.4e-05,0.001905,-0.001813,0.0005,-0.000441
2,32456,-0.004606,-0.00012,-0.000826,0.002398,-0.000835,-0.001451,-0.002063,-0.004084,-0.000858,-0.000781,0.000346,-0.00161,0.000446,-0.001984,0.005341,0.005533,-0.000918,-0.006004,0.003223,-0.000676
3,36406,-0.002113,-0.011673,-0.002595,0.001025,-0.002037,0.006117,-0.001108,-0.005131,-0.004899,0.002098,-0.002922,0.001686,-0.005944,-0.0028,0.000759,0.005592,0.000305,-0.001401,0.006621,-0.005896
4,39354,-0.002194,-0.012124,-0.002695,0.001064,-0.002115,0.006353,-0.001151,-0.005329,-0.005088,0.002178,-0.003034,0.001752,-0.006174,-0.002908,0.000788,0.005808,0.000316,-0.001456,0.006876,-0.006123


In [74]:
df_ranker_train = df_ranker_train.merge(item_factors, on=['item_id'], how='left')

Факторы пользователей из модели матричной факторизации

In [75]:
user_factors = pd.DataFrame(recommender.model.user_factors)
user_factors.columns = [f'user_factor_{i}' for i in range(len(user_factors.columns))]
user_ids = [recommender.id_to_userid[usr_id] for usr_id in range(user_factors.shape[0])]
user_factors = pd.concat([pd.DataFrame(user_ids), user_factors], axis=1)
user_factors.rename(columns={0: 'user_id'}, inplace=True)
user_factors.head()

Unnamed: 0,user_id,user_factor_0,user_factor_1,user_factor_2,user_factor_3,user_factor_4,user_factor_5,user_factor_6,user_factor_7,user_factor_8,user_factor_9,user_factor_10,user_factor_11,user_factor_12,user_factor_13,user_factor_14,user_factor_15,user_factor_16,user_factor_17,user_factor_18,user_factor_19
0,1,1.570063,1.793087,-1.25538,1.698795,-1.28298,-2.205495,-3.916419,-2.491551,-1.006178,-0.264562,4.419894,2.907531,2.451324,-0.603816,3.260381,-1.260603,2.381678,3.891313,3.070684,0.474495
1,2,-1.276078,-1.082382,-0.320335,-0.098362,3.759017,0.369653,0.31704,0.987781,1.299852,-0.149853,0.354893,2.815346,-0.252961,0.673437,-0.358584,2.137165,1.624511,0.414853,1.54132,0.014379
2,3,2.290083,-0.895145,0.305767,-1.774877,1.856111,1.918499,-1.665775,1.87405,0.991352,-0.207166,-0.829525,2.455991,-1.265147,0.998726,1.623868,0.585217,3.067434,-1.474437,1.222035,0.020641
3,4,0.85694,-1.13715,0.067816,0.349576,0.527766,2.01615,-2.667477,-0.647076,0.102796,-0.424586,2.219666,0.702985,-0.060151,-1.166902,0.41149,2.304061,3.712423,-0.278785,0.543687,-0.557503
4,5,0.893489,-1.498381,-2.274161,0.732509,1.763704,2.521795,0.151056,-0.232668,0.687653,0.744377,0.918676,1.178571,-0.072062,-0.428846,0.357306,1.21068,0.152592,-2.145304,1.670744,-0.492934


In [76]:
df_ranker_train = df_ranker_train.merge(user_factors, on=['user_id'], how='left')

### Разбиваю на X и y

In [77]:
df_ranker_train.head()

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,item_in_last_5_transactions,mean_purchase,num_purchases,morning_trans,day_trans,evening_trans,n_purchases_per_week,mean_n_purchases_per_week,n_purchases_div_by_mean,price,price_div_by_mean_dept_price,n_purchases_sub_by_mean,n_purchases_div_by_mean_all_users,mean_sale_sum_per_item_sub_price,total_item_sales_value,total_quantity_value,item_freq,user_freq,total_user_sales_value,item_quantity_per_week,user_quantity_per_week,item_quantity_per_basket,user_quantity_per_baskter,item_freq_per_basket,user_freq_per_basket,item_factor_0,item_factor_1,item_factor_2,item_factor_3,item_factor_4,item_factor_5,item_factor_6,item_factor_7,item_factor_8,item_factor_9,item_factor_10,item_factor_11,item_factor_12,item_factor_13,item_factor_14,item_factor_15,item_factor_16,item_factor_17,item_factor_18,item_factor_19,user_factor_0,user_factor_1,user_factor_2,user_factor_3,user_factor_4,user_factor_5,user_factor_6,user_factor_7,user_factor_8,user_factor_9,user_factor_10,user_factor_11,user_factor_12,user_factor_13,user_factor_14,user_factor_15,user_factor_16,user_factor_17,user_factor_18,user_factor_19
0,2070,1029743,0.0,69,GROCERY,Private,FLUID MILK PRODUCTS,FLUID MILK WHITE ONLY,1 GA,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,0,1.461549,39.0,0.714286,0.0,0.285714,179.333333,2.499367,71.75151,2.682058,0.655522,6.192347,21.127701,-0.150089,35764.66,15015,12737,1996,5754.86,165.0,1218.32967,0.061233,0.452137,0.051943,0.00814,0.013658,-0.004418,-0.029481,0.003334,0.087631,0.069758,-0.031747,0.055653,0.032106,0.037101,0.043613,0.100045,0.058864,0.006548,0.086371,0.026479,0.079501,-0.023664,0.039154,0.019993,-0.456368,-0.128948,1.702129,-0.604117,2.183773,-0.490204,-0.482594,-0.812225,0.540166,3.529608,-0.725454,1.370062,-0.187917,-2.211823,3.916788,2.26327,3.728155,0.543653,3.323811,0.074283
1,2070,913210,1.0,2,GROCERY,National,WATER - CARBONATED/FLVRD DRINK,NON-CRBNTD DRNKING/MNERAL WATE,405.6 OZ,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,0,1.461549,39.0,0.714286,0.0,0.285714,17.666667,2.499367,7.068457,3.99,0.975196,6.192347,21.127701,-1.458031,5406.18,1364,1175,1996,5754.86,14.989011,1218.32967,0.005563,0.452137,0.004792,0.00814,-0.017873,0.023815,-0.027981,0.025102,0.039161,0.035273,0.054501,0.021679,0.058979,0.0723,0.011073,0.052961,0.017503,0.033285,0.027476,0.00172,0.086734,0.019655,0.02754,0.036289,-0.456368,-0.128948,1.702129,-0.604117,2.183773,-0.490204,-0.482594,-0.812225,0.540166,3.529608,-0.725454,1.370062,-0.187917,-2.211823,3.916788,2.26327,3.728155,0.543653,3.323811,0.074283
2,2070,1105426,0.0,69,DELI,Private,SANDWICHES,SANDWICHES - (COLD),,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,0,3.596667,2.0,0.714286,0.0,0.285714,1.0,2.260583,0.442364,3.99,0.861,0.808016,5.20876,0.537463,442.9,113,99,1996,5754.86,1.241758,1218.32967,0.000461,0.452137,0.000404,0.00814,-0.001072,0.019785,0.017605,0.014162,0.010852,0.001114,-0.007499,-0.001613,-0.016147,0.035185,0.014363,-0.002664,0.004424,0.011221,-0.017329,-0.005015,0.031844,0.000549,0.008234,0.004858,-0.456368,-0.128948,1.702129,-0.604117,2.183773,-0.490204,-0.482594,-0.812225,0.540166,3.529608,-0.725454,1.370062,-0.187917,-2.211823,3.916788,2.26327,3.728155,0.543653,3.323811,0.074283
3,2070,933067,1.0,1425,MEAT-PCKGD,National,BACON,FLAVORED/OTHER,16 OZ,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,0,3.426364,3.0,0.714286,0.0,0.285714,7.333333,2.36945,3.094952,3.99,0.84042,0.80253,5.064053,-0.155712,1974.07,711,520,1996,5754.86,7.813187,1218.32967,0.0029,0.452137,0.002121,0.00814,0.045936,0.017835,0.057833,-0.030293,-0.005777,0.025867,0.008961,0.050592,0.034489,0.034195,-0.017803,0.075467,0.051759,0.032369,0.054374,0.046651,-0.004463,-0.028231,0.008649,0.030161,-0.456368,-0.128948,1.702129,-0.604117,2.183773,-0.490204,-0.482594,-0.812225,0.540166,3.529608,-0.725454,1.370062,-0.187917,-2.211823,3.916788,2.26327,3.728155,0.543653,3.323811,0.074283
4,2070,838186,1.0,1790,GROCERY,National,BAKED SWEET GOODS,SW GDS:DONUTS,18.2 OZ,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,0,1.461549,39.0,0.714286,0.0,0.285714,7.5,2.499367,3.00076,3.99,0.975196,6.192347,21.127701,-1.458031,2848.27,752,694,1996,5754.86,8.263736,1218.32967,0.003067,0.452137,0.00283,0.00814,0.008211,0.052232,0.018282,-0.013562,0.051826,0.077358,0.024086,0.01897,0.026714,0.023735,0.011206,0.041815,0.028882,0.020833,0.052984,0.0016,0.071893,0.035956,0.000946,0.003848,-0.456368,-0.128948,1.702129,-0.604117,2.183773,-0.490204,-0.482594,-0.812225,0.540166,3.529608,-0.725454,1.370062,-0.187917,-2.211823,3.916788,2.26327,3.728155,0.543653,3.323811,0.074283


In [78]:
df_ranker_train.fillna(0, inplace=True)

In [79]:
X_train = df_ranker_train.drop('target', axis=1)
y_train = df_ranker_train[['target']]

In [80]:
cat_feats = X_train.columns[2:16].tolist()
X_train[cat_feats] = X_train[cat_feats].astype('category')

cat_feats

['manufacturer',
 'department',
 'brand',
 'commodity_desc',
 'sub_commodity_desc',
 'curr_size_of_product',
 'age_desc',
 'marital_status_code',
 'income_desc',
 'homeowner_desc',
 'hh_comp_desc',
 'household_size_desc',
 'kid_category_desc',
 'item_in_last_5_transactions']

## Обучение модели ранжирования

In [None]:
# lgb = LGBMClassifier(objective='binary',
#                      max_depth=10,
#                      n_estimators=500,
#                      learning_rate=0.1,
#                      categorical_column=cat_feats)

In [None]:
# lgb.fit(X_train, y_train)

# train_preds = lgb.predict_proba(X_train)

In [81]:
from catboost import CatBoostClassifier

In [90]:
cbclf = CatBoostClassifier(custom_metric='Precision', cat_features=cat_feats, silent=True, random_state=0)

In [91]:
%%time
cbclf.fit(X_train, y_train, plot=True)

train_preds = cbclf.predict_proba(X_train)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Wall time: 47.6 s


In [92]:
df_ranker_predict = df_ranker_train.copy()

In [93]:
df_ranker_predict['proba_item_purchase'] = train_preds[:,1]

In [None]:
# df_ranker_predict.head()

# Evaluation on test dataset

In [None]:
# print('Matcher\'s precision:')
# for el in recs_type_list:
#     res = recommender.evalMetrics(metric_type='precision', df_result=data_val_matcher, 
#                     target_col_name=USER_COL, recommend_model_type=el, N_PREDICT=TOPK_PRECISION)
#     print(f'{el} precision: {res}')

In [None]:
# print('Ranker\'s precision:')
# for el in recs_type_list:
#     res = recommender.evalMetrics(metric_type='precision', df_result=data_val_ranker, 
#                     target_col_name=USER_COL, recommend_model_type=el, N_PREDICT=TOPK_PRECISION)
#     print(f'{el} precision: {res}')

## Eval re-ranked matched result on test dataset

In [94]:
print('Re-ranked precision:')
recommender.reranked_metrics(metric_type='precision', df_result=data_val_ranker, 
                                  df_predict=df_ranker_predict, target_col_name=USER_COL, 
                                  recommend_model_type='own', N_PREDICT=TOPK_PRECISION)

Re-ranked precision:


0.3033942558746728

# Оценка на тесте для выполнения курсового проекта

In [95]:
df_test = pd.read_csv('./data/retail_test1.csv')
df_test.head()

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,1340,41652823310,664,912987,1,8.49,446,0.0,52,96,0.0,0.0
1,588,41652838477,664,1024426,1,6.29,388,0.0,8,96,0.0,0.0
2,2070,41652857291,664,995242,5,9.1,311,-0.6,46,96,0.0,0.0
3,1602,41665647035,664,827939,1,7.99,334,0.0,1741,96,0.0,0.0
4,1602,41665647035,664,927712,1,0.59,334,-0.4,1741,96,0.0,0.0


In [96]:
# warm start
df_test = df_test[df_test.user_id.isin(common_users)]
print_stats_data(df_test,'df_test')

df_test
Shape: (88665, 12) Users: 1883 Items: 20492


In [None]:
# print('Test precision:')
# recommender.evalMetrics(metric_type='precision', df_result=df_test, 
#                 target_col_name=USER_COL, recommend_model_type='own', N_PREDICT=TOPK_PRECISION)

In [97]:
print('Test re-ranked precision:')
recommender.reranked_metrics(metric_type='precision', df_result=df_test, 
                                  df_predict=df_ranker_predict, target_col_name=USER_COL, 
                                  recommend_model_type='own', N_PREDICT=TOPK_PRECISION)

Test re-ranked precision:


0.24831081081080872