# Домашнее задание

In [3]:
!pip install lightgbm

Collecting lightgbm
  Downloading lightgbm-3.2.1-py3-none-win_amd64.whl (1.0 MB)
Installing collected packages: lightgbm
Successfully installed lightgbm-3.2.1


In [5]:
!pip install catboost

Collecting catboost
  Downloading catboost-0.25.1-cp38-none-win_amd64.whl (66.9 MB)
Collecting graphviz
  Downloading graphviz-0.16-py2.py3-none-any.whl (19 kB)
Collecting plotly
  Downloading plotly-4.14.3-py2.py3-none-any.whl (13.2 MB)
Collecting retrying>=1.3.3
  Downloading retrying-1.3.3.tar.gz (10 kB)
Building wheels for collected packages: retrying
  Building wheel for retrying (setup.py): started
  Building wheel for retrying (setup.py): finished with status 'done'
  Created wheel for retrying: filename=retrying-1.3.3-py3-none-any.whl size=11434 sha256=809748b3b43cd134252a4eef545a0201d6b0cd74451bd9795d89a7eb8664de14
  Stored in directory: c:\users\виктор\appdata\local\pip\cache\wheels\c4\a7\48\0a434133f6d56e878ca511c0e6c38326907c0792f67b476e56
Successfully built retrying
Installing collected packages: graphviz, retrying, plotly, catboost
Successfully installed catboost-0.25.1 graphviz-0.16 plotly-4.14.3 retrying-1.3.3


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit import als

# Модель второго уровня
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier, Pool

import os, sys
module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)

# Написанные нами функции
from metrics import precision_at_k, recall_at_k
from utils import prefilter_items
from recommenders import MainRecommender

Чтение данных

In [2]:
data = pd.read_csv('../data/retail_train.csv')
item_features = pd.read_csv('../data/product.csv')
user_features = pd.read_csv('../data/hh_demographic.csv')

Генерация факторов

In [3]:
ITEM_COL = 'item_id'
USER_COL = 'user_id'

In [4]:
# column processing
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': ITEM_COL}, inplace=True)
user_features.rename(columns={'household_key': USER_COL }, inplace=True)

Разделение на тренировочные и тестовые данные

In [5]:
# Важна схема обучения и валидации!
# -- давние покупки -- | -- 6 недель -- | -- 3 недель -- 
# подобрать размер 2-ого датасета (6 недель) --> learning curve (зависимость метрики recall@k от размера датасета)


VAL_MATCHER_WEEKS = 6
VAL_RANKER_WEEKS = 3

In [6]:
# берем данные для тренировки matching модели
data_train_matcher = data[data['week_no'] < data['week_no'].max() - (VAL_MATCHER_WEEKS + VAL_RANKER_WEEKS)]

# берем данные для валидации matching модели
data_val_matcher = data[(data['week_no'] >= data['week_no'].max() - (VAL_MATCHER_WEEKS + VAL_RANKER_WEEKS)) &
                      (data['week_no'] < data['week_no'].max() - (VAL_RANKER_WEEKS))]


# берем данные для тренировки ranking модели
data_train_ranker = data_val_matcher.copy()  # Для наглядности. Далее мы добавим изменения, и они будут отличаться

# берем данные для теста ranking, matching модели
data_val_ranker = data[data['week_no'] >= data['week_no'].max() - VAL_RANKER_WEEKS]

In [7]:
def print_stats_data(df_data, name_df):
    print(name_df)
    print(f"Shape: {df_data.shape} Users: {df_data[USER_COL].nunique()} Items: {df_data[ITEM_COL].nunique()}")

In [8]:
print_stats_data(data_train_matcher,'train_matcher')
print_stats_data(data_val_matcher,'val_matcher')
print_stats_data(data_train_ranker,'train_ranker')
print_stats_data(data_val_ranker,'val_ranker')

train_matcher
Shape: (2108779, 12) Users: 2498 Items: 83685
val_matcher
Shape: (169711, 12) Users: 2154 Items: 27649
train_ranker
Shape: (169711, 12) Users: 2154 Items: 27649
val_ranker
Shape: (118314, 12) Users: 2042 Items: 24329


In [9]:
# выше видим разброс по пользователям и товарам
data_train_matcher.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


Префильтрация товаров

In [10]:
n_items_before = data_train_matcher['item_id'].nunique()

data_train_matcher = prefilter_items(data_train_matcher, item_features=item_features, take_n_popular=5000)

n_items_after = data_train_matcher['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['price'] = data['sales_value'] / (np.maximum(data['quantity'], 1))


Decreased # items from 83685 to 5001


Перевод с холодного на теплый старт

In [11]:
# ищем общих пользователей
common_users = list(set(data_train_matcher.user_id.values)&(set(data_val_matcher.user_id.values))&set(data_val_ranker.user_id.values))

data_train_matcher = data_train_matcher[data_train_matcher.user_id.isin(common_users)]
data_val_matcher = data_val_matcher[data_val_matcher.user_id.isin(common_users)]
data_train_ranker = data_train_ranker[data_train_ranker.user_id.isin(common_users)]
data_val_ranker = data_val_ranker[data_val_ranker.user_id.isin(common_users)]

print_stats_data(data_train_matcher,'train_matcher')
print_stats_data(data_val_matcher,'val_matcher')
print_stats_data(data_train_ranker,'train_ranker')
print_stats_data(data_val_ranker,'val_ranker')

train_matcher
Shape: (784420, 13) Users: 1915 Items: 4999
val_matcher
Shape: (163261, 12) Users: 1915 Items: 27118
train_ranker
Shape: (163261, 12) Users: 1915 Items: 27118
val_ranker
Shape: (115989, 12) Users: 1915 Items: 24042


Тренировка рекомаендательной системы

In [12]:
recommender = MainRecommender(data_train_matcher)



HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=15.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4999.0), HTML(value='')))




In [13]:
# Берем тестового юзера 2375
recommender.get_als_recommendations(2375, N=5)

[899624, 1044078, 1106523, 871756, 1046545]

In [14]:
recommender.get_own_recommendations(2375, N=5)

[948640, 918046, 847962, 907099, 873980]

In [15]:
recommender.get_similar_items_recommendation(2375, N=5)

[1046545, 917816, 937292, 10355376, 15778319]

In [16]:
recommender.get_similar_users_recommendation(2375, N=5)

[940090, 935578, 1117824, 865026, 967994]

Анализ отклика для мачинга

### Измеряем recall@k

**Задание 1.**

A) Попробуйте различные варианты генерации кандидатов. Какие из них дают наибольший recall@k ?
- Пока пробуем отобрать 50 кандидатов (k=50)
- Качество измеряем на data_val_matcher: следующие 6 недель после трейна

Дают ли own recommendtions + top-popular лучший recall?  

B)* Как зависит recall@k от k? Постройте для одной схемы генерации кандидатов эту зависимость для k = {20, 50, 100, 200, 500}  
C)* Исходя из прошлого вопроса, как вы думаете, какое значение k является наиболее разумным?

In [17]:
ACTUAL_COL = 'actual'

In [18]:
result_eval_matcher = data_val_matcher.groupby(USER_COL)[ITEM_COL].unique().reset_index()
result_eval_matcher.columns=[USER_COL, ACTUAL_COL]
result_eval_matcher.head(2)

Unnamed: 0,user_id,actual
0,1,"[853529, 865456, 867607, 872137, 874905, 87524..."
1,6,"[1024306, 1102949, 6548453, 835394, 940804, 96..."


In [19]:
# # сырой и простой пример как можно обернуть в функцию
def evalRecall(df_result, target_col_name, recommend_model):
    result_col_name = 'result'
    df_result[result_col_name] = df_result[target_col_name].apply(lambda x: recommend_model(x, N=25))
    return df_result.apply(lambda row: recall_at_k(row[result_col_name], row[ACTUAL_COL], k=N_PREDICT), axis=1).mean()

In [20]:
def calc_recall(df_data, top_k):
    for col_name in df_data.columns[2:]:
        yield col_name, df_data.apply(lambda row: recall_at_k(row[col_name], row[ACTUAL_COL], k=top_k), axis=1).mean()

In [21]:
def calc_precision(df_data, top_k):
    for col_name in df_data.columns[2:]:
        yield col_name, df_data.apply(lambda row: precision_at_k(row[col_name], row[ACTUAL_COL], k=top_k), axis=1).mean()

In [22]:
# N = Neighbors
N_PREDICT = 50 

In [23]:
list_ = []
for n_pred in [5, 10, 20, 50, 100, 120, 150, 200, 500]:
    for top_k_recall in [5, 10, 20, 50, 100, 120, 150, 200, 500]:
        result_eval_matcher['own_rec'] = result_eval_matcher[USER_COL].apply(lambda x: recommender.get_own_recommendations(x, N=n_pred))
        result_eval_matcher['sim_item_rec'] = result_eval_matcher[USER_COL].apply(lambda x: recommender.get_similar_items_recommendation(x, N=n_pred))
        result_eval_matcher['als_rec'] = result_eval_matcher[USER_COL].apply(lambda x: recommender.get_als_recommendations(x, N=n_pred))
        srt_recall = sorted(calc_recall(result_eval_matcher, top_k_recall), key=lambda x: x[1],reverse=True)
        str_precision = sorted(calc_precision(result_eval_matcher, top_k_recall), key=lambda x: x[1],reverse=True)
        f_recall = "recall_" + "\r\nrecall_".join([f"{e[0]} = {e[1]}" for e in srt_recall])
        f_precision = "precision_" + "\r\nprecision_".join([f"{e[0]} = {e[1]}" for e in str_precision])
        print(f'При N_пред = {n_pred}, k_отклик = {top_k_recall}:\r\n{f_recall}\r\n{f_precision}')
        print()
        list_.append([n_pred, top_k_recall, srt_recall, str_precision])

При N_пред = 5, k_отклик = 5:
recall_own_rec = 0.01743355178020794
recall_als_rec = 0.012917458612426087
recall_sim_item_rec = 0.005151557009862349
precision_own_rec = 0.18872062663185182
precision_als_rec = 0.12793733681462044
precision_sim_item_rec = 0.06276762402088817

При N_пред = 5, k_отклик = 10:
recall_own_rec = 0.01743355178020794
recall_als_rec = 0.012917458612426087
recall_sim_item_rec = 0.005151557009862349
precision_own_rec = 0.18872062663185182
precision_als_rec = 0.12793733681462044
precision_sim_item_rec = 0.06276762402088817

При N_пред = 5, k_отклик = 20:
recall_own_rec = 0.01743355178020794
recall_als_rec = 0.012917458612426087
recall_sim_item_rec = 0.005151557009862349
precision_own_rec = 0.18872062663185182
precision_als_rec = 0.12793733681462044
precision_sim_item_rec = 0.06276762402088817

При N_пред = 5, k_отклик = 50:
recall_own_rec = 0.01743355178020794
recall_als_rec = 0.012917458612426087
recall_sim_item_rec = 0.005151557009862349
precision_own_rec = 0.18872

При N_пред = 50, k_отклик = 50:
recall_own_rec = 0.061684201353290766
recall_als_rec = 0.04767006867253098
recall_sim_item_rec = 0.03169056098830312
precision_own_rec = 0.07660574412532668
precision_als_rec = 0.060177545691906205
precision_sim_item_rec = 0.04001044386422989

При N_пред = 50, k_отклик = 100:
recall_own_rec = 0.061684201353290766
recall_als_rec = 0.04767006867253098
recall_sim_item_rec = 0.03169056098830312
precision_own_rec = 0.07660574412532668
precision_als_rec = 0.060177545691906205
precision_sim_item_rec = 0.04001044386422989

При N_пред = 50, k_отклик = 120:
recall_own_rec = 0.061684201353290766
recall_als_rec = 0.04767006867253098
recall_sim_item_rec = 0.03169056098830312
precision_own_rec = 0.07660574412532668
precision_als_rec = 0.060177545691906205
precision_sim_item_rec = 0.04001044386422989

При N_пред = 50, k_отклик = 150:
recall_own_rec = 0.061684201353290766
recall_als_rec = 0.04767006867253098
recall_sim_item_rec = 0.03169056098830312
precision_own_rec = 

При N_пред = 150, k_отклик = 150:
recall_own_rec = 0.1150617146322639
recall_als_rec = 0.08485118686398235
recall_sim_item_rec = 0.06762062903832493
precision_own_rec = 0.049796344647519655
precision_als_rec = 0.03834290687554408
precision_sim_item_rec = 0.0280800696257617

При N_пред = 150, k_отклик = 200:
recall_own_rec = 0.1150617146322639
recall_als_rec = 0.08485118686398235
recall_sim_item_rec = 0.06762062903832493
precision_own_rec = 0.049796344647519655
precision_als_rec = 0.03834290687554408
precision_sim_item_rec = 0.0280800696257617

При N_пред = 150, k_отклик = 500:
recall_own_rec = 0.1150617146322639
recall_als_rec = 0.08485118686398235
recall_sim_item_rec = 0.06762062903832493
precision_own_rec = 0.049796344647519655
precision_als_rec = 0.03834290687554408
precision_sim_item_rec = 0.0280800696257617

При N_пред = 200, k_отклик = 5:
recall_own_rec = 0.01743355178020794
recall_als_rec = 0.012917458612426087
recall_sim_item_rec = 0.005151557009862349
precision_own_rec = 0.188

**<h3>Вывод</h3>**
Судя по recall и precision при увеличении количества соседей происходит смещение распределения, а также увеличение recall в большую сторону. Судя по превиденным выше данным разумно найти оптимальный график для k зависящий от количества соседей и от приоритета определенной метрики (recall или precision).

**Задание 2.**

Обучите модель 2-ого уровня, при этом:

- Добавьте минимум по 2 фичи для юзера, товара и пары юзер-товар

- Измерьте отдельно precision@5 модели 1-ого уровня и двухуровневой модели на data_val_ranker

- Вырос ли precision@5 при использовании двухуровневой модели?

In [24]:
TOPK_PRECISION = 5

In [25]:
# взяли пользователей из трейна для ранжирования
df_match_candidates = pd.DataFrame(data_train_ranker[USER_COL].unique())
df_match_candidates.columns = [USER_COL]

In [26]:
# собираем кандитатов с первого этапа (matcher)
df_match_candidates['candidates'] = df_match_candidates[USER_COL].apply(lambda x: recommender.get_own_recommendations(x, N=N_PREDICT))

In [27]:
df_match_candidates.head(2)

Unnamed: 0,user_id,candidates
0,2070,"[1105426, 1097350, 879194, 948640, 928263, 944..."
1,2021,"[950935, 1119454, 835578, 863762, 1097398, 101..."


In [28]:
df_items = df_match_candidates.apply(lambda x: pd.Series(x['candidates']), axis=1).stack().reset_index(level=1, drop=True)
df_items.name = 'item_id'

In [29]:
df_match_candidates = df_match_candidates.drop('candidates', axis=1).join(df_items)

In [30]:
df_match_candidates.head(4)

Unnamed: 0,user_id,item_id
0,2070,1105426
0,2070,1097350
0,2070,879194
0,2070,948640


In [31]:
print_stats_data(df_match_candidates, 'match_candidates')

match_candidates
Shape: (95750, 2) Users: 1915 Items: 4437


In [32]:
df_ranker_train = data_train_ranker[[USER_COL, ITEM_COL]].copy()
df_ranker_train['target'] = 1  # тут только покупки 

df_ranker_train = df_match_candidates.merge(df_ranker_train, on=[USER_COL, ITEM_COL], how='left')

df_ranker_train['target'].fillna(0, inplace= True)

In [33]:
df_ranker_train.target.value_counts()

0.0    88346
1.0    11053
Name: target, dtype: int64

In [34]:
df_ranker_train.head(2)

Unnamed: 0,user_id,item_id,target
0,2070,1105426,0.0
1,2070,1097350,0.0


In [35]:
df_ranker_train['target'].mean()

0.11119830179378062

In [36]:
data.head(40000)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.60,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.00,1631,1,0.0,0.0
2,2375,26984851472,1,1036325,1,0.99,364,-0.30,1631,1,0.0,0.0
3,2375,26984851472,1,1082185,1,1.21,364,0.00,1631,1,0.0,0.0
4,2375,26984851472,1,8160430,1,1.50,364,-0.39,1631,1,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
39995,1367,27534669186,47,6533362,1,3.99,384,0.00,1751,7,0.0,0.0
39996,1284,27534677454,47,835300,3,11.24,339,-3.41,1727,7,0.0,0.0
39997,1284,27534677454,47,1058930,1,1.19,339,0.00,1727,7,0.0,0.0
39998,1284,27534677476,47,1131974,2,2.58,339,-0.20,1733,7,0.0,0.0


In [37]:
user_features.head(2)

Unnamed: 0,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,user_id
0,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1
1,45-54,A,50-74K,Homeowner,2 Adults No Kids,2,None/Unknown,7


In [38]:
data['month_no'] = ((data['week_no'] - 1) / 4).astype('int16')
data['full_sale'] = data['quantity'] * data['sales_value']

In [39]:
item_features.head(2)

Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,


**Фичи user_id:**
    - Средний чек
    - Средняя сумма покупки 1 товара в каждой категории
    - Кол-во покупок в каждой категории
    - Частотность покупок раз/месяц
    - Долю покупок в выходные
    - Долю покупок утром/днем/вечером

**Фичи item_id**:
    - Кол-во покупок в неделю
    - Среднее ол-во покупок 1 товара в категории в неделю
    - (Кол-во покупок в неделю) / (Среднее ол-во покупок 1 товара в категории в неделю)
    - Цена (Можно посчитать из retil_train.csv)
    - Цена / Средняя цена товара в категории
    
**Фичи пары user_id - item_id**
    - (Средняя сумма покупки 1 товара в каждой категории (берем категорию item_id)) - (Цена item_id)
    - (Кол-во покупок юзером конкретной категории в неделю) - (Среднее кол-во покупок всеми юзерами конкретной категории в неделю)
    - (Кол-во покупок юзером конкретной категории в неделю) / (Среднее кол-во покупок всеми юзерами конкретной категории в неделю)

In [40]:
item_features['price'] = item_features['item_id'].apply(lambda x: data[data['item_id'] == x]['sales_value'].mean())

weeks = len(data['week_no'].unique())
item_features['sum_sales_week'] = item_features['item_id'].apply(lambda x: data[data['item_id'] == x]['quantity'].sum() / weeks)

In [41]:
d = data.merge(item_features, left_on='item_id', right_on='item_id')
for cat in d['department'].unique():
    user_features[f'cat_{cat}'] = d[d['department'] == cat]['sales_value'].mean()
    user_features[f'cat_{cat}_full_sale'] = d[d['department'] == cat]['sales_value'].sum()
    user_features[f'cat_{cat}_quantity'] = d[d['department'] == cat]['quantity'].sum()
    item_features[f'avg_sales_one_item_category_{cat}_week'] = d[d['item_id'] == cat]['quantity'].sum() / len(data['week_no'].unique())
    item_features[f'sum_sale_div_avg_sales_category_{cat}'] = item_features['sum_sales_week'] / item_features[f'avg_sales_one_item_category_{cat}_week']
    item_features[f'avg_price_category_{cat}'] = d[d['department'] == cat]['price'].mean()

user_features['avg_sale'] = user_features['user_id'].apply(lambda x: data[data['user_id'] == x]['full_sale'].mean())
user_features['freq_sales_month'] = user_features['user_id'].apply(lambda x: data[data['user_id'] == x]['quantity'].sum() / len(data[data['user_id'] == x]['month_no'].unique()))
user_features['freq_sales_week'] = user_features['user_id'].apply(lambda x: data[data['user_id'] == x]['quantity'].sum() / len(data[data['user_id'] == x]['week_no'].unique()))

user_features['freq_sales_morning'] = user_features['user_id'].apply(lambda x: data[(400 <= data['trans_time']) & (data['trans_time'] < 1200) & (data['user_id'] == x)]['full_sale'].sum())
user_features['freq_sales_afternoon'] = user_features['user_id'].apply(lambda x: data[(1200 <= data['trans_time']) & (data['trans_time'] < 1800) & (data['user_id'] == x)]['full_sale'].sum())
user_features['freq_sales_evening'] = user_features['user_id'].apply(lambda x: data[(1800 <= data['trans_time']) & (data['trans_time'] <= 2359) & (data['user_id'] == x)]['full_sale'].sum())
user_features['freq_sales_night'] = user_features['user_id'].apply(lambda x: data[(0 <= data['trans_time']) & (data['trans_time'] < 400) & (data['user_id'] == x)]['full_sale'].sum())

In [42]:
#item_features['price_div_avg_price'] = item_features['item_id'].apply(lambda x: item_features[item_features['item_id'] == x]['price'] / item_features[item_features['item_id'] == x][f'avg_price_category_{item_features[item_features["item_id"] == x]["department"].values[0]}'])

In [43]:
user_features.head(2)

Unnamed: 0,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,user_id,cat_PRODUCE,cat_PRODUCE_full_sale,...,cat_VIDEO,cat_VIDEO_full_sale,cat_VIDEO_quantity,avg_sale,freq_sales_month,freq_sales_week,freq_sales_morning,freq_sales_afternoon,freq_sales_evening,freq_sales_night
0,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1,2.159994,515212.52,...,12.79,12.79,1,3.101441,80.173913,28.8125,1458.39,2963.13,506.67,0.0
1,45-54,A,50-74K,Homeowner,2 Adults No Kids,2,None/Unknown,7,2.159994,515212.52,...,12.79,12.79,1,3.283759,62.095238,29.636364,23.08,2202.07,1295.04,0.0


In [44]:
df_ranker_train = df_ranker_train.merge(item_features, on='item_id', how='left')
df_ranker_train = df_ranker_train.merge(user_features, on='user_id', how='left')

df_ranker_train.head(2)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,price,...,cat_VIDEO,cat_VIDEO_full_sale,cat_VIDEO_quantity,avg_sale,freq_sales_month,freq_sales_week,freq_sales_morning,freq_sales_afternoon,freq_sales_evening,freq_sales_night
0,2070,1105426,0.0,69,DELI,Private,SANDWICHES,SANDWICHES - (COLD),,4.474369,...,12.79,12.79,1.0,1105.636389,5798.333333,1484.939024,1027.23,766151.87,1900567.15,2365.63
1,2070,1097350,0.0,2468,GROCERY,National,DOMESTIC WINE,VALUE GLASS WINE,4 LTR,12.146275,...,12.79,12.79,1.0,1105.636389,5798.333333,1484.939024,1027.23,766151.87,1900567.15,2365.63


In [45]:
df_ranker_train.head()

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,price,...,cat_VIDEO,cat_VIDEO_full_sale,cat_VIDEO_quantity,avg_sale,freq_sales_month,freq_sales_week,freq_sales_morning,freq_sales_afternoon,freq_sales_evening,freq_sales_night
0,2070,1105426,0.0,69,DELI,Private,SANDWICHES,SANDWICHES - (COLD),,4.474369,...,12.79,12.79,1.0,1105.636389,5798.333333,1484.939024,1027.23,766151.87,1900567.15,2365.63
1,2070,1097350,0.0,2468,GROCERY,National,DOMESTIC WINE,VALUE GLASS WINE,4 LTR,12.146275,...,12.79,12.79,1.0,1105.636389,5798.333333,1484.939024,1027.23,766151.87,1900567.15,2365.63
2,2070,879194,0.0,69,DRUG GM,Private,DIAPERS & DISPOSABLES,BABY DIAPERS,14 CT,8.49587,...,12.79,12.79,1.0,1105.636389,5798.333333,1484.939024,1027.23,766151.87,1900567.15,2365.63
3,2070,948640,0.0,1213,DRUG GM,National,ORAL HYGIENE PRODUCTS,WHITENING SYSTEMS,3 OZ,7.345682,...,12.79,12.79,1.0,1105.636389,5798.333333,1484.939024,1027.23,766151.87,1900567.15,2365.63
4,2070,928263,0.0,69,DRUG GM,Private,DIAPERS & DISPOSABLES,BABY DIAPERS,13 CT,8.465091,...,12.79,12.79,1.0,1105.636389,5798.333333,1484.939024,1027.23,766151.87,1900567.15,2365.63


In [46]:
X_train = df_ranker_train.drop('target', axis=1)
y_train = df_ranker_train[['target']]

In [47]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 99399 entries, 0 to 99398
Columns: 288 entries, user_id to freq_sales_night
dtypes: float64(273), int64(3), object(12)
memory usage: 219.2+ MB


In [58]:
cat_feats = X_train.columns[2:].tolist()
X_train[cat_feats] = X_train[cat_feats].astype('category')

cat_feats

['manufacturer',
 'department',
 'brand',
 'commodity_desc',
 'sub_commodity_desc',
 'curr_size_of_product',
 'price',
 'sum_sales_week',
 'avg_sales_one_item_category_PRODUCE_week',
 'sum_sale_div_avg_sales_category_PRODUCE',
 'avg_price_category_PRODUCE',
 'avg_sales_one_item_category_GROCERY_week',
 'sum_sale_div_avg_sales_category_GROCERY',
 'avg_price_category_GROCERY',
 'avg_sales_one_item_category_DRUG GM_week',
 'sum_sale_div_avg_sales_category_DRUG GM',
 'avg_price_category_DRUG GM',
 'avg_sales_one_item_category_MEAT_week',
 'sum_sale_div_avg_sales_category_MEAT',
 'avg_price_category_MEAT',
 'avg_sales_one_item_category_MEAT-PCKGD_week',
 'sum_sale_div_avg_sales_category_MEAT-PCKGD',
 'avg_price_category_MEAT-PCKGD',
 'avg_sales_one_item_category_DELI_week',
 'sum_sale_div_avg_sales_category_DELI',
 'avg_price_category_DELI',
 'avg_sales_one_item_category_SEAFOOD-PCKGD_week',
 'sum_sale_div_avg_sales_category_SEAFOOD-PCKGD',
 'avg_price_category_SEAFOOD-PCKGD',
 'avg_sales_o

In [230]:
lgb = LGBMClassifier(objective='binary',
                     max_depth=10,
                     n_estimators=1000,
                     learning_rate=0.8,
                     categorical_column=cat_feats)

lgb.fit(X_train, y_train)

train_preds = lgb.predict_proba(X_train)

  return f(**kwargs)


In [231]:
df_ranker_predict = df_ranker_train.copy()

In [232]:
df_ranker_predict['proba_item_purchase'] = train_preds[:,1]

In [233]:
result_eval_ranker = data_val_ranker.groupby(USER_COL)[ITEM_COL].unique().reset_index()
result_eval_ranker.columns=[USER_COL, ACTUAL_COL]
result_eval_ranker.head(2)

Unnamed: 0,user_id,actual
0,1,"[821867, 834484, 856942, 865456, 889248, 90795..."
1,6,"[920308, 926804, 946489, 1006718, 1017061, 107..."


In [234]:
%%time
result_eval_ranker['own_rec'] = result_eval_ranker[USER_COL].apply(lambda x: recommender.get_own_recommendations(x, N=N_PREDICT))

Wall time: 5.31 s


In [235]:
# померяем precision только модели матчинга, чтобы понимать влияение ранжирования на метрики

sorted(calc_precision(result_eval_ranker, TOPK_PRECISION), key=lambda x: x[1], reverse=True)

[('own_rec', 0.1462140992167092)]

In [236]:
def rerank(user_id):
    return df_ranker_predict[df_ranker_predict[USER_COL]==user_id].sort_values('proba_item_purchase', ascending=False).head(5).item_id.tolist()

In [237]:
result_eval_ranker['reranked_own_rec'] = result_eval_ranker[USER_COL].apply(lambda user_id: rerank(user_id))

In [238]:
print(*sorted(calc_precision(result_eval_ranker, TOPK_PRECISION), key=lambda x: x[1], reverse=True), sep='\n')

('reranked_own_rec', 0.15258485639686525)
('own_rec', 0.1462140992167092)
