# Курсовой проект по курсу "Рекомендательные системы"

- Дедлайн - 27 декабря 23:59
- Целевая метрика precision@5
- Бейзлайн решения - [MainRecommender](https://github.com/geangohn/recsys-tutorial/blob/master/src/recommenders.py)
- Сдаем ссылку на github с решением. В решении должны быть отчетливо видна метрика на новом тестовом сете из файла retail_test1.csv, то есть вам нужно для всех юзеров из этого файла выдать выши рекомендации, и посчитать на actual покупках precision@5. 

**Важно:** Мы не рассматриваем холодный старт для пользователя, все наши пользователя одинаковы во всех сетах, поэтому нужно позаботиться об их исключении из теста.

In [159]:
#Импорт необходимых библиотек и модулей
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit.als import AlternatingLeastSquares
from implicit.nearest_neighbours import bm25_weight, tfidf_weight

# Модель второго уровня
from lightgbm import LGBMClassifier

# Функции из 1-ого вебинара
import os, sys
module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)
    
# Написанные нами функции
from metrics import precision_at_k, recall_at_k
from utils import prefilter_items, popularity_recommendation, postfilter_items
from recommenders import MainRecommender

import statistics

# План работы

[0. Загрузка данных](#part0) 

[1. Разбиение датасета retail_train на тренировочный, валидационный и тестовый](#part1) 

[2. Обучение модели первого уровня (ALS)](#part2) 

[3. Обучение модели второго уровня (LGBMClassifier)](#part3) 

[4. Оценка модели на датасете retail_test1](#part4) 

## 0. Загрузка данных  <a class="anchor" id="part0"></a><center>

Датасет с покупками

In [160]:
data = pd.read_csv('retail_train.csv')
data.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


Датасеты с признаками товаров и покупателей

In [161]:
item_features = pd.read_csv('product.csv')
user_features = pd.read_csv('hh_demographic.csv')

#преобразование столбцов
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': 'item_id'}, inplace=True)
user_features.rename(columns={'household_key': 'user_id'}, inplace=True)

In [162]:
item_features.head(2)

Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,


In [163]:
user_features.head(2)

Unnamed: 0,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,user_id
0,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1
1,45-54,A,50-74K,Homeowner,2 Adults No Kids,2,None/Unknown,7


## 1. Разбиение датасета retail_train на тренировочный, валидационный и тестовый  <a class="anchor" id="part1"></a><center>

Схема обучения и валидации:

 -- давние покупки -- | -- 6 недель -- | -- 3 недель -- 

In [164]:
val_lvl_1_size_weeks = 6
val_lvl_2_size_weeks = 3

#данные для обучения первой модели
data_train_lvl_1 = data[data['week_no'] < data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)]

#данные для валидации первой модели
data_val_lvl_1 = data[(data['week_no'] >= data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)) &
                      (data['week_no'] < data['week_no'].max() - (val_lvl_2_size_weeks))]

#данные для обучения второй модели
data_train_lvl_2 = data_val_lvl_1.copy() 

#данные для теста второй модели
data_val_lvl_2 = data[data['week_no'] >= data['week_no'].max() - val_lvl_2_size_weeks]


Предобработка данных

In [165]:
n_items_before = data_train_lvl_1['item_id'].nunique()

data_train_lvl_1 = prefilter_items(data_train_lvl_1, item_features=item_features, take_n_popular=13000)

n_items_after = data_train_lvl_1['item_id'].nunique()

print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

Decreased # items from 83685 to 13001


Преобразование холодного cтарта в теплый

In [166]:
def print_stats_data(df_data, name_df):
    print(name_df)
    print(f"Shape: {df_data.shape} Users: {df_data['user_id'].nunique()} Items: {df_data['item_id'].nunique()}")

In [167]:
# общие пользователи
common_users = list(set(data_train_lvl_1.user_id.values)&(set(data_val_lvl_1.user_id.values))&set(data_val_lvl_2.user_id.values))

data_train_lvl_1 = data_train_lvl_1[data_train_lvl_1.user_id.isin(common_users)]
data_val_lvl_1 = data_val_lvl_1[data_val_lvl_1.user_id.isin(common_users)]
data_train_lvl_2 = data_train_lvl_2[data_train_lvl_2.user_id.isin(common_users)]
data_val_lvl_2 = data_val_lvl_2[data_val_lvl_2.user_id.isin(common_users)]

print_stats_data(data_train_lvl_1,'train_lvl_1')
print_stats_data(data_val_lvl_1,'val_lvl_1')
print_stats_data(data_train_lvl_2,'train_lvl_2')
print_stats_data(data_val_lvl_2,'val_lvl_2')

train_lvl_1
Shape: (998901, 13) Users: 1915 Items: 12994
val_lvl_1
Shape: (163261, 12) Users: 1915 Items: 27118
train_lvl_2
Shape: (163261, 12) Users: 1915 Items: 27118
val_lvl_2
Shape: (115989, 12) Users: 1915 Items: 24042


**Вывод:** датасеты загружены, холодный старт преобразован в теплый.

## 2. Обучение модели первого уровня (ALS)  <a class="anchor" id="part2"></a><center>

Создание модели

In [168]:
recommender = MainRecommender(data_train_lvl_1)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=15.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=12994.0), HTML(value='')))




Выбор кандидатов для модели второго уровня (если модель рекомендует < N товаров, то рекомендации дополняются топ-популярными товарами до N)

In [169]:
result_lvl_1 = data_val_lvl_1.groupby('user_id')['item_id'].unique().reset_index()
result_lvl_1.columns=['user_id', 'actual']
result_lvl_1.head(2)

Unnamed: 0,user_id,actual
0,1,"[853529, 865456, 867607, 872137, 874905, 87524..."
1,6,"[1024306, 1102949, 6548453, 835394, 940804, 96..."


In [170]:
result_lvl_1['als_recommendations'] = result_lvl_1['user_id'].apply(lambda x: recommender.get_als_recommendations(x, N=50))
result_lvl_1['own_recommendations'] = result_lvl_1['user_id'].apply(lambda x: recommender.get_own_recommendations(x, N=50))
result_lvl_1['similar_items_recommendation'] = result_lvl_1['user_id'].apply(lambda x: recommender.get_similar_items_recommendation(x, N=50))
result_lvl_1['similar_users_recommendation'] = result_lvl_1['user_id'].apply(lambda x: recommender.get_similar_users_recommendation(x, N=50))
result_lvl_1['popular_recommendation'] = result_lvl_1['user_id'].apply(lambda x: popularity_recommendation(data_train_lvl_1, n=5))

result_lvl_1.head(2)

Unnamed: 0,user_id,actual,als_recommendations,own_recommendations,similar_items_recommendation,similar_users_recommendation,popular_recommendation
0,1,"[853529, 865456, 867607, 872137, 874905, 87524...","[887428, 6602270, 1082212, 6904459, 7406474, 1...","[9297615, 856942, 852662, 1049998, 1087895, 90...","[916758, 8291322, 1007512, 1080267, 9831882, 5...","[925350, 906631, 9831557, 1071343, 847962, 965...","[999999, 1070702, 933835, 1065538, 863447]"
1,6,"[1024306, 1102949, 6548453, 835394, 940804, 96...","[1042616, 910439, 968449, 869500, 823862, 9830...","[837076, 849618, 970736, 6773110, 13003092, 99...","[880150, 1097458, 1085846, 8203720, 1051041, 8...","[970160, 927080, 6979393, 823031, 970152, 6533...","[999999, 1070702, 933835, 1065538, 863447]"


Подсчет recall@50 для полученных рекомендаций

In [171]:
def calc_recall(df_data, top_k):
    for col_name in df_data.columns[2:]:
        yield col_name, df_data.apply(lambda row: recall_at_k(row[col_name], row['actual'], k=top_k), axis=1).mean()

In [172]:
sorted(calc_recall(result_lvl_1, 50), key=lambda x: x[1],reverse=True)

[('own_recommendations', 0.06563921352231059),
 ('als_recommendations', 0.02846531255696768),
 ('similar_items_recommendation', 0.019047191044164215),
 ('similar_users_recommendation', 0.0036165942564726303),
 ('popular_recommendation', 0.0019022192085052041)]

Подсчет pricision@5 для полученных рекомендаций

In [173]:
def calc_precision(df_data, top_k):
    for col_name in df_data.columns[2:]:
        yield col_name, df_data.apply(lambda row: precision_at_k(row[col_name], row['actual'], k=top_k), axis=1).mean()

In [174]:
sorted(calc_precision(result_lvl_1, 5), key=lambda x: x[1],reverse=True)

[('own_recommendations', 0.1927937336814603),
 ('als_recommendations', 0.048041775456919344),
 ('similar_items_recommendation', 0.043968668407310924),
 ('popular_recommendation', 0.028302872062663277),
 ('similar_users_recommendation', 0.006266318537859004)]

**Вывод:** лучшие значения метрик recall@50 и pricision@5 у own_recommendations, поэтому для генерации кандидатов для обучения модели второго уровня будем использовать get_own_recommendations.

## 3. Обучение модели второго уровня (LGBMClassifier) <a class="anchor" id="part3"></a><center>

**Подготавливаем выбранных кандидатов для обучения**: сгенерируем топ-50 кадидиатов через own_recommendations

In [175]:
#берем пользователей из data_train_lvl_2 для ранжирования
users_lvl_2 = pd.DataFrame(data_train_lvl_2['user_id'].unique())
users_lvl_2.columns = ['user_id']

#подбираем для каждого пользователя кандидатов в рекомендации
users_lvl_2['candidates'] = users_lvl_2['user_id'].apply(lambda x: recommender.get_own_recommendations(x, N=50))

In [176]:
users_lvl_2.head(2)

Unnamed: 0,user_id,candidates
0,2070,"[928263, 879194, 6979317, 1082955, 1105661, 11..."
1,2021,"[885655, 1041390, 950935, 1019142, 835578, 111..."


In [177]:
#"развернем" таблицу, чтобы каждый кандидат был один в строке (чтобы к нему добавть таргет)
s = users_lvl_2.apply(lambda x: pd.Series(x['candidates']), axis=1).stack().reset_index(level=1, drop=True)
s.name = 'item_id'

users_lvl_2 = users_lvl_2.drop('candidates', axis=1).join(s)

users_lvl_2.head(4)

Unnamed: 0,user_id,item_id
0,2070,928263
0,2070,879194
0,2070,6979317
0,2070,1082955


In [178]:
#проверяем теплый старт (количество уникальных пользователей не изменилось)
users_lvl_2['user_id'].nunique() 

1915

In [179]:
#создаем датасет для ранжирования (для обучения модели второго уровня)
#на каждого юзера по 50 кандидатов
#Обучаем модель 2-ого уровня на data_train_lvl_2 и *только* на выбранных кандидатах!!!

targets_lvl_2 = data_train_lvl_2[['user_id', 'item_id']].copy()

targets_lvl_2['target'] = 1  # тут только покупки 

targets_lvl_2 = users_lvl_2.merge(targets_lvl_2, on=['user_id', 'item_id'], how='left')

targets_lvl_2['target'].fillna(0, inplace= True)

targets_lvl_2.head(4)

Unnamed: 0,user_id,item_id,target
0,2070,928263,0.0
1,2070,879194,0.0
2,2070,6979317,0.0
3,2070,1082955,1.0


In [180]:
#дисбаланс классов
targets_lvl_2.target.value_counts()

0.0    87784
1.0    12201
Name: target, dtype: int64

In [181]:
#доля купленных товаров среди кандидатов
targets_lvl_2['target'].mean()

0.12202830424563685

**Подготовка описательных фичей для обучения модели**

In [182]:
#генерируем новые признаки из датасета data_train_lvl_2
data_train_lvl_2.head(3)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
2104867,2070,40618492260,594,1019940,1,1.0,311,-0.29,40,86,0.0,0.0
2107468,2021,40618753059,594,840361,1,0.99,443,0.0,101,86,0.0,0.0
2107469,2021,40618753059,594,856060,1,1.77,443,-0.09,101,86,0.0,0.0


In [183]:
user_features.head(3)

Unnamed: 0,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,user_id
0,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1
1,45-54,A,50-74K,Homeowner,2 Adults No Kids,2,None/Unknown,7
2,25-34,U,25-34K,Unknown,2 Adults Kids,3,1,8


In [184]:
item_features.head(3)

Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,
2,26093,69,PASTRY,Private,BREAD,BREAD:ITALIAN/FRENCH,


In [185]:
#Фичи для user_id
def new_user_features(data, user_features, item_features):
    
    new_user_features = user_features.merge(data, on='user_id', how='left')
    new_item_features = data.merge(item_features, on='item_id', how='left')
    
    #Средний чек
    average_check = new_user_features.groupby(['user_id'])['sales_value'].mean().reset_index()
    average_check.rename(columns={'sales_value': 'average_sales_value'}, inplace=True)
    user_features = user_features.merge(average_check)

     
    #Количество уникальных категорий покупателя
    num_unique_department = new_item_features.groupby(['user_id'])['department'].nunique().reset_index()
    num_unique_department.rename(columns={'department': 'num_unique_department'}, inplace=True)
    user_features = user_features.merge(num_unique_department, on='user_id', how='left')
    
   
    #Количество покупок в каждой категории
    num_sales_in_category = new_item_features[['user_id', 'quantity', 'department']].groupby(['user_id', 'department']).sum().reset_index()
    num_sales_in_category.rename(columns={'quantity': 'num_quantity'}, inplace=True) 
    num_sales_in_category.drop(labels=[0], axis=0,inplace=True)

    users_num_cat_sales_dict = {}
    for user_id in num_sales_in_category['user_id'].unique():
        users_num_cat_sales_dict[user_id] = dict(list(zip(num_sales_in_category[num_sales_in_category['user_id']==user_id]['department'].values, \
                                                      num_sales_in_category[num_sales_in_category['user_id']==user_id]['num_quantity'].values)))

    for cat in item_features['department'].unique():
        new_item_features[f'num_sales_in_{cat}'] = 0

    for user_id in new_item_features['user_id'].unique():
        for cat in users_num_cat_sales_dict[user_id].keys():
            new_item_features.loc[(new_item_features['user_id']==user_id) & (new_item_features['department']==cat), f'num_sales_in_{cat}']=\
            users_num_cat_sales_dict[user_id][cat]

    feat_to_merge = ['user_id'] + new_item_features.columns.tolist()[18:]
    user_num_sales_in_cat = new_item_features[feat_to_merge]
    user_num_sales_in_cat = user_num_sales_in_cat.groupby('user_id').max().reset_index()
    user_features = user_features.merge(user_num_sales_in_cat, on='user_id', how='left')
    
    user_features = user_features.drop('num_sales_in_ ', axis=1)
    
    #Среднее время покупки пользователем
    new_user_features['hour'] = new_user_features['trans_time'] // 100
    hour = new_user_features.groupby('user_id')['hour'].median().reset_index()
    hour.columns = ['user_id', 'median_sales_hour_for_user']
    user_features = user_features.merge(hour, on='user_id', how='left')
    
    #Популярный день недели покупки юзера
    new_user_features['weekday'] = new_user_features['day'] % 7
    week_day = new_user_features.groupby('user_id')['weekday'].apply(statistics.mode).reset_index()
    week_day.columns = ['user_id', 'mode_sales_day_for_user']
    user_features = user_features.merge(week_day, on='user_id', how='left')
    
    # средний чек корзины покупателя
    average_check_bascket = new_user_features.groupby(['user_id', 'basket_id'])['sales_value'].sum().reset_index()
    average_check_bascket = average_check_bascket.groupby('user_id')['sales_value'].mean().reset_index()
    average_check_bascket.columns = ['user_id', 'mean_basket_check']
    user_features = user_features.merge(average_check_bascket, on='user_id', how='left')
    
    # кол-во уникальных товаров, купленных клиентом
    item_unique = new_user_features.groupby(['user_id'])['item_id'].nunique().reset_index()
    item_unique.columns = ['user_id', 'n_items']
    user_features = user_features.merge(item_unique, on='user_id', how='left')
    
    user_features = user_features.replace(np.nan, 0)
    
    return user_features

In [186]:
%%time

user_features_new = new_user_features(data_train_lvl_2, user_features, item_features)
user_features_new.head(3)

Wall time: 2min 1s


Unnamed: 0,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,user_id,average_sales_value,num_unique_department,...,num_sales_in_CNTRL/STORE SUP,num_sales_in_HOUSEWARES,num_sales_in_POSTAL CENTER,num_sales_in_PHOTO,num_sales_in_VIDEO,num_sales_in_PHARMACY SUPPLY,median_sales_hour_for_user,mode_sales_day_for_user,mean_basket_check,n_items
0,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1,2.569774,9.0,...,0.0,0.0,0.0,0.0,0.0,0.0,11.0,2.0,48.825714,109
1,45-54,A,50-74K,Homeowner,2 Adults No Kids,2,None/Unknown,7,2.085,8.0,...,0.0,0.0,0.0,0.0,0.0,0.0,16.0,6.0,37.53,86
2,25-34,U,25-34K,Unknown,2 Adults Kids,3,1,8,2.472683,9.0,...,0.0,0.0,0.0,0.0,0.0,0.0,20.0,2.0,50.69,116


In [187]:
#Фичи для item_id
def new_item_features(data, user_features, item_features):
    
    new_item_features = item_features.merge(data, on='item_id', how='left')
    new_user_features = user_features.merge(data, on='user_id', how='left')
    
    # Цена
    price = new_item_features.groupby('item_id')['sales_value'].sum() \
                            / new_item_features.groupby('item_id')['quantity'].sum()
    price = price.groupby('item_id').mean().reset_index()
    price.columns = ['item_id', 'price']
    price['price'].fillna(0, inplace= True)
    item_features = item_features.merge(price)

    #Среднее число покупок товара в неделю
    num_purchase_week = data.groupby(['item_id']).agg({'week_no': 'nunique', 'quantity': 'sum'}).reset_index()
    num_purchase_week['average_num_purchases_week'] = num_purchase_week['quantity'] / num_purchase_week['week_no']
    average_num_purchases_week = num_purchase_week[['item_id', 'average_num_purchases_week']]
    item_features = item_features.merge(average_num_purchases_week, on='item_id', how='left')
    item_features['average_num_purchases_week'].fillna(0, inplace= True)
    
    #Средняя цена товара в категории
    average_price_in_cat = item_features.groupby(['department'])['price'].mean().reset_index()
    average_price_in_cat.rename(columns={'price': 'average_price_in_dep'}, inplace=True)
    average_price_in_cat.drop(labels=[0], axis=0,inplace=True)
    item_features = item_features.merge(average_price_in_cat)
    
    #Цена товара/средняя цена товара в категории
    item_features['price/average_price_in_dep'] = item_features['price'] / item_features['average_price_in_dep']
    
    #Средняя цена товара в категории - Цена товара
    item_features['average_price_in_dep - item_price'] = item_features['average_price_in_dep'] - item_features['price']
    
    #Среднее время покупки товара
    new_item_features['hour'] = new_item_features['trans_time'] // 100
    hour = new_item_features.groupby('item_id')['hour'].median().reset_index()
    hour.columns = ['item_id', 'median_sales_hour_for_item']
    item_features = item_features.merge(hour, on='item_id', how='left')
    
    #Популярный день недели покупки товара
    new_item_features['weekday'] = new_item_features['day'] % 7
    week_day = new_item_features.groupby('item_id')['weekday'].apply(statistics.mode).reset_index()
    week_day.columns = ['item_id', 'mode_sales_day_for_item']
    item_features = item_features.merge(week_day, on='item_id', how='left')
    
    #Количество магазинов, в которых продается товар
    shops = new_item_features.groupby('item_id')['store_id'].nunique().reset_index()
    shops.columns = ['item_id', 'n_stores']
    item_features = item_features.merge(shops, on='item_id', how='left')
    
    item_features = item_features.replace(np.nan, 0)
    
    return item_features

In [188]:
%%time

item_features_new = new_item_features(data_train_lvl_2, user_features, item_features)
item_features_new.head(3)

Wall time: 2.55 s


Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,price,average_num_purchases_week,average_price_in_dep,price/average_price_in_dep,average_price_in_dep - item_price,median_sales_hour_for_item,mode_sales_day_for_item,n_stores
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB,0.0,0.0,1.043754,0.0,1.043754,0.0,0.0,0
1,26190,69,GROCERY,Private,FRUIT - SHELF STABLE,APPLE SAUCE,50 OZ,0.0,0.0,1.043754,0.0,1.043754,0.0,0.0,0
2,26355,69,GROCERY,Private,COOKIES/CONES,SPECIALTY COOKIES,14 OZ,0.0,0.0,1.043754,0.0,1.043754,0.0,0.0,0


In [189]:
targets_lvl_2 = targets_lvl_2.merge(item_features_new, on='item_id', how='left')
targets_lvl_2 = targets_lvl_2.merge(user_features_new, on='user_id', how='left')

targets_lvl_2.head(3)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,price,...,num_sales_in_CNTRL/STORE SUP,num_sales_in_HOUSEWARES,num_sales_in_POSTAL CENTER,num_sales_in_PHOTO,num_sales_in_VIDEO,num_sales_in_PHARMACY SUPPLY,median_sales_hour_for_user,mode_sales_day_for_user,mean_basket_check,n_items
0,2070,928263,0.0,69,DRUG GM,Private,DIAPERS & DISPOSABLES,BABY DIAPERS,13 CT,7.99,...,0.0,0.0,0.0,0.0,0.0,0.0,18.0,3.0,14.355581,156.0
1,2070,879194,0.0,69,DRUG GM,Private,DIAPERS & DISPOSABLES,BABY DIAPERS,14 CT,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,18.0,3.0,14.355581,156.0
2,2070,6979317,0.0,5457,DRUG GM,National,DOMESTIC GOODS,SWIN/TOYS,30X60 IN,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,18.0,3.0,14.355581,156.0


**Подготовка поведенческих фичей для обучения модели**

In [190]:
#данные для модели первого уровня
df_join_train_lvl_1 = pd.concat([data_train_lvl_1, data_val_lvl_1])
df_join_train_lvl_1.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc,price
4,2375,26984851472,1,8160430,1,1.5,364,-0.39,1631,1,0.0,0.0,1.5
6,2375,26984851516,1,1043142,1,1.57,364,-0.68,1642,1,0.0,0.0,1.57


In [191]:
targets_lvl_2 = targets_lvl_2.merge(df_join_train_lvl_1.groupby(by='item_id').agg('sales_value').sum().rename('total_item_sales_value'), how='left',on='item_id')

targets_lvl_2  = targets_lvl_2.merge(df_join_train_lvl_1.groupby(by='item_id').agg('quantity').sum().rename('total_quantity_value'), how='left',on='item_id')

targets_lvl_2  = targets_lvl_2.merge(df_join_train_lvl_1.groupby(by='item_id').agg('user_id').count().rename('item_freq'), how='left',on='item_id')

targets_lvl_2  = targets_lvl_2.merge(df_join_train_lvl_1.groupby(by='user_id').agg('user_id').count().rename('user_freq'), how='left',on='user_id')

targets_lvl_2  = targets_lvl_2.merge(df_join_train_lvl_1.groupby(by='user_id').agg('sales_value').sum().rename('total_user_sales_value'), how='left',on='user_id')

targets_lvl_2  = targets_lvl_2.merge(df_join_train_lvl_1.groupby(by='item_id').agg('quantity').sum().rename('item_quantity_per_week')/df_join_train_lvl_1.week_no.nunique(), how='left',on='item_id')

targets_lvl_2  = targets_lvl_2.merge(df_join_train_lvl_1.groupby(by='user_id').agg('quantity').sum().rename('user_quantity_per_week')/df_join_train_lvl_1.week_no.nunique(), how='left',on='user_id')

targets_lvl_2 = targets_lvl_2.merge(df_join_train_lvl_1.groupby(by='item_id').agg('quantity').sum().rename('item_quantity_per_basket')/df_join_train_lvl_1.basket_id.nunique(), how='left',on='item_id')

targets_lvl_2 = targets_lvl_2.merge(df_join_train_lvl_1.groupby(by='user_id').agg('quantity').sum().rename('user_quantity_per_baskter')/df_join_train_lvl_1.basket_id.nunique(), how='left',on='user_id')

targets_lvl_2 = targets_lvl_2.merge(df_join_train_lvl_1.groupby(by='item_id').agg('user_id').count().rename('item_freq_per_basket')/df_join_train_lvl_1.basket_id.nunique(), how='left',on='item_id')

targets_lvl_2  = targets_lvl_2.merge(df_join_train_lvl_1.groupby(by='user_id').agg('user_id').count().rename('user_freq_per_basket')/df_join_train_lvl_1.basket_id.nunique(), how='left',on='user_id')

targets_lvl_2.head(3)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,price,...,total_quantity_value,item_freq,user_freq,total_user_sales_value,item_quantity_per_week,user_quantity_per_week,item_quantity_per_basket,user_quantity_per_baskter,item_freq_per_basket,user_freq_per_basket
0,2070,928263,0.0,69,DRUG GM,Private,DIAPERS & DISPOSABLES,BABY DIAPERS,13 CT,7.99,...,59,53,955,3599.87,0.648352,203.461538,0.000337,0.105787,0.000303,0.005456
1,2070,879194,0.0,69,DRUG GM,Private,DIAPERS & DISPOSABLES,BABY DIAPERS,14 CT,0.0,...,54,45,955,3599.87,0.593407,203.461538,0.000309,0.105787,0.000257,0.005456
2,2070,6979317,0.0,5457,DRUG GM,National,DOMESTIC GOODS,SWIN/TOYS,30X60 IN,0.0,...,39,16,955,3599.87,0.428571,203.461538,0.000223,0.105787,9.1e-05,0.005456


**Обучение модели ранжирования**

In [192]:
X_train = targets_lvl_2.drop('target', axis=1)
y_train = targets_lvl_2[['target']]
X_train.shape, y_train.shape

((99985, 83), (99985, 1))

In [193]:
cat_feats = list(X_train.select_dtypes(include=['object']).columns)
X_train[cat_feats] = X_train[cat_feats].astype('category')
cat_feats 

['department',
 'brand',
 'commodity_desc',
 'sub_commodity_desc',
 'curr_size_of_product',
 'age_desc',
 'marital_status_code',
 'income_desc',
 'homeowner_desc',
 'hh_comp_desc',
 'household_size_desc',
 'kid_category_desc']

In [194]:
%%time

#lgb = LGBMClassifier(objective='binary',
#                     max_depth=10,
#                     num_boost_round = 10000,
#                     n_estimators=5000,
#                     learning_rate=0.1,
#                     n_jobs = 8,
#                     force_row_wise = True,
#                     seed = 24,
#                     categorical_column=cat_feats)

lgb = LGBMClassifier(objective='binary',
                     num_boost_round = 10000,
                     n_jobs = 8,
                     force_row_wise = True,
                     seed = 24,
                     categorical_column=cat_feats,
                     verbose_eval=1000)

lgb.fit(X_train, y_train)

train_preds = lgb.predict_proba(X_train)

  return f(*args, **kwargs)


Wall time: 1min 58s


______

In [195]:
df_lvl_1_predict = targets_lvl_2.copy()

df_lvl_1_predict['proba_item_purchase'] = train_preds[:,1]

df_lvl_1_predict.head(2)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,price,...,item_freq,user_freq,total_user_sales_value,item_quantity_per_week,user_quantity_per_week,item_quantity_per_basket,user_quantity_per_baskter,item_freq_per_basket,user_freq_per_basket,proba_item_purchase
0,2070,928263,0.0,69,DRUG GM,Private,DIAPERS & DISPOSABLES,BABY DIAPERS,13 CT,7.99,...,53,955,3599.87,0.648352,203.461538,0.000337,0.105787,0.000303,0.005456,6.120766e-05
1,2070,879194,0.0,69,DRUG GM,Private,DIAPERS & DISPOSABLES,BABY DIAPERS,14 CT,0.0,...,45,955,3599.87,0.593407,203.461538,0.000309,0.105787,0.000257,0.005456,1.794581e-13


**Оценка модели первого уровня на валидационном наборе**

In [196]:
result_eval_lvl_2 = data_val_lvl_2.groupby('user_id')['item_id'].unique().reset_index()
result_eval_lvl_2.columns=['user_id', 'actual']
result_eval_lvl_2.head(2)

Unnamed: 0,user_id,actual
0,1,"[821867, 834484, 856942, 865456, 889248, 90795..."
1,6,"[920308, 926804, 946489, 1006718, 1017061, 107..."


In [197]:
%%time
result_eval_lvl_2['own_rec'] = result_eval_lvl_2['user_id'].apply(lambda x: recommender.get_own_recommendations(x, N=50))

Wall time: 9.71 s


In [198]:
# померяем precision только модели матчинга, чтобы понимать влияение ранжирования на метрики

sorted(calc_precision(result_eval_lvl_2, 5), key=lambda x: x[1], reverse=True)

[('own_rec', 0.14391644908616083)]

**Оценка модели второго уровня на валидационном наборе**

In [199]:
def rerank(user_id):
    return df_lvl_1_predict[df_lvl_1_predict['user_id']==user_id].sort_values('proba_item_purchase', ascending=False).head(5).item_id.tolist()

In [200]:
result_eval_lvl_2['reranked_own_rec'] = result_eval_lvl_2['user_id'].apply(lambda user_id: rerank(user_id))

In [201]:
print(*sorted(calc_precision(result_eval_lvl_2, 5), key=lambda x: x[1], reverse=True), sep='\n')

('reranked_own_rec', 0.27550913838120006)
('own_rec', 0.14391644908616083)


**Вывод:** построена двухуровненая модель с метрикой precision@5 = 0.27550913838120006 на валидационном датасете

## 4. Оценка модели на датасете retail_test1  <a class="anchor" id="part4"></a><center>

In [202]:
data_test = pd.read_csv('retail_test1.csv')

data_test = data_test[data_test['user_id'].isin(common_users)]

print_stats_data(data_test,'data_test')

data_test.head(2)

data_test
Shape: (83656, 12) Users: 1663 Items: 19981


Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,1340,41652823310,664,912987,1,8.49,446,0.0,52,96,0.0,0.0
1,588,41652838477,664,1024426,1,6.29,388,0.0,8,96,0.0,0.0


In [203]:
#все, что было до data_test
data.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [204]:
result_test = data_test.groupby('user_id')['item_id'].unique().reset_index()
result_test.columns=['user_id', 'actual']

result_test.head(2)

Unnamed: 0,user_id,actual
0,1,"[880007, 883616, 931136, 938004, 940947, 94726..."
1,6,"[956902, 960791, 1037863, 1119051, 1137688, 84..."


In [205]:
%%time

result_test['own_rec'] = result_test['user_id'].apply(lambda x: recommender.get_own_recommendations(x, N=50))

# померяем precision только модели матчинга, чтобы понимать влияение ранжирования на метрики

sorted(calc_precision(result_test, 5), key=lambda x: x[1], reverse=True)

Wall time: 8.02 s


[('own_rec', 0.12194828622970476)]

In [206]:
result_test['reranked_own_rec'] = result_test['user_id'].apply(lambda user_id: rerank(user_id))

In [207]:
print(*sorted(calc_precision(result_test, 5), key=lambda x: x[1], reverse=True), sep='\n')

('reranked_own_rec', 0.2514732411304859)
('own_rec', 0.12194828622970476)


**Вывод:** на тестовом датасете метрика модели precision@5 = 0.2514732411304859