# Курсовой проект

**Основное**
- Дедлайн - 21 июня 23:59
- На retail_test1.csv целевая метрика precision@5 > 0.235
- Бейзлайн решения - [MainRecommender](https://github.com/geangohn/recsys-tutorial/blob/master/src/recommenders.py)
- Сдаем ссылку на github с решением. На github должен быть файл recommendations.csv (user_id | [rec_1, rec_2, ...] с рекомендациями. rec_i - реальные id item-ов (из retail_train.csv)

**Hints:** 

Сначала просто попробуйте разные параметры MainRecommender:  
- N в топ-N товарах при формировании user-item матирцы (сейчас топ-5000)  
- Различные веса в user-item матрице (0/1, кол-во покупок, log(кол-во покупок + 1), сумма покупки, ...)  
- Разные взвешивания матрицы (TF-IDF, BM25 - у него есть параметры)  
- Разные смешивания рекомендаций (обратите внимание на бейзлайн - прошлые покупки юзера)  

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit import als

# Модель второго уровня
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier, Pool

import os, sys
module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)

# Написанные нами функции
from src.metrics import precision_at_k, recall_at_k
from src.utils import prefilter_items
from src.recommenders import MainRecommender

In [2]:
data = pd.read_csv('C:/Users/flath/Desktop/Rec.sis/retail_train.csv')
item_features = pd.read_csv('C:/Users/flath/Desktop/Rec.sis/product.csv')
user_features = pd.read_csv('C:/Users/flath/Desktop/Rec.sis/hh_demographic.csv')


In [3]:
user_features.head()

Unnamed: 0,AGE_DESC,MARITAL_STATUS_CODE,INCOME_DESC,HOMEOWNER_DESC,HH_COMP_DESC,HOUSEHOLD_SIZE_DESC,KID_CATEGORY_DESC,household_key
0,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1
1,45-54,A,50-74K,Homeowner,2 Adults No Kids,2,None/Unknown,7
2,25-34,U,25-34K,Unknown,2 Adults Kids,3,1,8
3,25-34,U,75-99K,Homeowner,2 Adults Kids,4,2,13
4,45-54,B,50-74K,Homeowner,Single Female,1,None/Unknown,16


In [4]:
item_features.head(10)

Unnamed: 0,PRODUCT_ID,MANUFACTURER,DEPARTMENT,BRAND,COMMODITY_DESC,SUB_COMMODITY_DESC,CURR_SIZE_OF_PRODUCT
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,
2,26093,69,PASTRY,Private,BREAD,BREAD:ITALIAN/FRENCH,
3,26190,69,GROCERY,Private,FRUIT - SHELF STABLE,APPLE SAUCE,50 OZ
4,26355,69,GROCERY,Private,COOKIES/CONES,SPECIALTY COOKIES,14 OZ
5,26426,69,GROCERY,Private,SPICES & EXTRACTS,SPICES & SEASONINGS,2.5 OZ
6,26540,69,GROCERY,Private,COOKIES/CONES,TRAY PACK/CHOC CHIP COOKIES,16 OZ
7,26601,69,DRUG GM,Private,VITAMINS,VITAMIN - MINERALS,300CT(1)
8,26636,69,PASTRY,Private,BREAKFAST SWEETS,SW GDS: SW ROLLS/DAN,
9,26691,16,GROCERY,Private,PNT BTR/JELLY/JAMS,HONEY,12 OZ


In [5]:
data.head(10)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0
2,2375,26984851472,1,1036325,1,0.99,364,-0.3,1631,1,0.0,0.0
3,2375,26984851472,1,1082185,1,1.21,364,0.0,1631,1,0.0,0.0
4,2375,26984851472,1,8160430,1,1.5,364,-0.39,1631,1,0.0,0.0
5,2375,26984851516,1,826249,2,1.98,364,-0.6,1642,1,0.0,0.0
6,2375,26984851516,1,1043142,1,1.57,364,-0.68,1642,1,0.0,0.0
7,2375,26984851516,1,1085983,1,2.99,364,-0.4,1642,1,0.0,0.0
8,2375,26984851516,1,1102651,1,1.89,364,0.0,1642,1,0.0,0.0
9,2375,26984851516,1,6423775,1,2.0,364,-0.79,1642,1,0.0,0.0


In [6]:
# column processing
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns =[col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': 'item_id'}, inplace=True)
user_features.rename(columns={'household_key': 'user_id'}, inplace=True)


In [7]:
val_matcher_weeks =6
val_ranker_weeks =3

In [8]:
# берем данные для тренировки matching модели
data_train_matcher = data[data['week_no'] < data['week_no'].max() - (val_matcher_weeks + val_ranker_weeks)] 

# берем данные для валидации matching модели
data_val_matcher = data[(data['week_no'] >= data['week_no'].max() - (val_matcher_weeks + val_ranker_weeks)) &
                      (data['week_no'] < data['week_no'].max() - (val_ranker_weeks))]
# берем данные для тренировки ranking модели
data_train_ranker = data_val_matcher.copy() 
# берем данные для теста ranking, matching модели
data_val_ranker = data[data['week_no'] >= data['week_no'].max() - val_ranker_weeks]


In [9]:
# сделаем объединенный сет данных для первого уровня (матчинга)
df_train_matcher = pd.concat([data_train_matcher, data_val_matcher])

## Prefilter items

In [10]:
n_items_before = data_train_matcher['item_id'].nunique()

data_train_matcher = prefilter_items(data_train_matcher, item_features=item_features, take_n_popular=5000)

n_items_after = data_train_matcher['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

Decreased # items from 83685 to 5001


In [11]:
# ищем общих пользователей
common_users = data_train_matcher.user_id.values

data_val_matcher = data_val_matcher[data_val_matcher.user_id.isin(common_users)]
data_train_ranker = data_train_ranker[data_train_ranker.user_id.isin(common_users)]
data_val_ranker = data_val_ranker[data_val_ranker.user_id.isin(common_users)]


In [12]:
recommender = MainRecommender(data_train_matcher)



  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/5001 [00:00<?, ?it/s]

In [13]:
def calc_recall(df_data, top_k):
    for col_name in df_data.columns[2:]:
        yield col_name, df_data.apply(lambda row: recall_at_k(row[col_name], row['actual'], k=top_k), axis=1).mean()

In [14]:
def calc_precision(df_data, top_k):
    for col_name in df_data.columns[2:]:
        yield col_name, df_data.apply(lambda row: precision_at_k(row[col_name], row['actual'], k=top_k), axis=1).mean()

In [15]:
TOPK_PRECISION = 5

In [16]:
N_PREDICT = 500

In [17]:
# взяли пользователей из трейна для ранжирования
df_match_candidates = pd.DataFrame(data_train_ranker['user_id'].unique())
df_match_candidates.columns = ['user_id']

In [18]:
# собираем кандитатов с первого этапа (matcher)
df_match_candidates['candidates'] = df_match_candidates['user_id'].apply(lambda x: recommender.get_own_recommendations(x, N=N_PREDICT))
df_match_candidates.head(2)

Unnamed: 0,user_id,candidates
0,2070,"[1105426, 1097350, 879194, 948640, 928263, 944..."
1,2021,"[950935, 1119454, 835578, 863762, 1019142, 102..."


In [19]:
# разворачиваем товары
df_items = df_match_candidates.apply(lambda x: pd.Series(x['candidates']), axis=1).stack().reset_index(level=1, drop=True)
df_items.name = 'item_id'

In [20]:
df_match_candidates = df_match_candidates.drop('candidates', axis=1).join(df_items)
df_match_candidates.head(4)

Unnamed: 0,user_id,item_id
0,2070,1105426
0,2070,1097350
0,2070,879194
0,2070,948640


In [21]:
df_ranker_train = data_train_ranker[['user_id', 'item_id']].copy()
df_ranker_train['target'] = 1  # тут только покупки 
df_ranker_train.head()

Unnamed: 0,user_id,item_id,target
2104867,2070,1019940,1
2107468,2021,840361,1
2107469,2021,856060,1
2107470,2021,869344,1
2107471,2021,896862,1


In [22]:
df_ranker_train = df_match_candidates.merge(df_ranker_train, on=['user_id', 'item_id'], how='left')

In [23]:
df_ranker_train 

Unnamed: 0,user_id,item_id,target
0,2070,1105426,
1,2070,1097350,
2,2070,879194,
3,2070,948640,
4,2070,928263,
...,...,...,...
1087521,1745,829852,
1087522,1745,997796,
1087523,1745,894439,
1087524,1745,1058404,


In [24]:
# чистим дубликаты
df_ranker_train = df_ranker_train.drop_duplicates(subset=['user_id', 'item_id'])

In [25]:
# дополняем нулями таргет
df_ranker_train['target'].fillna(0, inplace= True)
df_ranker_train.target.value_counts()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(


0.0    989501
1.0     26185
Name: target, dtype: int64

In [26]:
df_train_matcher.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [27]:
item_features.head(2)


Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,


In [28]:
user_features.head(2)

Unnamed: 0,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,user_id
0,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1
1,45-54,A,50-74K,Homeowner,2 Adults No Kids,2,None/Unknown,7


In [29]:
df_ranker_train = df_ranker_train.merge(item_features, on='item_id', how='left')
df_ranker_train = df_ranker_train.merge(user_features, on='user_id', how='left')

df_ranker_train.head(2)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc
0,2070,1105426,0.0,69,DELI,Private,SANDWICHES,SANDWICHES - (COLD),,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown
1,2070,1097350,0.0,2468,GROCERY,National,DOMESTIC WINE,VALUE GLASS WINE,4 LTR,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown


In [30]:
df_train_matcher['day'].nunique()

635

In [31]:
df_train_matcher['week_day'] = df_train_matcher['day'] % 7
df_train_matcher = pd.get_dummies(df_train_matcher, columns=['week_day'])
df_train_matcher.head()

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc,week_day_0,week_day_1,week_day_2,week_day_3,week_day_4,week_day_5,week_day_6
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0,0,1,0,0,0,0,0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0,0,1,0,0,0,0,0
2,2375,26984851472,1,1036325,1,0.99,364,-0.3,1631,1,0.0,0.0,0,1,0,0,0,0,0
3,2375,26984851472,1,1082185,1,1.21,364,0.0,1631,1,0.0,0.0,0,1,0,0,0,0,0
4,2375,26984851472,1,8160430,1,1.5,364,-0.39,1631,1,0.0,0.0,0,1,0,0,0,0,0


In [32]:
df_ranker_train = df_ranker_train.merge(df_train_matcher.groupby(by='item_id').agg('sales_value').sum().rename('total_item_sales_value'), how='left',on='item_id')

df_ranker_train = df_ranker_train.merge(df_train_matcher.groupby(by='item_id').agg('quantity').sum().rename('total_quantity_value'), how='left',on='item_id')

df_ranker_train = df_ranker_train.merge(df_train_matcher.groupby(by='item_id').agg('user_id').count().rename('item_freq'), how='left',on='item_id')

df_ranker_train = df_ranker_train.merge(df_train_matcher.groupby(by='user_id').agg('user_id').count().rename('user_freq'), how='left',on='user_id')

df_ranker_train = df_ranker_train.merge(df_train_matcher.groupby(by='user_id').agg('sales_value').sum().rename('total_user_sales_value'), how='left',on='user_id')

df_ranker_train = df_ranker_train.merge(df_train_matcher.groupby(by='item_id').agg('quantity').sum().rename('item_quantity_per_week')/df_train_matcher.week_no.nunique(), how='left',on='item_id')


df_ranker_train = df_ranker_train.merge(df_train_matcher.groupby(by='user_id').agg('quantity').sum().rename('user_quantity_per_week')/df_train_matcher.week_no.nunique(), how='left',on='user_id')


df_ranker_train = df_ranker_train.merge(df_train_matcher.groupby(by='item_id').agg('quantity').sum().rename('item_quantity_per_basket')/df_train_matcher.basket_id.nunique(), how='left',on='item_id')

df_ranker_train = df_ranker_train.merge(df_train_matcher.groupby(by='user_id').agg('quantity').sum().rename('user_quantity_per_baskter')/df_train_matcher.basket_id.nunique(), how='left',on='user_id')


df_ranker_train = df_ranker_train.merge(df_train_matcher.groupby(by='item_id').agg('user_id').count().rename('item_freq_per_basket')/df_train_matcher.basket_id.nunique(), how='left',on='item_id')

df_ranker_train = df_ranker_train.merge(df_train_matcher.groupby(by='user_id').agg('user_id').count().rename('user_freq_per_basket')/df_train_matcher.basket_id.nunique(), how='left',on='user_id')
df_ranker_train = df_ranker_train.merge(df_train_matcher.groupby(by=['user_id','item_id']).agg('quantity').sum().rename('user_item_total_quantity'), how='left',on=['user_id','item_id'])

df_ranker_train = df_ranker_train.merge(df_ranker_train.groupby(by=['user_id']).agg('department').count().rename('user_department'), how='left',on=['user_id'])
df_ranker_train = df_ranker_train.merge(df_ranker_train.groupby(by=['user_id']).agg('manufacturer').count().rename('user_manufacturer'), how='left',on=['user_id'])

df_ranker_train = df_ranker_train.merge(df_train_matcher.groupby(by='item_id').agg('week_day_0').sum().rename('item_quantity_per_week_day_0'), how='left',on='item_id')
df_ranker_train = df_ranker_train.merge(df_train_matcher.groupby(by='item_id').agg('week_day_1').sum().rename('item_quantity_per_week_day_1'), how='left',on='item_id')
df_ranker_train = df_ranker_train.merge(df_train_matcher.groupby(by='item_id').agg('week_day_2').sum().rename('item_quantity_per_week_day_2'), how='left',on='item_id')
df_ranker_train = df_ranker_train.merge(df_train_matcher.groupby(by='item_id').agg('week_day_3').sum().rename('item_quantity_per_week_day_3'), how='left',on='item_id')
df_ranker_train = df_ranker_train.merge(df_train_matcher.groupby(by='item_id').agg('week_day_4').sum().rename('item_quantity_per_week_day_4'), how='left',on='item_id')
df_ranker_train = df_ranker_train.merge(df_train_matcher.groupby(by='item_id').agg('week_day_5').sum().rename('item_quantity_per_week_day_5'), how='left',on='item_id')
df_ranker_train = df_ranker_train.merge(df_train_matcher.groupby(by='item_id').agg('week_day_6').sum().rename('item_quantity_per_week_day_6'), how='left',on='item_id')

df_ranker_train = df_ranker_train.merge(df_train_matcher.groupby(by=['user_id','item_id']).agg('week_day_0').sum().rename('user_item_week_day_0_quantity'), how='left',on=['user_id','item_id'])
df_ranker_train = df_ranker_train.merge(df_train_matcher.groupby(by=['user_id','item_id']).agg('week_day_1').sum().rename('user_item_week_day_1_quantity'), how='left',on=['user_id','item_id'])
df_ranker_train = df_ranker_train.merge(df_train_matcher.groupby(by=['user_id','item_id']).agg('week_day_2').sum().rename('user_item_week_day_2_quantity'), how='left',on=['user_id','item_id'])
df_ranker_train = df_ranker_train.merge(df_train_matcher.groupby(by=['user_id','item_id']).agg('week_day_3').sum().rename('user_item_week_day_3_quantity'), how='left',on=['user_id','item_id'])
df_ranker_train = df_ranker_train.merge(df_train_matcher.groupby(by=['user_id','item_id']).agg('week_day_4').sum().rename('user_item_week_day_4_quantity'), how='left',on=['user_id','item_id'])
df_ranker_train = df_ranker_train.merge(df_train_matcher.groupby(by=['user_id','item_id']).agg('week_day_5').sum().rename('user_item_week_day_5_quantity'), how='left',on=['user_id','item_id'])
df_ranker_train = df_ranker_train.merge(df_train_matcher.groupby(by=['user_id','item_id']).agg('week_day_6').sum().rename('user_item_week_day_6_quantity'), how='left',on=['user_id','item_id'])


In [33]:

# день недели совершения транзакции
df_train_matcher['weekday'] = df_train_matcher['day'] % 7
df = df_train_matcher.groupby(['user_id', 'item_id'])['weekday'].median().reset_index()
df.columns = ['user_id', 'item_id', 'median_weekday']
df_ranker_train = df_ranker_train.merge(df, how='left', on=['user_id', 'item_id'])
    

In [34]:
# средний чек корзины клиента
df = df_train_matcher.groupby(['user_id', 'basket_id'])['sales_value'].sum().reset_index()
df = df_train_matcher.groupby('user_id')['sales_value'].mean().reset_index()
df.columns = ['user_id', 'mean_check']
df_ranker_train = df_ranker_train.merge(df, how='left', on=['user_id'])
    

In [35]:
# кол-во уникальных товаров, купленных клиентом
df = df_train_matcher.groupby(['user_id'])['item_id'].nunique().reset_index()
df.columns = ['user_id', 'n_items']
df_ranker_train = df_ranker_train.merge(df, how='left', on=['user_id'])

In [36]:
# кол-во транзакций клиента
df = df_train_matcher.groupby(['user_id'])['item_id'].count().reset_index()
df.columns = ['user_id', 'n_transactions']
df_ranker_train = df_ranker_train.merge(df, how='left', on=['user_id'])

In [37]:
 # mean / max / std кол-ва уникальных товаров в корзине клиента
df = df_train_matcher.groupby(['user_id', 'basket_id'])['item_id'].nunique().reset_index()
df1 = df.groupby('user_id')['item_id'].mean().reset_index()
df1.columns = ['user_id', 'mean_n_items_basket']
df_ranker_train = df_ranker_train.merge(df1, how='left', on=['user_id'])

df2 = df.groupby('user_id')['item_id'].max().reset_index()
df2.columns = ['user_id', 'max_n_items_basket']
df_ranker_train = df_ranker_train.merge(df2, how='left', on=['user_id'])

df3 = df.groupby('user_id')['item_id'].std().reset_index()
df3.columns = ['user_id', 'std_n_items_basket']
df_ranker_train = df_ranker_train.merge(df3, how='left', on=['user_id'])
    

In [38]:
df_train_matcher

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc,week_day_0,week_day_1,week_day_2,week_day_3,week_day_4,week_day_5,week_day_6,weekday
0,2375,26984851472,1,1004906,1,1.39,364,-0.60,1631,1,0.0,0.0,0,1,0,0,0,0,0,1
1,2375,26984851472,1,1033142,1,0.82,364,0.00,1631,1,0.0,0.0,0,1,0,0,0,0,0,1
2,2375,26984851472,1,1036325,1,0.99,364,-0.30,1631,1,0.0,0.0,0,1,0,0,0,0,0,1
3,2375,26984851472,1,1082185,1,1.21,364,0.00,1631,1,0.0,0.0,0,1,0,0,0,0,0,1
4,2375,26984851472,1,8160430,1,1.50,364,-0.39,1631,1,0.0,0.0,0,1,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2282320,222,41297772783,635,1120741,1,0.59,304,0.00,1716,91,0.0,0.0,0,0,0,0,0,1,0,5
2282321,462,41297773713,635,993339,1,1.99,304,0.00,2040,91,0.0,0.0,0,0,0,0,0,1,0,5
2282322,462,41297773713,635,995242,1,1.00,304,-0.89,2040,91,0.0,0.0,0,0,0,0,0,1,0,5
2282323,462,41297773713,635,10180324,1,3.00,304,-0.29,2040,91,0.0,0.0,0,0,0,0,0,1,0,5


In [39]:
df_ranker_train

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,...,user_item_week_day_4_quantity,user_item_week_day_5_quantity,user_item_week_day_6_quantity,median_weekday,mean_check,n_items,n_transactions,mean_n_items_basket,max_n_items_basket,std_n_items_basket
0,2070,1105426,0.0,69,DELI,Private,SANDWICHES,SANDWICHES - (COLD),,45-54,...,3.0,0.0,1.0,4.0,2.883196,1068,1996,4.098563,70,9.101765
1,2070,1097350,0.0,2468,GROCERY,National,DOMESTIC WINE,VALUE GLASS WINE,4 LTR,45-54,...,,,,,2.883196,1068,1996,4.098563,70,9.101765
2,2070,879194,0.0,69,DRUG GM,Private,DIAPERS & DISPOSABLES,BABY DIAPERS,14 CT,45-54,...,0.0,1.0,0.0,3.5,2.883196,1068,1996,4.098563,70,9.101765
3,2070,948640,0.0,1213,DRUG GM,National,ORAL HYGIENE PRODUCTS,WHITENING SYSTEMS,3 OZ,45-54,...,,,,,2.883196,1068,1996,4.098563,70,9.101765
4,2070,928263,0.0,69,DRUG GM,Private,DIAPERS & DISPOSABLES,BABY DIAPERS,13 CT,45-54,...,0.0,0.0,1.0,3.5,2.883196,1068,1996,4.098563,70,9.101765
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1015681,1745,829852,0.0,69,GROCERY,Private,CHEESE,SHREDDED CHEESE,6 OZ,45-54,...,,,,,2.672520,401,897,7.601695,31,5.739561
1015682,1745,997796,0.0,1089,MEAT-PCKGD,National,LUNCHMEAT,HAM,9 OZ,45-54,...,,,,,2.672520,401,897,7.601695,31,5.739561
1015683,1745,894439,0.0,35,DRUG GM,National,MAGAZINE,TABLOIDS-MAGAZINE,,45-54,...,,,,,2.672520,401,897,7.601695,31,5.739561
1015684,1745,1058404,0.0,1126,GROCERY,National,CHEESE,STRING CHEESE,12 OZ,45-54,...,,,,,2.672520,401,897,7.601695,31,5.739561


In [40]:
X_train = df_ranker_train.drop('target', axis=1)
y_train = df_ranker_train[['target']]

In [41]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1015686 entries, 0 to 1015685
Data columns (total 50 columns):
 #   Column                         Non-Null Count    Dtype  
---  ------                         --------------    -----  
 0   user_id                        1015686 non-null  int64  
 1   item_id                        1015686 non-null  int64  
 2   manufacturer                   1015686 non-null  int64  
 3   department                     1015686 non-null  object 
 4   brand                          1015686 non-null  object 
 5   commodity_desc                 1015686 non-null  object 
 6   sub_commodity_desc             1015686 non-null  object 
 7   curr_size_of_product           1015686 non-null  object 
 8   age_desc                       367791 non-null   object 
 9   marital_status_code            367791 non-null   object 
 10  income_desc                    367791 non-null   object 
 11  homeowner_desc                 367791 non-null   object 
 12  hh_comp_desc  

In [42]:
cat_feats = X_train.columns[2:].tolist()
X_train[cat_feats] = X_train[cat_feats].astype('category')

In [43]:
%%time
lgb = LGBMClassifier(objective='binary',
                     max_depth=10,
                     n_estimators=200,
                     learning_rate=0.1,
                     categorical_column=cat_feats,
                     n_jobs=100)

lgb.fit(X_train, y_train)

  return f(*args, **kwargs)


Wall time: 27.1 s


LGBMClassifier(categorical_column=['manufacturer', 'department', 'brand',
                                   'commodity_desc', 'sub_commodity_desc',
                                   'curr_size_of_product', 'age_desc',
                                   'marital_status_code', 'income_desc',
                                   'homeowner_desc', 'hh_comp_desc',
                                   'household_size_desc', 'kid_category_desc',
                                   'total_item_sales_value',
                                   'total_quantity_value', 'item_freq',
                                   'user_freq', 'total_user_sales_value',
                                   'item_quantity_per_week',
                                   'user_quantity_per_week',
                                   'item_quantity_per_basket',
                                   'user_quantity_per_baskter',
                                   'item_freq_per_basket',
                                   'user_fre

In [44]:
train_preds = lgb.predict_proba(X_train)

In [45]:
df_ranker_predict = df_ranker_train.copy()

In [46]:
df_ranker_predict['proba_item_purchase'] = train_preds[:,1]

In [47]:
df_ranker_predict.shape

(1015686, 52)

In [48]:
result_eval_ranker = data_val_ranker.groupby('user_id')['item_id'].unique().reset_index()
result_eval_ranker.columns=['user_id', 'actual']
result_eval_ranker.head(2)

Unnamed: 0,user_id,actual
0,1,"[821867, 834484, 856942, 865456, 889248, 90795..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963..."


In [49]:
result_eval_ranker.shape

(2040, 2)

In [50]:
%%time
result_eval_ranker['own_rec'] = result_eval_ranker['user_id'].apply(lambda x: recommender.get_own_recommendations(x, N=N_PREDICT))

Wall time: 8.34 s


In [51]:
def rerank(user_id):
    return df_ranker_predict[df_ranker_predict['user_id']==user_id].sort_values('proba_item_purchase', ascending=False).head(5).item_id.tolist()

In [52]:
result_eval_ranker['reranked_own_rec'] = result_eval_ranker['user_id'].apply(lambda user_id: rerank(user_id))

In [53]:
# смотрим на метрики выше и сравниваем что с ранжированием и без, добавляем фичи и то же смотрим
# в первом приближении метрики должны расти с использованием второго этапа

print(*sorted(calc_precision(result_eval_ranker, TOPK_PRECISION), key=lambda x: x[1], reverse=True), sep='\n')

('reranked_own_rec', 0.331279373368146)
('own_rec', 0.1444117647058813)


  return flags.sum() / len(recommended_list)


In [85]:
df_test = pd.read_csv('C:/Users/flath/Desktop/Rec.sis/retail_test1.csv')

In [86]:
df_test.head()

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,1340,41652823310,664,912987,1,8.49,446,0.0,52,96,0.0,0.0
1,588,41652838477,664,1024426,1,6.29,388,0.0,8,96,0.0,0.0
2,2070,41652857291,664,995242,5,9.1,311,-0.6,46,96,0.0,0.0
3,1602,41665647035,664,827939,1,7.99,334,0.0,1741,96,0.0,0.0
4,1602,41665647035,664,927712,1,0.59,334,-0.4,1741,96,0.0,0.0


In [87]:
df_test = df_test[df_test.user_id.isin(common_users)]
df_test = df_test[df_test.user_id.isin(common_users)]


In [88]:
result_test = df_test.groupby('user_id')['item_id'].unique().reset_index()
result_test.columns=['user_id', 'actual']
result_test.head(2)

Unnamed: 0,user_id,actual
0,1,"[880007, 883616, 931136, 938004, 940947, 94726..."
1,2,"[820165, 820291, 826784, 826835, 829009, 85784..."


In [89]:
result_test['your_prediction'] = result_test['user_id'].apply(lambda user_id: rerank(user_id))

In [90]:
result_test

Unnamed: 0,user_id,actual,your_prediction
0,1,"[880007, 883616, 931136, 938004, 940947, 94726...","[1082185, 10149640, 856942, 9655212, 940947]"
1,2,"[820165, 820291, 826784, 826835, 829009, 85784...","[1106523, 1082185, 916122, 901062, 839656]"
2,3,"[827683, 908531, 989069, 1071377, 1080155, 109...",[]
3,6,"[956902, 960791, 1037863, 1119051, 1137688, 84...","[1082185, 1098844, 845208, 900802, 12757544]"
4,7,"[847270, 855557, 859987, 863407, 895454, 90663...","[1122358, 1082185, 1106523, 9338009, 1126899]"
...,...,...,...
1878,2496,"[829291, 862139, 912704, 933067, 933835, 95537...","[899624, 1056509, 1106523, 916122, 995876]"
1879,2497,[6534178],"[1120361, 1135834, 1029743, 900802, 870515]"
1880,2498,"[1053690, 1076875, 12386123, 858303, 920109, 1...","[1100379, 907993, 896074, 1106523, 1070820]"
1881,2499,"[826249, 895327, 9858944, 820321, 829291, 8323...","[1070820, 899624, 5568378, 5569327, 1082185]"


In [91]:
print(*sorted(calc_precision(result_test, TOPK_PRECISION), key=lambda x: x[1], reverse=True), sep='\n')

('your_prediction', 0.27060810810810565)


  return flags.sum() / len(recommended_list)


In [110]:
df = result_test[['user_id', 'your_prediction']].copy()
df.rename(columns={'user_id':'UserId', 'your_prediction':'Predicted'}, inplace=True)
df.to_csv('C:/Users/flath/Desktop/Rec.sis/ recommendations3.csv', index=False)
df.head()

Unnamed: 0,UserId,Predicted
0,1,"[1082185, 10149640, 856942, 9655212, 940947]"
1,2,"[1106523, 1082185, 916122, 901062, 839656]"
2,3,[]
3,6,"[1082185, 1098844, 845208, 900802, 12757544]"
4,7,"[1122358, 1082185, 1106523, 9338009, 1126899]"
