In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit import als

# Модель второго уровня
from lightgbm import LGBMClassifier

# import os, sys
# module_path = os.path.abspath(os.path.join(os.pardir))
# if module_path not in sys.path:
#     sys.path.append(module_path)

# Написанные нами функции
from src.metrics import precision_at_k, recall_at_k
from src.utils import prefilter_items
from src.recommenders import MainRecommender

In [2]:
data = pd.read_csv('02_Data/retail_train.csv')
item_features = pd.read_csv('02_Data/product.csv')
user_features = pd.read_csv('02_Data/hh_demographic.csv')

In [3]:
# column processing
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': 'item_id'}, inplace=True)
user_features.rename(columns={'household_key': 'user_id'}, inplace=True)

In [4]:
# Важна схема обучения и валидации!
# -- давние покупки -- | -- 6 недель -- | -- 3 недель -- 
# подобрать размер 2-ого датасета (6 недель) --> learning curve (зависимость метрики recall@k от размера датасета)

VAL_MATCHER_WEEKS = 6
VAL_RANKER_WEEKS = 3

In [5]:
# берем данные для тренировки matching модели
data_train_matcher = data[data['week_no'] < data['week_no'].max() - (VAL_MATCHER_WEEKS + VAL_RANKER_WEEKS)]

# берем данные для валидации matching модели
data_val_matcher = data[(data['week_no'] >= data['week_no'].max() - (VAL_MATCHER_WEEKS + VAL_RANKER_WEEKS)) &
                        (data['week_no'] < data['week_no'].max() - (VAL_RANKER_WEEKS))]


# берем данные для тренировки ranking модели
data_train_ranker = data_val_matcher.copy()  # Для наглядности. Далее мы добавим изменения, и они будут отличаться

# берем данные для теста ranking, matching модели
data_val_ranker = data[data['week_no'] >= data['week_no'].max() - VAL_RANKER_WEEKS]

In [6]:
def print_stats_data(df_data, name_df):
    print(name_df)
    print(f"Shape: {df_data.shape} Users: {df_data['user_id'].nunique()} Items: {df_data['item_id'].nunique()}")

In [7]:
print_stats_data(data_train_matcher,'train_matcher')
print_stats_data(data_val_matcher,'val_matcher')
print_stats_data(data_train_ranker,'train_ranker')
print_stats_data(data_val_ranker,'val_ranker')

train_matcher
Shape: (2108779, 12) Users: 2498 Items: 83685
val_matcher
Shape: (169711, 12) Users: 2154 Items: 27649
train_ranker
Shape: (169711, 12) Users: 2154 Items: 27649
val_ranker
Shape: (118314, 12) Users: 2042 Items: 24329


In [8]:
data_train_matcher.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [9]:
n_items_before = data_train_matcher['item_id'].nunique()

data_train_matcher = prefilter_items(data_train_matcher.copy(), item_features=item_features, take_n_popular=10000)

n_items_after = data_train_matcher['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

Decreased # items from 83685 to 10001


In [10]:
# ищем общих пользователей чтобы избежать холодного старта по пользователям# Init/train recommender
common_users = data_train_matcher.user_id.values

data_val_matcher = data_val_matcher[data_val_matcher.user_id.isin(common_users)]
data_train_ranker = data_train_ranker[data_train_ranker.user_id.isin(common_users)]
data_val_ranker = data_val_ranker[data_val_ranker.user_id.isin(common_users)]

print_stats_data(data_train_matcher,'train_matcher')
print_stats_data(data_val_matcher,'val_matcher')
print_stats_data(data_train_ranker,'train_ranker')
print_stats_data(data_val_ranker,'val_ranker')

train_matcher
Shape: (2108779, 12) Users: 2498 Items: 10001
val_matcher
Shape: (169707, 12) Users: 2153 Items: 27649
train_ranker
Shape: (169707, 12) Users: 2153 Items: 27649
val_ranker
Shape: (118303, 12) Users: 2041 Items: 24326


# Init/train recommender

In [11]:
recommender = MainRecommender(data_train_matcher)



In [12]:
# Берем тестового юзера 16 и проверяем, что наши методы рекомендаций выдают требуемое количество items

In [13]:
len(recommender.get_als_recommendations(16, N=50))

50

In [14]:
len(recommender.get_own_recommendations(16, N=50))

50

In [15]:
len(recommender.get_similar_items_recommendation(16, N=50))

50

In [16]:
len(recommender.get_similar_users_recommendation(16, N=50))

50

# Eval recall of matching

In [17]:
result_eval_matcher = data_val_matcher.groupby('user_id')['item_id'].unique().reset_index()
result_eval_matcher.columns=['user_id', 'actual']
result_eval_matcher.head(2)

Unnamed: 0,user_id,actual
0,1,"[853529, 865456, 867607, 872137, 874905, 87524..."
1,2,"[15830248, 838136, 839656, 861272, 866211, 870..."


In [18]:
%%time
N_PREDICT = 100
result_eval_matcher['own_rec'] = result_eval_matcher['user_id'].apply(lambda x: recommender.get_own_recommendations(x, N=N_PREDICT))
result_eval_matcher['als_rec'] = result_eval_matcher['user_id'].apply(lambda x: recommender.get_als_recommendations(x, N=N_PREDICT))
result_eval_matcher['sim_item_rec'] = result_eval_matcher['user_id'].apply(lambda x: recommender.get_similar_items_recommendation(x, N=N_PREDICT))
# lst_k = [20, 50, 100, 200, 500]
# for k in lst_k:
#     result_eval_matcher[f'als_rec_{k}'] = result_eval_matcher['user_id'].apply(lambda x: recommender.get_als_recommendations(x, N=k))

CPU times: user 5min 58s, sys: 5min 48s, total: 11min 47s
Wall time: 1min 1s


In [19]:
%%time
result_eval_matcher['sim_user_rec'] = result_eval_matcher['user_id'].apply(lambda x: recommender.get_similar_users_recommendation(x, N=N_PREDICT))

CPU times: user 6min 51s, sys: 7min 20s, total: 14min 12s
Wall time: 1min 14s


### Recall@k of matching

In [20]:
# def calc_recall_at_k(df_data, lst_k):
#     i = 0
#     for col_name in df_data.columns[2:]:
#         yield col_name, df_data.apply(lambda row: recall_at_k(row[col_name], row['actual'], k=lst_k[i]), axis=1).mean()
#         i += 1

In [21]:
# sorted(calc_recall_at_k(result_eval_matcher, lst_k), key=lambda x: x[1],reverse=True)
# [('als_rec_500', 0.3060136690863936),
#  ('als_rec_200', 0.2069766934661942),
#  ('als_rec_100', 0.14587668380786858),
#  ('als_rec_50', 0.09767023124135532),
#  ('als_rec_20', 0.05440960654494676)]

In [22]:
def calc_recall(df_data, top_k):
    for col_name in df_data.columns[2:]:
        yield col_name, df_data.apply(lambda row: recall_at_k(row[col_name], row['actual'], k=top_k), axis=1).mean()

In [23]:
TOPK_RECALL = N_PREDICT

In [24]:
sorted(calc_recall(result_eval_matcher, TOPK_RECALL), key=lambda x: x[1],reverse=True)

[('als_rec', 0.14675353796706392),
 ('own_rec', 0.13621600309332213),
 ('sim_item_rec', 0.1072117536443937),
 ('sim_user_rec', 0.09941717815582282)]

### Precision@5 of matching

In [25]:
def calc_precision(df_data, top_k):
    for col_name in df_data.columns[2:]:
        yield col_name, df_data.apply(lambda row: precision_at_k(row[col_name], row['actual'], k=top_k), axis=1).mean()

In [26]:
TOPK_PRECISION = 5

In [27]:
sorted(calc_precision(result_eval_matcher, TOPK_PRECISION), key=lambda x: x[1],reverse=True)

[('own_rec', 0.22359498374361358),
 ('als_rec', 0.17547607988852765),
 ('sim_user_rec', 0.1508592661402694),
 ('sim_item_rec', 0.14156990246168139)]

# Ranking part

In [67]:
# берём пользователей из трейна для ранжирования
df_match_candidates = pd.DataFrame(data_train_ranker['user_id'].unique())
df_match_candidates.columns = ['user_id']
df_match_candidates.head(2)

Unnamed: 0,user_id
0,2070
1,2021


In [68]:
df_match_candidates['candidates'] = df_match_candidates['user_id'].apply(lambda x: recommender.get_als_recommendations(x, N=N_PREDICT))
df_match_candidates.head(2)

Unnamed: 0,user_id,candidates
0,2070,"[1082185, 956609, 1080414, 935546, 6464126, 93..."
1,2021,"[950935, 1041390, 1019142, 1119454, 896938, 96..."


In [71]:
df_match_candidates = df_match_candidates.explode('candidates', ignore_index=True)
df_match_candidates.columns = ['user_id', 'item_id']
df_match_candidates = df_match_candidates.astype({'item_id': 'int64'})
df_match_candidates

Unnamed: 0,user_id,item_id
0,2070,1082185
1,2070,956609
2,2070,1080414
3,2070,935546
4,2070,6464126
...,...,...
215295,1745,1077643
215296,1745,1015247
215297,1745,1046055
215298,1745,1038746


### Создаем трейн сет для ранжирования с учетом кандидатов с этапа 1 

In [106]:
# проверка на отсутствие "холодных" пользователей
len(set(data_train_ranker['user_id'].unique().tolist()) - set(data_train_matcher['user_id'].unique().tolist()))

0

In [74]:
df_ranker_train = data_train_ranker[['user_id', 'item_id']].copy().drop_duplicates()

df_ranker_train['target'] = 1  # тут только покупки
df_ranker_train

Unnamed: 0,user_id,item_id,target
2104867,2070,1019940,1
2107468,2021,840361,1
2107469,2021,856060,1
2107470,2021,869344,1
2107471,2021,896862,1
...,...,...,...
2282319,222,926804,1
2282320,222,1120741,1
2282321,462,993339,1
2282323,462,10180324,1


In [75]:
df_ranker_train = df_match_candidates.merge(df_ranker_train, on=['user_id', 'item_id'], how='left')
df_ranker_train['target'].fillna(0, inplace= True)
df_ranker_train

Unnamed: 0,user_id,item_id,target
0,2070,1082185,1.0
1,2070,956609,0.0
2,2070,1080414,1.0
3,2070,935546,0.0
4,2070,6464126,0.0
...,...,...,...
215295,1745,1077643,0.0
215296,1745,1015247,0.0
215297,1745,1046055,0.0
215298,1745,1038746,0.0


In [77]:
df_ranker_train.target.value_counts()

0.0    196282
1.0     19018
Name: target, dtype: int64

In [78]:
df_ranker_train['target'].mean()

0.08833255921969345

In [79]:
item_features.head(2)

Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,


In [80]:
user_features.head(2)

Unnamed: 0,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,user_id
0,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1
1,45-54,A,50-74K,Homeowner,2 Adults No Kids,2,None/Unknown,7


In [81]:
df_ranker_train = df_ranker_train.merge(item_features, on='item_id', how='left')
df_ranker_train = df_ranker_train.merge(user_features, on='user_id', how='left')

df_ranker_train.head(2)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc
0,2070,1082185,1.0,2,PRODUCE,National,TROPICAL FRUIT,BANANAS,40 LB,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown
1,2070,956609,0.0,1276,GROCERY,National,BEANS - CANNED GLASS & MW,PREPARED BEANS - BAKED W/PORK,15 OZ,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown


In [87]:
X_train = df_ranker_train.drop('target', axis=1)
y_train = df_ranker_train['target']

In [88]:
cat_feats = X_train.columns[2:].tolist()
X_train[cat_feats] = X_train[cat_feats].astype('category')

cat_feats

['manufacturer',
 'department',
 'brand',
 'commodity_desc',
 'sub_commodity_desc',
 'curr_size_of_product',
 'age_desc',
 'marital_status_code',
 'income_desc',
 'homeowner_desc',
 'hh_comp_desc',
 'household_size_desc',
 'kid_category_desc']

## Обучение модели ранжирования

In [92]:
lgb = LGBMClassifier(objective='binary',
                     num_leaves=255,
                     max_depth=8,
                     n_estimators=300,
                     learning_rate=0.05,
                     categorical_column=cat_feats)

lgb.fit(X_train, y_train)

train_preds = lgb.predict_proba(X_train)



In [93]:
df_ranker_predict = df_ranker_train.copy()
df_ranker_predict['proba_item_purchase'] = train_preds[:,1]
df_ranker_predict.head(2)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,proba_item_purchase
0,2070,1082185,1.0,2,PRODUCE,National,TROPICAL FRUIT,BANANAS,40 LB,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,0.697811
1,2070,956609,0.0,1276,GROCERY,National,BEANS - CANNED GLASS & MW,PREPARED BEANS - BAKED W/PORK,15 OZ,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,0.134799


# Evaluation on test dataset

In [95]:
result_eval_ranker = data_val_ranker.groupby('user_id')['item_id'].unique().reset_index()
result_eval_ranker.columns=['user_id', 'actual']
result_eval_ranker.head(2)

Unnamed: 0,user_id,actual
0,1,"[821867, 834484, 856942, 865456, 889248, 90795..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963..."


## Eval matching on test dataset

In [96]:
%%time
result_eval_ranker['als_rec'] = result_eval_ranker['user_id'].apply(lambda x: recommender.get_als_recommendations(x, N=N_PREDICT))

CPU times: user 3.61 s, sys: 3.76 s, total: 7.37 s
Wall time: 685 ms


In [97]:
# померяем precision только модели матчинга, чтобы понимать влияение ранжирования на метрики
sorted(calc_precision(result_eval_ranker, TOPK_PRECISION), key=lambda x: x[1], reverse=True)

[('als_rec', 0.14149926506614402)]

In [104]:
# проверка на отсутствие "холодных" пользователей
len(set(data_val_ranker['user_id'].unique().tolist()) - set(data_train_ranker['user_id'].unique().tolist()))

126

## Eval re-ranked matched result on test dataset    

In [107]:
def rerank(user_id):
    return df_ranker_predict[df_ranker_predict['user_id']==user_id].sort_values('proba_item_purchase', ascending=False).head(5).item_id.tolist()

In [108]:
result_eval_ranker['reranked_own_rec'] = result_eval_ranker['user_id'].apply(lambda user_id: rerank(user_id))

In [111]:
print(*sorted(calc_precision(result_eval_ranker, TOPK_PRECISION), key=lambda x: x[1], reverse=True), sep='\n')

('reranked_own_rec', 0.2634986945169713)
('als_rec', 0.14149926506614402)


### Итоги:
В связи с тем, что бизнес задачи вошли в противоречие с целью домашней работы № 6, а именно максимизация метрик recall@k для модели первого уровня и
precision@5 для модели ранжирования второго уровня, функция префильтрации оставляет только те товары, которые продавались последние 52 недели (при уменьшении
этого периода до 40 недель метрика recall@k для модели первого уровня немного снижалась) и только 10 000 самых популярных.

Перебрав различные k для модели первого уровня, остановился на k=100 (recall@100=0,1458 для als_recommender). Хотя при k={200,..,500} метрика полноты
растёт с увеличением k, но в то же время сбалансированность обучающего датасета по классам target для модели второго уровня ухудшается:
            
для k=100: 8,8% (1); 91,2% (0)

для k=200: 6,6% (1); 93,4% (0)

для k=300: 5,4% (1); 94,6% (0)

для k=400: 4,7% (1); 95,3% (0)

для k=500: 4,1% (1); 95,9% (0)

При формировании датасета для модели второго уровня дополнительные фичи не добавлял (оставил для курсового проекта, т.к. целевая
метрика и так оказалась неплохая). Также для курсового проекта осталась задача обработки "холодных" пользователей, их в валидационном
датасете для модели второго уровня оказалось 126.

После ранжирования рекомендаций моделью второго уровня метрика precision@5=0,26.