# Двухуровневые модели рекомендаций


**Основное**  
Данны данные о покупках 2500 пользователей в течении 95 недель так же есть характеристики пользователя и характеристики продукта.
Задача разработать модель с
- Целевая метрика precision@5 > 0.22

In [174]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit.als import AlternatingLeastSquares
from implicit import approximate_als
from implicit.nearest_neighbours import ItemItemRecommender,  CosineRecommender

# Модель второго уровня
from catboost import CatBoostClassifier
from utilis import prefilter_items, prefilter_items_v2
from recomenders5 import MainRecommender
from metrics import recall_at_k_mean, recall_at_k, precision_at_k, map_k_mean, NDCG_mean, ap_k
from data_prepare import reduce_mem_usage, nan_replacer, feature_generator,user_feature_prepare, item_features_prepare
from recomender9 import MainRecommender
import warnings
warnings.filterwarnings("ignore")

In [175]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [176]:
data = pd.read_csv('retail_train.csv')
#data = prefilter_items(data)

item_features = pd.read_csv('product.csv')
user_features = pd.read_csv('hh_demographic.csv')

#column processing
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': 'item_id'}, inplace=True)
user_features.rename(columns={'household_key': 'user_id'}, inplace=True)

In [177]:
data = prefilter_items(data, take_n=10000)

In [178]:
max_week = data['week_no'].max()
condition_train = (data['week_no'] < max_week - 6)
condition_valid = data['week_no'] >= max_week - 3
condition_test = ((data['week_no'] >= max_week - 6) & (data['week_no'] < max_week - 3))
                   
data_train_L1 = data[condition_train]
data_test_L1 = data[condition_test]

data_train_L2 = data_test_L1.copy()
data_valid_L2 = data[condition_valid]

In [179]:
data_valid_L2['week_no'].unique

<bound method Series.unique of 2277416    92
2277417    92
2277418    92
2277419    92
2277420    92
           ..
2396799    95
2396800    95
2396801    95
2396802    95
2396803    95
Name: week_no, Length: 118314, dtype: int64>

In [180]:
data_train_L1['week_no'].unique

<bound method Series.unique of 0           1
1           1
2           1
3           1
4           1
           ..
2193522    88
2193523    88
2193524    88
2193525    88
2193526    88
Name: week_no, Length: 2193515, dtype: int64>

In [181]:
data_test_L1['week_no'].unique

<bound method Series.unique of 2191387    89
2191388    89
2191389    89
2191390    89
2191391    89
           ..
2282320    91
2282321    91
2282322    91
2282323    91
2282324    91
Name: week_no, Length: 84975, dtype: int64>

In [183]:
data_l1_l2_val = pd.concat([data_valid_L2, data_test_L1], ignore_index = True)

In [184]:
result = data_test_L1.groupby('user_id')['item_id'].unique().reset_index()
result.columns=['user_id', 'actual']
result.head(2)

Unnamed: 0,user_id,actual
0,1,"[829323, 999999, 851515, 879280, 940947, 96816..."
1,2,"[999999, 821083, 828106, 830960, 833025, 83813..."


In [185]:
result = result.loc[result['user_id']<=2499]

In [186]:
from implicit.nearest_neighbours import BM25Recommender

In [187]:
user_item_matrix = pd.pivot_table(data_train_L1, 
                                  index='user_id', columns='item_id', 
                                  values='quantity',
                                  aggfunc='count', 
                                  fill_value=0
                                 )

user_item_matrix[user_item_matrix > 0] = 1 # так как в итоге хотим предсказать 
user_item_matrix = user_item_matrix.astype(float) # необходимый тип матрицы для implicit

# переведем в формат saprse matrix
sparse_user_item = csr_matrix(user_item_matrix).tocsr()

user_item_matrix.shape

(2499, 10000)

In [188]:
userids = user_item_matrix.index.values
itemids = user_item_matrix.columns.values

matrix_userids = np.arange(len(userids))
matrix_itemids = np.arange(len(itemids))

id_to_itemid = dict(zip(matrix_itemids, itemids))
id_to_userid = dict(zip(matrix_userids, userids))

itemid_to_id = dict(zip(itemids, matrix_itemids))
userid_to_id = dict(zip(userids, matrix_userids))

In [189]:
%%time

model = BM25Recommender(K=200, num_threads=4) # K - кол-во билжайших соседей

model.fit(csr_matrix(user_item_matrix).tocsr(), 
          show_progress=True)


  0%|          | 0/10000 [00:00<?, ?it/s]

CPU times: total: 6.06 s
Wall time: 1.98 s


In [190]:
%%time
bm25 = model.recommend(userid=userids, user_items=csr_matrix(user_item_matrix).tocsr(),   # на вход user-item matrix
                                    N=150, 
                                    filter_already_liked_items=False, 
                                    filter_items=[itemid_to_id[999999]], 
                                    recalculate_user=False)[0]

CPU times: total: 1.91 s
Wall time: 1.93 s


In [191]:
def id_item_to_real_id_item(recs, k):
    recsals = np.array([],dtype='i')
    for row in recs:
        a = [id_to_itemid[el] for el in row]
        recsals = np.append(recsals,a)
    recsals = recsals.reshape(-1,k)    
    return recsals

In [192]:
bm25_5_conv = id_item_to_real_id_item(bm25, 150)
bm25_150_rec = pd.DataFrame({'user_id':userids, 'bm25':bm25_5_conv.tolist()})
result = result.merge(bm25_150_rec, on='user_id')

In [193]:
result.apply(lambda row: recall_at_k(row['bm25'], row['actual'], k=200), axis=1).mean()

0.229849337660866

# Генерация признаков

In [195]:
user_feature = user_feature_prepare(user_features)

In [196]:
item_features = item_features_prepare(item_features)

In [197]:
user_item_features = feature_generator(data_train_L1,user_feature, item_features)

Добавлены следующие признаки:
время покупки -hour
день недели совершения транзакции-median_weekday
кол-во транзакций покупок пользователем-n_transactions
кол-во уникальных покупок пользователем-n_items
средний чек пользвателя - mean_check
средний чек на размер семьи-mean_ckeck_per_household_size
популярность товара-popularity
любимые товары пользователя-purchase_2,purchase_3,purchase_4


In [198]:
user_item_features.head(5)

Unnamed: 0,user_id,item_id,median_sales_hour,median_weekday,n_items,purchase_2,purchase_3,purchase_4,mean_check,n_transactions,...,household_size_desc,kid_category_desc,mean_ckeck_per_household_size,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,commodity_category
0,1,820165,12.0,1.5,359,856942.0,1082185.0,995242.0,50.174167,1449,...,2.0,0,25.087083,2.0,PRODUCE,1.0,CITRUS,ORANGES NAVELS ALL,,166.0
1,1,823721,14.0,4.0,359,856942.0,1082185.0,995242.0,50.174167,1449,...,2.0,0,25.087083,317.0,GROCERY,1.0,CHEESE,GRATED CHEESE,8 OZ,14.0
2,1,823990,15.0,6.0,359,856942.0,1082185.0,995242.0,50.174167,1449,...,2.0,0,25.087083,2929.0,MEAT,1.0,BEEF,CHOICE BEEF,,13.0
3,1,825123,15.0,4.0,359,856942.0,1082185.0,995242.0,50.174167,1449,...,2.0,0,25.087083,1179.0,GROCERY,1.0,SALD DRSNG/SNDWCH SPRD,SEMI-SOLID SALAD DRESSING MAY,30 OZ,34.0
4,1,826695,15.5,1.5,359,856942.0,1082185.0,995242.0,50.174167,1449,...,2.0,0,25.087083,135.0,GROCERY,1.0,FRZN POTATOES,FRZN FRENCH FRIES,20 OZ,170.0


# Подготовка к уровню 2

### Слияние USER и IITEM + добавление flag 0 если не покупал 1 если купил

In [199]:
users_lvl_2 = pd.DataFrame(data_train_L2['user_id'].unique())
users_lvl_2.columns = ['user_id']
users_lvl_2 = users_lvl_2.loc[users_lvl_2['user_id']<=2499] # конец не попал в обучение поэтому и предсказать не сможем для юсеров выше
# Пока только warm start
train_users = data_train_L1['user_id'].unique()
users_lvl_2 = users_lvl_2[users_lvl_2['user_id'].isin(train_users)] #выбрали только тех пользователей что были в 1й обучающей выборке
#Обучение 1 го уровня и создание кандидатов
#users_lvl_2['candidates'] = users_lvl_2['user_id'].apply(lambda x:model.get_item_recommendations(x, N=200))

In [200]:
users_lvl_2 = users_lvl_2.merge(bm25_150_rec, on='user_id', how='left')
users_lvl_2.columns = ['user_id', 'candidates']

In [201]:
users_lvl_2.head(3)

Unnamed: 0,user_id,candidates
0,84,"[1062966, 1043590, 952317, 1080414, 938700, 10..."
1,1753,"[1044078, 1096036, 1092026, 1085604, 883404, 9..."
2,2120,"[480014, 1058997, 1126899, 1082185, 1029743, 9..."


In [202]:
# склеивание каждого el из candidates с user_id
s = users_lvl_2.apply(lambda x: pd.Series(x['candidates']), axis=1).stack().reset_index(level=1, drop=True)
s.name = 'item_id'
users_lvl_2 = users_lvl_2.drop('candidates', axis=1).join(s)
users_lvl_2['flag'] = 1 #пока метка у всех 1
users_lvl_2.head(3)

Unnamed: 0,user_id,item_id,flag
0,84,1062966,1
0,84,1043590,1
0,84,952317,1


In [203]:
targets_lvl_2 = data_train_L2[['user_id', 'item_id']].copy()
targets_lvl_2['target'] = 1  # тут только покупки 
targets_lvl_2 = users_lvl_2.merge(targets_lvl_2, on=['user_id', 'item_id'], how='left')
targets_lvl_2['target'].fillna(0, inplace= True)
targets_lvl_2.drop('flag', axis=1, inplace=True)
targets_lvl_2.head(3)

Unnamed: 0,user_id,item_id,target
0,84,1062966,0.0
1,84,1043590,1.0
2,84,952317,0.0


In [204]:
targets_lvl_2.shape

(286065, 3)

# Добавление признаков на уровень 2

In [205]:
targets_lvl_2 = targets_lvl_2.merge(user_item_features, on=['user_id','item_id'], how='left')
targets_lvl_2.head(4)

Unnamed: 0,user_id,item_id,target,median_sales_hour,median_weekday,n_items,purchase_2,purchase_3,purchase_4,mean_check,...,household_size_desc,kid_category_desc,mean_ckeck_per_household_size,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,commodity_category
0,84,1062966,0.0,16.0,5.0,170.0,903529.0,1080414.0,920025.0,23.707907,...,,,,1425.0,MEAT-PCKGD,1.0,HOT DOGS,ECONOMY - MEAT,16 OZ,152.0
1,84,1043590,1.0,4.0,0.0,170.0,903529.0,1080414.0,920025.0,23.707907,...,,,,69.0,GROCERY,0.0,SHORTENING/OIL,VEGETABLE/SALAD OIL,48 OZ,114.0
2,84,952317,0.0,0.0,1.0,170.0,903529.0,1080414.0,920025.0,23.707907,...,,,,418.0,GROCERY,1.0,BAKED SWEET GOODS,SNACK CAKE - MULTI PACK,13 OZ,32.0
3,84,1080414,1.0,0.0,3.0,170.0,903529.0,1080414.0,920025.0,23.707907,...,,,,693.0,DRUG GM,1.0,CANDY - CHECKLANE,CANDY BARS (SINGLES)(INCLUDING,1.6 OZ,47.0


In [206]:
targets_lvl_2.shape

(286065, 27)

## Обработка признаков товаров

In [207]:
targets_lvl_2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 286065 entries, 0 to 286064
Data columns (total 27 columns):
 #   Column                         Non-Null Count   Dtype   
---  ------                         --------------   -----   
 0   user_id                        286065 non-null  int64   
 1   item_id                        286065 non-null  int64   
 2   target                         286065 non-null  float64 
 3   median_sales_hour              123821 non-null  float64 
 4   median_weekday                 123821 non-null  float64 
 5   n_items                        123821 non-null  float64 
 6   purchase_2                     123820 non-null  float64 
 7   purchase_3                     123820 non-null  float64 
 8   purchase_4                     123820 non-null  float64 
 9   mean_check                     123821 non-null  float64 
 10  n_transactions                 123821 non-null  float64 
 11  popularity                     123821 non-null  float64 
 12  age_desc        

In [208]:
targets_lvl_2 = nan_replacer(targets_lvl_2)

In [209]:
targets_lvl_2 = reduce_mem_usage(targets_lvl_2)

Memory usage of dataframe is 55.39 MB
Memory usage after optimization is: 25.69 MB
Decreased by 53.6%


In [210]:
targets_lvl_2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 286065 entries, 0 to 286064
Data columns (total 27 columns):
 #   Column                         Non-Null Count   Dtype   
---  ------                         --------------   -----   
 0   user_id                        286065 non-null  int16   
 1   item_id                        286065 non-null  int32   
 2   target                         286065 non-null  float32 
 3   median_sales_hour              286065 non-null  float32 
 4   median_weekday                 286065 non-null  float32 
 5   n_items                        286065 non-null  float32 
 6   purchase_2                     286065 non-null  float32 
 7   purchase_3                     286065 non-null  float32 
 8   purchase_4                     286065 non-null  float32 
 9   mean_check                     286065 non-null  float32 
 10  n_transactions                 286065 non-null  float32 
 11  popularity                     286065 non-null  float32 
 12  age_desc        

## Разделение на трейн и предикт

In [211]:
X_train = targets_lvl_2.drop('target', axis=1)
y_train = targets_lvl_2[['target']]

In [212]:
X_train.shape

(286065, 26)

In [213]:
X_train = X_train.loc[X_train['item_id']!=999999]

In [214]:
X_train.shape

(286065, 26)

## Предобработка обучающего множества

In [215]:
choosen_feature = ['item_id',
                'n_transactions',
                'median_sales_hour',
                'median_weekday',
                'user_id',
                'n_items',
                'mean_check',
                'commodity_category',
                'sub_commodity_desc',
                'manufacturer']

In [216]:
X_train = X_train[choosen_feature]

In [217]:
categorial_columns = [col for col in X_train.columns if X_train[col].dtype =='category']

In [218]:
categorial_columns

['sub_commodity_desc']

## Обучение

In [221]:
%%time
cat = CatBoostClassifier(max_depth=3, n_estimators=900, cat_features=categorial_columns, random_state=14, silent=True)
cat.fit(X_train, y_train, cat_features=categorial_columns)

CPU times: total: 9min 32s
Wall time: 1min 45s


<catboost.core.CatBoostClassifier at 0x1fa4ce7b350>

In [222]:
train_preds = cat.predict(X_train)
train_preds_proba = cat.predict_proba(X_train)

In [223]:
train_preds = train_preds.astype(bool)

In [224]:
rec_items = X_train[train_preds].groupby(by=['user_id'])['item_id'].unique().reset_index()
rec_items.columns = ['user_id', 'model_preds']

In [225]:
data_valid_L2 = data_valid_L2.loc[data_valid_L2['item_id']!=999999]

In [226]:
result_l2 = data_valid_L2.groupby('user_id')['item_id'].unique().reset_index()
result_l2.columns = ['user_id', 'actual']

In [227]:
result_l2 = result_l2.merge(rec_items,
                                  on='user_id',
                                  how='inner')

In [228]:
result_l2.head(4)

Unnamed: 0,user_id,actual,model_preds
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[995242, 1005186, 1082185]"
1,6,"[920308, 926804, 946489, 1017061, 1078346, 110...","[1082185, 995242, 1029743]"
2,13,"[6534178, 1104146, 840361, 862070, 884897, 886...","[6534178, 1106523, 1029743, 995242, 1058997]"
3,18,"[831628, 914697, 995242, 1118878, 1128477, 744...",[1082185]


### Точность на Тренировочной выборке

In [229]:
result_l2.apply(lambda row: precision_at_k(row['model_preds'], row['actual']), axis=1).mean()

0.5710830704521557

In [230]:
importance = []
for f, n in zip(cat.feature_importances_, cat.feature_names_):
    importance.append((f,n))
importance.sort(reverse=True)

In [231]:
importance

[(18.113100680701045, 'sub_commodity_desc'),
 (14.221874619853596, 'item_id'),
 (13.117140782865484, 'n_items'),
 (11.952164529489506, 'commodity_category'),
 (11.560697109999744, 'median_weekday'),
 (10.637471563121183, 'n_transactions'),
 (7.484686873250083, 'mean_check'),
 (5.0917630817468185, 'manufacturer'),
 (3.986888906637063, 'median_sales_hour'),
 (3.8342118523355277, 'user_id')]

## Проверка на тесте

In [232]:
X_test_2 = data_valid_L2.merge(user_item_features, on=['user_id','item_id'], how='left')

In [233]:
X_test_2.shape

(90280, 36)

In [234]:
X_test_2 = nan_replacer(X_test_2)

In [235]:
X_test_2 = reduce_mem_usage(X_test_2)

Memory usage of dataframe is 23.69 MB
Memory usage after optimization is: 11.02 MB
Decreased by 53.5%


In [236]:
X_test_2=X_test_2[choosen_feature]

In [237]:
cat_proba = cat.predict_proba(X_test_2)[:, 1]
cat_predict = cat.predict(X_test_2)

In [238]:
cat_predict = cat_predict.astype(bool)

In [239]:
rec_items_test = X_test_2[cat_predict].groupby(by=['user_id'])['item_id'].unique().reset_index()
rec_items_test.columns = ['user_id', 'model_preds_test']

In [240]:
result_l2 = result_l2.merge(rec_items_test,
                                  on='user_id',
                                  how='inner')

In [241]:
result_l2.head(4)

Unnamed: 0,user_id,actual,model_preds,model_preds_test
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[995242, 1005186, 1082185]","[995242, 1005186, 1082185]"
1,6,"[920308, 926804, 946489, 1017061, 1078346, 110...","[1082185, 995242, 1029743]",[995242]
2,13,"[6534178, 1104146, 840361, 862070, 884897, 886...","[6534178, 1106523, 1029743, 995242, 1058997]","[6534178, 1029743, 1082185, 995242]"
3,18,"[831628, 914697, 995242, 1118878, 1128477, 744...",[1082185],"[948420, 1082185, 6533889]"


## Точность на тесте 

In [242]:
result_l2.apply(lambda row: precision_at_k(row['model_preds_test'], row['actual']), axis=1).mean()

1.0

In [243]:
result_2_test = X_test_2[['user_id', 'item_id']]
result_2_test['predictions'] = cat_proba

result_2_test = result_2_test.groupby(['user_id', 'item_id'])['predictions'].median().reset_index()
result_2_test = result_2_test.sort_values(['predictions'], ascending=False).groupby(['user_id']).head(5)
result_2_test = result_2_test.groupby('user_id')['item_id'].unique().reset_index()

In [244]:
result_2_test.columns = ['user_id', 'model_test_5']

In [245]:
result_l2 = result_l2.merge(result_2_test, on ='user_id', how='inner' )

In [247]:
result_l2.apply(lambda row: precision_at_k(row['model_test_5'], row['actual']), axis=1).mean()

1.0

In [248]:
NDCG_mean(result_l2['model_test_5'], result_l2['actual'])

0.9978491391753045

# Выводы
Модель показала идеальную точность 
Ранжирующая метрика NDCG так же близка к идеальной.
Данная модель идеально предсказывает предпочтения пользователя однако не предлагает пользователю ничего нового или схожего с тем что он брал но не то что он брал.
На AB тесте модель тоже скорей всего покажет хороший результат с любой другой моделью (по предсказанию).
Однако дополнительной прибыли модель врятли принесет поскольку не рекомендует ничего нового.
