# Двухуровневая модели рекомендаций для данных X5 retail


**Основное**  
Данны данные о покупках 2500 пользователей в течении 95 недель так же есть характеристики пользователя и характеристики продукта.
Задача разработать модель с
- Целевая метрика precision@5 > 0.22

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit.als import AlternatingLeastSquares
from implicit import approximate_als
from implicit.nearest_neighbours import ItemItemRecommender,  CosineRecommender

# Модель второго уровня
from catboost import CatBoostClassifier
from utilis import prefilter_items, prefilter_items_v2
from recomenders5 import MainRecommender
from metrics import recall_at_k_mean, recall_at_k, precision_at_k, map_k_mean, NDCG_mean, ap_k
from data_prepare import reduce_mem_usage, nan_replacer, feature_generator,user_feature_prepare, item_features_prepare
from recomender_10 import MainRecommender
import warnings
warnings.filterwarnings("ignore")

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
data = pd.read_csv('retail_train.csv')
#data = prefilter_items(data)

item_features = pd.read_csv('product.csv')
user_features = pd.read_csv('hh_demographic.csv')

#column processing
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': 'item_id'}, inplace=True)
user_features.rename(columns={'household_key': 'user_id'}, inplace=True)

In [4]:
max_week = data['week_no'].max()
condition_train = (data['week_no'] < max_week - 9)
condition_valid = data['week_no'] >= max_week - 6
condition_test = ((data['week_no'] >= max_week - 9) & (data['week_no'] < max_week - 6))
                   
data_train_L1 = data[condition_train]
data_test_L1 = data[condition_test]

data_train_L2 = data_test_L1.copy()
data_valid_L2 = data[condition_valid]

In [5]:
data_train_L1 = prefilter_items(data_train_L1, take_n=10000-1)

In [6]:
users_train_L1 = data_train_L1.user_id.unique()
users_test_L1 = data_train_L1.user_id.unique()
users_valid_L2 = data_valid_L2.user_id.unique()

new_users_test_to_train = list(set(users_test_L1) - set(users_train_L1))

new_users_L1_to_valid = list(set(users_valid_L2) - (set(users_train_L1) | set(users_test_L1)))

add_to_L2 = list(set(users_valid_L2) - (set(users_test_L1)))

new_users_test_to_train, new_users_L1_to_valid, len(add_to_L2)

([], [1984], 1)

In [7]:
data_l1_l2_val = pd.concat([data_valid_L2, data_test_L1], ignore_index = True)

In [8]:
def get_lvl_1(data_train_L1, data_train_L2, N, add_to_l2):
    recommender = MainRecommender(data_train_L1)

    users_l1 = data_train_L1['user_id'].unique()
    users_l2 = data_train_L2['user_id'].unique().tolist()
    if add_to_l2:
        users_l2 += add_to_lvl_2

    current_users = list(set(users_l2) & set(users_l1))    
    new_users = list(set(users_l2) - set(users_l1))

    df = pd.DataFrame(users_l2, columns=['user_id'])
    cond_1 = df['user_id'].isin(current_users)
    df.loc[cond_1, 'candidates'] = df.loc[cond_1, 'user_id'].apply(
        lambda x: recommender.get_item_recommendations(x, N))

    if new_users:
        cond_2 = df['user_id'].isin(new_users)
        df.loc[cond_2, 'candidates'] = df.loc[cond_2, 'user_id'].apply(
            lambda x: recommender.overall_top_purchases[:N])
        
    return df



In [9]:
def get_targets_l2(data_train_L1, data_train_L2, user_item_features, N, add_to_l2=None):
    
    users_L2 = get_lvl_1(data_train_L1, 
                                 data_train_L2, 
                                 N, 
                                 add_to_l2)
    
    df = pd.DataFrame({'user_id': users_L2['user_id'].values.repeat(N),
                       'item_id': np.concatenate(users_L2['candidates'].values)})

    targets_l2 = data_train_L2[['user_id', 'item_id']].copy()
    targets_l2['target'] = 1 

    targets_l2 = df.merge(targets_l2, on=['user_id', 'item_id'], how='left')
    targets_l2['target'].fillna(0, inplace= True)
    
    targets_l2 = targets_l2.merge(user_item_features, on=['user_id', 'item_id'], how='left')
    
    return targets_l2



# Генерация признаков

In [10]:
item_features = item_features_prepare(item_features)

In [11]:
user_feature = user_feature_prepare(user_features)

In [12]:
user_item_features = feature_generator(data_train_L1, user_feature, item_features)

Добавлены следующие признаки:
время покупки -hour
день недели совершения транзакции-median_weekday
кол-во транзакций покупок пользователем-n_transactions
кол-во уникальных покупок пользователем-n_items
средний чек пользвателя - mean_check
средний чек на размер семьи-mean_ckeck_per_household_size
популярность товара-popularity
любимые товары пользователя-purchase_2,purchase_3,purchase_4


In [13]:
user_item_features.head(4)

Unnamed: 0,user_id,item_id,median_sales_hour,median_weekday,n_items,purchase_2,purchase_3,purchase_4,mean_check,n_transactions,...,household_size_desc,kid_category_desc,mean_ckeck_per_household_size,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,commodity_category
0,1,820165,13.0,2.0,345,856942.0,1082185.0,995242.0,50.264853,1374,...,2.0,0,25.132426,2.0,PRODUCE,1.0,CITRUS,ORANGES NAVELS ALL,,166.0
1,1,823721,14.0,4.0,345,856942.0,1082185.0,995242.0,50.264853,1374,...,2.0,0,25.132426,317.0,GROCERY,1.0,CHEESE,GRATED CHEESE,8 OZ,14.0
2,1,823990,15.0,6.0,345,856942.0,1082185.0,995242.0,50.264853,1374,...,2.0,0,25.132426,2929.0,MEAT,1.0,BEEF,CHOICE BEEF,,13.0
3,1,825123,15.0,4.0,345,856942.0,1082185.0,995242.0,50.264853,1374,...,2.0,0,25.132426,1179.0,GROCERY,1.0,SALD DRSNG/SNDWCH SPRD,SEMI-SOLID SALAD DRESSING MAY,30 OZ,34.0


In [14]:
%%time

N = 1000

targets_l2 = get_targets_l2(data_train_L1, data_train_L2, user_item_features, N, add_to_l2=None)

print(f'Число пользователей: {targets_l2.user_id.nunique()}')

targets_l2.head()



  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

Число пользователей: 1935
CPU times: total: 40.7 s
Wall time: 14.2 s


Unnamed: 0,user_id,item_id,target,median_sales_hour,median_weekday,n_items,purchase_2,purchase_3,purchase_4,mean_check,...,household_size_desc,kid_category_desc,mean_ckeck_per_household_size,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,commodity_category
0,2070,1082185,0.0,18.0,2.0,721.0,1085604.0,834103.0,1055863.0,11.571104,...,1.0,0,11.571104,2.0,PRODUCE,1.0,TROPICAL FRUIT,BANANAS,40 LB,216.0
1,2070,981760,1.0,22.0,4.0,721.0,1085604.0,834103.0,1055863.0,11.571104,...,1.0,0,11.571104,69.0,GROCERY,0.0,EGGS,EGGS - X-LARGE,1 DZ,184.0
2,2070,1098066,0.0,21.0,4.0,721.0,1085604.0,834103.0,1055863.0,11.571104,...,1.0,0,11.571104,69.0,GROCERY,0.0,BAKED BREAD/BUNS/ROLLS,HOT DOG BUNS,11 OZ,10.0
3,2070,1127831,0.0,19.0,4.0,721.0,1085604.0,834103.0,1055863.0,11.571104,...,1.0,0,11.571104,5937.0,PRODUCE,1.0,BERRIES,STRAWBERRIES,16 OZ,200.0
4,2070,995242,0.0,21.5,4.0,721.0,1085604.0,834103.0,1055863.0,11.571104,...,1.0,0,11.571104,69.0,GROCERY,0.0,FLUID MILK PRODUCTS,FLUID MILK WHITE ONLY,,62.0


In [15]:
targets_l2 = nan_replacer(targets_l2)

In [16]:
targets_l2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1944011 entries, 0 to 1944010
Data columns (total 27 columns):
 #   Column                         Dtype   
---  ------                         -----   
 0   user_id                        int64   
 1   item_id                        int64   
 2   target                         float64 
 3   median_sales_hour              float64 
 4   median_weekday                 float64 
 5   n_items                        float64 
 6   purchase_2                     float64 
 7   purchase_3                     float64 
 8   purchase_4                     float64 
 9   mean_check                     float64 
 10  n_transactions                 float64 
 11  popularity                     float64 
 12  age_desc                       float64 
 13  marital_status_code            float64 
 14  income_desc                    float64 
 15  homeowner_desc                 float64 
 16  hh_comp_desc                   float64 
 17  household_size_desc        

In [17]:
targets_l2.head(5)

Unnamed: 0,user_id,item_id,target,median_sales_hour,median_weekday,n_items,purchase_2,purchase_3,purchase_4,mean_check,...,household_size_desc,kid_category_desc,mean_ckeck_per_household_size,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,commodity_category
0,2070,1082185,0.0,18.0,2.0,721.0,1085604.0,834103.0,1055863.0,11.571104,...,1.0,0,11.571104,2.0,PRODUCE,1.0,TROPICAL FRUIT,BANANAS,40 LB,216.0
1,2070,981760,1.0,22.0,4.0,721.0,1085604.0,834103.0,1055863.0,11.571104,...,1.0,0,11.571104,69.0,GROCERY,0.0,EGGS,EGGS - X-LARGE,1 DZ,184.0
2,2070,1098066,0.0,21.0,4.0,721.0,1085604.0,834103.0,1055863.0,11.571104,...,1.0,0,11.571104,69.0,GROCERY,0.0,BAKED BREAD/BUNS/ROLLS,HOT DOG BUNS,11 OZ,10.0
3,2070,1127831,0.0,19.0,4.0,721.0,1085604.0,834103.0,1055863.0,11.571104,...,1.0,0,11.571104,5937.0,PRODUCE,1.0,BERRIES,STRAWBERRIES,16 OZ,200.0
4,2070,995242,0.0,21.5,4.0,721.0,1085604.0,834103.0,1055863.0,11.571104,...,1.0,0,11.571104,69.0,GROCERY,0.0,FLUID MILK PRODUCTS,FLUID MILK WHITE ONLY,,62.0


# Уровень 2

## Разделение на трейн и предикт

In [18]:
X_train = targets_l2.drop('target', axis=1)
y_train = targets_l2[['target']]

In [19]:
X_train['user_id'].nunique()

1935

In [20]:
X_train.shape

(1944011, 26)

In [21]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1944011 entries, 0 to 1944010
Data columns (total 26 columns):
 #   Column                         Dtype   
---  ------                         -----   
 0   user_id                        int64   
 1   item_id                        int64   
 2   median_sales_hour              float64 
 3   median_weekday                 float64 
 4   n_items                        float64 
 5   purchase_2                     float64 
 6   purchase_3                     float64 
 7   purchase_4                     float64 
 8   mean_check                     float64 
 9   n_transactions                 float64 
 10  popularity                     float64 
 11  age_desc                       float64 
 12  marital_status_code            float64 
 13  income_desc                    float64 
 14  homeowner_desc                 float64 
 15  hh_comp_desc                   float64 
 16  household_size_desc            float64 
 17  kid_category_desc          

## Предобработка обучающего множества

In [22]:
choosen_feature = [ 'purchase_2',
                  'n_transactions',
                  'popularity',
                  'item_id',
                  'commodity_category',
                  'sub_commodity_desc',
                  'n_items',
                  'mean_check',
                  'commodity_desc',
                  'median_weekday',
                  'median_sales_hour',
                  'user_id']

In [23]:
X_train = reduce_mem_usage(X_train)

Memory usage of dataframe is 361.53 MB
Memory usage after optimization is: 166.95 MB
Decreased by 53.8%


In [25]:
X_train = X_train[choosen_feature]

In [26]:
categorial_columns = [col for col in X_train.columns if X_train[col].dtype =='category']
#categorial_columns.append('commodity_category')

In [27]:
categorial_columns

['sub_commodity_desc', 'commodity_desc']

In [28]:
X_train.columns

Index(['purchase_2', 'n_transactions', 'popularity', 'item_id',
       'commodity_category', 'sub_commodity_desc', 'n_items', 'mean_check',
       'commodity_desc', 'median_weekday', 'median_sales_hour', 'user_id'],
      dtype='object')

In [29]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1944011 entries, 0 to 1944010
Data columns (total 12 columns):
 #   Column              Dtype   
---  ------              -----   
 0   purchase_2          float32 
 1   n_transactions      float32 
 2   popularity          float32 
 3   item_id             int32   
 4   commodity_category  float32 
 5   sub_commodity_desc  category
 6   n_items             float32 
 7   mean_check          float32 
 8   commodity_desc      category
 9   median_weekday      float32 
 10  median_sales_hour   float32 
 11  user_id             int16   
dtypes: category(2), float32(8), int16(1), int32(1)
memory usage: 77.9 MB


## Обучение

In [30]:
%%time
cat = CatBoostClassifier(max_depth=5, n_estimators=1000, cat_features=categorial_columns, random_state=14, silent=True)
cat.fit(X_train, y_train, cat_features=categorial_columns)

CPU times: total: 1h 32min 35s
Wall time: 15min 7s


<catboost.core.CatBoostClassifier at 0x1d000e92250>

In [31]:
train_preds_proba = cat.predict_proba(X_train)

In [32]:
valid_l2 = data_valid_L2.groupby('user_id')['item_id'].unique().reset_index()
valid_l2.columns = ['user_id', 'actual']

In [33]:
valid_l2.head(5)

Unnamed: 0,user_id,actual
0,1,"[829323, 835108, 836423, 851515, 875240, 87737..."
1,2,"[895388, 8357614, 12301772, 821083, 828106, 83..."
2,3,"[835476, 851057, 872021, 878302, 879948, 90963..."
3,4,"[831063, 883932, 891423, 908283, 954966, 96222..."
4,6,"[850102, 897088, 940806, 1062782, 1078346, 103..."


In [34]:
result_2_train = X_train[['user_id', 'item_id']]
result_2_train['predictions'] = train_preds_proba[:,1]
result_2_train = result_2_train.groupby(['user_id', 'item_id'])['predictions'].median().reset_index()
result_2_train = result_2_train.sort_values(['predictions'], ascending=False).groupby(['user_id']).head(25)
result_2_train = result_2_train.groupby('user_id')['item_id'].unique().reset_index()
result_2_train.columns = ['user_id','model_train']

In [35]:
valid_l2.shape, result_2_train.shape

((2197, 2), (1935, 2))

In [36]:
result_2_train = result_2_train.merge(valid_l2, on='user_id')

### Точность на Тренировочной выборке

In [37]:
result_2_train.apply(lambda row: precision_at_k(row['model_train'], row['actual'], k=5), axis=1).mean()

0.397732181425486

In [38]:
importance = []
for f, n in zip(cat.feature_importances_, cat.feature_names_):
    importance.append((f,n))
importance.sort(reverse=True)

In [39]:
importance

[(15.866178229818274, 'n_items'),
 (14.155511484960105, 'n_transactions'),
 (12.417155459812689, 'purchase_2'),
 (11.386377463211138, 'mean_check'),
 (10.973645487756261, 'popularity'),
 (6.661196531686121, 'item_id'),
 (6.07303618517003, 'user_id'),
 (5.456213210494308, 'median_weekday'),
 (4.835632122111148, 'commodity_category'),
 (4.538214409490302, 'sub_commodity_desc'),
 (4.439713493388671, 'median_sales_hour'),
 (3.1971259221008768, 'commodity_desc')]

## Проверка на тесте

In [40]:
X_test_2 = data_valid_L2.merge(user_item_features, on=['user_id','item_id'], how='outer')

In [41]:
X_test_2.shape

(975479, 36)

In [42]:
X_test_2 = nan_replacer(X_test_2)

In [43]:
X_test_2 = reduce_mem_usage(X_test_2)

Memory usage of dataframe is 255.84 MB
Memory usage after optimization is: 121.03 MB
Decreased by 52.7%


In [44]:
X_test_2=X_test_2[X_train.columns]

In [45]:
X_test_2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 975479 entries, 0 to 975478
Data columns (total 12 columns):
 #   Column              Non-Null Count   Dtype   
---  ------              --------------   -----   
 0   purchase_2          975479 non-null  float32 
 1   n_transactions      975479 non-null  float32 
 2   popularity          975479 non-null  float32 
 3   item_id             975479 non-null  int32   
 4   commodity_category  975479 non-null  float32 
 5   sub_commodity_desc  975479 non-null  category
 6   n_items             975479 non-null  float32 
 7   mean_check          975479 non-null  float32 
 8   commodity_desc      975479 non-null  category
 9   median_weekday      975479 non-null  float32 
 10  median_sales_hour   975479 non-null  float32 
 11  user_id             975479 non-null  int16   
dtypes: category(2), float32(8), int16(1), int32(1)
memory usage: 39.1 MB


In [46]:
test_pred_proba = cat.predict_proba(X_test_2)[:, 1]

In [47]:
result_2_test = X_test_2[['user_id', 'item_id']]
result_2_test['predictions'] = test_pred_proba

result_2_test = result_2_test.groupby(['user_id', 'item_id'])['predictions'].median().reset_index()
result_2_test = result_2_test.sort_values(['predictions'], ascending=False).groupby(['user_id']).head(25)
result_2_test = result_2_test.groupby('user_id')['item_id'].unique().reset_index()
result_2_test.columns = ['user_id', 'model_test']

In [48]:
result_2_test.merge(valid_l2, on='user_id')

Unnamed: 0,user_id,model_test,actual
0,1,"[1082185, 995242, 840361, 1005186, 865456, 820...","[829323, 835108, 836423, 851515, 875240, 87737..."
1,2,"[1082185, 1106523, 12263788, 994928, 990797, 9...","[895388, 8357614, 12301772, 821083, 828106, 83..."
2,3,"[1082185, 1005186, 1106523, 983584, 899624, 10...","[835476, 851057, 872021, 878302, 879948, 90963..."
3,4,"[883932, 990797, 1029743, 6773204, 6391541, 10...","[831063, 883932, 891423, 908283, 954966, 96222..."
4,6,"[1082185, 1024306, 981760, 962568, 845208, 995...","[850102, 897088, 940806, 1062782, 1078346, 103..."
...,...,...,...
2192,2496,"[842783, 1082185, 1106523, 981760, 840361, 883...","[824915, 854920, 861603, 881964, 883404, 89962..."
2193,2497,"[1029743, 1070820, 840361, 981760, 1082185, 83...","[857503, 861792, 896613, 1024306, 5592455, 820..."
2194,2498,"[1106523, 1070820, 1126899, 6533889, 13841744,...","[839243, 846652, 847302, 914190, 951197, 10086..."
2195,2499,"[1070820, 867188, 849843, 6534178, 1106523, 99...","[853354, 882308, 1115160, 826249, 830887, 8334..."


In [49]:
result_2_test = result_2_test.merge(valid_l2, on='user_id')

## Точность на тесте 

In [50]:
result_2_test.apply(lambda row: precision_at_k(row['model_test'], row['actual'],k=5), axis=1).mean()

0.3569412835685025

# Выводы
Модель показала точность 35 %. Получить такую точность в основом помогло добавление признаков.
Наиболее влиятельными из которых оказались 
* n_items - количество уникальных покупок
* n_transactions - количество транзакций пользователя
* purchase_2 - любимый товар пользователя
* mean_check - средний чек
* popularity - популярность товара

Добавление признаков позволило улучшить точность модели на 12-15 % в сравнение с моделью без генерации признаков.
Так же данные признаки улучшили эталонный результат 
https://github.com/alex-coch/GeekBrains-AI-faculty/blob/main/4term/Recommended%20systems/08-Course%20project/Course_project.ipynb  на 8 % 28 для ансабля из 2 бустингов и 35 % для 1 модели.

Однако модель в большей мере предлагает пользователю что он и так возьмет (если обучить на всех item то точность на 5 рекомендациях будет порядка 100 %.
Хорошей идей будет дополнительная модель которая определит для пользователя любимую категорию или item и будет искать  категории или item наиболее схожих обьектов ранжированых по полпулярности. Такая модель врятли покажет хорошую точность но в дополнении к 2х уровневой будет предлагать более оригинальные продукты 
