# Рекомендательные системы

## 1. Импорт библиотек

In [1]:
import numpy as np
import pandas as pd

from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split

from src.metrics import precision_at_k, recall_at_k
from src.utils import prefilter_items
from src.recommenders import MainRecommender

## 2. Загрузка данных

In [2]:
data = pd.read_csv('./data/retail_train.csv')
data.head()

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0
2,2375,26984851472,1,1036325,1,0.99,364,-0.3,1631,1,0.0,0.0
3,2375,26984851472,1,1082185,1,1.21,364,0.0,1631,1,0.0,0.0
4,2375,26984851472,1,8160430,1,1.5,364,-0.39,1631,1,0.0,0.0


In [3]:
test = pd.read_csv('./data/retail_test1.csv')
test.head()

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,1340,41652823310,664,912987,1,8.49,446,0.0,52,96,0.0,0.0
1,588,41652838477,664,1024426,1,6.29,388,0.0,8,96,0.0,0.0
2,2070,41652857291,664,995242,5,9.1,311,-0.6,46,96,0.0,0.0
3,1602,41665647035,664,827939,1,7.99,334,0.0,1741,96,0.0,0.0
4,1602,41665647035,664,927712,1,0.59,334,-0.4,1741,96,0.0,0.0


In [4]:
item_features = pd.read_csv('./data/product.csv')
item_features.head()

Unnamed: 0,PRODUCT_ID,MANUFACTURER,DEPARTMENT,BRAND,COMMODITY_DESC,SUB_COMMODITY_DESC,CURR_SIZE_OF_PRODUCT
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,
2,26093,69,PASTRY,Private,BREAD,BREAD:ITALIAN/FRENCH,
3,26190,69,GROCERY,Private,FRUIT - SHELF STABLE,APPLE SAUCE,50 OZ
4,26355,69,GROCERY,Private,COOKIES/CONES,SPECIALTY COOKIES,14 OZ


In [5]:
user_features = pd.read_csv('./data/hh_demographic.csv')
user_features.head()

Unnamed: 0,AGE_DESC,MARITAL_STATUS_CODE,INCOME_DESC,HOMEOWNER_DESC,HH_COMP_DESC,HOUSEHOLD_SIZE_DESC,KID_CATEGORY_DESC,household_key
0,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1
1,45-54,A,50-74K,Homeowner,2 Adults No Kids,2,None/Unknown,7
2,25-34,U,25-34K,Unknown,2 Adults Kids,3,1,8
3,25-34,U,75-99K,Homeowner,2 Adults Kids,4,2,13
4,45-54,B,50-74K,Homeowner,Single Female,1,None/Unknown,16


In [6]:
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': 'item_id'}, inplace=True)
user_features.rename(columns={'household_key': 'user_id'}, inplace=True)

## 3. Разбивка на train и val
-- давние покупки -- | -- 6 недель -- | -- 3 недель --

In [7]:
val_lvl_1_size_weeks = 6
val_lvl_2_size_weeks = 3

data_train_lvl_1 = data[data['week_no'] < data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)]
data_val_lvl_1 = data[(data['week_no'] >= data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)) &
                      (data['week_no'] < data['week_no'].max() - (val_lvl_2_size_weeks))]

data_train_lvl_2 = data_val_lvl_1.copy()
data_val_lvl_2 = data[data['week_no'] >= data['week_no'].max() - val_lvl_2_size_weeks]

## 4. Первый уровень

### 4.1 Попробуем разные варианты параметров

Рекомендовать будем на основе own_recommendations. Если пользователя небыло в обучающей выборке, то наиболее популярные

In [8]:
def recommend_lvl_1(user, model, train_users, N=50):
    if user in train_users:
        return model.get_own_recommendations(user, N)

    return model.overall_top_purchases[:N]

In [9]:
result_lvl_1 = data_val_lvl_1.groupby('user_id')['item_id'].unique().reset_index()
result_lvl_1.columns=['user_id', 'actual']

for n_popular in (3000, 5000, 7000):
    print(f'n_popular = {n_popular}')
    prefiltered_data = prefilter_items(data_train_lvl_1, item_features=item_features, take_n_popular=n_popular)
    recommender = MainRecommender(prefiltered_data)
    train_users = prefiltered_data['user_id'].unique().tolist()

    for rec_N in (30, 50, 70):
        result_lvl_1[f'res_{n_popular}_{rec_N}'] = result_lvl_1['user_id'].apply(
            lambda x: recommend_lvl_1(user=x, model=recommender, train_users=train_users, N=rec_N)
        )
        
        print(f'\tN = {rec_N}')
    
    print('\tDone!\n')

n_popular = 3000


  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/3001 [00:00<?, ?it/s]

	N = 30
	N = 50
	N = 70
	Done!

n_popular = 5000


  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/5001 [00:00<?, ?it/s]

	N = 30
	N = 50
	N = 70
	Done!

n_popular = 7000


  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/7001 [00:00<?, ?it/s]

	N = 30
	N = 50
	N = 70
	Done!



In [10]:
for n_popular in (3000, 5000, 7000):
    for k in (30, 50, 70):
        recall = result_lvl_1.apply(lambda row: recall_at_k(row[f'res_{n_popular}_{k}'], row['actual'], k), axis=1).mean()
        print(f'n_popular_{n_popular} N_{k} = {recall}')

n_popular_3000 N_30 = 0.05434735147610842
n_popular_3000 N_50 = 0.07123213389042464
n_popular_3000 N_70 = 0.08520164154185238
n_popular_5000 N_30 = 0.049133892330838744
n_popular_5000 N_50 = 0.06519221251051327
n_popular_5000 N_70 = 0.0787176285925517
n_popular_7000 N_30 = 0.04550136300150642
n_popular_7000 N_50 = 0.061241354347907305
n_popular_7000 N_70 = 0.07343896031222348


## 5. Второй уровень
Сразу сделаем класс, который будет рекомендовать товары

In [11]:
class Recommender:
    def __init__(self):
        self.lvl_1_n_popular = 3000
        self.lvl_1_n_recommend = 250
        
        # Переменные, в которых храним пользователей из обцучающих выборок
        # self.lvl_1_train_users
        # self.lvl_2_train_users
        # self.lvl_2_train_users_data
        
        self.lvl_2_item_feat = [
            'brand',
            'manufacturer',
            'commodity_category'
        ]
        
        self.lvl_2_feat_user = [
            'age_desc',
            'marital_status_code',
            'income_desc',
            'homeowner_desc',
            'hh_comp_desc',
            'household_size_desc',
            'kid_category_desc'
        ]

        self.lvl_2_feat_train = [
            'mean_visits_interval',
            'mean_check',
            'n_transactions',
            'mean_n_items_basket',
            'max_n_items_basket',
            'std_n_items_basket',
            'mean_n_item_categories_basket',
            'max_n_item_categories_basket',
            'std_n_item_categories_basket'
        ]

        self.categorical = [
            'marital_status_code',
            'homeowner_desc',
            'hh_comp_desc',
            'manufacturer',
            'commodity_category',
            'household_size_desc',
            'kid_category_desc'
        ]

    def fit(self, retail, item_features, user_features):
        self.lvl_1_train = retail.copy()
        self.lvl_2_train = retail.copy()
        self.item_features = item_features.copy()
        self.user_features = user_features.copy()

        self._fit_lvl_1()
        
        self._prepare_user_feat()
        self._prepare_item_feat()
        self._prepare_item_user_feat()
        self._fit_lvl_2()
        
        return self
    
    def _fit_lvl_1(self):
        self.lvl_1_train = prefilter_items(
            self.lvl_1_train,
            item_features=self.item_features,
            take_n_popular=self.lvl_1_n_popular
        )
        
        self.lvl_1_model = MainRecommender(self.lvl_1_train)
        self.lvl_1_train_users = self.lvl_1_train['user_id'].unique().tolist()

    def _fit_lvl_2(self):
        self._prepare_lvl_2_train_set()
        
        params_lgb = {
            "boosting_type": "gbdt",
            "objective": "binary", 
            "metric": "auc",
            "learning_rate": 0.01,
            "max_depth": 7,
            "n_estimators": 1500,
            "n_jobs": 3,
            "seed": 12,
            "categorical_column": self.categorical
        } 
        
        # Модель для тех, у кого есть данные пользователя
        self._lvl_2_model_users = LGBMClassifier(**params_lgb)
        data_train = self.targets_lvl_2[self.targets_lvl_2['user_id'].isin(self.lvl_2_train_users_data)]
        columns = self.lvl_2_feat_train + self.lvl_2_item_feat + self.lvl_2_feat_user
        self._lvl_2_model_users.fit(data_train[columns], data_train['target'])
     
        params_lgb = {
            "boosting_type": "gbdt",
            "objective": "binary", 
            "metric": "auc",
            "learning_rate": 0.01,
            "max_depth": 7,
            "n_estimators": 1500,
            "n_jobs": 3,
            "seed": 12,
            "categorical_column": self.categorical
        }

        # Модель для тех у кого есть история покупок
        self._lvl_2_model_no_users = LGBMClassifier(**params_lgb)
        columns = self.lvl_2_feat_train + self.lvl_2_item_feat
        self._lvl_2_model_no_users.fit(self.targets_lvl_2[columns], self.targets_lvl_2['target'])

    def _prepare_lvl_2_train_set(self):
        # потом будем проверять пользователя на то, что он был в обучающей выюорке
        self.lvl_2_train_users = self.lvl_2_train['user_id'].unique().tolist()

        users_lvl_2 = pd.DataFrame(self.lvl_2_train_users, columns=['user_id'])
        users_lvl_2['candidates'] = users_lvl_2['user_id'].apply(lambda x: self._recommend_lvl_1(x))

        df = pd.DataFrame({'user_id': users_lvl_2['user_id'].values.repeat(self.lvl_1_n_recommend),
                           'item_id': np.concatenate(users_lvl_2['candidates'].values)})

        targets_lvl_2 = data_train_lvl_2[['user_id', 'item_id']].copy()
        targets_lvl_2['target'] = 1  # тут только покупки 

        targets_lvl_2 = df.merge(targets_lvl_2, on=['user_id', 'item_id'], how='left')
        targets_lvl_2['target'].fillna(0, inplace=True)

        targets_lvl_2 = targets_lvl_2.merge(self.lvl_2_train, on=['user_id'], how='left')        
        targets_lvl_2 = targets_lvl_2.merge(self.item_features, on=['item_id'], how='left')
        targets_lvl_2 = targets_lvl_2.merge(self.user_features, on=['user_id'], how='left')
        targets_lvl_2[self.categorical] = targets_lvl_2[self.categorical].astype('category')
        
        self.lvl_2_train_users_data = targets_lvl_2['user_id'][~targets_lvl_2['age_desc'].isnull()].unique().tolist()
        self.targets_lvl_2 = targets_lvl_2

    def _prepare_user_feat(self):
        self.user_features['age_desc'].replace(
            {'19-24': 0, '25-34': 1, '35-44': 2, '45-54': 3, '55-64': 4, '65+': 5},
            inplace=True)

        self.user_features['marital_status_code'].replace(
            {'U': 0, 'A': 1, 'B': 2}, inplace=True)

        self.user_features['income_desc'].replace(
            {'Under 15K': 0, '15-24K': 1, '25-34K':2, '35-49K': 3,
             '50-74K': 4, '75-99K': 5, '100-124K': 6, '125-149K': 7, 
             '150-174K': 8, '175-199K': 9, '200-249K': 10, '250K+':11}, inplace=True)

        self.user_features['homeowner_desc'].replace(
            {'Unknown': 0, 'Probable Renter': 1, 'Renter': 2,
             'Probable Owner': 3, 'Homeowner': 4}, inplace=True)

        self.user_features['hh_comp_desc'].replace(
            {'Unknown': 0, 'Single Male': 1, 'Single Female': 2,
             '1 Adult Kids': 3, '2 Adults No Kids': 4, '2 Adults Kids':5},inplace=True)

        self.user_features['household_size_desc'].replace({'5+': 5}, inplace=True) 

        self.user_features['kid_category_desc'].replace(
            {'None/Unknown': 0, '3+': 3}, inplace=True)
        
        self.user_features.fillna(-1, inplace=True)

    def _prepare_item_feat(self):
        commodities = self.item_features.commodity_desc.value_counts()
        commodities_list = commodities.keys().tolist()

        self.item_features['brand'] = np.where(self.item_features['brand']=='Private', 0, 1)

        for i, name in enumerate(commodities_list):
            self.item_features.loc[item_features['commodity_desc'] == name, 'commodity_category'] = i
    
    def _prepare_item_user_feat(self):
        X = self.lvl_2_train.copy()
        features = pd.DataFrame(X['user_id'].unique().tolist(), columns=['user_id'])

        # cреднее кол-во дней между покупками
        df = X.groupby('user_id')['day'].nunique().reset_index()
        df['mean_visits_interval'] = (X.groupby('user_id')['day'].max() - X.groupby('user_id')['day'].min()) / df['day']
        features = features.merge(df[['user_id', 'mean_visits_interval']], on=['user_id'])

        # средний чек корзины клиента
        df = X.groupby(['user_id', 'basket_id'])['sales_value'].sum().reset_index()
        df = df.groupby('user_id')['sales_value'].mean().reset_index()
        df.columns = ['user_id', 'mean_check']
        features = features.merge(df, on=['user_id'])

        # кол-во транзакций клиента
        df = X.groupby(['user_id'])['item_id'].count().reset_index()
        df.columns = ['user_id', 'n_transactions']
        features = features.merge(df, on=['user_id'])

        # mean / max / std кол-ва уникальных товаров в корзине клиента
        df = X.groupby(['user_id', 'basket_id'])['item_id'].nunique().reset_index()
        df1 = df.groupby('user_id')['item_id'].mean().reset_index()
        df1.columns = ['user_id', 'mean_n_items_basket']
        features = features.merge(df1, on=['user_id'])

        df2 = df.groupby('user_id')['item_id'].max().reset_index()
        df2.columns = ['user_id', 'max_n_items_basket']
        features = features.merge(df2, on=['user_id'])

        df3 = df.groupby('user_id')['item_id'].std().reset_index()
        df3.columns = ['user_id', 'std_n_items_basket']
        features = features.merge(df3, on=['user_id'])

        # mean / max / std кол-ва уникальных категорий в корзине клиента
        X = X.merge(item_features[['item_id', 'commodity_desc']], on=['item_id'])
        df = X.groupby(['user_id', 'basket_id'])['commodity_desc'].nunique().reset_index()
        df1 = df.groupby('user_id')['commodity_desc'].mean().reset_index()
        df1.columns = ['user_id', 'mean_n_item_categories_basket']
        features = features.merge(df1, on=['user_id'])

        df2 = df.groupby('user_id')['commodity_desc'].max().reset_index()
        df2.columns = ['user_id', 'max_n_item_categories_basket']
        features = features.merge(df2, on=['user_id'])

        df3 = df.groupby('user_id')['commodity_desc'].std().reset_index()
        df3.columns = ['user_id', 'std_n_item_categories_basket']
        features = features.merge(df3, on=['user_id'])

        self.lvl_2_train = features
        
    def recommend(self, user, N=5):
        recommendations = self._recommend_lvl_1(user)
        recommendations = self._recommend_lvl_2(user, recommendations, N)
        return recommendations
    
    def _recommend_lvl_1(self, user):
        if user in self.lvl_1_train_users:
            return self.lvl_1_model.get_own_recommendations(user, self.lvl_1_n_recommend)

        return self.lvl_1_model.overall_top_purchases[:self.lvl_1_n_recommend]
    
    def _recommend_lvl_2(self, user, recommendations, N):
        X = pd.DataFrame([user], columns=['user_id']).merge(
            pd.DataFrame(recommendations, columns=['item_id']),
            how='cross'
        )

        X = X.merge(self.lvl_2_train, on=['user_id'], how='left') 
        X = X.merge(self.item_features, on=['item_id'], how='left')
        X = X.merge(self.user_features, on=['user_id'], how='left')
        X.fillna(0, inplace=True)
        X[self.categorical] = X[self.categorical].astype('category')

        if user in self.lvl_2_train_users_data:
            columns = self.lvl_2_feat_train + self.lvl_2_item_feat + self.lvl_2_feat_user
            X['pred_proba'] = self._lvl_2_model_users.predict_proba(X[columns])[:, 1]
        else:
            columns = self.lvl_2_feat_train + self.lvl_2_item_feat
            X['pred_proba'] = self._lvl_2_model_no_users.predict_proba(X[columns])[:, 1]

        X.sort_values(['pred_proba'], ascending=False, inplace=True)

        return X['item_id'].head(N).tolist()

In [12]:
recommender = Recommender()
recommender.fit(data_train_lvl_2, item_features, user_features)
recommender.recommend(1)

  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/3001 [00:00<?, ?it/s]



[878285, 1099862, 991024, 968164, 9655212]

In [13]:
result = test.groupby('user_id')['item_id'].unique().reset_index()
result.columns=['user_id', 'actual']

result['recom'] = result['user_id'].apply(lambda x: recommender.recommend(x))
precision = result.apply(lambda row: precision_at_k(row['recom'], row['actual']), axis=1).mean()
print('precision@5 =', precision)

precision@5 = 0.09103448275862054
