# Вебинар 6. Двухуровневые модели рекомендаций


Код для src, utils, metrics вы можете скачать из [этого](https://github.com/geangohn/recsys-tutorial) github репозитория

## Задание 1

A) Попробуйте различные варианты генерации кандидатов. Какие из них дают наибольший recall@k ?
- Пока пробуем отобрать 50 кандидатов (k=50)
- Качество измеряем на data_val_lvl_1: следующие 6 недель после трейна

Дают ли own recommendtions + top-popular лучший recall?  

B)* Как зависит recall@k от k? Постройте для одной схемы генерации кандидатов эту зависимость для k = {20, 50, 100, 200, 500}  
C)* Исходя из прошлого вопроса, как вы думаете, какое значение k является наиболее разумным?

In [385]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit.als import AlternatingLeastSquares
from implicit import approximate_als
from implicit.nearest_neighbours import ItemItemRecommender,  CosineRecommender

# Модель второго уровня
from lightgbm import LGBMClassifier
from utilis import prefilter_items, prefilter_items_v2
from recomenders5 import MainRecommender
from metrics import recall_at_k_mean, recall_at_k, precision_at_k


In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
def id_item_to_real_id_item(recs, k):
    recsals = np.array([],dtype='i')
    for row in recs:
        a = [id_to_itemid[el] for el in row]
        recsals = np.append(recsals,a)
    recsals = recsals.reshape(-1,k)    
    return recsals

In [4]:
data = pd.read_csv('retail_train.csv')
#data = prefilter_items(data)

item_features = pd.read_csv('product.csv')
user_features = pd.read_csv('hh_demographic.csv')

#column processing
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': 'item_id'}, inplace=True)
user_features.rename(columns={'household_key': 'user_id'}, inplace=True)

In [5]:
max_week = data['week_no'].max()
condition_train = (data['week_no'] < max_week - 9)
condition_valid = data['week_no'] >= max_week - 3
condition_test = ((data['week_no'] >= max_week - 9) & (data['week_no'] < max_week - 3))
                   
data_train_L1 = data[condition_train]
data_test_L1 = data[condition_test]

data_train_L2 = data_test_L1.copy()
data_valid_L2 = data[condition_valid]

In [25]:
data_l1_l2_val = pd.concat([data_valid_L2, data_test_L1], ignore_index = True)

In [6]:
popularity = data_train_L1.groupby('item_id')['quantity'].sum().reset_index()
popularity.rename(columns={'quantity': 'n_sold'}, inplace=True)
top_5000 = popularity.sort_values('n_sold', ascending=False).head(5000).item_id.tolist()
data_train_L1.loc[~data_train_L1['item_id'].isin(top_5000), 'item_id'] = 999999

In [7]:
user_item_matrix = pd.pivot_table(data_train_L1, 
                                  index='user_id', columns='item_id', 
                                  values='quantity',
                                  aggfunc='count', 
                                  fill_value=0
                                 )

user_item_matrix[user_item_matrix > 0] = 1 # так как в итоге хотим предсказать 
user_item_matrix = user_item_matrix.astype(float) # необходимый тип матрицы для implicit

# переведем в формат saprse matrix
sparse_user_item = csr_matrix(user_item_matrix).tocsr()

user_item_matrix.shape

(2498, 5001)

In [8]:
userids = user_item_matrix.index.values
itemids = user_item_matrix.columns.values

matrix_userids = np.arange(len(userids))
matrix_itemids = np.arange(len(itemids))

id_to_itemid = dict(zip(matrix_itemids, itemids))
id_to_userid = dict(zip(matrix_userids, userids))

itemid_to_id = dict(zip(itemids, matrix_itemids))
userid_to_id = dict(zip(userids, matrix_userids))

In [9]:
result = data_test_L1.groupby('user_id')['item_id'].unique().reset_index()
result.columns=['user_id', 'actual']
result.head(2)

Unnamed: 0,user_id,actual
0,1,"[853529, 865456, 867607, 872137, 874905, 87524..."
1,2,"[15830248, 838136, 839656, 861272, 866211, 870..."


In [10]:
result = result.loc[result['user_id']<=2497]

In [27]:
result_tot = data.groupby('user_id')['item_id'].unique().reset_index()
result_tot.columns=['user_id', 'actual']
result_tot = result_tot.loc[result_tot['user_id']<=2497]
result_train_1 = data_train_L1.groupby('user_id')['item_id'].unique().reset_index()
result_train_1.columns=['user_id', 'actual']
result_train_1 = result_train_1.loc[result_train_1['user_id']<=2497]
result_tot_without_1_train = data_l1_l2_val.groupby('user_id')['item_id'].unique().reset_index()
result_tot_without_1_train.columns=['user_id', 'actual']
result_tot_without_1_train = result_tot_without_1_train.loc[result_tot_without_1_train['user_id']<=2497]

# ALS

In [28]:
%%time

als = AlternatingLeastSquares(factors=8, 
                                regularization=0.001,
                                iterations=12, 
                                calculate_training_loss=True, 
                                num_threads=4, random_state=14)
als.fit(csr_matrix(user_item_matrix).tocsr(),  # На вход item-user matrix
          show_progress=True) #В ФИТЕ НЕ ТРАНСПОНИРУЕМ ОНО САМО ВНУТРИ СТРАНСПОНИРУЕТСЯ!!!!!

  0%|          | 0/12 [00:00<?, ?it/s]

CPU times: total: 9.78 s
Wall time: 1.68 s


### Recall на тестовой выборки 3-9 неделя

In [29]:
%%time
als_res = als.recommend(userid=result['user_id'], user_items=csr_matrix(user_item_matrix).tocsr(),   # на вход user-item matrix
                                    N=300, 
                                    filter_already_liked_items=False, 
                                    filter_items=[itemid_to_id[999999]], 
                                    recalculate_user=False)[0]

CPU times: total: 750 ms
Wall time: 274 ms


In [32]:
als_res_conv = id_item_to_real_id_item(als_res, 300)
result['als'] = als_res_conv.tolist()

In [33]:
result.apply(lambda row: recall_at_k(row['als'], row['actual'], k=300), axis=1).mean()

0.17165413003382865

### Recall на всей выборке

In [39]:
%%time
als_res_tot = als.recommend(userid=result_tot['user_id'], user_items=csr_matrix(user_item_matrix).tocsr(),   # на вход user-item matrix
                                    N=300, 
                                    filter_already_liked_items=False, 
                                    filter_items=[itemid_to_id[999999]], 
                                    recalculate_user=False)[0]

CPU times: total: 938 ms
Wall time: 315 ms


In [40]:
als_res_conv_tot = id_item_to_real_id_item(als_res_tot, 300)
result_tot['als'] = als_res_conv_tot.tolist()

In [41]:
result_tot.apply(lambda row: recall_at_k(row['als'], row['actual'], k=300), axis=1).mean()

0.1296125375345311

# ItemItem

In [42]:
item_5 = ItemItemRecommender(K=500, num_threads=4,) # K - кол-во билжайших соседей
item_5.fit(csr_matrix(user_item_matrix))

  0%|          | 0/5001 [00:00<?, ?it/s]

In [43]:
item_5_rec = item_5.recommend(userid=userids, 
                user_items=csr_matrix(user_item_matrix),   # на вход user-item matrix
                N=300, 
                filter_already_liked_items=False, 
                filter_items=[itemid_to_id[999999]], 
                recalculate_user=False)[0]

In [44]:
item_5_conv = id_item_to_real_id_item(item_5_rec, 300)
item_500 = pd.DataFrame({'user_id':userids, 'item_500':item_5_conv.tolist()})
result = result.merge(item_500, on='user_id')

In [45]:
result.apply(lambda row: recall_at_k(row['item_500'], row['actual'], k=300), axis=1).mean()

0.19930515112886824

## Собственные покупки + топ товаров

In [46]:
from recomender7 import MainRecommender

In [47]:
rec = MainRecommender(data_train_L1)

  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/5001 [00:00<?, ?it/s]

In [48]:
own_plus_top = np.array([])
for ids in result['user_id']:
    a = rec.get_own_recommendations(ids, N=300)
    own_plus_top = np.append(own_plus_top, a)
    own_plus_top = own_plus_top.reshape(-1, 300)
own_plus_top

array([[1082185., 1127831., 1098066., ...,  933637., 1117514.,  970030.],
       [1082185., 1127831., 1098066., ...,  951412., 1101173.,  896369.],
       [1029743., 1096036., 5568378., ..., 1042907., 1094955.,  821083.],
       ...,
       [1007195., 1006184., 1126899., ...,  952163.,  864774.,  970202.],
       [1082185., 1029743., 1127831., ..., 1085604.,  986912., 1053690.],
       [1082185., 1029743., 1068719., ...,  861272.,  893501.,  908318.]])

In [49]:
result['own_plus_top'] = own_plus_top.tolist()

In [23]:
result.apply(lambda row: recall_at_k(row['own_plus_top'], row['actual'], k=300), axis=1).mean()

0.1669107140684147

## Изменение recall в зависимости от k

In [50]:
%%time
k_list = [50, 100, 150, 200, 300, 500]
k_dict = {}
for k in k_list:
    temp = rec.get_als_recommendations_to_bath_users(result['user_id'],N=k)
    k_dict[k] = recall_at_k_mean(result['actual'], temp)
    
k_dict

CPU times: total: 18.1 s
Wall time: 9.12 s


{50: 0.08671317829457365,
 100: 0.1233875968992248,
 150: 0.14717829457364343,
 200: 0.16671317829457366,
 300: 0.19617829457364339,
 500: 0.2352635658914729}

### Чем выше значение k тем соответсвено и выше полнота т.к чем больше элементов мы возьмем тем выше вероятность что они акажутся уже в актуальном.

In [None]:
data_train_L1

In [None]:
# your_code

### Задание 2.

Обучите модель 2-ого уровня, при этом:
    - Добавьте минимум по 2 фичи для юзера, товара и пары юзер-товар
    - Измерьте отдельно precision@5 модели 1-ого уровня и двухуровневой модели на data_val_lvl_2
    - Вырос ли precision@5 при использовании двухуровневой модели?

## Средний чек ползователя (user)

Берем только тренировочную выборку, будет несовсем чесно генерировать признак из всего множества 

In [53]:
%%time
mean_check_train_lv1 = {}
for user in data_train_L1["user_id"].unique():
    week_holder = np.array([])
    for w in data['week_no'].unique():
        sum_val = data_train_L1.loc[(data_train_L1['user_id']==user)&(data_train_L1['week_no']==w)]['sales_value'].sum()
        week_holder = np.append(week_holder,sum_val)
    mean_check_train_lv1[user] = week_holder.mean()     

CPU times: total: 25min 25s
Wall time: 25min 27s


In [65]:
chek = pd.DataFrame({'user_id':mean_check_train_lv1.keys(), 'check':mean_check_train_lv1.values()})
chek.head(2)

Unnamed: 0,user_id,check
0,2375,22.313053
1,1364,21.675158


In [66]:
user_features = user_features.merge(chek, on ='user_id')

In [78]:
user_features['household_size_desc'].replace({'5+': 6}, inplace=True)
user_features['household_size_desc'] = user_features['household_size_desc'].astype(np.int32)

## Средний чек в зависимости от размера семьи

In [79]:
user_features['med_chek_for_household_size_desc'] = user_features['check']/user_features['household_size_desc']

## Топ 3 любимых товара (user-item)

In [230]:
user_purchase = {}
for user in data_train_L1["user_id"].unique():
    user_purchase[user] = [el for el in data_train_L1.loc[data_train_L1['user_id']==user]['item_id'].value_counts().index][:4]

In [231]:
purchase_top = pd.DataFrame({'user_id':user_purchase.keys(), 
                             'purchase':user_purchase.values()
                            })
purchase_top.head(3)

Unnamed: 0,user_id,purchase
0,2375,"[999999, 1036501, 849843, 899624]"
1,1364,"[999999, 1005186, 1082185, 862349]"
2,1130,"[999999, 1082185, 1133018, 995785]"


In [295]:
%%time
user_purchase_1 = {}
user_purchase_2 = {}
user_purchase_3 = {}
user_purchase_4 = {}
for user in data_train_L1["user_id"].unique():
    p_1 = [el for el in data_train_L1.loc[data_train_L1['user_id']==user]['item_id'].value_counts().index][:4]
    try:
        user_purchase_1[user] = p_1[0]
        user_purchase_2[user] = p_1[1]
        user_purchase_3[user] = p_1[2]
        user_purchase_4[user] = p_1[3]
    except IndexError:
        pass      
            

CPU times: total: 7.22 s
Wall time: 7.25 s


In [296]:
top1 = pd.DataFrame({'user_id':user_purchase_1.keys(), 'purchase_1':user_purchase_1.values()})
top2 = pd.DataFrame({'user_id':user_purchase_2.keys(), 'purchase_2':user_purchase_2.values()})
top3 = pd.DataFrame({'user_id':user_purchase_3.keys(), 'purchase_3':user_purchase_3.values()})
top4 = pd.DataFrame({'user_id':user_purchase_4.keys(), 'purchase_4':user_purchase_4.values()})

In [298]:
top1 = top1.merge(top2, how ='outer', on = 'user_id')
top1 = top1.merge(top3, how ='outer', on = 'user_id')
top1 = top1.merge(top4, how ='outer', on = 'user_id')

In [300]:
top1.isna().sum()

user_id        0
purchase_1     0
purchase_2    15
purchase_3    23
purchase_4    31
dtype: int64

In [302]:
top1.fillna(0,inplace=True)

In [303]:
top1.isna().sum()

user_id       0
purchase_1    0
purchase_2    0
purchase_3    0
purchase_4    0
dtype: int64

In [304]:
user_features = user_features.merge(top1, on ='user_id')

In [305]:
user_features.head(3)

Unnamed: 0,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,user_id,check,med_chek_for_household_size_desc,purchase_1,purchase_2,purchase_3,purchase_4
0,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1,35.979053,17.989526,999999,856942.0,1082185.0,995242.0
1,45-54,A,50-74K,Homeowner,2 Adults No Kids,2,None/Unknown,7,24.533474,12.266737,999999,1082185.0,1122358.0,6944571.0
2,25-34,U,25-34K,Unknown,2 Adults Kids,3,1,8,43.677579,14.559193,999999,1082185.0,1133018.0,969932.0


## Популярность товара (для item-user)

In [306]:
popularity = data_train_L1.groupby('item_id')['user_id'].nunique().reset_index() 
popularity.rename(columns={'user_id': 'share_unique_users'}, inplace=True)
#popularity[share_unique_users]/ data_train_L1['user_id'].nunique()

In [307]:
popularity['share_unique_users'] = popularity['share_unique_users']*100/ data_train_L1['user_id'].nunique()

In [310]:
item_features = item_features.merge(popularity, on='item_id')

In [313]:
item_features.head(3)

Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,share_unique_users
0,202291,69,MISC SALES TRAN,Private,COUPON/MISC ITEMS,GASOLINE-REG UNLEADED,,0.080064
1,397896,69,KIOSK-GAS,Private,COUPON/MISC ITEMS,GASOLINE-REG UNLEADED,,2.121697
2,420647,69,MISC SALES TRAN,Private,COUPON/MISC ITEMS,GASOLINE-REG UNLEADED,,0.20016


# Подготовка к уровню 2

### Слияние USER и IITEM + добавление flag 0 если не покупал 1 если купил

In [326]:
users_lvl_2 = pd.DataFrame(data_train_L2['user_id'].unique())
users_lvl_2.columns = ['user_id']
users_lvl_2 = users_lvl_2.loc[users_lvl_2['user_id']<=2497] # конец фрецма не попал в обучение поэтому и предсказать не сможем для юсеров выше
# Пока только warm start
train_users = data_train_L1['user_id'].unique()
users_lvl_2 = users_lvl_2[users_lvl_2['user_id'].isin(train_users)]

users_lvl_2['candidates'] = users_lvl_2['user_id'].apply(lambda x: rec.get_own_recommendations(x, N=300))

In [328]:
s = users_lvl_2.apply(lambda x: pd.Series(x['candidates']), axis=1).stack().reset_index(level=1, drop=True)
s.name = 'item_id'

users_lvl_2 = users_lvl_2.drop('candidates', axis=1).join(s)
users_lvl_2['flag'] = 1

users_lvl_2.head(4)

Unnamed: 0,user_id,item_id,flag
0,2070,1082185,1
0,2070,1029743,1
0,2070,1098066,1
0,2070,995785,1


## Добавление признаков Item и User к обучающему множеству

In [330]:
targets_lvl_2 = data_train_L2[['user_id', 'item_id']].copy()
targets_lvl_2['target'] = 1  # тут только покупки 

targets_lvl_2 = users_lvl_2.merge(targets_lvl_2, on=['user_id', 'item_id'], how='left')

targets_lvl_2['target'].fillna(0, inplace= True)
targets_lvl_2.drop('flag', axis=1, inplace=True)

In [331]:
targets_lvl_2 = targets_lvl_2.merge(item_features, on='item_id', how='left')
targets_lvl_2 = targets_lvl_2.merge(user_features, on='user_id', how='left')

targets_lvl_2.head(2)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,share_unique_users,...,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,check,med_chek_for_household_size_desc,purchase_1,purchase_2,purchase_3,purchase_4
0,2070,1082185,1.0,2,PRODUCE,National,TROPICAL FRUIT,BANANAS,40 LB,79.863891,...,Unknown,Unknown,1.0,None/Unknown,54.079684,54.079684,999999.0,1085604.0,834103.0,1055863.0
1,2070,1029743,0.0,69,GROCERY,Private,FLUID MILK PRODUCTS,FLUID MILK WHITE ONLY,1 GA,51.441153,...,Unknown,Unknown,1.0,None/Unknown,54.079684,54.079684,999999.0,1085604.0,834103.0,1055863.0


## Разделение на трейн и тест

In [332]:
X_train = targets_lvl_2.drop('target', axis=1)
y_train = targets_lvl_2[['target']]

## Предобработка обучающего множества

In [351]:
X_train.fillna(0, inplace=True)

In [333]:
from catboost import CatBoostClassifier

In [372]:
categorial_columns = ['department','manufacturer', 'brand', 'commodity_desc', \
                     'sub_commodity_desc', 'curr_size_of_product', 'marital_status_code',\
                      'age_desc','homeowner_desc', 'income_desc',\
                      'hh_comp_desc','kid_category_desc']

In [None]:
# 'purchase_1', 'purchase_2', 'purchase_3', 'purchase_3' признаки категориальны но изза большого числа данных catboost будет ругатся

In [370]:
X_train.replace({999999.0:0}, inplace=True)

## Обучение

In [373]:
%%time
cat = CatBoostClassifier(max_depth=7, cat_features=categorial_columns, random_state=14, silent=True)
cat.fit(X_train, y_train, cat_features=categorial_columns)

train_preds = cat.predict(X_train)

CPU times: total: 2h 2min 51s
Wall time: 18min 52s


In [375]:
train_preds = train_preds.astype(bool)

In [376]:
rec_items = X_train[train_preds].groupby(by=['user_id'])['item_id'].unique().reset_index()
rec_items.columns = ['user_id', 'model_preds']

In [380]:
result_l2 = data_valid_L2.groupby('user_id')['item_id'].unique().reset_index()
result_l2.columns = ['user_id', 'actual']
result_l2.head(5)

Unnamed: 0,user_id,actual
0,1,"[821867, 834484, 856942, 865456, 889248, 90795..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963..."
2,6,"[920308, 926804, 946489, 1006718, 1017061, 107..."
3,7,"[840386, 889774, 898068, 909714, 929067, 95347..."
4,8,"[835098, 872137, 910439, 924610, 992977, 10412..."


In [382]:
result_l2 = result_l2.merge(rec_items,
                                  on='user_id',
                                  how='inner')

In [386]:
result_l2.apply(lambda row: precision_at_k(row['model_preds'], row['actual']), axis=1).mean()

0.5005507955936352

### Очень хорошая точность получилась от части из за того что был введен признак того насколько товар популярен среди пользователей share_unique_users так же были введены признаки любимого товара и усрелненого счета за неделю
### В дальнейшем можно расширить количесвто рекомендаций за счет уменьшения порога  и использования predict_proba для сортировки.

In [398]:
importance = []
for f, n in zip(cat.feature_importances_, cat.feature_names_):
    importance.append((f,n))
importance.sort(reverse=True)

In [405]:
importance[:10]

[(29.821125873863412, 'share_unique_users'),
 (10.610412540245227, 'user_id'),
 (9.406490224395966, 'item_id'),
 (5.780300932187047, 'check'),
 (5.572798975249155, 'commodity_desc'),
 (4.85888257213557, 'sub_commodity_desc'),
 (4.276437338739306, 'purchase_4'),
 (3.85133273192159, 'purchase_2'),
 (3.708360295660216, 'purchase_3'),
 (3.6307267794482962, 'med_chek_for_household_size_desc')]