**Импорт библиотек**

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix, coo_matrix

# Матричная факторизация
from implicit import als

# Модель второго уровня
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

import os, sys
module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)

# Написанные нами функции
from src.metrics import precision_at_k, recall_at_k
from src.utils import prefilter_items
from src.recommenders import MainRecommender

import warnings
warnings.filterwarnings('ignore')

Для решения задачи класс MainRecommender был улучшен (в сравнении с версией Baseline) - изменены параметры обучения модели и выбора кандидатов, добавлена возможность при инициализации класса указать параметр (метрику) по которой строится матрица.

**Функции**

In [2]:
def print_stats_data(df_data, name_df):
    print(name_df)
    print(f"Shape: {df_data.shape} Users: {df_data[USER_COL].nunique()} Items: {df_data[ITEM_COL].nunique()}")

In [3]:
def calc_precision_at_k(df_data, top_k):
    for col_name in df_data.columns[2:]:
        yield col_name, df_data.apply(lambda row: precision_at_k(row[col_name], row[ACTUAL_COL], k=top_k), axis=1).mean()

**Загрука и преподготовка данных**

In [4]:
data = pd.read_csv('retail_train.csv') #user-item data
item_features = pd.read_csv('product.csv') #item data
user_features = pd.read_csv('hh_demographic.csv') #user data

In [5]:
# column processing
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': 'item_id'}, inplace=True)
user_features.rename(columns={'household_key': 'user_id'}, inplace=True)

In [6]:
# Генерируем доп. метрики для обучения модели первого уровня
data['log_quantity'] = np.log(data['quantity'])
data['log_sales_value'] = np.log(data['sales_value'])
data['log_quantity+1'] = np.log(data['quantity']+1)
data['log_sales_value+1'] = np.log(data['sales_value']+1)
data.loc[data['quantity']>0, 'boughten'] = 1
data.loc[data['quantity']==0, 'boughten'] = 0

Создание глобальных переменных

In [7]:
ITEM_COL = 'item_id'
USER_COL = 'user_id'
ACTUAL_COL = 'actual'

# N = Neighbors
N_PREDICT = 30

Разделение на train и test

In [8]:
VAL_MATCHER_WEEKS = 6
VAL_RANKER_WEEKS = 3

# берем данные для тренировки matching модели
data_train_matcher = data[data['week_no'] < data['week_no'].max() - (VAL_MATCHER_WEEKS + VAL_RANKER_WEEKS)]

# берем данные для валидации matching модели
data_val_matcher = data[(data['week_no'] >= data['week_no'].max() - (VAL_MATCHER_WEEKS + VAL_RANKER_WEEKS)) &
                      (data['week_no'] < data['week_no'].max() - (VAL_RANKER_WEEKS))]


# берем данные для тренировки ranking модели
data_train_ranker = data_val_matcher.copy()  # Для наглядности. Далее мы добавим изменения, и они будут отличаться

# берем данные для теста ranking, matching модели
data_val_ranker = data[data['week_no'] >= data['week_no'].max() - VAL_RANKER_WEEKS]

In [9]:
# сделаем объединенный сет данных для первого уровня (матчинга)
df_join_train_matcher = pd.concat([data_train_matcher, data_val_matcher])

Префильтрация данных

In [10]:
n_popular = 11000 

In [11]:
n_items_before = data_train_matcher['item_id'].nunique()

data_train_matcher = prefilter_items(data_train_matcher, item_features=item_features, take_n_popular=n_popular)

n_items_after = data_train_matcher['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

Decreased # items from 83685 to 11001


Обработка холодного старта

In [12]:
# ищем общих пользователей
common_users = list(set(data_train_matcher.user_id.values)&(set(data_val_matcher.user_id.values))&set(data_val_ranker.user_id.values))

# оставляем общих пользователей
data_train_matcher = data_train_matcher[data_train_matcher.user_id.isin(common_users)]
data_val_matcher = data_val_matcher[data_val_matcher.user_id.isin(common_users)]
data_train_ranker = data_train_ranker[data_train_ranker.user_id.isin(common_users)]
data_val_ranker = data_val_ranker[data_val_ranker.user_id.isin(common_users)]

print_stats_data(data_train_matcher,'train_matcher')
print_stats_data(data_val_matcher,'val_matcher')
print_stats_data(data_train_ranker,'train_ranker')
print_stats_data(data_val_ranker,'val_ranker')

train_matcher
Shape: (784420, 18) Users: 1915 Items: 10995
val_matcher
Shape: (163261, 17) Users: 1915 Items: 27118
train_ranker
Shape: (163261, 17) Users: 1915 Items: 27118
val_ranker
Shape: (115989, 17) Users: 1915 Items: 24042


**Baseline**

Сделаем рекомендации методом случайного подбора и оценим их эффективность (precision_at_k). Данный результат будем считать базовым и пытаться его улучшить.


In [13]:
result_eval_matcher = data_val_matcher.groupby(USER_COL)[ITEM_COL].unique().reset_index()
result_eval_matcher.columns=[USER_COL, ACTUAL_COL]
result_eval_matcher.head(2)

Unnamed: 0,user_id,actual
0,1,"[853529, 865456, 867607, 872137, 874905, 87524..."
1,6,"[1024306, 1102949, 6548453, 835394, 940804, 96..."


In [14]:
import random
# список из всех возможных товаров
all_items = list(data_val_matcher['item_id'].unique())
# рекомендация для каждого юзера 5 случайных товаров
random.seed(42)
result_eval_matcher['random_recs'] = result_eval_matcher['user_id'].apply(lambda x: random.sample(all_items, N_PREDICT))
result_eval_matcher.head(3)

Unnamed: 0,user_id,actual,random_recs
0,1,"[853529, 865456, 867607, 872137, 874905, 87524...","[1047349, 5566855, 879143, 13876745, 998334, 1..."
1,6,"[1024306, 1102949, 6548453, 835394, 940804, 96...","[7167441, 7142935, 896308, 838769, 13768063, 1..."
2,7,"[836281, 843306, 845294, 914190, 920456, 93886...","[6423857, 966832, 13417451, 1121393, 12171707,..."


Посчитаем presicion at 5 для случайных рекомендаций.

In [15]:
TOPK_PRECISION = 5
sorted(calc_precision_at_k(result_eval_matcher, TOPK_PRECISION), key=lambda x: x[1],reverse=True)

[('random_recs', 0.0027154046997389042)]

Задача нашей модели - показать результат > 0.002

**Построение модели первого уровня**

Попробуем посмотреть на качество результата модели первого уровня при различных весах в user-item матрице 

In [16]:
# Довольно долго просчитывает, результат описан в заметке ниже, можно пропустить.

In [17]:
weights = ['quantity', 'sales_value',
           'log_quantity', 'log_sales_value',
           'log_quantity+1', 'log_sales_value+1', 'boughten']

In [18]:
%%time
for param in weights:
    own_name = ['own recs by '+param]
    als_name = ['als recs by '+param]
    
    #Создаем экземпляр класса
    recommender = MainRecommender(data_train_matcher, weighting = True, param=param)
    
    #Делаем предсказания
    result_eval_matcher[own_name] = result_eval_matcher['user_id'].apply(lambda x: recommender.get_own_recommendations(x, N=N_PREDICT))
    result_eval_matcher[als_name] = result_eval_matcher['user_id'].apply(lambda x: recommender.get_als_recommendations(x, N=N_PREDICT))
      
#Считаем score   
TOPK_PRECISION = 5   
print(sorted(calc_precision_at_k(result_eval_matcher, TOPK_PRECISION), key=lambda x: x[1],reverse=True))
    

Model using base param:  quantity




  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/10995 [00:00<?, ?it/s]

Model using base param:  sales_value


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/10995 [00:00<?, ?it/s]

Model using base param:  log_quantity


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/10995 [00:00<?, ?it/s]

Model using base param:  log_sales_value


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/10995 [00:00<?, ?it/s]

Model using base param:  log_quantity+1


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/10995 [00:00<?, ?it/s]

Model using base param:  log_sales_value+1


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/10995 [00:00<?, ?it/s]

Model using base param:  boughten


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/10995 [00:00<?, ?it/s]

[('own recs by quantity', 0.29838120104438465), ('own recs by sales_value', 0.29838120104438465), ('own recs by log_quantity', 0.29838120104438465), ('own recs by log_sales_value', 0.29838120104438465), ('own recs by log_quantity+1', 0.29838120104438465), ('own recs by log_sales_value+1', 0.29838120104438465), ('own recs by boughten', 0.29838120104438465), ('als recs by log_quantity', 0.1289817232375969), ('als recs by quantity', 0.12804177545691808), ('als recs by log_sales_value', 0.12720626631853682), ('als recs by sales_value', 0.12657963446475112), ('als recs by boughten', 0.12637075718015564), ('als recs by log_quantity+1', 0.1254308093994768), ('als recs by log_sales_value+1', 0.12375979112271432), ('random_recs', 0.0027154046997389042)]
Wall time: 4min 7s


Как мы видим, переборка весов влияла на качество работы als модели, но не изменила качество own recs. 
Наилучший результат показала модель own recs. 
Ее результаты мы возьмем для работы модели уровня 2.

Переобучим экземпляр класса MainRecommender с необходимым параметром.

In [19]:
recommender = MainRecommender(data_train_matcher, weighting = True, param='quantity')

Model using base param:  quantity


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/10995 [00:00<?, ?it/s]

**Подготовка данных для трейна**

In [20]:
# взяли пользователей из трейна для ранжирования
df_match_candidates = pd.DataFrame(data_train_ranker[USER_COL].unique())
df_match_candidates.columns = [USER_COL]

In [21]:
# собираем кандитатов с первого этапа (matcher)
df_match_candidates['candidates'] = df_match_candidates[USER_COL].apply(lambda x: recommender.get_own_recommendations(x, N=N_PREDICT))

In [22]:
# разворачиваем товары
df_items = df_match_candidates.apply(lambda x: pd.Series(x['candidates']), axis=1).stack().reset_index(level=1, drop=True)
df_items.name = 'item_id'

In [23]:
df_match_candidates = df_match_candidates.drop('candidates', axis=1).join(df_items)

In [24]:
df_match_candidates.head(4)

Unnamed: 0,user_id,item_id
0,2070,913210
0,2070,1029743
0,2070,5569374
0,2070,838186


In [25]:
print_stats_data(df_match_candidates, 'match_candidates')

match_candidates
Shape: (57450, 2) Users: 1915 Items: 3736


**Создаем трейн сет для ранжирования с учетом кандидатов с этапа 1**

In [26]:
df_ranker_train = data_train_ranker[[USER_COL, ITEM_COL]].copy()
df_ranker_train['target'] = 1  # тут только покупки 

df_ranker_train = df_match_candidates.merge(df_ranker_train, on=[USER_COL, ITEM_COL], how='left')

df_ranker_train['target'].fillna(0, inplace= True)

In [27]:
df_ranker_train['target'].mean()

0.21757623243096308

**Подготавливаем фичи для обучения модели**

In [28]:
df_ranker_train = df_ranker_train.merge(item_features, on='item_id', how='left')
df_ranker_train = df_ranker_train.merge(user_features, on='user_id', how='left')
df_ranker_train['manufacturer'] = df_ranker_train['manufacturer'].astype(str)

df_ranker_train.head(2)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc
0,2070,913210,1.0,2,GROCERY,National,WATER - CARBONATED/FLVRD DRINK,NON-CRBNTD DRNKING/MNERAL WATE,405.6 OZ,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown
1,2070,1029743,0.0,69,GROCERY,Private,FLUID MILK PRODUCTS,FLUID MILK WHITE ONLY,1 GA,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown


Базовые фичи

In [29]:
df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=ITEM_COL).agg('sales_value').sum().rename('total_item_sales_value'), how='left',on=ITEM_COL)

df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=ITEM_COL).agg('quantity').sum().rename('total_quantity_value'), how='left',on=ITEM_COL)

df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=ITEM_COL).agg(USER_COL).count().rename('item_freq'), how='left',on=ITEM_COL)

df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=USER_COL).agg(USER_COL).count().rename('user_freq'), how='left',on=USER_COL)

df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=USER_COL).agg('sales_value').sum().rename('total_user_sales_value'), how='left',on=USER_COL)

df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=ITEM_COL).agg('quantity').sum().rename('item_quantity_per_week')/df_join_train_matcher.week_no.nunique(), how='left',on=ITEM_COL)

df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=USER_COL).agg('quantity').sum().rename('user_quantity_per_week')/df_join_train_matcher.week_no.nunique(), how='left',on=USER_COL)


df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=ITEM_COL).agg('quantity').sum().rename('item_quantity_per_basket')/df_join_train_matcher.basket_id.nunique(), how='left',on=ITEM_COL)

df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=USER_COL).agg('quantity').sum().rename('user_quantity_per_baskter')/df_join_train_matcher.basket_id.nunique(), how='left',on=USER_COL)


df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=ITEM_COL).agg(USER_COL).count().rename('item_freq_per_basket')/df_join_train_matcher.basket_id.nunique(), how='left',on=ITEM_COL)

df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=USER_COL).agg(USER_COL).count().rename('user_freq_per_basket')/df_join_train_matcher.basket_id.nunique(), how='left',on=USER_COL)


Дополнительные фичи

In [30]:
# Средний чек
users_sales = data_train_ranker.groupby('user_id')['sales_value'].mean().reset_index()
users_sales.rename(columns={'sales_value': 'avg_cheque'}, inplace=True)
df_ranker_train = df_ranker_train.merge(users_sales[['user_id', 'avg_cheque']], on='user_id', how='left')

# Средняя частота покупки
users_sales = data_train_ranker.groupby('user_id')['quantity'].mean().reset_index()
users_sales.rename(columns={'quantity': 'avg_quantity'}, inplace=True)
df_ranker_train = df_ranker_train.merge(users_sales[['user_id', 'avg_quantity']], on='user_id', how='left')

# Средная цена купленных товаров пользователем
users_sales = data_train_ranker.groupby('user_id')[['sales_value', 'quantity']].sum().reset_index()
users_sales['avg_price_by_user'] = users_sales['sales_value'] / users_sales['quantity']
df_ranker_train = df_ranker_train.merge(users_sales[['user_id', 'avg_price_by_user']], on='user_id', how='left')

# Средняя цена товаров
items_sales = data_train_ranker.groupby('item_id')[['sales_value', 'quantity']].sum().reset_index()
items_sales['avg_price_by_item'] = items_sales['sales_value'] / items_sales['quantity']
items_sales['avg_price_by_item'].fillna(0, inplace=True)
df_ranker_train = df_ranker_train.merge(items_sales[['item_id', 'avg_price_by_item']], on='item_id', how='left')

Еще больше фичей

In [31]:
def generate_user_item_features(user_data, item_data, result_df):
    user_cols = list(user_data.select_dtypes(include=['object']).columns)
    items_cols = item_data.columns.to_list()
    
    #Удаляем столбцы, содержащие id
    del_cols = ['user_id', 'basket_id', 'item_id', 'store_id']
    for el in del_cols:
        items_cols.remove(el)
        
    #Генерируем суммы
    for col in user_cols:
        for c in items_cols:
            name = str('sum'+' of '+c+' by '+col)
            tmp_df = user_data.copy()
            tmp_df = tmp_df.merge(item_data.groupby(USER_COL).agg(c).sum().rename(name), how='left', on=USER_COL)
            tmp_df = tmp_df.groupby(by=col).sum().reset_index()
            result_df=result_df.merge(tmp_df[[col, name]], how='left',on=col)
            
    #Генерируем ср. значения
    for col in user_cols:
        for c in items_cols:
            name = str('mean'+' of '+c+' by '+col)
            tmp_df = user_data.copy()
            tmp_df = tmp_df.merge(item_data.groupby(USER_COL).agg(c).mean().rename(name), how='left', on=USER_COL)
            tmp_df = tmp_df.groupby(by=col).mean().reset_index()
            result_df=result_df.merge(tmp_df[[col, name]], how='left',on=col)
    
    return result_df

In [32]:
%%time
df_ranker_train = generate_user_item_features(user_features, df_join_train_matcher, df_ranker_train)
print('total features ', len(df_ranker_train.columns.to_list()))

total features  213
Wall time: 20.1 s


In [33]:
def generate_item_item_features(user_data, item_data, result_df):
    user_cols = list(user_data.select_dtypes(include=['object']).columns)
    items_cols = item_data.columns.to_list()
    
    #Удаляем столбцы, содержащие id
    del_cols = ['user_id', 'basket_id', 'item_id', 'store_id']
    for el in del_cols:
        items_cols.remove(el)
        
    #Генерируем суммы
    for col in user_cols:
        for c in items_cols:
            name = str('sum'+' of '+c+' by '+col)
            tmp_df = user_data.copy()
            tmp_df = tmp_df.merge(item_data.groupby(ITEM_COL).agg(c).sum().rename(name), how='left', on=ITEM_COL)
            tmp_df = tmp_df.groupby(by=col).sum().reset_index()
            result_df=result_df.merge(tmp_df[[col, name]], how='left',on=col)
            
    #Генерируем ср. значения
    for col in user_cols:
        for c in items_cols:
            name = str('mean'+' of '+c+' by '+col)
            tmp_df = user_data.copy()
            tmp_df = tmp_df.merge(item_data.groupby(ITEM_COL).agg(c).mean().rename(name), how='left', on=ITEM_COL)
            tmp_df = tmp_df.groupby(by=col).mean().reset_index()
            result_df=result_df.merge(tmp_df[[col, name]], how='left',on=col)
    
    return result_df

In [34]:
%%time
df_ranker_train = generate_item_item_features(item_features, df_join_train_matcher, df_ranker_train)
print('total features ', len(df_ranker_train.columns.to_list()))

total features  343
Wall time: 34.9 s


In [35]:
df_ranker_train.columns.to_list()

['user_id',
 'item_id',
 'target',
 'manufacturer',
 'department',
 'brand',
 'commodity_desc',
 'sub_commodity_desc',
 'curr_size_of_product',
 'age_desc',
 'marital_status_code',
 'income_desc',
 'homeowner_desc',
 'hh_comp_desc',
 'household_size_desc',
 'kid_category_desc',
 'total_item_sales_value',
 'total_quantity_value',
 'item_freq',
 'user_freq',
 'total_user_sales_value',
 'item_quantity_per_week',
 'user_quantity_per_week',
 'item_quantity_per_basket',
 'user_quantity_per_baskter',
 'item_freq_per_basket',
 'user_freq_per_basket',
 'avg_cheque',
 'avg_quantity',
 'avg_price_by_user',
 'avg_price_by_item',
 'sum of day by age_desc',
 'sum of quantity by age_desc',
 'sum of sales_value by age_desc',
 'sum of retail_disc by age_desc',
 'sum of trans_time by age_desc',
 'sum of week_no by age_desc',
 'sum of coupon_disc by age_desc',
 'sum of coupon_match_disc by age_desc',
 'sum of log_quantity by age_desc',
 'sum of log_sales_value by age_desc',
 'sum of log_quantity+1 by age

**Обучаем модель второго уровня**

In [36]:
X_train = df_ranker_train.drop(columns = ['target'])
y_train = df_ranker_train[['target']]

In [37]:
cat_feats = X_train.columns[2:].tolist()
X_train[cat_feats] = X_train[cat_feats].astype('category')

In [38]:
%%time
lgb = LGBMClassifier(objective='binary',
                     max_depth=10, 
                     n_estimators=700, 
                     learning_rate=0.2,
                     categorical_column=cat_feats)

lgb.fit(X_train, y_train)

train_preds = lgb.predict_proba(X_train)

Wall time: 20.2 s


In [39]:
df_ranker_predict = df_ranker_train.copy()

In [40]:
df_ranker_predict['proba_item_purchase'] = train_preds[:,1]

In [41]:
df_ranker_predict.head()

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,...,mean of trans_time by curr_size_of_product,mean of week_no by curr_size_of_product,mean of coupon_disc by curr_size_of_product,mean of coupon_match_disc by curr_size_of_product,mean of log_quantity by curr_size_of_product,mean of log_sales_value by curr_size_of_product,mean of log_quantity+1 by curr_size_of_product,mean of log_sales_value+1 by curr_size_of_product,mean of boughten by curr_size_of_product,proba_item_purchase
0,2070,913210,1.0,2,GROCERY,National,WATER - CARBONATED/FLVRD DRINK,NON-CRBNTD DRNKING/MNERAL WATE,405.6 OZ,45-54,...,1537.442842,45.81594,-0.000899,0.0,-inf,-inf,0.742544,1.789636,0.997466,0.966736
1,2070,1029743,0.0,69,GROCERY,Private,FLUID MILK PRODUCTS,FLUID MILK WHITE ONLY,1 GA,45-54,...,1630.97474,49.090432,-0.001173,-7.067626e-07,-inf,-inf,0.764696,1.168223,0.999724,0.133192
2,2070,5569374,0.0,1208,GROCERY,National,SOFT DRINKS,SOFT DRINKS 12/18&15PK CAN CAR,12 OZ,45-54,...,1565.781241,48.956838,-0.015155,-0.002193967,-inf,-inf,0.750705,1.432868,0.995978,0.005535
3,2070,838186,1.0,1790,GROCERY,National,BAKED SWEET GOODS,SW GDS:DONUTS,18.2 OZ,45-54,...,1567.599722,38.780708,-0.003247,0.0,-inf,-inf,0.729807,1.482357,0.999869,0.949918
4,2070,926905,0.0,103,GROCERY,National,SOFT DRINKS,SOFT DRINKS 12/18&15PK CAN CAR,12 OZ,45-54,...,1565.781241,48.956838,-0.015155,-0.002193967,-inf,-inf,0.750705,1.432868,0.995978,0.006468


**Сделаем предсказания на тесте**

In [42]:
result_eval_ranker = data_val_ranker.groupby(USER_COL)[ITEM_COL].unique().reset_index()
result_eval_ranker.columns=[USER_COL, ACTUAL_COL]
result_eval_ranker.head(2)

Unnamed: 0,user_id,actual
0,1,"[821867, 834484, 856942, 865456, 889248, 90795..."
1,6,"[920308, 926804, 946489, 1006718, 1017061, 107..."


In [43]:
%%time
result_eval_ranker['own_rec'] = result_eval_ranker[USER_COL].apply(lambda x: recommender.get_own_recommendations(x, N=N_PREDICT))

Wall time: 7.83 s


In [44]:
# померяем precision только модели матчинга, чтобы понимать влияение ранжирования на метрики
sorted(calc_precision_at_k(result_eval_ranker, TOPK_PRECISION), key=lambda x: x[1], reverse=True)

[('own_rec', 0.24689295039164288)]

Сделаем предсказания с учетом ранжирования

In [45]:
from collections import OrderedDict

def rerank(user_id):
    res = df_ranker_predict[df_ranker_predict[USER_COL]==user_id].sort_values('proba_item_purchase', ascending=False).head(N_PREDICT).item_id.tolist()
    # Делаем постфильтрацию, оставляя только уникальные значения
    res = list(OrderedDict.fromkeys(res))
    return res

In [46]:
result_eval_ranker['reranked_own_rec'] = result_eval_ranker[USER_COL].apply(lambda user_id: rerank(user_id))

In [47]:
sorted(calc_precision_at_k(result_eval_ranker, TOPK_PRECISION), key=lambda x: x[1], reverse=True)

[('reranked_own_rec', 0.2920104438642282), ('own_rec', 0.24689295039164288)]

In [48]:
0.2958

0.2958

Модель второго уровня показала результат лучше, чем модель первого уровня.

**Оценка на тесте для выполнения курсового проекта**

In [83]:
df_test = pd.read_csv('retail_test1.csv')
df_transactions = pd.read_csv('retail_train.csv')

In [84]:
df_test.head()

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,1340,41652823310,664,912987,1,8.49,446,0.0,52,96,0.0,0.0
1,588,41652838477,664,1024426,1,6.29,388,0.0,8,96,0.0,0.0
2,2070,41652857291,664,995242,5,9.1,311,-0.6,46,96,0.0,0.0
3,1602,41665647035,664,827939,1,7.99,334,0.0,1741,96,0.0,0.0
4,1602,41665647035,664,927712,1,0.59,334,-0.4,1741,96,0.0,0.0


In [85]:
result_test = df_test.groupby(USER_COL)[ITEM_COL].unique().reset_index()
result_test.columns=[USER_COL, ACTUAL_COL]
result_test.head(2)

Unnamed: 0,user_id,actual
0,1,"[880007, 883616, 931136, 938004, 940947, 94726..."
1,2,"[820165, 820291, 826784, 826835, 829009, 85784..."


In [86]:
result_test['own_rec'] = result_test[USER_COL].apply(lambda x: recommender.get_own_recommendations(x, N=N_PREDICT))
result_test['reranked_own_rec'] = result_test[USER_COL].apply(lambda user_id: rerank(user_id))

**Постфильтрация результатов**

Проверим, есть ли строки где количество предсказаний <5

In [87]:
# Проверим, есть ли строки где количество предсказаний <5
nums = []
for num, row in enumerate(result_test['reranked_own_rec']): 
    if len(row) < 5:
        nums.append(num)
print(len(nums))

224


224 строки содержат меньше 5-ти предсказаний. Дозаполним пропуски из списка top-purchases.

In [88]:
random.seed(42)

top_recommends = recommender.top_purchases
users = top_recommends['user_id'].to_list()
top_list = top_recommends['item_id'].to_list()[:20]

for num, row in result_test.iterrows(): 
    if num in nums:
        if row.user_id in users:
            rec_qty = 5 - len(row.reranked_own_rec)
            new_rec = top_recommends[top_recommends['user_id']==row.user_id]['item_id'].head(rec_qty).to_list()
            for rec in new_recs:
                row.reranked_own_rec.append(rec)
        else:
            rec_qty = 5 - len(row.reranked_own_rec)
            new_recs = random.sample(top_list, rec_qty)
            for rec in new_recs:
                row.reranked_own_rec.append(rec)

Проверим, есть ли строки где количество предсказаний >5

In [89]:
# Проверим, есть ли строки где количество предсказаний > 5
nums = []
for num, row in enumerate(result_test['reranked_own_rec']): 
    if len(row) > 5:
        nums.append(num)
print(len(nums))

1660


1660 строк содержат больше 5-ти предсказаний, оставим только ТОП-5 из них.

In [90]:
for num, row in enumerate(result_test['reranked_own_rec']): 
    if num in nums:
        result_test.at[num, 'reranked_own_rec'] = row[:5]

Проверим, остались ли в датасете предсказания длиной != 5

In [91]:
nums = []
for num, row in enumerate(result_test['reranked_own_rec']): 
    if len(row) != 5:
        nums.append(num)
print(len(nums))

0


Оценим качество наших предсказаний на тесте.

In [92]:
sorted(calc_precision_at_k(result_test, TOPK_PRECISION), key=lambda x: x[1], reverse=True)

[('reranked_own_rec', 0.21708222811670896), ('own_rec', 0.1969230769230747)]

Полученный результат на тесте: **presicion_at_5 = 0.2170**

In [93]:
# Сохраним рекомендации
recommendations = result_test[[USER_COL, 'reranked_own_rec']]
recommendations.rename(columns = {'reranked_own_rec' : 'recommendations'}, inplace = True)

In [94]:
recommendations

Unnamed: 0,user_id,recommendations
0,1,"[940947, 9297615, 9655212, 10149640, 856942]"
1,2,"[939860, 977374, 1097398, 1029743, 1074333]"
2,3,"[8020166, 939860, 986912, 6632283, 1029743]"
3,6,"[1024306, 1098844, 6548453, 878996, 1029743]"
4,7,"[1122358, 993638, 1106523, 1126899, 862682]"
...,...,...
1880,2496,"[1056509, 1106523, 1041796, 907631, 916122]"
1881,2497,"[1029743, 1135834, 1051323, 1040807, 5590613]"
1882,2498,"[1106523, 1070820, 1100379, 1126899, 1130858]"
1883,2499,"[5569327, 5568378, 1070820, 1060872, 899624]"


In [95]:
recommendations.to_csv('recommendations.csv', index=False)
recommendations.head(2)

Unnamed: 0,user_id,recommendations
0,1,"[940947, 9297615, 9655212, 10149640, 856942]"
1,2,"[939860, 977374, 1097398, 1029743, 1074333]"
