# Загрузка данных

In [1]:
import pandas as pd
import numpy as np

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit import als

from gensim.models import Word2Vec

# Модель второго уровня
from catboost import CatBoostClassifier

import os, sys
sys.path.insert(1, os.getcwd() + '/src/')

from metrics import precision_at_k, recall_at_k
from utils import prefilter_items
from recommenders import MainRecommender

In [2]:
data = pd.read_csv('retail_train.csv')
item_features = pd.read_csv('product.csv')
user_features = pd.read_csv('hh_demographic.csv')

# Функции

In [3]:
def print_stats_data(df_data, name_df):
    print(name_df)
    print(f"Shape: {df_data.shape} Users: {df_data[USER_COL].nunique()} Items: {df_data[ITEM_COL].nunique()}")
    
def make_recommendations(df_result, recommend_model, N_PREDICT=500, USER_COL='user_id'):
    return df_result[USER_COL].apply(lambda x: recommend_model(x, N=N_PREDICT))

def calc_recall(df_data, top_k, ACTUAL_COL='actual'):
    for col_name in df_data.columns[2:]:
        yield col_name, df_data.apply(lambda row: recall_at_k(row[col_name], row[ACTUAL_COL], k=top_k), axis=1).mean()
        
def calc_precision(df_data, top_k, ACTUAL_COL='actual'):
    for col_name in df_data.columns[2:]:
        yield col_name, df_data.apply(lambda row: precision_at_k(row[col_name], row[ACTUAL_COL], k=top_k), axis=1).mean()
        
def rerank(user_id, df, USER_COL='user_id', proba_col_name='proba_item_purchase', N=5):
    return df[df[USER_COL]==user_id].sort_values(proba_col_name, ascending=False).head(N).item_id.tolist()


# Подготовка данных

In [4]:
ITEM_COL = 'item_id'
USER_COL = 'user_id'

# column processing
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': ITEM_COL}, inplace=True)
user_features.rename(columns={'household_key': USER_COL }, inplace=True)

In [5]:
VAL_MATCHER_WEEKS = 5
VAL_RANKER_WEEKS = 3

# берем данные для тренировки matching модели
data_train_matcher = data[data['week_no'] < data['week_no'].max() - (VAL_MATCHER_WEEKS + VAL_RANKER_WEEKS)]

# берем данные для валидации matching модели
data_val_matcher = data[(data['week_no'] >= data['week_no'].max() - (VAL_MATCHER_WEEKS + VAL_RANKER_WEEKS)) &
                      (data['week_no'] < data['week_no'].max() - (VAL_RANKER_WEEKS))]


# берем данные для тренировки ranking модели
data_train_ranker = data_val_matcher.copy()  # Для наглядности. Далее мы добавим изменения, и они будут отличаться

# берем данные для теста ranking, matching модели
data_val_ranker = data[data['week_no'] >= data['week_no'].max() - VAL_RANKER_WEEKS]

In [6]:
print_stats_data(data_train_matcher,'train_matcher')
print_stats_data(data_val_matcher,'val_matcher')
print_stats_data(data_train_ranker,'train_ranker')
print_stats_data(data_val_ranker,'val_ranker')

train_matcher
Shape: (2136728, 12) Users: 2498 Items: 84180
val_matcher
Shape: (141762, 12) Users: 2097 Items: 25770
train_ranker
Shape: (141762, 12) Users: 2097 Items: 25770
val_ranker
Shape: (118314, 12) Users: 2042 Items: 24329


Проведем префильтрацию данных

In [7]:
n_items_before = data_train_matcher['item_id'].nunique()

data_train_matcher = prefilter_items(data_train_matcher, item_features=item_features, take_n_popular=20000)

n_items_after = data_train_matcher['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['price'] = data['sales_value'] / (np.maximum(data['quantity'], 1))


Decreased # items from 84180 to 20001


Оставим только пользователей, которые встречаются в тренировочном датасете

In [8]:
# ищем общих пользователей
common_users = data_train_matcher.user_id.values

data_val_matcher = data_val_matcher[data_val_matcher.user_id.isin(common_users)]
data_train_ranker = data_train_ranker[data_train_ranker.user_id.isin(common_users)]
data_val_ranker = data_val_ranker[data_val_ranker.user_id.isin(common_users)]

print_stats_data(data_train_matcher,'train_matcher')
print_stats_data(data_val_matcher,'val_matcher')
print_stats_data(data_train_ranker,'train_ranker')
print_stats_data(data_val_ranker,'val_ranker')

train_matcher
Shape: (872777, 13) Users: 2496 Items: 20001
val_matcher
Shape: (141737, 12) Users: 2095 Items: 25768
train_ranker
Shape: (141737, 12) Users: 2095 Items: 25768
val_ranker
Shape: (118282, 12) Users: 2040 Items: 24325


#  Построение модели первого уровня

In [9]:
recommender = MainRecommender(data_train_matcher)



  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/20001 [00:00<?, ?it/s]

In [10]:
ACTUAL_COL = 'actual'
result_eval_matcher = data_val_matcher.groupby(USER_COL)[ITEM_COL].unique().reset_index()
result_eval_matcher.columns=[USER_COL, ACTUAL_COL]
result_eval_matcher.head(2)

Unnamed: 0,user_id,actual
0,1,"[1005186, 907466, 909497, 940947, 963542, 1067..."
1,2,"[15830248, 838136, 839656, 861272, 866211, 870..."


In [11]:
models = {'own_rec': recommender.get_own_recommendations, 
          'sim_item_rec': recommender.get_similar_items_recommendation, 
          'als_rec': recommender.get_als_recommendations, 
          'sim_user_rec': recommender.get_similar_users_recommendation}

for column_name, model in models.items():
    result_eval_matcher[column_name] = make_recommendations(result_eval_matcher, model)


In [12]:
result_eval_matcher.head(2)

Unnamed: 0,user_id,actual,own_rec,sim_item_rec,als_rec,sim_user_rec
0,1,"[1005186, 907466, 909497, 940947, 963542, 1067...","[7167962, 896666, 1087895, 900875, 1024128, 85...","[880007, 1031087, 5563739, 8091643, 9803207, 9...","[883589, 856942, 821292, 13008459, 900875, 110...","[9655175, 1092295, 819549, 825160, 904021, 103..."
1,2,"[15830248, 838136, 839656, 861272, 866211, 870...","[9807181, 13133763, 1107760, 7463018, 944568, ...","[8090509, 1029743, 1059902, 985999, 920654, 11...","[1087547, 916122, 1029743, 881121, 5568729, 67...","[1082700, 7432119, 1071196, 933918, 1128882, 9..."


In [13]:
TOPK_RECALL = 500
sorted(calc_recall(result_eval_matcher, TOPK_RECALL), key=lambda x: x[1],reverse=True)

[('own_rec', 0.16829698695837966),
 ('sim_item_rec', 0.12873881960612904),
 ('als_rec', 0.12302629421548907),
 ('sim_user_rec', 0.010128428859075247)]

 лучшие результаты показывает модель на основе предыдущих покупок пользователя

# Генерация признаков для модели второго уровня

In [14]:
# взяли пользователей из трейна для ранжирования
N_PREDICT = 500

df_match_candidates = pd.DataFrame(data_train_ranker[USER_COL].unique())
df_match_candidates.columns = [USER_COL]

df_match_candidates['candidates'] = make_recommendations(df_match_candidates, recommender.get_own_recommendations, 
                                                         N_PREDICT=N_PREDICT)

df_items = df_match_candidates.apply(lambda x: pd.Series(x['candidates']), axis=1).stack().reset_index(level=1, drop=True)
df_items.name = 'item_id'
df_match_candidates = df_match_candidates.drop('candidates', axis=1).join(df_items)

df_match_candidates.head()

Unnamed: 0,user_id,item_id
0,1827,942763
0,1827,913958
0,1827,928263
0,1827,842342
0,1827,6464173


In [15]:
df_ranker_train = data_train_ranker[[USER_COL, ITEM_COL]].copy()
df_ranker_train['target'] = 1  # тут только покупки 

df_ranker_train = df_match_candidates.merge(df_ranker_train, on=[USER_COL, ITEM_COL], how='left')

# чистим дубликаты
df_ranker_train = df_ranker_train.drop_duplicates(subset=[USER_COL, ITEM_COL])

df_ranker_train['target'].fillna(0, inplace= True)

df_ranker_train.head()

Unnamed: 0,user_id,item_id,target
0,1827,942763,0.0
1,1827,913958,0.0
2,1827,928263,0.0
3,1827,842342,0.0
4,1827,6464173,0.0


In [16]:
df_ranker_train.target.value_counts()

0.0    1022845
1.0      19403
Name: target, dtype: int64

In [17]:
df_ranker_train = df_ranker_train.merge(item_features, on='item_id', how='left')
df_ranker_train = df_ranker_train.merge(user_features, on='user_id', how='left')

df_ranker_train.head(2)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc
0,1827,942763,0.0,4951,DRUG GM,National,COLD AND FLU,COLD AND FLU - PSE,24 CT,,,,,,,
1,1827,913958,0.0,720,DRUG GM,National,ORAL HYGIENE PRODUCTS,ORAL HYGIENE BRUSHES,,,,,,,,


Сгенерируем новые признаки и добавим их к датасету для обучения.



In [18]:
# Добавим параметр категории к исходному обучающему датасету для удобства создания новых фичей
data_department = data_train_ranker.merge(item_features[['item_id', 'department']], on='item_id', how='inner')
data_department.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc,department
0,1827,40702967646,601,891141,2,2.73,33923,0.0,7,87,0.0,0.0,PRODUCE
1,496,40739402373,603,891141,1,1.83,445,0.0,2226,87,0.0,0.0,PRODUCE


In [19]:
# Средная цена купленных товаров пользователем
users_sales = data_train_ranker.groupby(USER_COL)[['sales_value', 'quantity']].sum().reset_index()
users_sales['avg_price'] = users_sales['sales_value'] / users_sales['quantity']
df_ranker_train = df_ranker_train.merge(users_sales[['user_id', 'avg_price']], on='user_id', how='left')
df_ranker_train.head(2)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,avg_price
0,1827,942763,0.0,4951,DRUG GM,National,COLD AND FLU,COLD AND FLU - PSE,24 CT,,,,,,,,2.208947
1,1827,913958,0.0,720,DRUG GM,National,ORAL HYGIENE PRODUCTS,ORAL HYGIENE BRUSHES,,,,,,,,,2.208947


In [20]:
# Количество покупок в каждой категории и средная сумма покупки в каждой категории для пользователя
users_sales_department = data_department.groupby([USER_COL, 'department'])\
                        [['sales_value', 'quantity']].sum().reset_index()
users_sales_department.rename(columns={'quantity': 'n_sold_category'}, inplace=True)
users_sales_department['avg_transaction_category'] = users_sales_department['sales_value']\
                                                    /users_sales_department['n_sold_category']
users_sales_department.drop(columns=['sales_value'], inplace=True)

df_ranker_train = df_ranker_train.merge(
    users_sales_department, on=[USER_COL, 'department'], how='left')
df_ranker_train['Missing n_sold_category'] = 0
df_ranker_train.loc[df_ranker_train['n_sold_category'].isna(), 'Missing n_sold_category'] = 1
df_ranker_train['n_sold_category'].fillna(0, inplace=True)

df_ranker_train.head(2)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,avg_price,n_sold_category,avg_transaction_category,Missing n_sold_category
0,1827,942763,0.0,4951,DRUG GM,National,COLD AND FLU,COLD AND FLU - PSE,24 CT,,,,,,,,2.208947,2.0,1.0,0
1,1827,913958,0.0,720,DRUG GM,National,ORAL HYGIENE PRODUCTS,ORAL HYGIENE BRUSHES,,,,,,,,,2.208947,2.0,1.0,0


In [21]:
# Средняя сумма покупки в категории
department_sales = data_department.groupby('department')['sales_value'].mean().reset_index()
department_sales.rename(columns={'sales_value': 'mean_sales_value_category'}, inplace=True)
department_sales.tail(2)

n_weeks = data_department['week_no'].max() - data_department['week_no'].min() + 1

# Количество покупок юзером конкретной категории в неделю
users_department = data_department.groupby([USER_COL, 'department'])['quantity'].sum().reset_index()
users_department['quantity'] /= n_weeks
users_department.rename(columns={'quantity': 'n_sold_category_user_week'}, inplace=True)

df_ranker_train = df_ranker_train.merge(department_sales, on='department', how='left')
df_ranker_train = df_ranker_train.merge(users_department, on=[USER_COL, 'department'], how='left')
df_ranker_train.head(2)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,...,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,avg_price,n_sold_category,avg_transaction_category,Missing n_sold_category,mean_sales_value_category,n_sold_category_user_week
0,1827,942763,0.0,4951,DRUG GM,National,COLD AND FLU,COLD AND FLU - PSE,24 CT,,...,,,,,2.208947,2.0,1.0,0,3.994219,0.4
1,1827,913958,0.0,720,DRUG GM,National,ORAL HYGIENE PRODUCTS,ORAL HYGIENE BRUSHES,,,...,,,,,2.208947,2.0,1.0,0,3.994219,0.4


In [22]:
# Цена
items_sales = data_department.groupby(ITEM_COL)[['sales_value', 'quantity']].sum().reset_index()
items_sales['price'] = items_sales['sales_value'] / items_sales['quantity']
items_sales['price'].fillna(0, inplace=True)

# Количество покупок товара в неделю
items_sales['quantity_per_week'] = items_sales['quantity'] / n_weeks


df_ranker_train = df_ranker_train.merge(items_sales[[ITEM_COL,'price', 'quantity_per_week']],
                                        on=ITEM_COL, how='left')

df_ranker_train['Missing price'] = 0
df_ranker_train.loc[df_ranker_train['price'].isna(), 'Missing price'] = 1
df_ranker_train['price'].fillna(0, inplace=True)

df_ranker_train['Missing quantity per week'] = 0
df_ranker_train.loc[df_ranker_train['quantity_per_week'].isna(), 'Missing quantity per week'] = 1
df_ranker_train['quantity_per_week'].fillna(0, inplace=True)

df_ranker_train.head()

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,...,avg_price,n_sold_category,avg_transaction_category,Missing n_sold_category,mean_sales_value_category,n_sold_category_user_week,price,quantity_per_week,Missing price,Missing quantity per week
0,1827,942763,0.0,4951,DRUG GM,National,COLD AND FLU,COLD AND FLU - PSE,24 CT,,...,2.208947,2.0,1.0,0,3.994219,0.4,0.0,0.0,1,1
1,1827,913958,0.0,720,DRUG GM,National,ORAL HYGIENE PRODUCTS,ORAL HYGIENE BRUSHES,,,...,2.208947,2.0,1.0,0,3.994219,0.4,0.0,0.0,1,1
2,1827,928263,0.0,69,DRUG GM,Private,DIAPERS & DISPOSABLES,BABY DIAPERS,13 CT,,...,2.208947,2.0,1.0,0,3.994219,0.4,7.99,2.0,0,0
3,1827,842342,0.0,42,DRUG GM,National,CANDY - PACKAGED,CANDY & BREATH MINTS (PKGD) (N,,,...,2.208947,2.0,1.0,0,3.994219,0.4,2.59,0.2,0,0
4,1827,6464173,0.0,938,GROCERY,National,FROZEN PIE/DESSERTS,FROZEN CREAM PIES,5.2 OZ,,...,2.208947,25.0,2.2408,0,2.541433,5.0,2.29,0.2,0,0


In [23]:
# Количество уникальных магазинов, в которых продавался товар
items_stores = data_department.groupby(ITEM_COL)['store_id'].nunique().reset_index()
items_stores.rename(columns={'store_id': 'n_unique_stores'}, inplace=True)
df_ranker_train = df_ranker_train.merge(items_stores, on=ITEM_COL, how='left')

df_ranker_train.head(2)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,...,n_sold_category,avg_transaction_category,Missing n_sold_category,mean_sales_value_category,n_sold_category_user_week,price,quantity_per_week,Missing price,Missing quantity per week,n_unique_stores
0,1827,942763,0.0,4951,DRUG GM,National,COLD AND FLU,COLD AND FLU - PSE,24 CT,,...,2.0,1.0,0,3.994219,0.4,0.0,0.0,1,1,
1,1827,913958,0.0,720,DRUG GM,National,ORAL HYGIENE PRODUCTS,ORAL HYGIENE BRUSHES,,,...,2.0,1.0,0,3.994219,0.4,0.0,0.0,1,1,


In [24]:
# Среднее количество транзакций клиента в неделю
users_transactions = data_department.groupby(USER_COL)[ITEM_COL].count().reset_index()
users_transactions.rename(columns={'item_id': 'n_transactions_per_week'}, inplace=True)
users_transactions['n_transactions_per_week'] /= n_weeks


df_ranker_train = df_ranker_train.merge(users_transactions, on=USER_COL, how='left')

df_ranker_train.tail(2)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,...,avg_transaction_category,Missing n_sold_category,mean_sales_value_category,n_sold_category_user_week,price,quantity_per_week,Missing price,Missing quantity per week,n_unique_stores,n_transactions_per_week
1042246,1745,1051516,0.0,2,PRODUCE,National,VEGETABLES - ALL OTHERS,BEANS,25 LB,45-54,...,,1,2.32713,,1.523261,18.4,0,0,54.0,0.6
1042247,1745,6602729,0.0,1769,MEAT,National,TURKEY,GROUND TURKEY,1.3LB,45-54,...,,1,6.552912,,4.99,3.6,0,0,8.0,0.6


In [25]:
# Средний чек
users_sales = data_train_ranker.groupby(USER_COL)['sales_value'].mean().reset_index()
users_sales.rename(columns={'sales_value': 'avg_cheque'}, inplace=True)
df_ranker_train = df_ranker_train.merge(users_sales[['user_id', 'avg_cheque']], on='user_id', how='left')
df_ranker_train.head(2)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,...,Missing n_sold_category,mean_sales_value_category,n_sold_category_user_week,price,quantity_per_week,Missing price,Missing quantity per week,n_unique_stores,n_transactions_per_week,avg_cheque
0,1827,942763,0.0,4951,DRUG GM,National,COLD AND FLU,COLD AND FLU - PSE,24 CT,,...,0,3.994219,0.4,0.0,0.0,1,1,,6.0,2.798
1,1827,913958,0.0,720,DRUG GM,National,ORAL HYGIENE PRODUCTS,ORAL HYGIENE BRUSHES,,,...,0,3.994219,0.4,0.0,0.0,1,1,,6.0,2.798


In [26]:
# Среднее количество уникальных категорий в корзине
users_baskets = data_department.groupby([USER_COL, 'basket_id'])['department'].nunique().reset_index()
users_baskets = users_baskets.groupby(USER_COL)['department'].mean().reset_index()
users_baskets.rename(columns={'department': 'avg_basket_department'}, inplace=True)
df_ranker_train = df_ranker_train.merge(users_baskets[['user_id', 'avg_basket_department']], on='user_id', how='left')
df_ranker_train.head(2)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,...,mean_sales_value_category,n_sold_category_user_week,price,quantity_per_week,Missing price,Missing quantity per week,n_unique_stores,n_transactions_per_week,avg_cheque,avg_basket_department
0,1827,942763,0.0,4951,DRUG GM,National,COLD AND FLU,COLD AND FLU - PSE,24 CT,,...,3.994219,0.4,0.0,0.0,1,1,,6.0,2.798,2.4
1,1827,913958,0.0,720,DRUG GM,National,ORAL HYGIENE PRODUCTS,ORAL HYGIENE BRUSHES,,,...,3.994219,0.4,0.0,0.0,1,1,,6.0,2.798,2.4


Построим признак, отражающий средний интервал между покупками пользователя.

In [27]:
users_days = data_department.groupby(USER_COL)['day'].unique().reset_index()
users_days['day'] = users_days['day'].apply(lambda x: sorted(x))
users_days.head()

Unnamed: 0,user_id,day
0,1,"[606, 608, 610, 620, 622, 632]"
1,2,"[608, 614, 620, 622]"
2,4,"[605, 617, 627]"
3,6,"[603, 607, 610, 611, 616, 619, 620, 624, 627, ..."
4,7,"[606, 610, 614, 623, 629]"


In [28]:
def avg_ndays(days):
    diff = 0
    if len(days) > 1:
        for i in range(len(days) - 1):
            diff += days[i+1] - days[i]
        return diff / (len(days) - 1)
    else:
        return 0
    
users_days['avg_interval'] = users_days['day'].apply(avg_ndays)

df_ranker_train = df_ranker_train.merge(users_days[['user_id', 'avg_interval']], on='user_id', how='left')
df_ranker_train.head(2)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,...,n_sold_category_user_week,price,quantity_per_week,Missing price,Missing quantity per week,n_unique_stores,n_transactions_per_week,avg_cheque,avg_basket_department,avg_interval
0,1827,942763,0.0,4951,DRUG GM,National,COLD AND FLU,COLD AND FLU - PSE,24 CT,,...,0.4,0.0,0.0,1,1,,6.0,2.798,2.4,7.0
1,1827,913958,0.0,720,DRUG GM,National,ORAL HYGIENE PRODUCTS,ORAL HYGIENE BRUSHES,,,...,0.4,0.0,0.0,1,1,,6.0,2.798,2.4,7.0


Построим признак, в котором будет закодировано место товара в пяти последних покупках клиента.

In [29]:
users_items = data_train_ranker.groupby(USER_COL)[ITEM_COL].apply(list).reset_index()
users_items['item_id'] = users_items['item_id'].apply(lambda x: x[-5:])
users_items.head()

Unnamed: 0,user_id,item_id
0,1,"[5577022, 8293439, 9526676, 9527558, 10149640]"
1,2,"[7407562, 10149597, 13776981, 15572067, 17215077]"
2,4,"[963365, 1038692, 1133312, 5570830, 7431408]"
3,6,"[1099058, 895268, 1017061, 1082185, 1119051]"
4,7,"[9837501, 12524016, 13072715, 13987153, 13987338]"


In [30]:
def code_last_sales(x, df=users_items):
    last_sales = df.loc[df['user_id'] == x[0], 'item_id'].item()
    code = str()
    last_sales.reverse()
    for item in last_sales:
        code += '1' if item == x[1] else '0'
    return code

df_ranker_train['Last5sales'] = df_ranker_train[[USER_COL, ITEM_COL]].apply(code_last_sales, axis=1)
df_ranker_train.head(2)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,...,price,quantity_per_week,Missing price,Missing quantity per week,n_unique_stores,n_transactions_per_week,avg_cheque,avg_basket_department,avg_interval,Last5sales
0,1827,942763,0.0,4951,DRUG GM,National,COLD AND FLU,COLD AND FLU - PSE,24 CT,,...,0.0,0.0,1,1,,6.0,2.798,2.4,7.0,0
1,1827,913958,0.0,720,DRUG GM,National,ORAL HYGIENE PRODUCTS,ORAL HYGIENE BRUSHES,,,...,0.0,0.0,1,1,,6.0,2.798,2.4,7.0,0


Построим модель Word2Vec для получения эмбеддингов товаров

In [31]:
df_ = data_train_ranker.groupby(USER_COL)[ITEM_COL].unique().reset_index()
df_.head()

Unnamed: 0,user_id,item_id
0,1,"[1005186, 907466, 909497, 940947, 963542, 1067..."
1,2,"[15830248, 838136, 839656, 861272, 866211, 870..."
2,4,"[883932, 970760, 1035676, 1055863, 1097610, 67..."
3,6,"[873654, 994928, 1098844, 1122879, 8357613, 98..."
4,7,"[836281, 843306, 845294, 914190, 920456, 93886..."


In [32]:
purchases = []

for user in df_['user_id']:
    purchases.append([str(item) for item in df_[df_['user_id'] == user].item_id.values[0]])
    
print(f"Total # of Sessions: {len(purchases)}")

Total # of Sessions: 2095


In [33]:
w2v_model = Word2Vec(min_count=1, vector_size=100, sg=1, workers=3)
w2v_model.build_vocab(purchases, progress_per=100)
w2v_model.train(purchases, total_examples=w2v_model.corpus_count, epochs=12, report_delay=1)

(1450996, 1457916)

In [34]:
def word2vec_len(itemid):
    try:
        return sum([i**2 for i in w2v_model.wv[str(itemid)]])
    except:
        return -1

df_ranker_train['Word2Vec_length'] = df_ranker_train[ITEM_COL].apply(lambda x: word2vec_len(x))
df_ranker_train.head(2)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,...,quantity_per_week,Missing price,Missing quantity per week,n_unique_stores,n_transactions_per_week,avg_cheque,avg_basket_department,avg_interval,Last5sales,Word2Vec_length
0,1827,942763,0.0,4951,DRUG GM,National,COLD AND FLU,COLD AND FLU - PSE,24 CT,,...,0.0,1,1,,6.0,2.798,2.4,7.0,0,-1.0
1,1827,913958,0.0,720,DRUG GM,National,ORAL HYGIENE PRODUCTS,ORAL HYGIENE BRUSHES,,,...,0.0,1,1,,6.0,2.798,2.4,7.0,0,-1.0


In [35]:
def avg_word2vec(items):
    return sum([w2v_model.wv[str(item)] for item in items]) / len(items)

df_['Avg_Word2Vec'] = df_[ITEM_COL].apply(avg_word2vec)
df_.head()

Unnamed: 0,user_id,item_id,Avg_Word2Vec
0,1,"[1005186, 907466, 909497, 940947, 963542, 1067...","[-0.10641853, 0.1835677, 0.21097258, 0.0324307..."
1,2,"[15830248, 838136, 839656, 861272, 866211, 870...","[-0.08931988, 0.255342, 0.19888788, 0.08634065..."
2,4,"[883932, 970760, 1035676, 1055863, 1097610, 67...","[-0.097336695, 0.22907256, 0.21010162, 0.04670..."
3,6,"[873654, 994928, 1098844, 1122879, 8357613, 98...","[-0.08162051, 0.23737237, 0.19490752, 0.056757..."
4,7,"[836281, 843306, 845294, 914190, 920456, 93886...","[-0.10343934, 0.21685547, 0.21724707, 0.032269..."


In [36]:
def get_w2v_distance(x, df=df_):
    avg_w2v = df.loc[df_[USER_COL] == x[0], 'Avg_Word2Vec'].item()
    try:
        return sum((w2v_model.wv[str(x[1])] - avg_w2v) ** 2)
    except:
        return -1
    
df_ranker_train['Word2Vec_distance_from_avg'] = df_ranker_train[[USER_COL, ITEM_COL]].\
                                                apply(get_w2v_distance, axis=1)
df_ranker_train.head(2)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,...,Missing price,Missing quantity per week,n_unique_stores,n_transactions_per_week,avg_cheque,avg_basket_department,avg_interval,Last5sales,Word2Vec_length,Word2Vec_distance_from_avg
0,1827,942763,0.0,4951,DRUG GM,National,COLD AND FLU,COLD AND FLU - PSE,24 CT,,...,1,1,,6.0,2.798,2.4,7.0,0,-1.0,-1.0
1,1827,913958,0.0,720,DRUG GM,National,ORAL HYGIENE PRODUCTS,ORAL HYGIENE BRUSHES,,,...,1,1,,6.0,2.798,2.4,7.0,0,-1.0,-1.0


# Построение модели второго уровня

In [37]:
X_train = df_ranker_train.drop(['target', 
                                'Missing n_sold_category', 
                                'n_sold_category_user_week', 
                                'mean_sales_value_category',], axis=1)
y_train = df_ranker_train['target']

In [38]:
cat_feats = ['manufacturer', 
             'department', 
             'brand', 
             'commodity_desc',
             'sub_commodity_desc',
             'curr_size_of_product',
             'age_desc',
             'marital_status_code',
             'income_desc',
             'homeowner_desc',
             'hh_comp_desc',
             'household_size_desc',
             'kid_category_desc',
             'Missing price',
             'Missing quantity per week',
             'Last5sales',
            ]


for column in cat_feats:
    X_train[column].fillna(0, inplace=True)
    
X_train[cat_feats] = X_train[cat_feats].astype('category')

In [70]:
%%time
cb = CatBoostClassifier(learning_rate=0.1,
                        max_depth=12,
                        n_estimators=800,
                        random_state=42, 
                        cat_features=cat_feats, 
                        silent=False)

cb.fit(X_train, y_train)

train_preds = cb.predict_proba(X_train)

0:	learn: 0.4846482	total: 3.88s	remaining: 51m 39s
1:	learn: 0.3548829	total: 7.58s	remaining: 50m 25s
2:	learn: 0.2712886	total: 8.29s	remaining: 36m 41s
3:	learn: 0.2154374	total: 10.6s	remaining: 35m 9s
4:	learn: 0.1711487	total: 13.3s	remaining: 35m 20s
5:	learn: 0.1407452	total: 14.9s	remaining: 32m 56s
6:	learn: 0.1202079	total: 19.3s	remaining: 36m 21s
7:	learn: 0.1079716	total: 21.2s	remaining: 34m 58s
8:	learn: 0.0982735	total: 25.5s	remaining: 37m 25s
9:	learn: 0.0910219	total: 29.6s	remaining: 38m 57s
10:	learn: 0.0861731	total: 33.4s	remaining: 39m 54s
11:	learn: 0.0825683	total: 37.4s	remaining: 40m 54s
12:	learn: 0.0796318	total: 41.3s	remaining: 41m 38s
13:	learn: 0.0774122	total: 45.1s	remaining: 42m 9s
14:	learn: 0.0757445	total: 49.2s	remaining: 42m 56s
15:	learn: 0.0742896	total: 53.1s	remaining: 43m 21s
16:	learn: 0.0732508	total: 57.4s	remaining: 44m 5s
17:	learn: 0.0723591	total: 1m 2s	remaining: 44m 56s
18:	learn: 0.0716588	total: 1m 5s	remaining: 45m 1s
19:	lea

153:	learn: 0.0569675	total: 11m 17s	remaining: 47m 20s
154:	learn: 0.0569239	total: 11m 21s	remaining: 47m 17s
155:	learn: 0.0568571	total: 11m 26s	remaining: 47m 13s
156:	learn: 0.0568088	total: 11m 30s	remaining: 47m 7s
157:	learn: 0.0567294	total: 11m 34s	remaining: 47m 2s
158:	learn: 0.0566911	total: 11m 38s	remaining: 46m 56s
159:	learn: 0.0566326	total: 11m 43s	remaining: 46m 52s
160:	learn: 0.0565536	total: 11m 47s	remaining: 46m 47s
161:	learn: 0.0565207	total: 11m 51s	remaining: 46m 40s
162:	learn: 0.0564541	total: 11m 55s	remaining: 46m 35s
163:	learn: 0.0564299	total: 11m 59s	remaining: 46m 30s
164:	learn: 0.0563819	total: 12m 3s	remaining: 46m 24s
165:	learn: 0.0562997	total: 12m 7s	remaining: 46m 17s
166:	learn: 0.0561737	total: 12m 11s	remaining: 46m 13s
167:	learn: 0.0561323	total: 12m 16s	remaining: 46m 11s
168:	learn: 0.0560784	total: 12m 21s	remaining: 46m 7s
169:	learn: 0.0559826	total: 12m 25s	remaining: 46m 3s
170:	learn: 0.0558707	total: 12m 29s	remaining: 45m 58

301:	learn: 0.0490595	total: 22m 13s	remaining: 36m 39s
302:	learn: 0.0490313	total: 22m 18s	remaining: 36m 35s
303:	learn: 0.0490008	total: 22m 23s	remaining: 36m 31s
304:	learn: 0.0489562	total: 22m 27s	remaining: 36m 27s
305:	learn: 0.0488329	total: 22m 32s	remaining: 36m 23s
306:	learn: 0.0487637	total: 22m 37s	remaining: 36m 19s
307:	learn: 0.0486905	total: 22m 41s	remaining: 36m 15s
308:	learn: 0.0486300	total: 22m 46s	remaining: 36m 11s
309:	learn: 0.0485961	total: 22m 51s	remaining: 36m 8s
310:	learn: 0.0485620	total: 22m 56s	remaining: 36m 4s
311:	learn: 0.0484738	total: 23m 1s	remaining: 36m
312:	learn: 0.0484061	total: 23m 5s	remaining: 35m 56s
313:	learn: 0.0483652	total: 23m 10s	remaining: 35m 52s
314:	learn: 0.0482855	total: 23m 15s	remaining: 35m 49s
315:	learn: 0.0482190	total: 23m 20s	remaining: 35m 45s
316:	learn: 0.0481536	total: 23m 25s	remaining: 35m 41s
317:	learn: 0.0480916	total: 23m 30s	remaining: 35m 37s
318:	learn: 0.0480658	total: 23m 35s	remaining: 35m 34s


449:	learn: 0.0423723	total: 33m 35s	remaining: 26m 7s
450:	learn: 0.0423194	total: 33m 40s	remaining: 26m 3s
451:	learn: 0.0422441	total: 33m 46s	remaining: 25m 59s
452:	learn: 0.0422253	total: 33m 50s	remaining: 25m 55s
453:	learn: 0.0421780	total: 33m 56s	remaining: 25m 51s
454:	learn: 0.0421572	total: 34m 1s	remaining: 25m 48s
455:	learn: 0.0420906	total: 34m 7s	remaining: 25m 44s
456:	learn: 0.0420415	total: 34m 11s	remaining: 25m 40s
457:	learn: 0.0419626	total: 34m 16s	remaining: 25m 35s
458:	learn: 0.0418998	total: 34m 20s	remaining: 25m 30s
459:	learn: 0.0418519	total: 34m 24s	remaining: 25m 26s
460:	learn: 0.0418322	total: 34m 29s	remaining: 25m 21s
461:	learn: 0.0418150	total: 34m 34s	remaining: 25m 17s
462:	learn: 0.0418057	total: 34m 38s	remaining: 25m 12s
463:	learn: 0.0417907	total: 34m 43s	remaining: 25m 8s
464:	learn: 0.0417804	total: 34m 47s	remaining: 25m 4s
465:	learn: 0.0417638	total: 34m 52s	remaining: 24m 59s
466:	learn: 0.0416885	total: 34m 58s	remaining: 24m 56

597:	learn: 0.0376484	total: 45m 3s	remaining: 15m 13s
598:	learn: 0.0375471	total: 45m 8s	remaining: 15m 8s
599:	learn: 0.0375350	total: 45m 12s	remaining: 15m 4s
600:	learn: 0.0375148	total: 45m 16s	remaining: 14m 59s
601:	learn: 0.0374892	total: 45m 22s	remaining: 14m 55s
602:	learn: 0.0374723	total: 45m 26s	remaining: 14m 50s
603:	learn: 0.0374247	total: 45m 31s	remaining: 14m 46s
604:	learn: 0.0373800	total: 45m 35s	remaining: 14m 41s
605:	learn: 0.0373758	total: 45m 40s	remaining: 14m 37s
606:	learn: 0.0373585	total: 45m 45s	remaining: 14m 32s
607:	learn: 0.0373553	total: 45m 49s	remaining: 14m 28s
608:	learn: 0.0373382	total: 45m 54s	remaining: 14m 23s
609:	learn: 0.0372576	total: 46m	remaining: 14m 19s
610:	learn: 0.0372494	total: 46m 4s	remaining: 14m 15s
611:	learn: 0.0372448	total: 46m 9s	remaining: 14m 10s
612:	learn: 0.0372372	total: 46m 15s	remaining: 14m 6s
613:	learn: 0.0372261	total: 46m 19s	remaining: 14m 2s
614:	learn: 0.0372152	total: 46m 24s	remaining: 13m 57s
615:

746:	learn: 0.0338075	total: 56m 40s	remaining: 4m 1s
747:	learn: 0.0338007	total: 56m 44s	remaining: 3m 56s
748:	learn: 0.0337593	total: 56m 50s	remaining: 3m 52s
749:	learn: 0.0337071	total: 56m 54s	remaining: 3m 47s
750:	learn: 0.0337023	total: 56m 58s	remaining: 3m 43s
751:	learn: 0.0336716	total: 57m 3s	remaining: 3m 38s
752:	learn: 0.0336060	total: 57m 7s	remaining: 3m 33s
753:	learn: 0.0335785	total: 57m 12s	remaining: 3m 29s
754:	learn: 0.0335554	total: 57m 17s	remaining: 3m 24s
755:	learn: 0.0335539	total: 57m 22s	remaining: 3m 20s
756:	learn: 0.0335510	total: 57m 27s	remaining: 3m 15s
757:	learn: 0.0335060	total: 57m 32s	remaining: 3m 11s
758:	learn: 0.0334708	total: 57m 37s	remaining: 3m 6s
759:	learn: 0.0333788	total: 57m 41s	remaining: 3m 2s
760:	learn: 0.0333636	total: 57m 46s	remaining: 2m 57s
761:	learn: 0.0333584	total: 57m 50s	remaining: 2m 53s
762:	learn: 0.0333419	total: 57m 55s	remaining: 2m 48s
763:	learn: 0.0332734	total: 58m	remaining: 2m 43s
764:	learn: 0.03327

In [71]:
fi = pd.DataFrame(cb.feature_importances_, index=X_train.columns, columns=['importance'])
fi.sort_values(by='importance', ascending=False)

Unnamed: 0,importance
n_sold_category,11.458737
avg_transaction_category,10.820623
n_unique_stores,8.043346
Word2Vec_distance_from_avg,7.516109
price,5.966379
Word2Vec_length,5.487594
n_transactions_per_week,5.041407
item_id,4.381218
avg_interval,4.339251
user_id,4.140321


In [72]:
df_ranker_predict = df_ranker_train.copy()
df_ranker_predict['proba_item_purchase'] = train_preds[:,1]

In [73]:
N_PREDICT = 50
TOPK_PRECISION = 5

result_eval_ranker = data_val_ranker.groupby(USER_COL)[ITEM_COL].unique().reset_index()
result_eval_ranker.columns=[USER_COL, ACTUAL_COL]
result_eval_ranker['own_rec'] = make_recommendations(result_eval_ranker, 
                                                     recommender.get_own_recommendations, N_PREDICT=N_PREDICT)

sorted(calc_precision(result_eval_ranker, TOPK_PRECISION), key=lambda x: x[1], reverse=True)

[('own_rec', 0.08754901960784306)]

In [74]:
result_eval_ranker['reranked_own_rec'] = result_eval_ranker[USER_COL].\
                                            apply(lambda user_id: rerank(user_id, df_ranker_predict))
print(*sorted(calc_precision(result_eval_ranker, TOPK_PRECISION), key=lambda x: x[1], reverse=True), sep='\n')

('reranked_own_rec', 0.24373333333333086)
('own_rec', 0.08754901960784306)


  return flags.sum() / len(recommended_list)


# Рекомендации для тестового датасета

In [75]:
data_test = pd.read_csv('retail_test1.csv')
data_test.head()

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,1340,41652823310,664,912987,1,8.49,446,0.0,52,96,0.0,0.0
1,588,41652838477,664,1024426,1,6.29,388,0.0,8,96,0.0,0.0
2,2070,41652857291,664,995242,5,9.1,311,-0.6,46,96,0.0,0.0
3,1602,41665647035,664,827939,1,7.99,334,0.0,1741,96,0.0,0.0
4,1602,41665647035,664,927712,1,0.59,334,-0.4,1741,96,0.0,0.0


In [76]:
df_test_candidates = pd.DataFrame(data_test[USER_COL].unique())
df_test_candidates.columns = [USER_COL]

df_test_candidates['recommendations'] = df_test_candidates[USER_COL].\
                                        apply(lambda user_id: rerank(user_id, df_ranker_predict))

In [77]:
df_test_candidates.to_csv('recommendations.csv', index=False)
df_test_candidates.head()

Unnamed: 0,user_id,recommendations
0,1340,"[1040970, 979707, 944486, 1082185, 852856]"
1,588,"[14025268, 12517411, 899624, 907631, 933835]"
2,2070,"[1082185, 904236, 1072917, 899624, 865456]"
3,1602,"[6632270, 15596872, 1006342, 829323, 885592]"
4,447,"[1072298, 1004906, 1008675, 899624, 856060]"
