In [None]:
import pandas as pd
import numpy as np

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit import als

from gensim.models import Word2Vec

# Модель второго уровня
from catboost import CatBoostClassifier

import os, sys
sys.path.insert(1, os.getcwd() + '/src/')

from metrics import precision_at_k, recall_at_k
from utils import prefilter_items
from recommenders import MainRecommender

In [None]:
data = pd.read_csv('retail_train.csv')
item_features = pd.read_csv('product.csv')
user_features = pd.read_csv('hh_demographic.csv')

In [None]:
# Several important functions:

def print_stats_data(df_data, name_df):
    print(name_df)
    print(f"Shape: {df_data.shape} Users: {df_data[USER_COL].nunique()} Items: {df_data[ITEM_COL].nunique()}")
    
def make_recommendations(df_result, recommend_model, N_PREDICT=500, USER_COL='user_id'):
    return df_result[USER_COL].apply(lambda x: recommend_model(x, N=N_PREDICT))

def calc_recall(df_data, top_k, ACTUAL_COL='actual'):
    for col_name in df_data.columns[2:]:
        yield col_name, df_data.apply(lambda row: recall_at_k(row[col_name], row[ACTUAL_COL], k=top_k), axis=1).mean()
        
def calc_precision(df_data, top_k, ACTUAL_COL='actual'):
    for col_name in df_data.columns[2:]:
        yield col_name, df_data.apply(lambda row: precision_at_k(row[col_name], row[ACTUAL_COL], k=top_k), axis=1).mean()
        
def rerank(user_id, df, USER_COL='user_id', proba_col_name='proba_item_purchase', N=5):
    return df[df[USER_COL]==user_id].sort_values(proba_col_name, ascending=False).head(N).item_id.tolist()

In [None]:
# input data preparation

ITEM_COL = 'item_id'
USER_COL = 'user_id'

# column processing
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': ITEM_COL}, inplace=True)
user_features.rename(columns={'household_key': USER_COL }, inplace=True)

In [None]:
VAL_MATCHER_WEEKS = 5
VAL_RANKER_WEEKS = 3

# берем данные для тренировки matching модели
data_train_matcher = data[data['week_no'] < data['week_no'].max() - (VAL_MATCHER_WEEKS + VAL_RANKER_WEEKS)]

# берем данные для валидации matching модели
data_val_matcher = data[(data['week_no'] >= data['week_no'].max() - (VAL_MATCHER_WEEKS + VAL_RANKER_WEEKS)) &
                      (data['week_no'] < data['week_no'].max() - (VAL_RANKER_WEEKS))]


# берем данные для тренировки ranking модели
data_train_ranker = data_val_matcher.copy()  # Для наглядности. Далее мы добавим изменения, и они будут отличаться

# берем данные для теста ranking, matching модели
data_val_ranker = data[data['week_no'] >= data['week_no'].max() - VAL_RANKER_WEEKS]

In [None]:
print_stats_data(data_train_matcher,'train_matcher')
print_stats_data(data_val_matcher,'val_matcher')
print_stats_data(data_train_ranker,'train_ranker')
print_stats_data(data_val_ranker,'val_ranker')

In [None]:
# Let's filter the data

n_items_before = data_train_matcher['item_id'].nunique()

data_train_matcher = prefilter_items(data_train_matcher, item_features=item_features, take_n_popular=20000)

n_items_after = data_train_matcher['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

In [None]:
# and leave only users who are found in the train dataset

common_users = data_train_matcher.user_id.values

data_val_matcher = data_val_matcher[data_val_matcher.user_id.isin(common_users)]
data_train_ranker = data_train_ranker[data_train_ranker.user_id.isin(common_users)]
data_val_ranker = data_val_ranker[data_val_ranker.user_id.isin(common_users)]

print_stats_data(data_train_matcher,'train_matcher')
print_stats_data(data_val_matcher,'val_matcher')
print_stats_data(data_train_ranker,'train_ranker')
print_stats_data(data_val_ranker,'val_ranker')

### 1-st level model

In [None]:
recommender = MainRecommender(data_train_matcher)

In [None]:
ACTUAL_COL = 'actual'
result_eval_matcher = data_val_matcher.groupby(USER_COL)[ITEM_COL].unique().reset_index()
result_eval_matcher.columns=[USER_COL, ACTUAL_COL]
result_eval_matcher.head(2)

In [None]:
models = {'own_rec': recommender.get_own_recommendations, 
          'sim_item_rec': recommender.get_similar_items_recommendation, 
          'als_rec': recommender.get_als_recommendations, 
          'sim_user_rec': recommender.get_similar_users_recommendation}

for column_name, model in models.items():
    result_eval_matcher[column_name] = make_recommendations(result_eval_matcher, model)

In [None]:
result_eval_matcher.head(2)

In [None]:
TOPK_RECALL = 500
sorted(calc_recall(result_eval_matcher, TOPK_RECALL), key=lambda x: x[1],reverse=True)

##### Generation of features for the second level model

In [None]:
# took users from the train dataset for ranking:

N_PREDICT = 500

df_match_candidates = pd.DataFrame(data_train_ranker[USER_COL].unique())
df_match_candidates.columns = [USER_COL]

df_match_candidates['candidates'] = make_recommendations(df_match_candidates, recommender.get_own_recommendations, 
                                                         N_PREDICT=N_PREDICT)

df_items = df_match_candidates.apply(lambda x: pd.Series(x['candidates']), axis=1).stack().reset_index(level=1, drop=True)
df_items.name = 'item_id'
df_match_candidates = df_match_candidates.drop('candidates', axis=1).join(df_items)

df_match_candidates.head()

In [None]:
df_ranker_train = data_train_ranker[[USER_COL, ITEM_COL]].copy()
df_ranker_train['target'] = 1  # тут только покупки 

df_ranker_train = df_match_candidates.merge(df_ranker_train, on=[USER_COL, ITEM_COL], how='left')

# чистим дубликаты
df_ranker_train = df_ranker_train.drop_duplicates(subset=[USER_COL, ITEM_COL])

df_ranker_train['target'].fillna(0, inplace= True)

df_ranker_train.head()

In [None]:
df_ranker_train.target.value_counts()

In [None]:
df_ranker_train = df_ranker_train.merge(item_features, on='item_id', how='left')
df_ranker_train = df_ranker_train.merge(user_features, on='user_id', how='left')

df_ranker_train.head(2)

In [None]:
# Let's generate new features and add them to the training dataset

# Let's add a category parameter to the original training dataset for the convenience of creating new features

data_department = data_train_ranker.merge(item_features[['item_id', 'department']], on='item_id', how='inner')
data_department.head(2)

In [None]:
# Average price of goods purchased by the user

users_sales = data_train_ranker.groupby(USER_COL)[['sales_value', 'quantity']].sum().reset_index()
users_sales['avg_price'] = users_sales['sales_value'] / users_sales['quantity']
df_ranker_train = df_ranker_train.merge(users_sales[['user_id', 'avg_price']], on='user_id', how='left')
df_ranker_train.head(2)

In [None]:
# Number of purchases in each category and average purchase amount in each category for a user

users_sales_department = data_department.groupby([USER_COL, 'department'])\
                        [['sales_value', 'quantity']].sum().reset_index()
users_sales_department.rename(columns={'quantity': 'n_sold_category'}, inplace=True)
users_sales_department['avg_transaction_category'] = users_sales_department['sales_value']\
                                                    /users_sales_department['n_sold_category']
users_sales_department.drop(columns=['sales_value'], inplace=True)

df_ranker_train = df_ranker_train.merge(
    users_sales_department, on=[USER_COL, 'department'], how='left')
df_ranker_train['Missing n_sold_category'] = 0
df_ranker_train.loc[df_ranker_train['n_sold_category'].isna(), 'Missing n_sold_category'] = 1
df_ranker_train['n_sold_category'].fillna(0, inplace=True)

df_ranker_train.head(2)

In [None]:
# Average purchase amount per category

department_sales = data_department.groupby('department')['sales_value'].mean().reset_index()
department_sales.rename(columns={'sales_value': 'mean_sales_value_category'}, inplace=True)
department_sales.tail(2)

n_weeks = data_department['week_no'].max() - data_department['week_no'].min() + 1

# Количество покупок юзером конкретной категории в неделю
users_department = data_department.groupby([USER_COL, 'department'])['quantity'].sum().reset_index()
users_department['quantity'] /= n_weeks
users_department.rename(columns={'quantity': 'n_sold_category_user_week'}, inplace=True)

df_ranker_train = df_ranker_train.merge(department_sales, on='department', how='left')
df_ranker_train = df_ranker_train.merge(users_department, on=[USER_COL, 'department'], how='left')
df_ranker_train.head(2)

In [None]:
# Price

items_sales = data_department.groupby(ITEM_COL)[['sales_value', 'quantity']].sum().reset_index()
items_sales['price'] = items_sales['sales_value'] / items_sales['quantity']
items_sales['price'].fillna(0, inplace=True)

# Количество покупок товара в неделю
items_sales['quantity_per_week'] = items_sales['quantity'] / n_weeks


df_ranker_train = df_ranker_train.merge(items_sales[[ITEM_COL,'price', 'quantity_per_week']],
                                        on=ITEM_COL, how='left')

df_ranker_train['Missing price'] = 0
df_ranker_train.loc[df_ranker_train['price'].isna(), 'Missing price'] = 1
df_ranker_train['price'].fillna(0, inplace=True)

df_ranker_train['Missing quantity per week'] = 0
df_ranker_train.loc[df_ranker_train['quantity_per_week'].isna(), 'Missing quantity per week'] = 1
df_ranker_train['quantity_per_week'].fillna(0, inplace=True)

df_ranker_train.head()

In [None]:
# Number of unique stores that sold the item

items_stores = data_department.groupby(ITEM_COL)['store_id'].nunique().reset_index()
items_stores.rename(columns={'store_id': 'n_unique_stores'}, inplace=True)
df_ranker_train = df_ranker_train.merge(items_stores, on=ITEM_COL, how='left')

df_ranker_train.head(2)

In [None]:
# Average number of customer transactions per week

users_transactions = data_department.groupby(USER_COL)[ITEM_COL].count().reset_index()
users_transactions.rename(columns={'item_id': 'n_transactions_per_week'}, inplace=True)
users_transactions['n_transactions_per_week'] /= n_weeks


df_ranker_train = df_ranker_train.merge(users_transactions, on=USER_COL, how='left')

df_ranker_train.tail(2)

In [None]:
# Average check

users_sales = data_train_ranker.groupby(USER_COL)['sales_value'].mean().reset_index()
users_sales.rename(columns={'sales_value': 'avg_cheque'}, inplace=True)
df_ranker_train = df_ranker_train.merge(users_sales[['user_id', 'avg_cheque']], on='user_id', how='left')
df_ranker_train.head(2)

In [None]:
# Average number of unique categories in cart

users_baskets = data_department.groupby([USER_COL, 'basket_id'])['department'].nunique().reset_index()
users_baskets = users_baskets.groupby(USER_COL)['department'].mean().reset_index()
users_baskets.rename(columns={'department': 'avg_basket_department'}, inplace=True)
df_ranker_train = df_ranker_train.merge(users_baskets[['user_id', 'avg_basket_department']], on='user_id', how='left')
df_ranker_train.head(2)

#### Let's create a feature that reflects the average interval between user purchases.

In [None]:
users_days = data_department.groupby(USER_COL)['day'].unique().reset_index()
users_days['day'] = users_days['day'].apply(lambda x: sorted(x))
users_days.head()

In [None]:
def avg_ndays(days):
    diff = 0
    if len(days) > 1:
        for i in range(len(days) - 1):
            diff += days[i+1] - days[i]
        return diff / (len(days) - 1)
    else:
        return 0
    
users_days['avg_interval'] = users_days['day'].apply(avg_ndays)

df_ranker_train = df_ranker_train.merge(users_days[['user_id', 'avg_interval']], on='user_id', how='left')
df_ranker_train.head(2)

#### Let's create a feature that will encode the place of the product in the last five purchases of the client.

In [None]:
users_items = data_train_ranker.groupby(USER_COL)[ITEM_COL].apply(list).reset_index()
users_items['item_id'] = users_items['item_id'].apply(lambda x: x[-5:])
users_items.head()

In [None]:
def code_last_sales(x, df=users_items):
    last_sales = df.loc[df['user_id'] == x[0], 'item_id'].item()
    code = str()
    last_sales.reverse()
    for item in last_sales:
        code += '1' if item == x[1] else '0'
    return code

df_ranker_train['Last5sales'] = df_ranker_train[[USER_COL, ITEM_COL]].apply(code_last_sales, axis=1)
df_ranker_train.head(2)

#### Let's create a Word2Vec model to get product embeddings

In [None]:
df_ = data_train_ranker.groupby(USER_COL)[ITEM_COL].unique().reset_index()
df_.head()

In [None]:
purchases = []

for user in df_['user_id']:
    purchases.append([str(item) for item in df_[df_['user_id'] == user].item_id.values[0]])
    
print(f"Total # of Sessions: {len(purchases)}")

In [None]:
w2v_model = Word2Vec(min_count=1, vector_size=100, sg=1, workers=3)
w2v_model.build_vocab(purchases, progress_per=100)
w2v_model.train(purchases, total_examples=w2v_model.corpus_count, epochs=12, report_delay=1)

In [None]:
def word2vec_len(itemid):
    try:
        return sum([i**2 for i in w2v_model.wv[str(itemid)]])
    except:
        return -1

df_ranker_train['Word2Vec_length'] = df_ranker_train[ITEM_COL].apply(lambda x: word2vec_len(x))
df_ranker_train.head(2)

In [None]:
def avg_word2vec(items):
    return sum([w2v_model.wv[str(item)] for item in items]) / len(items)

df_['Avg_Word2Vec'] = df_[ITEM_COL].apply(avg_word2vec)
df_.head()

In [None]:
def get_w2v_distance(x, df=df_):
    avg_w2v = df.loc[df_[USER_COL] == x[0], 'Avg_Word2Vec'].item()
    try:
        return sum((w2v_model.wv[str(x[1])] - avg_w2v) ** 2)
    except:
        return -1
    
df_ranker_train['Word2Vec_distance_from_avg'] = df_ranker_train[[USER_COL, ITEM_COL]].\
                                                apply(get_w2v_distance, axis=1)
df_ranker_train.head(2)

### Generation a second level model

In [None]:
X_train = df_ranker_train.drop(['target', 
                                'Missing n_sold_category', 
                                'n_sold_category_user_week', 
                                'mean_sales_value_category',], axis=1)
y_train = df_ranker_train['target']

In [None]:
cat_feats = ['manufacturer', 
             'department', 
             'brand', 
             'commodity_desc',
             'sub_commodity_desc',
             'curr_size_of_product',
             'age_desc',
             'marital_status_code',
             'income_desc',
             'homeowner_desc',
             'hh_comp_desc',
             'household_size_desc',
             'kid_category_desc',
             'Missing price',
             'Missing quantity per week',
             'Last5sales',
            ]


for column in cat_feats:
    X_train[column].fillna(0, inplace=True)
    
X_train[cat_feats] = X_train[cat_feats].astype('category')

In [None]:
%%time
cb = CatBoostClassifier(learning_rate=0.1,
                        max_depth=12,
                        n_estimators=800,
                        random_state=42, 
                        cat_features=cat_feats, 
                        silent=False)

cb.fit(X_train, y_train)

train_preds = cb.predict_proba(X_train)

In [None]:
fi = pd.DataFrame(cb.feature_importances_, index=X_train.columns, columns=['importance'])
fi.sort_values(by='importance', ascending=False)

In [None]:
df_ranker_predict = df_ranker_train.copy()
df_ranker_predict['proba_item_purchase'] = train_preds[:,1]

In [None]:
N_PREDICT = 50
TOPK_PRECISION = 5

result_eval_ranker = data_val_ranker.groupby(USER_COL)[ITEM_COL].unique().reset_index()
result_eval_ranker.columns=[USER_COL, ACTUAL_COL]
result_eval_ranker['own_rec'] = make_recommendations(result_eval_ranker, 
                                                     recommender.get_own_recommendations, N_PREDICT=N_PREDICT)

sorted(calc_precision(result_eval_ranker, TOPK_PRECISION), key=lambda x: x[1], reverse=True)

In [None]:
result_eval_ranker['reranked_own_rec'] = result_eval_ranker[USER_COL].\
                                            apply(lambda user_id: rerank(user_id, df_ranker_predict))
print(*sorted(calc_precision(result_eval_ranker, TOPK_PRECISION), key=lambda x: x[1], reverse=True), sep='\n')