### Курсовой проект
### Студент: Абрамов А.В.

In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from scipy.sparse import csr_matrix
from implicit import als

from lightgbm import LGBMClassifier
import lightgbm as lgb
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

import os, sys
module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)

from src.metrics import precision_at_k, recall_at_k
from src.utils import prefilter_items
from src.recommenders import MainRecommender

In [14]:
data = pd.read_csv('retail_train.csv')
item_features = pd.read_csv('product.csv')
user_features = pd.read_csv('hh_demographic.csv')

In [5]:
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': 'item_id'}, inplace=True)
user_features.rename(columns={'household_key': 'user_id'}, inplace=True)

In [6]:
val_lvl_1_size_weeks = 6
val_lvl_2_size_weeks = 3

data_train_lvl_1 = data[data['week_no'] < data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)]
data_val_lvl_1 = data[(data['week_no'] >= data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)) &
                      (data['week_no'] < data['week_no'].max() - (val_lvl_2_size_weeks))]

data_train_lvl_2 = data_val_lvl_1.copy()
data_val_lvl_2 = data[data['week_no'] >= data['week_no'].max() - val_lvl_2_size_weeks]

In [7]:
users_lvl_1 = data_train_lvl_1.user_id.unique()
users_lvl_2 = data_val_lvl_1.user_id.unique()
users_lvl_3 = data_val_lvl_2.user_id.unique()

new_users_lvl_2 = list(set(users_lvl_2) - set(users_lvl_1))
new_users_lvl_3 = list(set(users_lvl_3) - (set(users_lvl_1) | set(users_lvl_2)))

add_to_lvl_2 = list(set(users_lvl_3) - (set(users_lvl_2)))

In [8]:
n_items_before = data_train_lvl_1['item_id'].nunique()
data_train_lvl_1 = prefilter_items(data_train_lvl_1, item_features=item_features, take_n_popular=5000)
n_items_after = data_train_lvl_1['item_id'].nunique()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['price'] = data['sales_value'] / (np.maximum(data['quantity'], 1))


In [9]:
user_features['age_desc'].replace(
    {'19-24': 22, '25-34': 30, '35-44': 40, '45-54': 50, '55-64': 60, '65+': 70},
    inplace=True)

user_features['marital_status_code'].replace(
    {'U': 0, 'A': 1, 'B': 2}, inplace=True)

user_features['income_desc'].replace(
    {'Under 15K': 10, '15-24K': 20, '25-34K':30, '35-49K': 40,
     '50-74K': 62, '75-99K': 87, '100-124K': 112, '125-149K': 137, 
     '150-174K': 162, '175-199K': 187, '200-249K': 225, '250K+':275}, inplace=True)

user_features['homeowner_desc'].replace(
    {'Unknown': 0, 'Probable Renter': 1, 'Renter': 2,
     'Probable Owner': 3, 'Homeowner': 4}, inplace=True)

user_features['hh_comp_desc'].replace(
    {'Unknown': 0, 'Single Male': 1, 'Single Female': 2,
     '1 Adult Kids': 3, '2 Adults No Kids': 4, '2 Adults Kids':5},inplace=True)

user_features['household_size_desc'].replace({'5+': 5}, inplace=True) 

user_features['kid_category_desc'].replace(
    {'None/Unknown': 0, '3+': 3}, inplace=True)

In [10]:
names = ['manufacturer', 'department', 'commodity_desc', 'sub_commodity_desc', 'curr_size_of_product']
for name in names:
    new_name = name + '_freq'
    a = item_features[name].value_counts()
    ind = a.index.tolist()
    for i in ind:
        item_features.loc[item_features[name] == i, new_name] = a[i]

item_features['brand'] = np.where(item_features['brand']=='Private', 0, 1)

commodities = item_features.commodity_desc.value_counts()
commodities_list = commodities.keys().tolist()
for i, name in enumerate(commodities_list):
    item_features.loc[item_features['commodity_desc'] == name, 'commodity_category'] = i

In [11]:
def get_user_item_features(data_train_lvl_1):
    X = data_train_lvl_1.copy()
    X['hour'] = X['trans_time'] // 100
    user_item_features = X.groupby(['user_id', 'item_id'])['hour'].median().reset_index()
    user_item_features.columns = ['user_id', 'item_id', 'median_sales_hour']
    
    X['weekday'] = X['day'] % 7
    df = X.groupby(['user_id', 'item_id'])['weekday'].median().reset_index()
    df.columns = ['user_id', 'item_id', 'median_weekday']
    user_item_features = user_item_features.merge(df, on=['user_id', 'item_id'])
    
    df = X.groupby('user_id')['day'].nunique().reset_index()
    df['mean_visits_interval'] = (X.groupby('user_id')['day'].max() - X.groupby('user_id')['day'].min()) / df['day']
    user_item_features = user_item_features.merge(df[['user_id', 'mean_visits_interval']], on=['user_id'])
    
    df = X.groupby(['user_id', 'basket_id'])['sales_value'].sum().reset_index()
    df = df.groupby('user_id')['sales_value'].mean().reset_index()
    df.columns = ['user_id', 'mean_check']
    user_item_features = user_item_features.merge(df, on=['user_id'])
    
    df = X.groupby(['item_id'])['store_id'].nunique().reset_index()
    df.columns = ['item_id', 'n_stores']
    user_item_features = user_item_features.merge(df, on=['item_id'])
    
    df = X.groupby(['user_id'])['item_id'].nunique().reset_index()
    df.columns = ['user_id', 'n_items']
    user_item_features = user_item_features.merge(df, on=['user_id'])
    
    df = X.groupby(['user_id'])['item_id'].count().reset_index()
    df.columns = ['user_id', 'n_transactions']
    user_item_features = user_item_features.merge(df, on=['user_id'])
    
    df = X.groupby(['user_id', 'basket_id'])['item_id'].nunique().reset_index()
    df1 = df.groupby('user_id')['item_id'].mean().reset_index()
    df1.columns = ['user_id', 'mean_n_items_basket']
    user_item_features = user_item_features.merge(df1, on=['user_id'])

    df2 = df.groupby('user_id')['item_id'].max().reset_index()
    df2.columns = ['user_id', 'max_n_items_basket']
    user_item_features = user_item_features.merge(df2, on=['user_id'])

    df3 = df.groupby('user_id')['item_id'].std().reset_index()
    df3.columns = ['user_id', 'std_n_items_basket']
    user_item_features = user_item_features.merge(df3, on=['user_id'])

    recommender = MainRecommender(X)
    df = recommender.model.item_factors
    n_factors = recommender.model.factors
    ind = list(recommender.id_to_itemid.values())
    df = pd.DataFrame(df, index=ind).reset_index()
    df.columns = ['item_id'] + ['factor_' + str(i + 1) for i in range(n_factors)]
    user_item_features = user_item_features.merge(df, on=['item_id'])
    
    df = recommender.model.user_factors
    ind = list(recommender.id_to_userid.values())
    df = pd.DataFrame(df, index=ind).reset_index()
    df.columns = ['user_id'] + ['user_factor_' + str(i + 1) for i in range(n_factors)]
    user_item_features = user_item_features.merge(df, on=['user_id'])
    
    return user_item_features

In [12]:
user_item_features = get_user_item_features(data_train_lvl_1)

TypeError: __init__() missing 1 required positional argument: 'top_popular_n'

In [None]:
def get_candidates(data_train_lvl_1, data_train_lvl_2, N, add_to_lvl_2):
    recommender = MainRecommender(data_train_lvl_1)

    users_lvl_1 = data_train_lvl_1['user_id'].unique()
    users_lvl_2 = data_train_lvl_2['user_id'].unique().tolist()
    if add_to_lvl_2:
        users_lvl_2 += add_to_lvl_2

    current_users = list(set(users_lvl_2) & set(users_lvl_1))    
    new_users = list(set(users_lvl_2) - set(users_lvl_1))

    df = pd.DataFrame(users_lvl_2, columns=['user_id'])
    cond_1 = df['user_id'].isin(current_users)
    df.loc[cond_1, 'candidates'] = df.loc[cond_1, 'user_id'].apply(
        lambda x: recommender.get_own_recommendations(x, N))

    if new_users:
        cond_2 = df['user_id'].isin(new_users)
        df.loc[cond_2, 'candidates'] = df.loc[cond_2, 'user_id'].apply(
            lambda x: recommender.overall_top_purchases[:N])
        
    return df



def get_targets_lvl_2(data_train_lvl_1, data_train_lvl_2, user_item_features, N, add_to_lvl_2=None):
    
    users_lvl_2 = get_candidates(data_train_lvl_1, data_train_lvl_2, N, add_to_lvl_2)
    
    df = pd.DataFrame({'user_id': users_lvl_2['user_id'].values.repeat(N),
                       'item_id': np.concatenate(users_lvl_2['candidates'].values)})

    targets_lvl_2 = data_train_lvl_2[['user_id', 'item_id']].copy()
    targets_lvl_2['target'] = 1  

    targets_lvl_2 = df.merge(targets_lvl_2, on=['user_id', 'item_id'], how='left')
    targets_lvl_2['target'].fillna(0, inplace= True)
    
    targets_lvl_2 = targets_lvl_2.merge(
        user_item_features, on=['user_id', 'item_id'], how='left')
    
    return targets_lvl_2
N = 500
targets_lvl_2 = get_targets_lvl_2(data_train_lvl_1, data_train_lvl_2, user_item_features, N, add_to_lvl_2)

In [None]:
SELECTED_FEATURES_NAMES = ['brand', 'manufacturer_freq', 'department_freq', 'commodity_desc_freq',
                           'sub_commodity_desc_freq', 'curr_size_of_product_freq',
                           'commodity_category', 'age_desc', 'marital_status_code', 'income_desc',
                           'homeowner_desc', 'hh_comp_desc'
                           
                             
                           'manufacturer',
                           
                           'median_sales_hour', 'median_weekday', #'mean_visits_interval',
                           'mean_check', 
                           'n_stores', 'n_items', 'n_transactions', 
                           'mean_n_items_basket', 'max_n_items_basket', 'std_n_items_basket',
                           'mean_n_item_categories_basket', 'max_n_item_categories_basket', 
                           'std_n_item_categories_basket',
                           'factor_1', 'factor_2', 'factor_3', 'factor_4', 'factor_5',
                           'factor_6', 'factor_7', 'factor_8', 'factor_9', 'factor_10',
                           'factor_11', 'factor_12', 'factor_13', 'factor_14', 'factor_15',
                           'factor_16', 'factor_17', 'factor_18', 'factor_19', 'factor_20',
                           
                           'user_factor_1', 'user_factor_2', 'user_factor_3', 'user_factor_4',
                           'user_factor_5', 'user_factor_6', 'user_factor_7', 'user_factor_8',
                           'user_factor_9', 'user_factor_10', 'user_factor_11', 'user_factor_12',
                           'user_factor_13', 'user_factor_14', 'user_factor_15', 'user_factor_16',
                           'user_factor_17', 'user_factor_18', 'user_factor_19', 'user_factor_20',
                          ]
categorical = ['marital_status_code','homeowner_desc', 'hh_comp_desc', 'manufacturer','commodity_category']

In [None]:
SELECTED_FEATURES_NAMES_cb = [i for i in SELECTED_FEATURES_NAMES if not i in categorical]
def run_model_cb(targets_lvl_2):    
    X_train, X_valid, y_train, y_valid = train_test_split(targets_lvl_2[SELECTED_FEATURES_NAMES_cb].fillna(0),
                                                          targets_lvl_2[['target']],
                                                          test_size=0.2, random_state=16,
                                                          stratify=targets_lvl_2[['target']])

    dtrain = Pool(data=X_train, label=y_train)
    dvalid = Pool(data=X_valid, label=y_valid) 

    params_cb = {"n_estimators":5000,
                 "loss_function": "Logloss",
                 "eval_metric": "AUC",
                 "task_type": "CPU",
                 "max_bin": 30,
                 "early_stopping_rounds": 30,
                 "verbose": 1000,
                 "l2_leaf_reg": 80,
                 "thread_count": 6,
                 "random_seed": 51} 

    model_cb = CatBoostClassifier(**params_cb)
    model_cb.fit(dtrain, eval_set=[dvalid])

    
    return model_cb

In [None]:
model_cb = run_model_cb(targets_lvl_2)

print(model_cb.get_all_params())
def run_model_lgb(targets_lvl_2):
    X_train, X_valid, y_train, y_valid = train_test_split(targets_lvl_2[SELECTED_FEATURES_NAMES].fillna(0),
                                                          targets_lvl_2[['target']],
                                                          test_size=0.2, random_state=16,
                                                          stratify=targets_lvl_2[['target']])

    dtrain = lgb.Dataset(X_train, y_train, categorical_feature=categorical)
    dvalid = lgb.Dataset(X_valid, y_valid, categorical_feature=categorical)

    params_lgb = {
                  "objective": "binary", 
                  "metric": "auc",
                  "num_boost_round": 10000, 
                  "n_jobs": 8,
                  "force_row_wise": True, 
                  "seed": 24} 

    model_lgb = lgb.train(params=params_lgb,
                          train_set=dtrain,  
                          valid_sets=[dtrain, dvalid],
                          categorical_feature=categorical,
                          verbose_eval=1000,
                          early_stopping_rounds=30)
    
    return model_lgb
model_lgb = run_model_lgb(targets_lvl_2)

#[1322]	training's auc: 0.954272	valid_1's auc: 0.912695

In [None]:
predictions_lgb_train = model_lgb.predict(targets_lvl_2[SELECTED_FEATURES_NAMES].fillna(0))
predictions_cb_train = model_cb.predict_proba(targets_lvl_2[SELECTED_FEATURES_NAMES_cb].fillna(0))[:, 1]

preds_train = pd.DataFrame(zip(predictions_lgb_train, 
                               predictions_cb_train),
                           columns=['lgb', 'cb']).mean(axis=1).values
roc_auc_score(targets_lvl_2['target'], preds_train)

def get_predictions(targets_lvl_2, raw_predictions, prefix='lgb'): 
    df = targets_lvl_2[['user_id', 'item_id']]
    df['predictions'] = raw_predictions

    df = df.groupby(['user_id', 'item_id'])['predictions'].median().reset_index()
    df = df.sort_values(['predictions'], ascending=False).groupby(['user_id']).head(5)

    df = df.groupby('user_id')['item_id'].unique().reset_index()
    df.columns = ['user_id', prefix + '_recommendations']
    
    return df


def get_results(data_val_lvl_2, targets_lvl_2, preds_lgb, preds_cb, combined_preds):
    result = data_val_lvl_2.groupby('user_id')['item_id'].unique().reset_index()
    result.columns=['user_id', 'actual']
    
    prefixes = ['lgb', 'cb', 'cb_lgb']
    predictions = [preds_lgb, preds_cb, combined_preds]
    
    for i, preds in enumerate(predictions):
        df = get_predictions(targets_lvl_2, preds, prefixes[i])
        result = result.merge(df, on='user_id', how='left')

    return result
result_lvl_2 = get_results(data_val_lvl_2, targets_lvl_2, 
                           predictions_lgb_train,
                           predictions_cb_train, 
                           preds_train)

In [None]:
#LightGBM
result_lvl_2.apply(lambda row: precision_at_k(row['lgb_recommendations'], row['actual'], 5), axis=1).mean()
# 0.29441723800195807

#CatBoost
result_lvl_2.apply(lambda row: precision_at_k(row['cb_recommendations'], row['actual'], 5), axis=1).mean()
# 0.2863858961802145

#Ensemble
result_lvl_2.apply(lambda row: precision_at_k(row['cb_lgb_recommendations'], row['actual'], 5), axis=1).mean()
# 0.30499510284035214

In [None]:
validation_weeks = 6
data_train = data[data['week_no'] < data['week_no'].max() - validation_weeks]
data_valid = data[data['week_no'] >= data['week_no'].max() - validation_weeks]
test = pd.read_csv('predictions_basic.csv')
users_lvl_1 = data_train.user_id.unique()
users_lvl_2 = data_valid.user_id.unique()
users_lvl_3 = test.user_id.unique()

new_users_lvl_2 = list(set(users_lvl_2) - set(users_lvl_1))
new_users_lvl_3 = list(set(users_lvl_3) - (set(users_lvl_1) | set(users_lvl_2)))

add_to_lvl_2 = list(set(users_lvl_3) - (set(users_lvl_2)))

new_users_lvl_2, new_users_lvl_3, len(add_to_lvl_2)

from src.metrics import precision_at_k, recall_at_k
from src.utils import prefilter_items
from src.recommenders import MainRecommender
import os

os.chdir('src')
%run utils.py
n_items_before = data['item_id'].nunique()
data_train = prefilter_items(data_train, item_features=item_features, take_n_popular=5000)
n_items_after = data_train['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))
user_item_features = get_user_item_features(data_train)
user_item_features.head(2)
import os
os.chdir('src')
%run recommenders.py
targets_test = get_targets_lvl_2(data_train, data_valid, user_item_features, N, add_to_lvl_2)

print(f'число пользователей: {targets_test.user_id.nunique()}')
print(f'среднее число покупок: {round(targets_test["target"].mean(), 4)}')

targets_test.head(2)
SELECTED_FEATURES_NAMES = ['median_sales_hour', 'median_weekday', 
                           'mean_check', 
                           'n_stores', 'n_items', 'n_transactions', 
                           'mean_n_items_basket', 'max_n_items_basket', 
                           
                          
                           'factor_1', 'factor_2', 'factor_3', 'factor_4', 'factor_5',
                           'factor_6', 'factor_7', 'factor_8', 'factor_9', 'factor_10',
                           'factor_11', 'factor_12', 'factor_13', 'factor_14', 'factor_15',
                           'factor_16', 'factor_17', 'factor_18', 'factor_19', 'factor_20',
                           
                           'user_factor_1', 'user_factor_2', 'user_factor_3', 'user_factor_4',
                           'user_factor_5', 'user_factor_6', 'user_factor_7', 'user_factor_8',
                           'user_factor_9', 'user_factor_10', 'user_factor_11', 'user_factor_12',
                           'user_factor_13', 'user_factor_14', 'user_factor_15', 'user_factor_16',
                           'user_factor_17', 'user_factor_18', 'user_factor_19', 'user_factor_20',
                          ]
categorical = []

In [None]:
model_lgb = run_model_lgb(targets_test)
# [1558]	training's auc: 0.900737	valid_1's auc: 0.87275
SELECTED_FEATURES_NAMES_cb = [i for i in SELECTED_FEATURES_NAMES if not i in categorical]
model_cb = run_model_cb(targets_test)
# bestTest = 0.8620786343
# bestIteration = 3899
predictions_lgb_test = model_lgb.predict(targets_test[SELECTED_FEATURES_NAMES].fillna(0))
predictions_cb_test = model_cb.predict_proba(targets_test[SELECTED_FEATURES_NAMES_cb].fillna(0))[:, 1]

preds_test = pd.DataFrame(zip(predictions_lgb_test, predictions_cb_test),columns=['lgb', 'cb']).mean(axis=1).values
roc_auc_score(targets_test['target'], preds_test)
# 0.8887098749469134

In [None]:
def get_results_1(data_val_lvl_2, targets_lvl_2, preds_lgb, preds_cb, combined_preds):
    result = data_val_lvl_2.groupby('user_id')['actual'].unique().reset_index()
    result.columns=['user_id', 'actual']
    
    prefixes = ['lgb', 'cb', 'cb_lgb']
    predictions = [preds_lgb, preds_cb, combined_preds]
    
    for i, preds in enumerate(predictions):
        df = get_predictions(targets_lvl_2, preds, prefixes[i])
        result = result.merge(df, on='user_id', how='left')

    return result
result_test = get_results_1(test, targets_test, 
                           predictions_lgb_test,
                           predictions_cb_test, 
                           preds_test)

In [None]:
#LightGBM
result_test.apply(lambda row: precision_at_k(row['lgb_recommendations'], row['actual'], 5), axis=1).mean()
# 0.3131034482758621

#CatBooost
result_test.apply(lambda row: precision_at_k(row['cb_recommendations'], row['actual'], 5), axis=1).mean()
# 0.33389920424403186

#Ensemble
result_test.apply(lambda row: precision_at_k(row['cb_lgb_recommendations'], row['actual'], 5), axis=1).mean()
# 0.3335809018567639
Сохранение результатов

df = result_test[['user_id', 'cb_lgb_recommendations']].copy()
df.to_csv('predictions_2.csv', index=False)