In [1]:
"""
04-03-23 - Added if statements for DO_LOCAL_VAL. Now test and cv use same pipeline
04-02-23 - For getting test preds, don't need to downsample or train
         - X*Make extra script for just getting final test preds
         - X*Maybe we can if statement the downsample and train part
           This is useful to reduce human error in using the same data pipeline
04-01-23 - To compare HP sweeps, should only look at clicks/carts/orders, not overall
         - Updated lgbm code to use for loop. Made d_preds do clicks, carts, orders
03-27-23 - Removed get_cands50 code. Instead, using suggest_preds with num_cand=50
03-27-23 - 2 parts, the CV, and the final preds for test
           *How to split the two? should probably use 1 nb? Memory usage?
03-26-23 - Moved get_recall into otto_utils.py
03-21-23 - Moved covisit preprocessing into function
         - Created test_aids_types.pkl file so we load instead of process
03-20-23 - Removed LGBM code
03-18-23 - Created separate val data in pkl dictionary
         - Now uniformly sampled across the week instead of chronologically
03-17-23 - Couldn't get starmap or partial+map to work faster than orig method
03-17-23 - Moved covisit code into otto_utils.py
03-16-23 - Make covisit preprocess indep func.
""";

In [2]:
import os
from collections import Counter
import warnings
from itertools import product

import pandas as pd
import numpy as np
from pandarallel import pandarallel
import mlflow

from otto_utils import trange, preprocess_covisits, get_preds, get_recall
# Below are ranker functions
from otto_utils import iterate_dict, create_config
from otto_utils import get_cands_pl, make_feats_radek_pl, make_all_feats_pl
from otto_utils import downsample_neg, train_lgbm, get_preds_lgbm

pandarallel.initialize(progress_bar=False)
warnings.filterwarnings('ignore')

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [3]:
# Keeping suggests in main notebook for speed. Tried moving to utils.py, but
# could not match equivalent speed. Funcs need top_20_clicks/buys/buy2buy
num_cands = 50
def suggest_clicks(event, num_cands=num_cands, type_weights1={0: 0.5, 1: 9, 2:0.5}):
    """ Get click suggestions, uses top_20_clicks & popul_20_clicks """
    aids, types = event
    unique_aids = list(dict.fromkeys(aids[::-1]))
    # RERANK CANDIDATES USING WEIGHTS
    if len(unique_aids) >= num_cands:
        time_weights = np.logspace(0.1,1,len(aids),base=2, endpoint=True)-1
        aids_temp = {}
        # RERANK BASED ON REPEAT ITEMS AND TYPE OF ITEMS
        for time_w, aid, type_aid in zip(time_weights, aids, types):
            type_w = type_weights1[type_aid]
            aids_temp[aid] = aids_temp.get(aid, 0) + time_w*type_w
        sorted_aids = [k for k,v in sorted(aids_temp.items(), key=lambda x: x[1], reverse=True)]
        return sorted_aids[:num_cands]
    # USE "CLICKS" CO-VISITATION MATRIX
    candidates = []
    for aid in unique_aids: # most recent AID is first here
        if aid in top_20_clicks:
            candidates.extend(top_20_clicks[aid])
    # RERANK CANDIDATES
    top_candidates = []
    for candidate, _ in Counter(candidates).most_common(num_cands):
        if candidate not in unique_aids:
            top_candidates.append(candidate)
    result = unique_aids + top_candidates[:num_cands-len(unique_aids)]
    set_result = set(result)
    result += [i for i in popul_20_clicks if i not in set_result][:num_cands - len(result)]
    return result

def suggest_carts(event, num_cands=num_cands, type_weights1={0: 0.5, 1: 9, 2:0.5}):
    """ Get carts & orders suggestions, uses top_20_clicks/buys/buy2buy & popul_20_carts """
    aids, types = event
    unique_aids = list(dict.fromkeys(aids[::-1])) # """"""
    unique_buys = [aid for aid, typ in zip(aids, types) if typ<2][::-1]
    unique_buys = list(dict.fromkeys(unique_buys)) # idx=0 = most recent
    # RERANK CANDIDATES USING WEIGHTS
    if len(unique_aids) >= 20: # """"""
        time_weights=np.logspace(0.5,1,len(aids),base=2, endpoint=True)-1
        aids_temp = {}
        # RERANK BASED ON REPEAT ITEMS AND TYPE OF ITEMS
        for time_w, aid, type_aid in zip(time_weights, aids, types):
            type_w = type_weights1[type_aid]
            aids_temp[aid] = aids_temp.get(aid, 0) + time_w*type_w
        # RERANK CANDIDATES USING "BUYS" CO-VISITATION MATRIX
        candidates = []
        for aid in unique_buys: #"""weight top 20 buys from covisit matrix"""
            if aid in top_20_buys:
                candidates.extend(top_20_buys[aid])
        for candidate in candidates:
            aids_temp[candidate] = aids_temp.get(candidate, 0) + 0.03
        sorted_aids = [k for k,v in sorted(aids_temp.items(), key=lambda x: x[1], reverse=True)]
        return sorted_aids[:num_cands]
    # USE "CART ORDER" CO-VISITATION MATRIX
    candidates1, candidates2, candidates3 = [], [], []
    for aid in unique_aids: # most recent AID is first here
        if aid in top_20_buys:
            candidates1.extend(top_20_buys[aid])
    # USE "CLICKS" CO-VISITATION MATRIX
    for aid in unique_buys: # most recent AID is first here
        if aid in top_20_clicks:
            candidates2.extend(top_20_clicks[aid])
    for aid in unique_buys: # most recent AID is first here
        if aid in top_20_buy2buy:
            candidates3.extend(top_20_buy2buy[aid])
    # RERANK CANDIDATES
    top_candidates = []
    # """Deote uses 20 instead of 40 here"""
    for candidate, _ in Counter(candidates1+candidates2+candidates3).most_common(num_cands):
        if candidate not in unique_aids: top_candidates.append(candidate)
    result = unique_aids + top_candidates[:num_cands-len(unique_aids)]
    set_result = set(result)
    result += [i for i in popul_20_carts if i not in set_result][:num_cands - len(result)]
    return result

In [4]:
# 0: 5%, 1: 10%, 2: 25%, 3: 50%, 4: 100%
CV_NUM = 4
DO_LOCAL_VALIDATION=True
# DO_LOCAL_VALIDATION=False
DATE = 230313

## 3-15-23 Adding back covisit matrix code
# Data + data_test had best covisit matrix performance
data = pd.read_parquet('data/230313_df_train.pqt')
data_test = pd.read_parquet('data/230313_df_test.pqt')
d_id2type = pd.read_pickle('data/d_id2type.pkl')
d_type2id = pd.read_pickle('data/d_type2id.pkl')
df_all_sm = pd.concat([data, data_test]).reset_index(drop=True)
trange(df_all_sm)

# Make all covisit matrices if they do not exist
preprocess_covisits(df_all_sm, DATE)

top_20_clicks = pd.read_pickle(f'covisit/{DATE}_top_20_clicks_data_datatest.pkl')
top_20_buys = pd.read_pickle(f'covisit/{DATE}_top_20_buys_data_datatest.pkl')
top_20_buy2buy = pd.read_pickle(f'covisit/{DATE}_top_20_buy2buy_data_datatest.pkl')

if DO_LOCAL_VALIDATION:
    # Train weeks 1-3, and validate on 4
    df_train = pd.read_parquet('data/230313_train_1to3.pqt')
    df_val = pd.read_parquet('data/230313_val.pqt')
    d = pd.read_pickle(f'data/preload/{DATE}_val_aids_types_{CV_NUM}.pkl')
    test_aids, test_types = d['aids'], d['types']
else:
    # Keep train to 4 weeks to maintain input distribution for LGBMranker
    # Train weeks 2-4, validate on 4 (for sanity check)
    df_train = pd.read_parquet('data/230313_train_2to4.pqt')
    df_val = pd.read_parquet('data/230313_df_test.pqt')
    d = pd.read_pickle('data/preload/test_aids_types.pkl')
    test_aids, test_types = d['aids'], d['types']

t = list(zip(test_aids, test_types))
print(len(t)*3)
print(df_train.shape, df_val.shape)

## {0: 'clicks', 1: 'carts', 2: 'orders'}
popul_20_clicks = df_val.loc[df_val['type']==0,'aid'].value_counts().index.values[:20].tolist()
popul_20_carts = df_val.loc[df_val['type']== 1,'aid'].value_counts().index.values[:20].tolist()
popul_20_orders = df_val.loc[df_val['type']==2,'aid'].value_counts().index.values[:20].tolist()

2022-07-31 22:00:00
2022-09-04 21:59:51
34 days 23:59:51
*Start covisit preprocessing
top_20_clicks already exists
top_20_buys already exists
top_20_buy2buy already exists
*Finished covisit preprocessing
5015409
(163555218, 4) (6928123, 4)


In [5]:
%%time
pclicks = get_preds(t, suggest_clicks)
pcarts = get_preds(t, suggest_carts)
preds = pclicks + pcarts + pcarts

pclicks20 = [x[:20] for x in pclicks]
pcarts20 = [x[:20] for x in pcarts]
preds20 = pclicks20 + pcarts20 + pcarts20

CPU times: user 38.2 s, sys: 7.93 s, total: 46.2 s
Wall time: 1min 1s


In [6]:
sess = test_aids.index.to_list()
sub = {'session': sess*3,
       'type': ['clicks']*len(sess) + ['carts']*len(sess) + ['orders']*len(sess)
      }
submission = pd.DataFrame(sub)

if DO_LOCAL_VALIDATION:
    submission['labels'] = preds
else:
    l_preds = [' '.join([str(l) for l in lls]) for lls in preds]
    submission['labels'] = l_preds
submission.head(3)

Unnamed: 0,session,type,labels
0,12899779,clicks,59625 1253524 737445 438191 731692 1790770 942...
1,12899780,clicks,1142000 736515 973453 582732 1502122 889686 48...
2,12899781,clicks,918667 199008 194067 57315 141736 1460571 7594...


In [7]:
%%time
# Compare covisit 20 and 50 cand recall
if DO_LOCAL_VALIDATION:
    get_recall(submission, DO_LOCAL_VALIDATION, DATE)
    submission['labels'] = preds20
    get_recall(submission, DO_LOCAL_VALIDATION, DATE)

clicks Recall: 0.6425
carts Recall: 0.5051
orders Recall: 0.6914
Overall Recall: 0.63064
CPU times: user 6.71 s, sys: 2.46 s, total: 9.17 s
Wall time: 17.4 s


(0.6424781171022736,
 0.5051080625090656,
 0.6914356279083776,
 0.6306416072079737)

In [None]:
%%time
INFO = "0307 refactored covisit with lgbmranker carts and orders, no embeddings"
s = f"kaggle competitions submit -c otto-recommender-system -f submission.zip -m '{INFO}'"
os.system(s)

In [7]:
# START LGBM SECTION
# Notes: To get best HP, sweep and sort by atype, and use the
# HPs for best scores for each atype. DO NOT USE OVERALL RECALL***
# Make a dictionary of model hyperparameter sweeps
# Then permute with data preprocessing hyperparameters
def create_model_configs(model)->dict:
    if model == 'lgbm':
        d = {'model': ['lgbm'], # baseline highest score with extra trees
             'boosting_type': ['gbdt'], # ['gbdt', 'dart']
             'n_estimators': [499,],
             'subsample': [0.8,],
             'subsample_freq': [1],
             'learning_rate': [0.1,],
             'num_leaves': [50], # less than 2^max_depth
             'min_data_in_leaf': [100],
             'max_depth': [6,],
             'is_unbalance': [False],
#              'extra_trees': [False, True], # False is better
             'boost_from_average': [True], # default True
             'early_stopping': [50],
             'verbose': [-1],
             'random_state': [42],
             'device': ['gpu'],
             'gpu_platform_id': [0],
             'gpu_device_id': [0], # a6000 = device_id 0
            }
    
    keys, values = zip(*d.items()) # Gets keys and values in deterministic order
    df_config = pd.DataFrame(list(product(*values)), columns=keys)
    d_configs = df_config.to_dict('records')
    return d_configs

cfgl = create_model_configs('lgbm')
cfgs = cfgl

d_mdl = {
    'num_splits': [5],
    'num_avg': [1], # number of folds to use and then avg. Use 1 for max speed
    'neg_frac': [0.3,],
    'metric': ['map'],
    'eval_at': [20],
    'MDL': cfgs
    }

d = {'MDL': iterate_dict(d_mdl)}
df_config = create_config(d)
df_config['recall'] = 0.
df_config['r_click'], df_config['r_cart'], df_config['r_order'] = 0, 0, 0
df_config

Unnamed: 0,MDL,recall,r_click,r_cart,r_order
0,"{'num_splits': 5, 'num_avg': 1, 'neg_frac': 0....",0.0,0,0,0


In [8]:
# Minor preprocessing hacks to get loop working. Refactor in future
# Put preds into dictionary, so we can adjust predictions after fitting lgbm
dpreds = {'clicks': pclicks20, 'carts': pcarts, 'orders': pcarts}

# Grab relevant sessions because we subsampled sessions for speed
df_val_cv = df_val[df_val.session.isin(sess)].reset_index(drop=True)
df_val_cv.head(3)

Unnamed: 0,session,ts,type,aid
0,12899779,1661724000,0,59625
1,12899780,1661724000,0,1142000
2,12899780,1661724058,0,582732


In [9]:
%%time
# LOG = True
LOG = False

# for atype in ['clicks', 'carts', 'orders']:
for atype in ['carts', 'orders']:
# for atype in ['carts']:
    print(f'***Starting {atype}')
    # 1) Get features for candidates
    # Make user, item, user-item, covisit features
    grps = [len(x) for x in pcarts]
    dfc = get_cands_pl(pcarts, df_train, df_val_cv, atype, grps, DATE)
    df_val_r = make_feats_radek_pl(df_val_cv)
    dfc = make_all_feats_pl(dfc, df_val_r, atype)
    dfc = dfc.sort(by=['user', 'ts'])
    dfc = dfc.to_pandas()
    # Fillna
    cols_fillna = ['session_length', 'action_num_reverse_chrono', 'log_recency_score', ]
    for col in cols_fillna:
        dfc[col] = dfc[col].fillna(-1)
    # Change datatypes to save memory
    dfc['ts'] = dfc['ts'].astype('float32')
    dfc['type'] = dfc['type'].astype('float16')
    dfc['session_length'] = dfc['session_length'].astype('float16')
    dfc['action_num_reverse_chrono'] = dfc['action_num_reverse_chrono'].astype('float16')
    print(f'{atype} Density: {dfc[atype].sum()/len(dfc):0.5f}')
    
    ############# START LGMB TRAINING #############
    cols_remove = ['user', 'item', 'carts', 'clicks', 'orders', 'ts', 'type',]
    feats = [col for col in dfc.columns if col not in cols_remove]
    train_plots = []

    for CFG in df_config.itertuples():
        try:
            with mlflow.start_run(experiment_id = None,
                                  run_name=''):
                if LOG and DO_LOCAL_VALIDATION:
                    mlflow.log_params({k:v for k,v in CFG.MDL.items() if k!='MDL'})
                    mlflow.log_params(CFG.MDL['MDL'])
                    mlflow.log_params({'atype': atype})

                ###### Code
                if DO_LOCAL_VALIDATION:
                    dfc_sm = downsample_neg(dfc, atype, CFG.MDL['neg_frac'])
                    ranker, train_curves = train_lgbm(dfc_sm, atype, feats, CFG.MDL)
                    train_plots.append(train_curves)

                dpreds[atype] = get_preds_lgbm(dfc, feats, atype, CFG.MDL)
                preds = dpreds['clicks'] + dpreds['carts'] + dpreds['orders']
                
                if DO_LOCAL_VALIDATION:
                    submission['labels'] = preds
                    r_click, r_cart, r_order, recall = get_recall(submission, 
                                                                  DO_LOCAL_VALIDATION,
                                                                  DATE)
                    avg_last=20
                    mmap = np.mean(ranker.evals_result_['valid_0'][f"{CFG.MDL['metric']}@{CFG.MDL['eval_at']}"][-avg_last:])

                    df_config.loc[CFG.Index, 'recall'] = recall
                    df_config.loc[CFG.Index, 'r_click'] = r_click
                    df_config.loc[CFG.Index, 'r_cart'] = r_cart
                    df_config.loc[CFG.Index, 'r_order'] = r_order
                    df_config.loc[CFG.Index, 'map'] = mmap
                    if LOG and DO_LOCAL_VALIDATION:
                        metrics = {
                            'r_click': r_click,
                            'r_carts': r_cart, 
                            'r_orders': r_order,
                            'recall': recall,
                            'stop_iter': len(ranker.evals_result_['valid_0'][f"{CFG.MDL['metric']}@{CFG.MDL['eval_at']}"]),
                            'map': mmap
                        }
                        mlflow.log_metrics(metrics)         
        except:
            print('error')
            pass

***Starting carts
carts Density: 0.00000
pred fold: 0
***Starting orders
orders Density: 0.00000
pred fold: 0
CPU times: user 1h 5min, sys: 1min 52s, total: 1h 6min 52s
Wall time: 9min 34s


In [12]:
# Postprocess for LGBM+covisit test submission
l_preds = [' '.join([str(l) for l in lls]) for lls in preds]
submission['labels'] = l_preds
get_recall(submission, DO_LOCAL_VALIDATION, DATE)

In [13]:
%%time
INFO = "0307 lgbm+covisit refactored covisit with lgbmranker carts and orders, no embeddings"
s = f"kaggle competitions submit -c otto-recommender-system -f submission.zip -m '{INFO}'"
os.system(s)



100%|██████████| 344M/344M [03:58<00:00, 1.51MB/s]   

Successfully submitted to OTTO – Multi-Objective Recommender SystemCPU times: user 443 ms, sys: 131 ms, total: 574 ms
Wall time: 4min





0

In [8]:
# Compare after LGBM preds with p20, not p50
submission['labels'] = preds20
get_recall(submission, DO_LOCAL_VALIDATION, DATE)

clicks Recall: 0.5912
carts Recall: 0.4304
orders Recall: 0.6445
Overall Recall: 0.57494


(0.5912450178094085,
 0.4303918649657605,
 0.6445010014607774,
 0.5749426621471354)