In [1]:
"""
03-26-23 - Moved get_recall into otto_utils.py
03-21-23 - Moved covisit preprocessing into function
         - Created test_aids_types.pkl file so we load instead of process
03-20-23 - Removed LGBM code
03-18-23 - Created separate val data in pkl dictionary
         - Now uniformly sampled across the week instead of chronologically
03-17-23 - Couldn't get starmap or partial+map to work faster than orig method
03-17-23 - Moved covisit code into otto_utils.py
03-16-23 - Make covisit preprocess indep func.
""";

In [2]:
import os
from collections import Counter
import warnings

import pandas as pd
import numpy as np
from pandarallel import pandarallel
from otto_utils import trange, preprocess_covisits, get_preds, get_recall

pandarallel.initialize(progress_bar=False)
warnings.filterwarnings('ignore')

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [3]:
# Keeping suggests in main notebook for speed. Tried moving to utils.py, but
# could not match equivalent speed. Funcs need top_20_clicks/buys/buy2buy
def suggest_clicks(event, num_cands=20, type_weights1={0: 0.5, 1: 9, 2:0.5}):
    """ Get click suggestions, uses top_20_clicks & popul_20_clicks """
    aids, types = event
    unique_aids = list(dict.fromkeys(aids[::-1]))
    # RERANK CANDIDATES USING WEIGHTS
    if len(unique_aids) >= num_cands:
        time_weights = np.logspace(0.1,1,len(aids),base=2, endpoint=True)-1
        aids_temp = {}
        # RERANK BASED ON REPEAT ITEMS AND TYPE OF ITEMS
        for time_w, aid, type_aid in zip(time_weights, aids, types):
            type_w = type_weights1[type_aid]
            aids_temp[aid] = aids_temp.get(aid, 0) + time_w*type_w
        sorted_aids = [k for k,v in sorted(aids_temp.items(), key=lambda x: x[1], reverse=True)]
        return sorted_aids[:num_cands]
    # USE "CLICKS" CO-VISITATION MATRIX
    candidates = []
    for aid in unique_aids: # most recent AID is first here
        if aid in top_20_clicks:
            candidates.extend(top_20_clicks[aid])
    # RERANK CANDIDATES
    top_candidates = []
    for candidate, _ in Counter(candidates).most_common(num_cands):
        if candidate not in unique_aids:
            top_candidates.append(candidate)
    result = unique_aids + top_candidates[:num_cands-len(unique_aids)]
    set_result = set(result)
    result += [i for i in popul_20_clicks if i not in set_result][:num_cands - len(result)]
    return result

def suggest_carts(event, num_cands=20, type_weights1={0: 0.5, 1: 9, 2:0.5}):
    """ Get carts & orders suggestions, uses top_20_clicks/buys/buy2buy & popul_20_carts """
    aids, types = event
    unique_aids = list(dict.fromkeys(aids[::-1])) # """"""
    unique_buys = [aid for aid, typ in zip(aids, types) if typ<2][::-1]
    unique_buys = list(dict.fromkeys(unique_buys)) # idx=0 = most recent
    # RERANK CANDIDATES USING WEIGHTS
    if len(unique_aids) >= 20: # """"""
        time_weights=np.logspace(0.5,1,len(aids),base=2, endpoint=True)-1
        aids_temp = {}
        # RERANK BASED ON REPEAT ITEMS AND TYPE OF ITEMS
        for time_w, aid, type_aid in zip(time_weights, aids, types):
            type_w = type_weights1[type_aid]
            aids_temp[aid] = aids_temp.get(aid, 0) + time_w*type_w
        # RERANK CANDIDATES USING "BUYS" CO-VISITATION MATRIX
        candidates = []
        for aid in unique_buys: #"""weight top 20 buys from covisit matrix"""
            if aid in top_20_buys:
                candidates.extend(top_20_buys[aid])
        for candidate in candidates:
            aids_temp[candidate] = aids_temp.get(candidate, 0) + 0.03
        sorted_aids = [k for k,v in sorted(aids_temp.items(), key=lambda x: x[1], reverse=True)]
        return sorted_aids[:num_cands]
    # USE "CART ORDER" CO-VISITATION MATRIX
    candidates1, candidates2, candidates3 = [], [], []
    for aid in unique_aids: # most recent AID is first here
        if aid in top_20_buys:
            candidates1.extend(top_20_buys[aid])
    # USE "CLICKS" CO-VISITATION MATRIX
    for aid in unique_buys: # most recent AID is first here
        if aid in top_20_clicks:
            candidates2.extend(top_20_clicks[aid])
    for aid in unique_buys: # most recent AID is first here
        if aid in top_20_buy2buy:
            candidates3.extend(top_20_buy2buy[aid])
    # RERANK CANDIDATES
    top_candidates = []
    # """Deote uses 20 instead of 40 here"""
    for candidate, _ in Counter(candidates1+candidates2+candidates3).most_common(num_cands):
        if candidate not in unique_aids: top_candidates.append(candidate)
    result = unique_aids + top_candidates[:num_cands-len(unique_aids)]
    set_result = set(result)
    result += [i for i in popul_20_carts if i not in set_result][:num_cands - len(result)]
    return result

In [4]:
## 3-15-23 Adding back covisit matrix code
# Data + data_test had best covisit matrix performance
data = pd.read_parquet('data/230313_df_train.pqt')
data_test = pd.read_parquet('data/230313_df_test.pqt')

d_id2type = pd.read_pickle('data/d_id2type.pkl')
d_type2id = pd.read_pickle('data/d_type2id.pkl')

df_all_sm = pd.concat([data, data_test]).reset_index(drop=True)
trange(df_all_sm)

2022-07-31 22:00:00
2022-09-04 21:59:51
34 days 23:59:51


In [5]:
DATE = 230313
preprocess_covisits(df_all_sm, DATE)

top_20_clicks = pd.read_pickle(f'covisit/{DATE}_top_20_clicks_data_datatest.pkl')
top_20_buys = pd.read_pickle(f'covisit/{DATE}_top_20_buys_data_datatest.pkl')
top_20_buy2buy = pd.read_pickle(f'covisit/{DATE}_top_20_buy2buy_data_datatest.pkl')

*Start covisit preprocessing
top_20_clicks already exists
top_20_buys already exists
top_20_buy2buy already exists
*Finished covisit preprocessing


In [6]:
# 0: 5%, 1: 10%, 2: 25%, 3: 50%, 4: 100%
CV_NUM = 2
DO_LOCAL_VALIDATION=True
# DO_LOCAL_VALIDATION=False

if DO_LOCAL_VALIDATION:
    # Train weeks 1-3, and validate on 4
    df_train = pd.read_parquet('data/230313_train_1to3.pqt')
    df_val = pd.read_parquet('data/230313_val.pqt')
    d = pd.read_pickle(f'data/preload/{DATE}_val_aids_types_{CV_NUM}.pkl')
    test_aids, test_types = d['aids'], d['types']
else:
    # Keep train to 4 weeks to maintain input distribution for LGBMranker
    # Train weeks 2-4, validate on 4 (for sanity check)
    df_train = pd.read_parquet('data/230313_train_2to4.pqt')
    df_val = pd.read_parquet('data/230313_df_test.pqt')
    d = pd.read_pickle('data/preload/test_aids_types.pkl')
    test_aids, test_types = d['aids'], d['types']

t = list(zip(test_aids, test_types))
print(len(t)*3)
print(df_train.shape, df_val.shape)

1350939
(163955181, 4) (7684122, 4)


In [7]:
## {0: 'clicks', 1: 'carts', 2: 'orders'}
popul_20_clicks = df_val.loc[df_val['type']==0,'aid'].value_counts().index.values[:20].tolist()
popul_20_carts = df_val.loc[df_val['type']== 1,'aid'].value_counts().index.values[:20].tolist()
popul_20_orders = df_val.loc[df_val['type']==2,'aid'].value_counts().index.values[:20].tolist()

In [8]:
%%time
pclicks = get_preds(t, suggest_clicks)
pcarts = get_preds(t, suggest_carts)
preds = pclicks + pcarts + pcarts

CPU times: user 7 s, sys: 2.34 s, total: 9.34 s
Wall time: 14.6 s


In [9]:
sess = test_aids.index.to_list()
sub = {'session': sess*3,
       'type': ['clicks']*len(sess) + ['carts']*len(sess) + ['orders']*len(sess)
      }
submission = pd.DataFrame(sub)
submission.head(3)

Unnamed: 0,session,type
0,11098530,clicks
1,11098531,clicks
2,11098535,clicks


In [10]:
if DO_LOCAL_VALIDATION:
    submission['labels'] = preds
else:
    l_preds = [' '.join([str(l) for l in lls]) for lls in preds]
    submission['labels'] = l_preds
submission.head(3)

Unnamed: 0,session,type,labels
0,11098530,clicks,"[409236, 264500, 1603001, 963957, 254154, 5830..."
1,11098531,clicks,"[1271998, 624163, 1553691, 396199, 1728212, 13..."
2,11098535,clicks,"[745365, 767201, 1750442, 803918, 896972, 1371..."


In [11]:
%%time
get_recall(submission, DO_LOCAL_VALIDATION, DATE)

clicks Recall: 0.5906
carts Recall: 0.4288
orders Recall: 0.6419
Overall Recall: 0.57284
CPU times: user 5.37 s, sys: 1.48 s, total: 6.85 s
Wall time: 9.18 s


In [12]:
%%time
INFO = "#2 230321 nb 1005 check, pclicks pcarts pcarts, covisit with 230313 covisits"
s = f"kaggle competitions submit -c otto-recommender-system -f submission.zip -m '{INFO}'"
os.system(s)