# ***Please use Kaggle Env to Load this notebook***
* https://www.kaggle.com/code/gambitwister/co-visitation-matrix-pred
# ***Please add the following dataset:***
* https://www.kaggle.com/datasets/columbia2131/otto-chunk-data-inparquet-format
* https://www.kaggle.com/datasets/gambitwister/co-visitation-matrix-9417
* https://www.kaggle.com/datasets/adaubas/otto-valid-test-list

# Introduction
This notebook uses the parallel CPU method and "list" rather than "dataframes" to speed up the prediction calculation.

In [19]:
import pandas as pd, numpy as np
from tqdm.notebook import tqdm
import os, sys, pickle, glob, gc
from collections import Counter
import itertools

# Load the test file:

In [20]:
type_labels = {'clicks':0, 'carts':1, 'orders':2}
type_weight = {0:1, 1:6, 2:3}
dataframes = []
for _, test_file_parquet in enumerate(glob.glob('../input/otto-chunk-data-inparquet-format/test_parquet/*')):
    test_file_df = pd.read_parquet(test_file_parquet)
    # change millisec to sec
    test_file_df.ts = (test_file_df.ts/1000).astype('int32')
    test_file_df['type'] = test_file_df['type'].map(type_labels).astype('int8')
    dataframes.append(test_file_df)
test_df = pd.concat(dataframes).reset_index(drop=True)
print("Test data shape: ", test_df.shape)
test_df.head()

Test data shape:  (6928123, 4)


Unnamed: 0,session,aid,ts,type
0,13099779,245308,1661795832,0
1,13099779,245308,1661795862,1
2,13099779,972319,1661795888,0
3,13099779,972319,1661795898,1
4,13099779,245308,1661795907,0


# Load the 3 Co-visitation matrix as dictionary:

In [21]:
%%time

DISK_PIECES = 4
def to_dict(df):
    return df.groupby('aid_x').aid_y.apply(list).to_dict()

top_carts_orders = to_dict(pd.read_parquet('../input/co-visitation-matrix-9417/carts_orders_0.pqt'))

top_clicks = to_dict(pd.read_parquet('../input/co-visitation-matrix-9417/clicks_0.pqt'))
for i in range(1, DISK_PIECES):
    top_clicks.update(to_dict(pd.read_parquet(f'../input/co-visitation-matrix-9417/clicks_{i}.pqt')))
    
top_clicks_carts_orders = to_dict(pd.read_parquet('../input/co-visitation-matrix-9417/clicks_carts_orders_0.pqt'))
for i in range(1, DISK_PIECES):
    top_clicks_carts_orders.update(to_dict(pd.read_parquet(f'../input/co-visitation-matrix-9417/clicks_carts_orders_{i}.pqt')))
    
# top 20 clicks&orders in test
test_top_clicks = test_df.loc[test_df['type']==0, 'aid'].value_counts().index.values[:20]
test_top_orders = test_df.loc[test_df['type']==2, 'aid'].value_counts().index.values[:20]

print('Size of click Co-visitation matrix: ', len(top_clicks))
print('Size of cart_order Co-visitation matrix: ', len(top_carts_orders))
print('Size of click_cart_order Co-visitation matrix: ', len(top_clicks_carts_orders))

Size of click Co-visitation matrix:  1513227
Size of cart_order Co-visitation matrix:  716395
Size of click_cart_order Co-visitation matrix:  1513227
CPU times: user 2min 36s, sys: 13.1 s, total: 2min 49s
Wall time: 2min 35s


# To speed up the prediction, use CPU in parallel:
* Modified from https://www.kaggle.com/code/adaubas/otto-fast-handcrafted-model-recall-20/notebook?scriptVersionId=113646958

In [22]:
# multiprocessing 
import psutil
# Available CPU cores
CORES_NUM = psutil.cpu_count()
print(f"Core num: {CORES_NUM}")
from multiprocessing import Pool

Core num: 4


In [24]:
def df_parallelize_run(func, t_split):
    num_cores = np.min([CORES_NUM, len(t_split)])
    pool = Pool(num_cores)
    df = pool.map(func, t_split)
    pool.close()
    pool.join()
    return df

# Use lists instead of dataframes to speed up calculations:

In [26]:
%%time

PIECES = 5
test_bysession_list = []
for PART in range(PIECES):
    with open(f'../input/otto-valid-test-list/test_group_tolist_{PART}_1.pkl', 'rb') as f:
        test_bysession_list.extend(pickle.load(f))
print(len(test_bysession_list))

1671803
CPU times: user 4.86 s, sys: 743 ms, total: 5.61 s
Wall time: 5.52 s


# Set up the click prediction function:

In [27]:
def click_pred(df):
    session = df[0]
    aids = df[1]
    types = df[2]
    # use dict to generate unique aids with a resverse order
    unique_aids = list(dict.fromkeys(aids[::-1]))
    # use weights if the aid amount is larger than 20
    if len(unique_aids) >= 20:
        weights=np.logspace(0.1,1,len(aids),base=2, endpoint=True)-1
        aids_temp = Counter() 
        # sort the aids with the repeat items and types
        for aid,w,t in zip(aids,weights,types): 
            aids_temp[aid] += w * type_weight[t]
        sorted_aids = [k for k,v in aids_temp.most_common(20)]
        return session, sorted_aids        
    # use the clicks Co-visitation matrix if the aid amount is less than 20
    aids2 = list(itertools.chain(*[top_clicks[aid] for aid in unique_aids if aid in top_clicks]))
    top_aids2 = [aid2 for aid2, cnt in Counter(aids2).most_common(20) if aid2 not in unique_aids]    
    result = unique_aids + top_aids2[:20 - len(unique_aids)]
    return session, result + list(test_top_clicks)[:20-len(result)]

# Set up the cart-order predcition:

In [29]:
def cart_order_pred(df):
    session = df[0]
    aids = df[1]
    types = df[2]
    # use dict to generate unique aids with a resverse order
    unique_aids = list(dict.fromkeys(aids[::-1]))
    # only keep the 'cart' & 'order' type
#     df = df.loc[df['type'].isin([1,2])]
#     unique_cart_order = dict.fromkeys(df.aid.tolist()[::-1])
#     unique_cart_order = list(unique_cart_order)
    unique_cart_order = list(dict.fromkeys( [f for i, f in enumerate(aids) if types[i] in [1, 2]][::-1] ))
    # use weights if the aid amount is larger than 20
    if len(unique_aids) >= 20:
        weights=np.logspace(0.5,1,len(aids),base=2, endpoint=True)-1
        aids_temp = Counter() 
        # sort the aids with the repeat items and types
        for aid,w,t in zip(aids,weights,types): 
            aids_temp[aid] += w * type_weight[t]
        # sort the aids by using cart-order Co-visitation matrix
        aids3 = list(itertools.chain(*[top_carts_orders[aid] for aid in unique_cart_order if aid in top_carts_orders]))
        for aid in aids3: aids_temp[aid] += 0.1
        sorted_aids = [k for k,v in aids_temp.most_common(20)]
        return session, sorted_aids
    # use click-cart-order Co-visitation matrix
    aids2 = list(itertools.chain(*[top_clicks_carts_orders[aid] for aid in unique_aids if aid in top_clicks_carts_orders]))
    # use cart-order Co-visitation matrix
    aids3 = list(itertools.chain(*[top_carts_orders[aid] for aid in unique_cart_order if aid in top_carts_orders]))
    top_aids2 = [aid2 for aid2, cnt in Counter(aids2+aids3).most_common(20) if aid2 not in unique_aids] 
    result = unique_aids + top_aids2[:20 - len(unique_aids)]
    return session, result + list(test_top_orders)[:20-len(result)]    

# Make the predictions on all sessions in parallel

### click predictions:

In [30]:
%%time

temp = df_parallelize_run(click_pred, test_bysession_list)
clicks_pred_df = pd.Series([f[1] for f in temp], index=[f[0] for f in temp])
clicks_pred_df = clicks_pred_df.add_suffix("_clicks")
clicks_pred_df.head()

CPU times: user 50.6 s, sys: 5.06 s, total: 55.7 s
Wall time: 1min 3s


12899779_clicks    [59625, 1790770, 637538, 941596, 1246235, 2739...
12899780_clicks    [1142000, 736515, 973453, 582732, 889686, 6361...
12899781_clicks    [918667, 199008, 194067, 57315, 141736, 146057...
12899782_clicks    [834354, 595994, 740494, 889671, 987399, 77947...
12899783_clicks    [1817895, 607638, 1754419, 1216820, 1729553, 3...
dtype: object

### cart-order prediction:

In [32]:
%%time

temp = df_parallelize_run(cart_order_pred, test_bysession_list)
orders_carts_pred_df = pd.Series([f[1] for f in temp], index=[f[0] for f in temp])
orders_pred_df = orders_carts_pred_df.add_suffix("_orders")
carts_pred_df = orders_carts_pred_df.add_suffix("_carts")

CPU times: user 1min 5s, sys: 8.55 s, total: 1min 14s
Wall time: 1min 25s


### final predictions:

In [33]:
pred_df = pd.concat([clicks_pred_df, orders_pred_df, carts_pred_df]).reset_index()
pred_df.columns = ["session_type", "labels"]
pred_df["labels"] = pred_df.labels.apply(lambda x: " ".join(map(str,x)))
pred_df.to_csv("submission.csv", index=False)
pred_df.head()

Unnamed: 0,session_type,labels
0,12899779_clicks,59625 1790770 637538 941596 1246235 273918 448...
1,12899780_clicks,1142000 736515 973453 582732 889686 636101 487...
2,12899781_clicks,918667 199008 194067 57315 141736 1460571 9507...
3,12899782_clicks,834354 595994 740494 889671 987399 779477 1344...
4,12899783_clicks,1817895 607638 1754419 1216820 1729553 300127 ...
