# Generate candidates for orders
This notebook uses the click2buy matrix to generate candidates. It produces 4 outputs: candidates for the first cross-validation set (used to train the re-ranking model), candidates for the second cross-validation dataset (used to train the re-ranking model), top 75 candidates for the test dataset (used as test dataset for re-ranking model to produce the final results) and top 20 candidates for the test dataset (used to check the results on leaderboard).

Until the very end of competition, carts model used 50 candidates, and all the constants for 50 candidates are still there. But the final submission pipeline only uses 75 candidates.
## Imports and definitions

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
        
import gc
from datetime import datetime
from humanize import naturalsize
import itertools
from collections import Counter

# functions and classes common for several notebooks of current project
import otto_common

In [2]:
def suggest_buys(df, n_candidates, top_dict, click2buy_dict):
    # Set some constants depending on number of candidates.
    # n_candidates should be 20, 50 or 75
    if n_candidates == 20:
        aid_length = 18
        max_from_buys = 5
    if n_candidates == 50:
        aid_length = 32
        max_from_buys = 10
    if n_candidates == 75:
        aid_length = 35
        max_from_buys = 12
        
    # Select all last AIDs and some other info from dataframe.
    aids=df.aid.tolist()
    unique_aids = list(dict.fromkeys(aids[::-1] ))
    day_of_week = max(df.day_of_week.tolist())
    df = df.loc[(df['type']==1)|(df['type']==2)]
    unique_buys = list(dict.fromkeys( df.aid.tolist()[::-1] ))

    # Use AIDs, suggested from buys.
    if len(unique_buys) > 0:
        root_aids = unique_aids[:max_from_buys]
        result = unique_buys[:aid_length]
        for aid in unique_aids[:aid_length]:
            if aid not in result:
                result.append(aid)
        if len(result) >= n_candidates:
            return result[:n_candidates]
        dict_limit = min(5, (n_candidates - len(result)))
        if dict_limit > 0:
            aids2 = list(itertools.chain(*[click2buy_dict[aid][:dict_limit] for aid in unique_buys if
                                           aid in click2buy_dict]))
            top_aids2 = [aid2 for aid2, cnt in Counter(aids2).most_common(n_candidates) if aid2 not in result]
        else:
            top_aids2 = []
    else:
        root_aids = unique_aids[:n_candidates]
        top_aids2 = []
        result = unique_aids[:aid_length]
    
    # Select AIDs, suggested from all the last clicks.
    n_left = n_candidates - len(result) - len(unique_aids[:aid_length])
    aids3 = list(itertools.chain(*[click2buy_dict[aid][:n_left] for aid in root_aids if aid in click2buy_dict]))
    top_aids3 = [aid3 for aid3, cnt in Counter(aids3).most_common(n_candidates) if aid3 not in result]
    
    # Check for duplicates and merge all the previously selected AIDs.
    suggested_aids = top_aids2[:max_from_buys] + top_aids3[:n_candidates]
    i = 0
    while (len(result) < n_candidates) & (i < len(suggested_aids)):
        if suggested_aids[i] not in result:
            result.append(suggested_aids[i])
        i+=1
        
    # Add items from day top, if free slots.
    if (n_candidates-len(result)) > 0:
        i = 0
        days_top = top_dict[day_of_week]
        while len(result) < n_candidates:
            if days_top[i] not in result:
                result.append(days_top[i])
            i+=1
    return result   

In [3]:
# Prepare all the data for candidate generation.

def generate_order_candidates(click2buy_matrix_path, sessions_path, n_candidates, answers_path=None):
    # Load covisitation matrix and transform it to dictionary (to speed up candidate generation)
    click2buy_dict = otto_common.matrix_to_dict(click2buy_matrix_path, n_candidates)
    
    # Add a weekday column and calculate top orders for each day
    df = pd.read_parquet(sessions_path)
    df = otto_common.add_datetime(df)
    df['day_of_week'] = df['time'].dt.dayofweek.astype(np.int8)
    
    del df['time']
    gc.collect()
    
    top_dict = otto_common.build_top_dict(df, n_candidates, 2)
    
    # In case we generate candidates for cross-validation, reduce inputs to those that have positive results as ground truth.
    # Also prepare some data for statistics.
    if answers_path:
        df, df_answers = otto_common.reduce_df_prepare_answers(df, answers_path, 'orders')
        
        unique_session_aids = df.groupby(["session"]).aid.nunique()
        unique_session_aids = pd.DataFrame({'session':unique_session_aids.index,
                                            'all_aids':unique_session_aids.values})
        unique_session_buys = df.loc[df['type'] > 0].groupby(["session"]).aid.nunique()
        unique_session_buys = pd.DataFrame({'session':unique_session_buys.index,
                                            'buys':unique_session_buys.values})
        
    # The candidate generation itself.
    pred_df_clicks = df.sort_values(["session", "ts"]).groupby(["session"]).apply(
        lambda x: suggest_buys(x, n_candidates, top_dict, click2buy_dict)
        )
    
    del df
    gc.collect()
    
    pred_df_clicks = pd.DataFrame({'session':pred_df_clicks.index, 'order_predictions':pred_df_clicks.values})
 
    # If generating candidates for a cross-validation set, use ground truth to calculate some statistics.
    # Else just remove columns we do not need anymore.
    if answers_path:
        pred_df_clicks = otto_common.calculate_stats(pred_df_clicks, df_answers, unique_session_aids,
                                         unique_session_buys, 'orders', n_candidates)
        #pred_df_clicks = calculate_stats(pred_df_clicks, df_answers, unique_session_aids, unique_session_buys,
        #                                 'orders', n_candidates)
    else:
        pred_df_clicks = pred_df_clicks[['session', 'order_predictions']]
    
    return pred_df_clicks

In [4]:
# All the paths.
# Tried to use all the co-visitation matrixes, but final version only uses "regular" matrix.

click2buy_matrix_path = '/kaggle/input/otto-prepare-candidates-buys/matrix_click2buy_cv_top_n.parquet'
click2buy_full_matrix_path = '/kaggle/input/otto-prepare-candidates-buys/matrix_click2buy_test_top_n.parquet'

sessions_path_cv1 = '/kaggle/input/otto-prepare-cv/cv_inputs.parquet'
sessions_path_cv2 = '/kaggle/input/otto-prepare-cv/cv_inputs2.parquet'
sessions_path_test = '/kaggle/input/otto-prepare-cv/test.parquet'

answers_path_cv1 = '/kaggle/input/otto-prepare-cv/cv_labels.parquet'
answers_path_cv2 = '/kaggle/input/otto-prepare-cv/cv_labels2.parquet'

In [5]:
%%time
# This cell was used for tests and checks.

#n_candidates = 75

#df_result =  generate_order_candidates(click2buy_matrix_path, sessions_path_cv1, n_candidates, answers_path_cv1)
#df_result =  generate_order_candidates(click2buy_matrix_path, sessions_path_cv2, n_candidates, answers_path_cv2)

CPU times: user 4 µs, sys: 1 µs, total: 5 µs
Wall time: 7.15 µs


In [6]:
%%time
# Generate candidates for the first cross-validation.

n_candidates = 75

df_result =  generate_order_candidates(click2buy_matrix_path, sessions_path_cv1, n_candidates, answers_path_cv1)
df_result.to_parquet('candidates_order_cv1.parquet')

Total orders:  311302
Total orders clipped:  311302
Total orders guessed:  214629
Total orders with buys in history:  227882
Orders with buys in history guessed:  166746
Total orders with no buys in history:  83420
Orders with no buys in history guessed:  47883
Total:  68.95%
Buys:  73.17%
No buys:  57.40%
CPU times: user 3min 46s, sys: 7.32 s, total: 3min 54s
Wall time: 3min 55s


In [7]:
%%time
# Generate candidates for the second cross-validation.

df_result =  generate_order_candidates(click2buy_matrix_path, sessions_path_cv2, n_candidates, answers_path_cv2)
df_result.to_parquet('candidates_order_cv2.parquet')

Total orders:  311762
Total orders clipped:  311762
Total orders guessed:  214972
Total orders with buys in history:  228320
Orders with buys in history guessed:  167111
Total orders with no buys in history:  83442
Orders with no buys in history guessed:  47861
Total:  68.95%
Buys:  73.19%
No buys:  57.36%
CPU times: user 3min 49s, sys: 5.96 s, total: 3min 55s
Wall time: 3min 55s


In [8]:
%%time
# Generate candidates for the test dataset.

df_result =  generate_order_candidates(click2buy_full_matrix_path, sessions_path_test, n_candidates)
df_result.to_parquet('candidates_order_test.parquet')
print('max candidates test ready')

max candidates test ready
CPU times: user 29min 30s, sys: 29.8 s, total: 30min
Wall time: 29min 56s


In [9]:
%%time
# Generate 20 candidates for the test dataset (to check on leaderboard).

n_candidates = 20
df_result =  generate_order_candidates(click2buy_full_matrix_path, sessions_path_test, n_candidates)
df_result.to_parquet('candidates_order_test20.parquet')

CPU times: user 25min 24s, sys: 20.2 s, total: 25min 45s
Wall time: 25min 36s
