# Generate candidates for clicks
This notebook uses the click2click regular matrix to generate candidates. It produces 4 outputs: candidates for the first cross-validation set (used to train the re-ranking model), candidates for the second cross-validation dataset (planned to also use them for training, but those plans were abandoned, this output is not used), top 50 candidates for the test dataset (used as test dataset for re-ranking model to produce the final results) and top 20 candidates for the test dataset (used to check the results on leaderboard).

I have prepared code that would generate 75 best candidates, but because of memory limitations and lack of time decided not to proceed with switching to 75 candidates for clicks model.

## Imports and definitions

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
        
import gc
from humanize import naturalsize
import itertools
from collections import Counter

# functions and classes common for several notebooks of current project
import otto_common

In [2]:
# Generate candidates for clicks.

def suggest_clicks(df, n_candidates, click_dict, top_dict):
    # Set some constants depending on number of candidates.
    # n_candidates should be 20, 50 or 75
    if n_candidates == 20:
        aid_length = 10
        top1_dict_limit = 10
    if n_candidates == 50:
        aid_length = 20
        top1_dict_limit = 25
    if n_candidates == 75:
        aid_length = 26
        top1_dict_limit = 35
        
    # Make a list of history aids
    aids=df.aid.tolist()
    unique_aids = list(dict.fromkeys(aids[::-1] ))[:aid_length]

    # Select top aids suggested for the exact last aid 
    if unique_aids[0] in click_dict:
        top_aids = click_dict[unique_aids[0]][:top1_dict_limit]
    else:
        top_aids = []

    # Select most common aids suggested for the few last "root" aids
    root_aids=[]
    i = 0
    if n_candidates == 75:
        n_root_aids = 5
    else:
        n_root_aids = 4
    while (len(root_aids) < n_root_aids) & (i < len(unique_aids)):
        aid = unique_aids[i]
        if aid in click_dict:
            root_aids.append(aid)
        i+=1
    
    top_n = n_candidates
    if (len(root_aids) > 2) & (n_candidates == 75):
        top_n = 60
    
    aids2 = list(itertools.chain(*[click_dict[aid][:top_n] for aid in root_aids]))
    top_aids2 = [aid2 for aid2, cnt in Counter(aids2).most_common(n_candidates) if aid2 not in unique_aids]
    
    # Make a common list of aids from history and two previous selections    
    result = unique_aids
    top_aids = top_aids + top_aids2
    for aid in top_aids:
        if aid not in result:
            result.append(aid)
    result = result[:n_candidates]
    
    # Add aids from day top, if free slots
    if (n_candidates-len(result)) > 0:
        i = 0
        day_of_week = max(df.day_of_week.tolist())
        days_top = top_dict[day_of_week]
        while len(result) < n_candidates:
            if days_top[i] not in result:
                result.append(days_top[i])
            i+=1
    return result

In [3]:
# Print some statistics after candidate generation.
def print_stats(df_check):
    non_zero_clicks = len(df_check.loc[df_check['clicks'] != -1])
    guessed_clicks = len(df_check.loc[df_check['pred_true'] == 1])
    print(f"Total_sessions_with_clicks {non_zero_clicks}")
    print(f"Sessions_with_correctly_predicted_clicks {guessed_clicks}")
    
    non_zero_clicks_long = len(df_check.loc[(df_check['clicks'] != -1) & (df_check['aids'] >= 10)])
    guessed_clicks_long = len(df_check.loc[(df_check['pred_true'] == 1) & (df_check['aids'] >= 10)])
    print(f"Total_long_sessions_with_clicks {non_zero_clicks_long}")
    print(f"Long_sessions_with_correctly_predicted_clicks {guessed_clicks_long}")
    
    non_zero_clicks_short = len(df_check.loc[(df_check['clicks'] != -1) & (df_check['aids'] < 10)])
    guessed_clicks_short = len(df_check.loc[(df_check['pred_true'] == 1) & (df_check['aids'] < 10)])
    print(f"Total_short_sessions_with_clicks {non_zero_clicks_short}")
    print(f"Long_short_sessions_with_correctly_predicted_clicks {guessed_clicks_short}")
    
    print(f"Percent= {100*guessed_clicks/non_zero_clicks:.2f}%")
    print(f"Percent_long= {100*guessed_clicks_long/non_zero_clicks_long:.2f}%")
    print(f"Percent_short= {100*guessed_clicks_short/non_zero_clicks_short:.2f}%")

In [4]:
# Prepare all the data for candidate generation. 
# If candidates are generated for the cross-validation dataset, this function also prints some statistics for percentage of ground truth aids among candidates.
# n_candidates should be 20 or 50 or 75

def generate_click_candidates(matrix_path, sessions_path, n_candidates, answers_path=None):
    # Load co-visitation matrix and transform it to dictionary (to speed up candidate generation)
    click_dict = otto_common.matrix_to_dict(matrix_path, n_candidates)
    
    # Load inputs and prepare some data for statistics.
    df = pd.read_parquet(sessions_path)
    if answers_path:
        unique_session_aids = df.groupby(["session"]).aid.nunique()
        unique_session_aids = pd.DataFrame({'session':unique_session_aids.index, 'aids':unique_session_aids.values})
    
    # Add a weekday column and calculate top clicks for each day.
    df_test = otto_common.add_datetime(df)
    df['day_of_week'] = df['time'].dt.dayofweek.astype(np.int8)
    del df['time']
    gc.collect()
    top_dict = otto_common.build_top_dict(df, n_candidates, 0)
    
    # The candidate generation itself.
    pred_df_clicks = df.sort_values(["session", "ts"]).groupby(["session"]).apply(
        lambda x: suggest_clicks(x, n_candidates, click_dict, top_dict)
        )
         
    del df
    gc.collect()
        
    pred_df_clicks = pd.DataFrame({'session':pred_df_clicks.index, 'click_predictions':pred_df_clicks.values})
    
    # If generating candidates for a cross-validation set, use ground truth to calculate some statistics.
    if answers_path:
        df_answers = pd.read_parquet(answers_path)
        pred_df_clicks = pd.merge(pred_df_clicks, df_answers, on = 'session')
        pred_df_clicks['pred_true'] = pred_df_clicks.apply(lambda x: x.clicks in x.click_predictions, axis=1)
        pred_df_clicks = pd.merge(pred_df_clicks, unique_session_aids, on = 'session')
        print_stats(pred_df_clicks)
        
    return pred_df_clicks


## Generate candidates

In [5]:
# All the paths.
# Tried to use all the co-visitation matrixes, but final version only uses "regular" matrix.
matrix_path_regular = '/kaggle/input/otto-prepare-candidates-clicks/regular_click2click_matrix_cv_top_n.parquet'
matrix_path_regular_full = '/kaggle/input/otto-prepare-candidates-clicks/regular_click2click_matrix_test_top_n.parquet'

sessions_path_cv1 = '/kaggle/input/otto-prepare-cv/cv_inputs.parquet'
sessions_path_cv2 = '/kaggle/input/otto-prepare-cv/cv_inputs2.parquet'
sessions_path_test = '/kaggle/input/otto-prepare-cv/test.parquet'

answers_path_cv1 = '/kaggle/input/otto-prepare-cv/cv_labels.parquet'
answers_path_cv2 = '/kaggle/input/otto-prepare-cv/cv_labels2.parquet'

In [6]:
%%time
# This cell was used for tests and checks.

#n_candidates = 20

#df_result =  generate_click_candidates(matrix_path_regular, sessions_path_cv1, n_candidates, answers_path_cv1)
#df_result =  generate_click_candidates(matrix_path_regular, sessions_path_cv2, n_candidates, answers_path_cv2)


CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 7.87 µs


In [7]:
# At some point, I've planned to start using 75 candidates for the clicks model, but at the end of the competition decided to try some other improvements instead.
# So, carts and orders models use 75 candidates, while clicks is the only model to use 50 candidates.

# Generate candidates for first cross-validation.
n_candidates = 50

df_result =  generate_click_candidates(matrix_path_regular, sessions_path_cv1, n_candidates, answers_path_cv1)
df_result.to_parquet('candidates_click.parquet')

Total_sessions_with_clicks 1738122
Sessions_with_correctly_predicted_clicks 1050415
Total_long_sessions_with_clicks 94739
Long_sessions_with_correctly_predicted_clicks 50495
Total_short_sessions_with_clicks 1643383
Long_short_sessions_with_correctly_predicted_clicks 999920
Percent= 60.43%
Percent_long= 53.30%
Percent_short= 60.85%


In [8]:
# I've planned to use both first and second cross-validations and then take the average prediction. But only did that for orders model.
# Other models have all candidates prepared for both cross-validation datasets, but features are engineered only for the first cross-validation dataset.

# Generate candidates for second cross-validation.
df_result =  generate_click_candidates(matrix_path_regular, sessions_path_cv2, n_candidates, answers_path_cv2)
df_result.to_parquet('candidates_click2.parquet')

Total_sessions_with_clicks 1738205
Sessions_with_correctly_predicted_clicks 1050656
Total_long_sessions_with_clicks 95434
Long_sessions_with_correctly_predicted_clicks 50637
Total_short_sessions_with_clicks 1642771
Long_short_sessions_with_correctly_predicted_clicks 1000019
Percent= 60.44%
Percent_long= 53.06%
Percent_short= 60.87%


In [9]:
# Generate 50 candidates for the test dataset.

df_result =  generate_click_candidates(matrix_path_regular_full, sessions_path_test, n_candidates)
df_result.to_parquet('candidates_test.parquet')

In [10]:
# Generate 20 candidates for the test dataset (to check on leaderboard).
n_candidates = 20

df_result =  generate_click_candidates(matrix_path_regular_full, sessions_path_test, n_candidates)
df_result.to_parquet('candidates_test_20.parquet')