# Feature engineering for clicks model
This notebook builds most features for the clicks model (except for w2vec features that are calculated in a separate notebook).
It takes as input already generated candidates, co-visitation matrixes and some pre-calculations made aside in create_counts_for_clicks notebook. Most of the features are built using functions common for all the three models that are defined in otto_common_fe notebook, but a few features unique to clicks model are built using functions defined in this notebook.
## Imports and definitions

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
     
import gc
from humanize import naturalsize

# functions and classes common for several notebooks of current project
import otto_common, otto_common_fe

Collecting polars
  Downloading polars-0.16.14-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (16.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.2/16.2 MB[0m [31m51.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: polars
Successfully installed polars-0.16.14
[0m

In [2]:
# Polars library is used to generate features from co-visitation matrixes.
!pip install polars
import polars as pl

[0m

In [3]:
# This function was used for data exploration only.
# It selects ground truth aids that were missed during candidate generation.
def cand_read_and_reduce_test_not_cands(cand_path, is_cv):
    df = pd.read_parquet(cand_path)
    df = df.loc[df['pred_true'] == 0]
    df = df[['session', 'clicks']]
    df['session'] = df['session'].astype(np.int32)
    df['click_predictions'] = df['clicks'].astype(np.int32)
    del df['clicks']
    gc.collect()
    return df

In [4]:
# Apply co-visitation matrix to the last two aids in each truncated session.
# Columns for the last two aids created by add_history_agg_features function.
def add_matrix_calculations(input_path, df_candidate):
    df_matrix = pd.read_parquet(input_path)
    df_candidate = pd.merge(df_candidate, df_matrix, how='left', 
                   left_on=['first_aid','click_predictions'], right_on=['aid_x','aid_y'])
    del df_candidate['aid_x'], df_candidate['aid_y']
    gc.collect()
    df_candidate = pd.merge(df_candidate, df_matrix, how='left', 
                   left_on=['second_aid','click_predictions'], right_on=['aid_x','aid_y'])
    del df_candidate['aid_x'], df_candidate['aid_y']
    gc.collect()
    df_candidate = df_candidate.rename(columns={'wgt_x' : 'wgt_last', 'wgt_y' : 'wgt_before_last'})
    df_candidate['wgt_last'] = df_candidate['wgt_last'].fillna(0)
    df_candidate['wgt_before_last'] = df_candidate['wgt_before_last'].fillna(0)    
    return df_candidate

In [5]:
# Count total number of events for each aid and add this count as a feature.
def add_total_counts(train_path, df_candidate):
    total_counts = pd.read_parquet(train_path)
    total_counts = total_counts.groupby('aid')['session'].nunique()
    gc.collect()
    total_counts.name = "aid_counts"
    total_counts = total_counts.astype(np.int32)
    df_candidate = pd.merge(df_candidate, total_counts, how='left', left_on='click_predictions', right_index=True)
    return df_candidate

In [6]:
# A function that performs join operation between polar dataframes chunk-by-chunk.
def join_matrice_chunks(df, df_matrice, n_chunks, fields_df):
    for j in range(n_chunks):
        print('j = ' + str(j))
        max_total_aid_x = df_matrice.select(pl.col("aid_x").max())
        aid_x_min = j*max_total_aid_x/n_chunks
        if j + 1 == n_chunks:
            aid_x_max = max_total_aid_x + 1
        else:
            aid_x_max = (j+1)*(max_total_aid_x)/n_chunks
        df_matrice_chunk = df_matrice.filter((pl.col('aid_x') >= aid_x_min) & (pl.col('aid_x') < aid_x_max))
        df = df.join(df_matrice_chunk, left_on=fields_df, how='left', right_on=['aid_x','aid_y'])
        if j == 0:
            df = df.rename({"wgt": "wgt_all"})
        else:
            df = df.with_columns(pl.col("wgt_all").fill_null(pl.col("wgt"))).drop('wgt')
    df = df.rename({"wgt_all": "wgt"}).drop('aid')
    return df
            

In [7]:
# A special function to build features using full click2click co-visitation matrix. This matrix is too big and causes memory crush if using a standard
# function from otto_common_fe. So, I had to write another function that performs the join operation for click2click co-visitation matrix 
# chunk-by-chunk.
def add_big_matrice_data_polars(df_test, count_matrice, df_candidate, col_name, n_max, prediction_col):
    print('start loading matrice')
    df_matrice = pl.read_parquet(count_matrice)
    print('success loading matrice')
    df_matrice = df_matrice.drop('__index_level_0__')
    print(col_name)
    for i in range(n_max):
        print(str(i))
        df_test_i = df_test.filter(pl.col("n") == i).drop('n')
        df_candidate = df_candidate.join(df_test_i, on='session', how='left')
        df_candidate = join_matrice_chunks(df_candidate, df_matrice, 5, ['aid', prediction_col])
        gc.collect()
        if i == 0:
            df_candidate = df_candidate.with_columns(pl.col("wgt").fill_null(0).alias(col_name))
        else:
            df_candidate = df_candidate.with_columns((pl.col("wgt").fill_null(0) + pl.col(col_name))
                                                    .alias(col_name).cast(pl.Float32))
        df_candidate = df_candidate.drop('wgt')
        gc.collect()
    df_candidate = df_candidate.to_pandas()
    return df_candidate

In [8]:
# Builds a feature from "experimental" co-visitation matrix using standard function add_matrice_data_polars from otto_common_fe.
def add_history_experiment_features_pl(input_path, experiment_matrix_path, df_candidate):
    n_max = 10
    col_name = 'wgt_exp'
    time_frame = 5 * 60
    df_test = pd.read_parquet(input_path)
    df_test= otto_common.filter_by_time_and_n_max(df_test, time_frame, n_max)
    df_candidate = pl.from_pandas(df_candidate)
    df_test = pl.from_pandas(df_test)
    df_candidate = otto_common_fe.add_matrice_data_polars(df_test, experiment_matrix_path, df_candidate, col_name, n_max,
                                                          'click_predictions', normalize=True, divide=True)
    return df_candidate

In [9]:
# Builds a feature from regular co-visitation matrix using a modified function, that performs join operation chunk-by-chunk to limit memory usage,
# as regular click2click co-visitation matrix is larger than all the other co-visitation matrixes and using a straightforward join causes memory error.
def add_matrix_weight_pl(input_path, matrix_path, df_candidate):
    df_test = pd.read_parquet(input_path)
    n_max = 5
    col_name = 'wgt_matrix'
    time_frame = 5 * 60
    df_test = pd.read_parquet(input_path)
    df_test= otto_common.filter_by_time_and_n_max(df_test, time_frame, n_max)
    df_candidate = pl.from_pandas(df_candidate)
    df_test = pl.from_pandas(df_test)
    df_candidate = add_big_matrice_data_polars(df_test, matrix_path, df_candidate, col_name, n_max, 'click_predictions')
    return df_candidate

## Feature engineering for cross-validation dataset

In [10]:
# Input paths for cross-validation dataset.
cand_path = '/kaggle/input/otto-click-candidates-validation/candidates_click.parquet'
cv_path = '/kaggle/input/otto-prepare-cv/cv_inputs.parquet'
cv_matrix_path = '/kaggle/input/create-counts-for-clicks/matrix_exact_next_counts_cv.parquet'
experiment_matrix_path = '/kaggle/input/otto-experiment-prepare-candidates-clicks/matrix_experimental_cv.parquet'
matrix_path_cv = '/kaggle/input/otto-prepare-candidates-clicks/regular_click2click_matrix_cv.parquet'
return_rate_path_cv = '/kaggle/input/create-counts-for-clicks/return_rate_cv.parquet'
median_time_viewed_cv = '/kaggle/input/create-counts-for-clicks/time_viewed_agg_cv.parquet'

counts_before_path = '/kaggle/input/create-counts-for-clicks/daily_counts_train.parquet'
counts_during_path = '/kaggle/input/create-counts-for-clicks/daily_counts_cv1.parquet'
train_path = '/kaggle/input/otto-prepare-cv/cv_train.parquet'

In [11]:
%%time
# Building features for the first cross-validation dataset.

df_cand = pd.read_parquet(cand_path)
df_cand = otto_common_fe.cand_read_and_reduce(df_cand, 'click', True)
df_cand = add_matrix_weight_pl(cv_path, matrix_path_cv, df_cand)
df_cand = add_history_experiment_features_pl(cv_path, experiment_matrix_path, df_cand)
df_cand = otto_common_fe.add_history_aid_features(cv_path, df_cand, 'click_predictions')
df_cand = otto_common_fe.add_history_agg_features(cv_path, df_cand, False)
df_cand = otto_common_fe.add_time_viewed(cv_path, df_cand, 'click_predictions')
df_cand = otto_common_fe.add_daily_averages(counts_before_path,counts_during_path, df_cand, 'click_predictions')
df_cand = otto_common_fe.add_daily_averages_same_day(counts_during_path, df_cand, 'click_predictions')
df_cand = otto_common_fe.add_weekly_averages(counts_before_path,counts_during_path, df_cand, 'click_predictions')
df_cand = add_matrix_calculations(cv_matrix_path, df_cand)
df_cand = otto_common_fe.add_median_time_viewed(median_time_viewed_cv, df_cand, 'click_predictions')
df_cand = add_total_counts(train_path, df_cand)
df_cand = otto_common_fe.add_type_last(cv_path, df_cand, 'click_predictions')

start loading matrice
success loading matrice
wgt_matrix
0
j = 0
j = 1
j = 2
j = 3
j = 4
1
j = 0
j = 1
j = 2
j = 3
j = 4
2
j = 0
j = 1
j = 2
j = 3
j = 4
3
j = 0
j = 1
j = 2
j = 3
j = 4
4
j = 0
j = 1
j = 2
j = 3
j = 4
start normalizing
wgt_exp
0
1
2
3
4
5
6
7
8
9
CPU times: user 49min 53s, sys: 15min 8s, total: 1h 5min 1s
Wall time: 39min 18s


In [12]:
# Check file size and export to file.
size = df_cand.memory_usage(deep='True').sum()
print(naturalsize(size))
df_cand.to_parquet('cv1_features.parquet')

del df_cand
gc.collect()

4.0 GB


0

In [13]:
'''
%%time
# this sell was used to see feature values for ground truth aids that were NOT selected during candidate generation

df_cand = pd.read_parquet(cand_path)
df_cand = cand_read_and_reduce_test_not_cands(cand_path, True)
df_cand = add_history_experiment_features_pl(cv_path, experiment_matrix_path, df_cand)
df_cand = otto_common_fe.add_history_aid_features(cv_path, df_cand, 'click_predictions')
df_cand = otto_common_fe.add_history_agg_features(cv_path, df_cand, False)
df_cand = otto_common_fe.add_time_viewed(cv_path, df_cand, 'click_predictions')
df_cand = otto_common_fe.add_daily_averages(counts_before_path,counts_during_path, df_cand, 'click_predictions')
df_cand = add_daily_averages_same_day(counts_during_path, df_cand)
df_cand = otto_common_fe.add_weekly_averages(counts_before_path,counts_during_path, df_cand, 'click_predictions')
df_cand = add_matrix_calculations(cv_matrix_path, df_cand)
df_cand = otto_common_fe.add_median_time_viewed(median_time_viewed_cv, df_cand, 'click_predictions')
df_cand = add_total_counts(train_path, df_cand)
df_cand = otto_common_fe.add_type_last(cv_path, df_cand, 'click_predictions')
'''

"\n%%time\n# this sell was used to see feature values for ground truth aids that were NOT selected during candidate generation\n\ndf_cand = pd.read_parquet(cand_path)\ndf_cand = cand_read_and_reduce_test_not_cands(cand_path, True)\ndf_cand = add_history_experiment_features_pl(cv_path, experiment_matrix_path, df_cand)\ndf_cand = otto_common_fe.add_history_aid_features(cv_path, df_cand, 'click_predictions')\ndf_cand = otto_common_fe.add_history_agg_features(cv_path, df_cand, False)\ndf_cand = otto_common_fe.add_time_viewed(cv_path, df_cand, 'click_predictions')\ndf_cand = otto_common_fe.add_daily_averages(counts_before_path,counts_during_path, df_cand, 'click_predictions')\ndf_cand = add_daily_averages_same_day(counts_during_path, df_cand)\ndf_cand = otto_common_fe.add_weekly_averages(counts_before_path,counts_during_path, df_cand, 'click_predictions')\ndf_cand = add_matrix_calculations(cv_matrix_path, df_cand)\ndf_cand = otto_common_fe.add_median_time_viewed(median_time_viewed_cv, df_ca

## Feature engineering for the test dataset

In [14]:
# Input paths for the test dataset.
cand_test_path = '/kaggle/input/otto-click-candidates-validation/candidates_test.parquet'
test_path = '/kaggle/input/otto-prepare-cv/test.parquet'
test_matrix_path = '/kaggle/input/create-counts-for-clicks/matrix_exact_next_counts_full.parquet'
experiment_matrix_path = '/kaggle/input/otto-experiment-prepare-candidates-clicks/matrix_experimental_test.parquet'
matrix_path_test = '/kaggle/input/otto-prepare-candidates-clicks/regular_click2click_matrix_test.parquet'
return_rate_path_test = '/kaggle/input/create-counts-for-clicks/return_rate_test.parquet'
median_time_viewed_test = '/kaggle/input/create-counts-for-clicks/time_viewed_agg_test.parquet'

counts_before_path = '/kaggle/input/create-counts-for-clicks/daily_counts_test_full.parquet'
counts_during_path = '/kaggle/input/create-counts-for-clicks/daily_counts_test_trunked.parquet'
train_full_path = '/kaggle/input/otto-prepare-cv/train_full.parquet'



In [15]:
# Split test dataset into 2 chunks and build features for each chunk.
n_splits=2

for i in range(n_splits):
    df = pd.read_parquet(cand_test_path)
    df_cand = otto_common.divide_df_by_column(df, n_splits, i, 'session')
    del df
    gc.collect()
    df_cand = otto_common_fe.cand_read_and_reduce(df_cand, 'click', False)
    df_cand = add_matrix_weight_pl(test_path, matrix_path_test, df_cand)
    df_cand = add_history_experiment_features_pl(test_path, experiment_matrix_path, df_cand)
    df_cand = otto_common_fe.add_history_aid_features(test_path, df_cand, 'click_predictions')
    df_cand = otto_common_fe.add_history_agg_features(test_path, df_cand, False)
    df_cand = otto_common_fe.add_time_viewed(test_path, df_cand, 'click_predictions')
    df_cand = otto_common_fe.add_daily_averages(counts_before_path,counts_during_path, df_cand, 'click_predictions')
    df_cand = otto_common_fe.add_daily_averages_same_day(counts_during_path, df_cand, 'click_predictions')
    df_cand = otto_common_fe.add_weekly_averages(counts_before_path,counts_during_path, df_cand, 'click_predictions')
    df_cand = add_matrix_calculations(test_matrix_path, df_cand)
    df_cand = otto_common_fe.add_median_time_viewed(median_time_viewed_test, df_cand, 'click_predictions')
    df_cand = add_total_counts(train_full_path, df_cand)
    df_cand = otto_common_fe.add_type_last(test_path, df_cand, 'click_predictions')
    gc.collect()
    string_i = 'test_features_cart_part_' + str(i) + '.parquet'
    df_cand.to_parquet(string_i)


start loading matrice
success loading matrice
wgt_matrix
0
j = 0
j = 1
j = 2
j = 3
j = 4
1
j = 0
j = 1
j = 2
j = 3
j = 4
2
j = 0
j = 1
j = 2
j = 3
j = 4
3
j = 0
j = 1
j = 2
j = 3
j = 4
4
j = 0
j = 1
j = 2
j = 3
j = 4
start normalizing
wgt_exp
0
1
2
3
4
5
6
7
8
9
start loading matrice
success loading matrice
wgt_matrix
0
j = 0
j = 1
j = 2
j = 3
j = 4
1
j = 0
j = 1
j = 2
j = 3
j = 4
2
j = 0
j = 1
j = 2
j = 3
j = 4
3
j = 0
j = 1
j = 2
j = 3
j = 4
4
j = 0
j = 1
j = 2
j = 3
j = 4
start normalizing
wgt_exp
0
1
2
3
4
5
6
7
8
9


In [16]:
size = df_cand.memory_usage(deep='True').sum()
print(naturalsize(size))

3.1 GB
