# OTTO common functions for feature engineering
This notebook contains functions, used for feature engineering in the OTTO project.
Three feature engineering notebooks of the project have a huge number of mostly the same functions, and copying the functions between notebooks made them too long and hard to manage. So, I had to move functions, common among those notebooks, to a special shared notebook.
## Imports

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
     
import gc
from datetime import datetime

# functions and classes common for several notebooks of current project
import otto_common

In [2]:
# Use polars to speed up the most time-consuming operations.
!pip install polars
import polars as pl

[0m

## General functions for feature engineering
These are functions, used in more than one of feature engineering notebooks - notebooks, that import already generated candidates and add some information so that the ranker model could rank the candidates.

In [3]:
# Load the candidates, reduce datatypes and in case of cross-validation dataset also filter out sessions without positive candidates. 
def cand_read_and_reduce(df, target_string, is_cv):
    target_predictions = target_string + '_predictions'
    if is_cv:
        ground_truth = target_string + 's'
        df = df.loc[df['pred_true'] > 0]
        df = df[['session', target_predictions, ground_truth]]
    else:
        df = df[['session', target_predictions]]
    df['session'] = df['session'].astype(np.int32)
    df = df.explode(target_predictions).reset_index(drop=True)
    df[target_predictions] = df[target_predictions].astype(np.int32)
    gc.collect()
    if is_cv:
        if target_string == 'click':
            df['target'] = 0
            df.loc[df['click_predictions'] == df['clicks'], 'target'] = 1
        else:
            df['target'] = df.apply(lambda x: x[target_predictions] in x[ground_truth], axis=1)
        df['target'] = df['target'].astype(bool)
        del df[ground_truth]
        gc.collect()
    return df

In [4]:
# For candidates present in session history, add a few features, including:
# A. position in session history ('n')
# B. time in seconds from last mention in session history to last known event ('time_delta')
# C. number of interactions with the item ('count_views')
# For candidates, that do not come from session history, the functions fills in some default values.
def add_history_aid_features(input_path, df_candidate, prediction_col):
    df_test = pd.read_parquet(input_path)
    df_test = df_test.sort_values(['session','ts'],ascending=[True,False])
    df_test['n'] = df_test.groupby('session').cumcount().astype(np.int16)
    df_test['time_delta'] = df_test.groupby('session')['ts'].transform(np.max).astype(np.int32)
    df_test['time_delta'] = df_test['time_delta'] - df_test['ts']
    df_test = df_test.groupby(['session', 'aid']).agg({'n': np.min, 'time_delta': np.min, 'ts': 'nunique'})
    df_test = df_test.rename(columns={'ts':'count_views'}).reset_index()
    df_candidate = pd.merge(df_candidate, df_test,
                            how='left', left_on=['session', prediction_col],right_on=['session', 'aid'])
    del df_candidate['aid']
    gc.collect()
    df_candidate['n'] = df_candidate['n'].fillna(10000).astype(np.int16)
    df_candidate['time_delta'] = df_candidate['time_delta'].fillna(1000000).astype(np.int32)
    df_candidate['count_views'] = df_candidate['count_views'].fillna(0).clip(0,125).astype(np.int8)
    return df_candidate

In [5]:
# Adds columns, including last aid, aid before last, day of week for the last event and time between last two events.
# Time between last two aids in session is used as a feature in all the models, rest of columns are used to engineer other features.

def add_history_agg_features(input_path, df_candidate, remove_first_second=True):
    df_test = pd.read_parquet(input_path)
    df_test = df_test.sort_values(['session','ts'],ascending=[True,False])
    df_test['n'] = df_test.groupby('session').cumcount().astype(np.int16)
    df_test_first = df_test.loc[df_test['n'] == 0].drop('n',axis=1)
    df_test_first = df_test_first.add_prefix('first_')
    df_test_second = df_test.loc[df_test['n'] == 1].drop('n',axis=1)
    df_test_second = df_test_second.add_prefix('second_')
    df_last_two = pd.merge(df_test_first, df_test_second,
                           left_on='first_session', right_on='second_session', how='left')
    df_last_two['ts_diff'] = df_last_two['first_ts'] - df_last_two['second_ts']
    df_last_two = df_last_two[['first_session', 'first_aid', 'first_ts', 'second_aid', 'ts_diff']]
    df_last_two['second_aid'] = df_last_two['second_aid'].fillna(-1).astype(np.int32)
    df_last_two['ts_diff'] = df_last_two['ts_diff'].fillna(1000000).astype(np.int32)
    df_last_two = df_last_two.rename(columns={'first_session' : 'session'})
    df_last_two = otto_common.add_datetime(df_last_two, 'first_ts')
    df_last_two['day_of_week'] = df_last_two['time'].dt.dayofweek.astype(np.int8)
    df_last_two = df_last_two[['session', 'first_aid', 'second_aid', 'ts_diff', 'day_of_week']]
    df_candidate = pd.merge(df_candidate, df_last_two, how='left', on='session')
    if remove_first_second:
        del df_candidate['first_aid'], df_candidate['second_aid']
        gc.collect()
    return df_candidate

In [6]:
# Calculates time from a moment user interacted with aid to the next event for candidates, present in session history.
# Those values are clipped to 180 seconds and are then summed up if a user interacted with aid more than once.
def add_time_viewed(input_path, df_candidate, prediction_col):
    df_test = pd.read_parquet(input_path)
    df_test['n'] = df_test.groupby('session').cumcount().astype(np.int16)
    del df_test['type']
    gc.collect()
    df_test_plus_1 = df_test.copy()
    df_test_plus_1['n'] = df_test_plus_1['n'] - 1
    df_test = df_test.merge(df_test_plus_1, how='inner', on=['session', 'n'])
    del df_test_plus_1
    gc.collect()
    df_test['time_viewed'] = df_test['ts_y'] - df_test['ts_x']
    df_test = df_test[['session', 'aid_x', 'time_viewed']]
    df_test['time_viewed'] = df_test['time_viewed'].clip(0,180)
    df_test = df_test.groupby(['session', 'aid_x']).agg({'time_viewed':'sum'})
    df_test = df_test.reset_index()
    df_candidate = df_candidate.merge(df_test, how='left',
                        left_on=['session',prediction_col], right_on=['session','aid_x'])
    df_candidate['time_viewed'] = df_candidate['time_viewed'].fillna(0).astype(np.int16)
    df_candidate = df_candidate.drop('aid_x', axis=1)
    return df_candidate

In [7]:
# Adds values for averaged daily clicks/carts/orders of every aid.
# Those values are pre-calculated in another notebook (counts_for_clicks or counts_for_buys).
def add_daily_averages(daily_counts_before, daily_counts_during, df_candidate, prediction_col):
    df_before = pd.read_parquet(daily_counts_before)
    df_before = df_before.loc[df_before['day_of_week'] == 6]
    df_before['day_of_week'] = 0
    df_candidate = pd.merge(df_candidate, df_before, how = 'left', left_on = [prediction_col, 'day_of_week'],
                   right_on = ['aid', 'day_of_week'])
    df_candidate['daily_aid_count'] = df_candidate['aid_count']
    df_candidate = df_candidate.drop(['aid', 'aid_count'], axis=1)
    del df_before
    gc.collect()
    df_during = pd.read_parquet(daily_counts_during)
    for i in range(6):
        df_i = df_during.loc[df_during['day_of_week'] == i].copy()
        df_i['day_of_week'] = df_i['day_of_week'] + 1
        df_candidate = pd.merge(df_candidate, df_i, how = 'left', left_on = [prediction_col, 'day_of_week'],
                           right_on = ['aid', 'day_of_week'])
        df_candidate['daily_aid_count'] = df_candidate['daily_aid_count'].fillna(df_candidate['aid_count'])
        df_candidate = df_candidate.drop(['aid', 'aid_count'], axis=1)
        del df_i
        gc.collect()
    df_candidate['daily_aid_count'] = df_candidate['daily_aid_count'].fillna(0).astype(np.float32)
    return df_candidate

In [8]:
# Calculates average weekly clicks/carts/orders by summing up the pre-calculated daily clicks/carts/orders.
# Those values are pre-calculated in another notebook (counts_for_clicks or counts_for_buys).
def add_weekly_averages(daily_counts_before, daily_counts_during, df_candidate, prediction_col):
    df_before = pd.read_parquet(daily_counts_before)
    df_during = pd.read_parquet(daily_counts_during)
    for i in range(7):
        for j in range(7):
            k = i + j
            if k < 7:
                df_j = df_before.loc[df_before['day_of_week'] == k]
            else:
                df_j = df_during.loc[df_during['day_of_week'] == k-7]
            df_j = df_j.drop(['day_of_week'], axis=1)
            if j == 0:
                df_i = df_j
                df_i['aid_count_total'] = df_i['aid_count']
            else:
                df_i = pd.merge(df_i, df_j, how='outer', on='aid')
                df_i['aid_count'] = df_i['aid_count'].fillna(0)
                df_i['aid_count_total'] = df_i['aid_count_total'].fillna(0)
                df_i['aid_count_total'] = df_i['aid_count_total'] + df_i['aid_count']
            df_i = df_i.drop(['aid_count'], axis=1)
        df_i['day_of_week'] = i
        df_candidate = pd.merge(df_candidate, df_i, how = 'left', left_on = [prediction_col, 'day_of_week'],
                           right_on = ['aid', 'day_of_week'])
        if i == 0:
            df_candidate['aid_count_weekly'] = df_candidate['aid_count_total']
        else:
            df_candidate['aid_count_weekly'] = df_candidate['aid_count_weekly'].fillna(df_candidate['aid_count_total'])
        df_candidate = df_candidate.drop(['aid', 'aid_count_total'], axis=1)
    df_candidate['aid_count_weekly'] = df_candidate['aid_count_weekly'].fillna(0).astype(np.float32)
    return df_candidate

In [9]:
# Data for a median time aid was viewed (means median time from an event with aid to any next event).
# Those median times are calculated in create_counts_for_clicks notebook.
def add_median_time_viewed(time_viewed_path, df_candidate, prediction_col):
    df_time_viewed = pd.read_parquet(time_viewed_path)
    df_candidate = pd.merge(df_candidate, df_time_viewed, how='left', left_on=prediction_col, right_on='aid_x')
    df_candidate['time_viewed_clipped'] = df_candidate['time_viewed_clipped'].fillna(60)
    df_candidate = df_candidate.drop('aid_x', axis=1)
    return df_candidate

In [10]:
# Calculates the most actual event type for aids present in session history.
# If aid was either added to cart or ordered, the function selects the last of these event types.
# Click event type is only selected for aids that were clicked, but never carted or ordered.
def add_type_last(input_path, df_candidate, prediction_col):
    df_sessions = pd.read_parquet(input_path)
    df_sessions = df_sessions.loc[df_sessions['type'] > 0]
    df_sessions = df_sessions.groupby(['session', 'aid']).agg({'type':'last'})
    df_sessions = df_sessions.rename(columns={'type':'type_last'})
    df_candidate = pd.merge(df_candidate, df_sessions, how='left', left_on=['session',prediction_col], right_index=True)
    df_candidate['type_last'] = df_candidate['type_last'].fillna(0).astype(np.int8)
    return df_candidate

In [11]:
# Total number of events in last 3 hours of the session.
def count_events_3hours(input_path, df_candidate):
    df_sessions = pd.read_parquet(input_path)
    df_sessions['time_delta'] = df_sessions.groupby('session')['ts'].transform(np.max).astype(np.int32)
    df_sessions['time_delta'] = df_sessions['time_delta'] - df_sessions['ts']
    df_sessions = df_sessions.loc[df_sessions['time_delta'] < 3*60*60]
    df_sessions = df_sessions.groupby('session').size()
    df_sessions.name = 'events_last_3hours'
    df_candidate = pd.merge(df_candidate, df_sessions, how='left', left_on= 'session', right_index=True)
    df_candidate['events_last_3hours'] = df_candidate['events_last_3hours'].clip(0,125).astype(np.int8)
    return df_candidate

In [12]:
# W2vec mean similarity between last aid and previous 4 aids before the last one.
# Loads pre-calculated in create_counts_for_buys notebook similarities.
def add_history_similarity(history_path, df_candidate):
    df_history = pd.read_parquet(history_path)
    df_candidate = pd.merge(df_candidate, df_history, how='left', on='session')
    return df_candidate

In [13]:
# Total amount of buys in session.
def add_total_buys_in_session(trunked_sessions, df_candidate):
    df_sessions = pd.read_parquet(trunked_sessions)
    df_sessions = df_sessions.loc[df_sessions['type'] > 0]
    df_sessions = df_sessions.groupby('session').size()
    gc.collect()
    df_sessions.name = "buys_this_session"
    df_candidate = pd.merge(df_candidate, df_sessions, how='left',
                            left_on='session', right_index=True)
    df_candidate['buys_this_session'] = df_candidate['buys_this_session'].fillna(0).astype(np.int16)
    return df_candidate

In [14]:
# Time in seconds from first to last event.
def add_session_time(input_path, df_candidate):
    df_sessions = pd.read_parquet(input_path)
    df_sessions = df_sessions.groupby('session').agg(ts_min=('ts', np.min), ts_max=('ts', np.max))
    df_sessions['session_time'] = df_sessions.ts_max - df_sessions.ts_min
    df_sessions['session_time'] = df_sessions['session_time'].astype(np.int32)
    df_sessions = df_sessions.drop(['ts_min', 'ts_max'], axis=1)
    df_candidate = pd.merge(df_candidate, df_sessions, how='left', left_on= 'session', right_index=True)
    return df_candidate

In [15]:
# Average clicks on each aid before it is bought for the first time.
# Loads values, pre-calculated in create_counts_for_buys notebook.
def add_clicks_before_buy(clicks_before_buy_path, df_candidate, prediction_col):
    df_clicks = pd.read_parquet(clicks_before_buy_path)
    df_candidate = pd.merge(df_candidate, df_clicks, how='left', left_on=prediction_col, right_on='aid')
    df_candidate['clicks_before_buy'] = df_candidate['clicks_before_buy'].fillna(2).astype(np.float32)
    return df_candidate

In [16]:
# Load daily averages for the day of last known event in session.
def add_daily_averages_same_day(daily_counts_during, df_candidate, prediction_col):
    df_during = pd.read_parquet(daily_counts_during)
    df_candidate['same_day_aid_count'] = np.NaN
    for i in range(7):
        df_i = df_during.loc[df_during['day_of_week'] == i]
        df_candidate = pd.merge(df_candidate, df_i, how = 'left', left_on = [prediction_col, 'day_of_week'],
                           right_on = ['aid', 'day_of_week'])
        df_candidate['same_day_aid_count'] = df_candidate['same_day_aid_count'].fillna(df_candidate['aid_count'])
        df_candidate = df_candidate.drop(['aid', 'aid_count'], axis=1)
        del df_i
        gc.collect()
    df_candidate['same_day_aid_count'] = df_candidate['same_day_aid_count'].fillna(0).astype(np.float32)
    return df_candidate

## Functions to build features from co-visitation matrixes

In [17]:
# This function normalizes matrix before calculating features.
# Normalize here means to divide all weights by sum of weights per aid_x.
# Some co-validation matrixes are normalized before calculating features, while others are not.
def normalize_matrice(df):
    print('start normalizing')
    df = df.select([
        pl.all(),
        pl.col("wgt").sum().over("aid_x").alias("wgt_sum")
    ])
    df = df.with_column((100 * pl.col("wgt") / pl.col("wgt_sum"))
                        .alias("wgt").cast(pl.Float32))
    df = df.drop('wgt_sum')
    return df

In [18]:
# This function is used to build features based on co-visitation matrixes. 
# It sums weights for n_max last aids in each session (aid_x) and the candidate aid (aid_y).
# This function is computationally heavy, but it builds a number of features with 
# very high feature importance. I had to rewrite it using polars, to speed up feature generation.
# Same function on pandas runs too slow.

def add_matrice_data_polars(df_test, count_matrice, df_candidate, col_name, n_max, prediction_col,
                            normalize=False, divide=False):
    df_matrice = pl.read_parquet(count_matrice)
    df_matrice = df_matrice.unique(subset=['aid_x', 'aid_y'])
    df_matrice = df_matrice.drop('__index_level_0__')
    if normalize:
        df_matrice = normalize_matrice(df_matrice)
    print(col_name)
    for i in range(n_max):
        print(str(i))
        df_test_i = df_test.filter(pl.col("n") == i).drop('n')
        df_candidate = df_candidate.join(df_test_i, on='session', how='left')
        df_candidate = df_candidate.join(df_matrice, left_on=['aid', prediction_col], how='left',
                                         right_on=['aid_x','aid_y'])
        df_candidate = df_candidate.drop('aid')
        gc.collect()
        if i == 0:
            df_candidate = df_candidate.with_column(pl.col("wgt").fill_null(0).alias(col_name))
        else:
            if divide:
                df_candidate = df_candidate.with_column((pl.col("wgt").fill_null(0)/(i+1) + pl.col(col_name))
                                                        .alias(col_name).cast(pl.Float32))
            else:
                df_candidate = df_candidate.with_column((pl.col("wgt").fill_null(0) + pl.col(col_name))
                                                    .alias(col_name).cast(pl.Float32))
        df_candidate = df_candidate.drop('wgt')
        gc.collect()
    df_candidate = df_candidate.to_pandas()
    return df_candidate

In [19]:
'''
# The same function on pandas, the one that works slowly.
def add_matrice_data(df_test, count_matrice, df_candidate, n_max, col_name, time_frame, normalize=False):
    df_test = df_test.sort_values(['session','ts'],ascending=[True,False])
    df_test['n'] = df_test.groupby('session').cumcount().astype(np.int16)
    df_test['time_delta'] = df_test.groupby('session')['ts'].transform(np.max).astype(np.int32)
    df_test['time_delta'] = df_test['time_delta'] - df_test['ts']
    df_test = df_test.loc[df_test['time_delta'] < time_frame]
    gc.collect()
    df_test = df_test[['session', 'aid', 'n']]
    df_test = df_test.loc[df_test['n'] < n_max]
    gc.collect()
    df_matrice = pd.read_parquet(count_matrice)
    df_matrice = df_matrice.drop_duplicates(subset=['aid_x', 'aid_y'])
    df_matrice = df_matrice.set_index(['aid_x', 'aid_y'])
    print(col_name)
    for i in range(n_max):
        print(str(i))
        df_test_i = df_test.loc[df_test['n'] == i]
        df_test_i = df_test_i[['session', 'aid']]
        df_test_i = df_test_i.set_index('session')
        df_candidate = df_candidate.join(df_test_i, on='session', how='left')
        df_candidate = df_candidate.join(df_matrice, on=['aid', 'click_predictions'], how='left')
        df_candidate = df_candidate.drop('aid', axis=1)
        gc.collect()
        if i == 0:
            df_candidate[col_name] = df_candidate['wgt'].fillna(0)
            df_candidate[col_name] = df_candidate[col_name].astype(np.float32)
        else:
            if normalize:
                df_candidate[col_name] = df_candidate[col_name] + (df_candidate['wgt'].fillna(0))/(i+1)
            else:
                df_candidate[col_name] = df_candidate[col_name] + df_candidate['wgt'].fillna(0)
        df_candidate = df_candidate.drop(['wgt'], axis=1)
        gc.collect()
    return df_candidate
'''

"\n# The same function on pandas, the one that works slowly.\ndef add_matrice_data(df_test, count_matrice, df_candidate, n_max, col_name, time_frame, normalize=False):\n    df_test = df_test.sort_values(['session','ts'],ascending=[True,False])\n    df_test['n'] = df_test.groupby('session').cumcount().astype(np.int16)\n    df_test['time_delta'] = df_test.groupby('session')['ts'].transform(np.max).astype(np.int32)\n    df_test['time_delta'] = df_test['time_delta'] - df_test['ts']\n    df_test = df_test.loc[df_test['time_delta'] < time_frame]\n    gc.collect()\n    df_test = df_test[['session', 'aid', 'n']]\n    df_test = df_test.loc[df_test['n'] < n_max]\n    gc.collect()\n    df_matrice = pd.read_parquet(count_matrice)\n    df_matrice = df_matrice.drop_duplicates(subset=['aid_x', 'aid_y'])\n    df_matrice = df_matrice.set_index(['aid_x', 'aid_y'])\n    print(col_name)\n    for i in range(n_max):\n        print(str(i))\n        df_test_i = df_test.loc[df_test['n'] == i]\n        df_test_