# Calculations for buys
Some calculations, that are used later to create features for buys (this means, for both carts and orders models) are performed in this notebook. Those are the calculations, that do not need information about cart/order candidates and thus can be done separately.
Those calculations include:
* conversion rate - means conversion from click to either cart or order
* conversion to carts - conversion from either clicks, previously carted aids or previously ordered aids to carts
* conversion to orders - conversion from either clicks, carts or previously ordered aids to new orders
* average per aid clicks before buy
* daily total carts/orders per aid
* average w2vec similarity between the last one aid in session and 5 aids before it

## Imports and definitions

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import gc
from humanize import naturalsize
from gensim.models import Word2Vec
from pandarallel import pandarallel

# functions and classes common for several notebooks of current project
import otto_common

In [2]:
# Count how often users buy a product given they clicked to view it
def count_conversion(train_data):
    df = pd.read_parquet(train_data)
    df_events = df.groupby('aid')['session'].nunique()
    df_events.name = 'aid_counts'
    df = df.loc[df['type'] > 0]
    df_buys = df.groupby('aid')['session'].nunique()
    df_buys.name = 'aid_buys'
    df = pd.merge(df_events, df_buys, how='left', left_index=True, right_index=True)
    df['aid_buys'] = df['aid_buys'].fillna(0)
    df['conv'] = ((2 + df['aid_buys'])/(18 + df['aid_counts'])).astype(np.float32)
    df = df.reset_index()
    df['aid'] = df['aid'].astype(np.int32)
    df = df[['aid', 'conv']]
    return df

In [3]:
# Count how often users order a product if they clicked, carted or ordered it before
def count_2order_conversion(train_data):
    df = pd.read_parquet(train_data)
    df_events = df.groupby('aid')['session'].nunique()
    df_events.name = 'aid_counts'
    df_carts = df.loc[df['type'] == 1]
    df_carts = df_carts.groupby('aid')['session'].nunique()
    df_carts.name = 'aid_carts'
    df = df.loc[df['type'] == 2]
    df_orders = df.groupby('aid')['session'].nunique()
    df_orders.name = 'aid_orders'
    df_double_orders = df.groupby(['aid', 'session']).size()
    df_double_orders = df_double_orders.loc[df_double_orders > 1]
    df_double_orders = df_double_orders.reset_index()
    df_double_orders = df_double_orders.groupby('aid')['session'].nunique()
    df_double_orders.name = 'double_orders'
    
    df = pd.merge(df_events, df_carts, how='left', left_index=True, right_index=True)
    df = pd.merge(df, df_orders, how='left', left_index=True, right_index=True)
    df = pd.merge(df, df_double_orders, how='left', left_index=True, right_index=True)
    
    df['aid_carts'] =  df['aid_carts'].fillna(0)
    df['aid_orders'] =  df['aid_orders'].fillna(0)
    df['double_orders'] =  df['double_orders'].fillna(0)
    
    df['click2order_conv'] = ((1 + df['aid_orders'])/(31 + df['aid_counts'])).astype(np.float32)
    df['cart2order_conv'] = ((7 + df['aid_orders'])/(28 + df['aid_carts'])).astype(np.float32)
    df['order2order_conv'] = ((1 + df['double_orders'])/(18 + df['aid_orders'])).astype(np.float32)
    df = df.reset_index()
    
    df = df[['aid', 'click2order_conv', 'cart2order_conv', 'order2order_conv']]
    return df

In [4]:
# Count how often users put a product in a cart if they clicked, carted or ordered it before.
def count_2cart_conversion(train_data):
    df = pd.read_parquet(train_data)
    df_events = df.groupby('aid')['session'].nunique()
    df_events.name = 'aid_counts'
    df_carts = df.loc[df['type'] == 1]
    df_carts = df_carts.groupby('aid')['session'].nunique()
    df_carts.name = 'aid_carts'
    df_orders = df.loc[df['type'] == 2]
    df_orders = df_orders.groupby('aid')['session'].nunique()
    df_orders.name = 'aid_orders'
    df_double_carts = df.loc[df['type'] == 1]
    df_double_carts = df_double_carts.groupby(['aid', 'session']).size()
    df_double_carts = df_double_carts.loc[df_double_carts > 1].reset_index()
    df_double_carts = df_double_carts.groupby('aid')['session'].nunique()
    df_double_carts.name = 'double_carts'

    df = df.loc[df['type'] > 0]
    df_session_orders = df.loc[df['type'] == 2]
    df_session_orders = df_session_orders.groupby(['aid', 'session']).agg(ts_min=('ts', np.min))
    df_session_carts = df.loc[df['type'] == 1]
    df_session_carts = df_session_carts.groupby(['aid', 'session']).agg(ts_max=('ts', np.max))
    df_session_orders = pd.merge(df_session_orders, df_session_carts, how='left',
                             left_index=True, right_index=True)
    df_session_orders = df_session_orders.loc[df_session_orders['ts_min'] < df_session_orders['ts_max']]
    df_session_orders = df_session_orders.reset_index()
    df_session_orders = df_session_orders.groupby('aid')['session'].nunique()
    df_session_orders.name = 'carts_after_orders'
    
    df = pd.merge(df_events, df_carts, how='left', left_index=True, right_index=True)
    df = pd.merge(df, df_orders, how='left', left_index=True, right_index=True)
    df = pd.merge(df, df_double_carts, how='left', left_index=True, right_index=True)
    df = pd.merge(df, df_session_orders, how='left', left_index=True, right_index=True)

    df['aid_carts'] =  df['aid_carts'].fillna(0)
    df['aid_orders'] =  df['aid_orders'].fillna(0)
    df['double_carts'] =  df['double_carts'].fillna(0)
    df['carts_after_orders'] =  df['carts_after_orders'].fillna(0)
    
    df['click2cart_conv'] = ((2 + df['aid_carts'])/(18 + df['aid_counts'])).astype(np.float32)
    df['order2cart_conv'] = ((1 + df['carts_after_orders'])/(50 + df['aid_orders'])).astype(np.float32)
    df['cart2cart_conv'] = ((2 + df['double_carts'])/(20 + df['aid_carts'])).astype(np.float32)
    df = df.reset_index()
    
    df = df[['aid', 'click2cart_conv', 'order2cart_conv', 'cart2cart_conv']]
    return df

In [5]:
# Average number of clicks on an item before buy.
def clicks_before_buy(train_data):
    df_click_data = pd.read_parquet(train_data)
    df_click_data['n'] = df_click_data.groupby('session').cumcount().astype(np.int16)
    df_buy_data = df_click_data.loc[df_click_data['type'] > 0]
    df_buy_data = df_buy_data.groupby(['session','aid']).agg({'n':'min'})
    df_buy_data =  df_buy_data.rename(columns={'n':'first_buy_n'})
    df_click_data = pd.merge(df_click_data, df_buy_data, how='inner', on=['session','aid'])
    del df_buy_data
    gc.collect()
    df_clicks_before_buy = df_click_data.loc[df_click_data['n'] <= df_click_data['first_buy_n']]
    df_clicks_before_buy = df_clicks_before_buy.groupby(['session','aid']).size()
    df_clicks_before_buy = df_clicks_before_buy.reset_index()
    df_clicks_before_buy = df_clicks_before_buy.rename(columns={0 :'clicks_before_buy'})
    df_clicks_before_buy['clicks_before_buy'] = df_clicks_before_buy['clicks_before_buy'].clip(0,30)
    
    session_max = df_clicks_before_buy['session'].max()
    aids = df_clicks_before_buy['aid'].unique()
    df_add1 = pd.DataFrame({'session': session_max + 1, 'aid': aids})
    df_add1['clicks_before_buy'] = 2
    df_add2 = df_add1.copy()
    df_add2['session'] = session_max + 2
    df_clicks_before_buy = pd.concat([df_clicks_before_buy, df_add1, df_add2])
    
    df_clicks_before_buy = df_clicks_before_buy.groupby(['aid']).agg({'clicks_before_buy':'mean'})
    return df_clicks_before_buy

In [6]:
# Function to calculate mean w2vec similarity between the last aid and up to 4 previous aids.
def agg_mean(x, w2v_model):
    similarities = []
    for item in x.aid:
        similarities.append(w2v_model.wv.similarity(item, x.aid_last))
    return np.mean(similarities)

In [7]:
# Load cross-validation or test sessions and select the last aid and up to 4 previous aids.
def prepare_sessions_4_w2vec(sessions_path):
    df_sessions = pd.read_parquet(sessions_path)
    df_sessions = df_sessions.sort_values(['session','ts'],ascending=[True,False])
    df_sessions['n'] = df_sessions.groupby('session').cumcount().astype(np.int16)
    df_sessions_last = df_sessions.loc[df_sessions['n'] == 0]
    df_sessions_last = df_sessions_last[['session', 'aid']].rename(columns={'aid':'aid_last'})
    df_sessions = df_sessions.loc[(df_sessions['n'] < 5) & (df_sessions['n'] > 0)]
    df_sessions = (df_sessions.groupby('session').agg({'aid': lambda x: x.tolist()}))
    df_sessions = pd.merge(df_sessions, df_sessions_last, how='inner', left_index=True, right_on='session')
    return df_sessions

In [8]:
# Function to manage the workflow for the w2vec session similarity.
def calculate_w2vec_history_feature(sessions_path, w2v_model_path):
    w2v_model = Word2Vec.load(w2v_model_path)
    df_sessions = prepare_sessions_4_w2vec(sessions_path)
    pandarallel.initialize(nb_workers=4)
    df_sessions['history_mean']  = df_sessions.parallel_apply(
        lambda x: agg_mean(x, w2v_model), axis=1
    )
    df_sessions = df_sessions[['session', 'history_mean']]
    return df_sessions

## Counts for cross-validation datasets

In [9]:
# Paths for trunked_sessions and history (full sessions).
trunked_sessions_path = '/kaggle/input/otto-prepare-cv/cv_inputs.parquet'
trunked_sessions_path2 = '/kaggle/input/otto-prepare-cv/cv_inputs2.parquet'
click_data_cv = '/kaggle/input/otto-prepare-cv/cv_train.parquet'

In [10]:
# Calculate the history w2vec similarity for both cross-validation datasets.
w2vec_path_cv = '/kaggle/input/otto-word2vec-exp/word2vec_cv_exp.wordvectors'

df_history_cv1 = calculate_w2vec_history_feature(trunked_sessions_path, w2vec_path_cv)
df_history_cv1.to_parquet('history_change_cv1.parquet')

df_history_cv2 = calculate_w2vec_history_feature(trunked_sessions_path2, w2vec_path_cv)
df_history_cv2.to_parquet('history_change_cv2.parquet')

del df_history_cv1, df_history_cv2
gc.collect()

INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


0

In [11]:
# Calculate average daily carts per aid for both cross-validation datasets and for the last week of full sessions.
df_daily_counts_train = otto_common.create_average_daily_counts(click_data_cv, False, 1)
df_daily_counts_train.to_parquet('daily_counts_carts_train.parquet')

df_daily_counts_cv1 = otto_common.create_average_daily_counts(trunked_sessions_path, True, 1)
df_daily_counts_cv1.to_parquet('daily_counts_carts_cv1.parquet')

df_daily_counts_cv2 = otto_common.create_average_daily_counts(trunked_sessions_path2, True, 1)
df_daily_counts_cv2.to_parquet('daily_counts_carts_cv2.parquet')

In [12]:
# Calculate average daily orders per aid for both cross-validation datasets and for last week of full sessions.
df_daily_counts_train = otto_common.create_average_daily_counts(click_data_cv, False, 2)
df_daily_counts_train.to_parquet('daily_counts_orders_train.parquet')

df_daily_counts_cv1 = otto_common.create_average_daily_counts(trunked_sessions_path, True, 2)
df_daily_counts_cv1.to_parquet('daily_counts_orders_cv1.parquet')

df_daily_counts_cv2 = otto_common.create_average_daily_counts(trunked_sessions_path2, True, 2)
df_daily_counts_cv2.to_parquet('daily_counts_orders_cv2.parquet')

In [13]:
# Calculate average clicks before buy per aid.
df_clicks_before_buy = clicks_before_buy(click_data_cv)
df_clicks_before_buy.to_parquet('clicks_before_buy_cv.parquet')

In [14]:
# Calculate general conversion rate per aid.
df_conversions = count_conversion(click_data_cv)
df_conversions.to_parquet('conversions_cv.parquet')

In [15]:
# Calculate click2order, cart2order and order2order conversion rates.
df_2order_conversions = count_2order_conversion(click_data_cv)
df_2order_conversions.to_parquet('2order_conversions_cv.parquet')

In [16]:
# Calculate click2cart, cart2cart and order2cart conversion rates.
df_2cart_conversions = count_2cart_conversion(click_data_cv)
df_2cart_conversions.to_parquet('2cart_conversions_cv.parquet')

In [17]:
del df_daily_counts_train, df_daily_counts_cv1, df_daily_counts_cv2, df_conversions, df_2order_conversions
gc.collect()

39

## Counts for test dataset

In [18]:
trunked_sessions_path = '/kaggle/input/otto-prepare-cv/test.parquet'
click_data_test = '/kaggle/input/otto-prepare-cv/train_full.parquet'

In [19]:
# Calculate the history w2vec similarity for test dataset.
w2vec_path_test = '/kaggle/input/otto-word2vec-exp/word2vec_test_exp.wordvectors'

df_history_test = calculate_w2vec_history_feature(trunked_sessions_path, w2vec_path_test)
df_history_test.to_parquet('history_change_test.parquet')

del df_history_test
gc.collect()

INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


0

In [20]:
# Calculate average daily carts per aid for test dataset and for the last week of full sessions.
df_daily_counts_test_full = otto_common.create_average_daily_counts(click_data_test, False, 1)
df_daily_counts_test_full.to_parquet('daily_counts_carts_test_full.parquet')

df_daily_counts_test_trunked =otto_common.create_average_daily_counts(trunked_sessions_path, True, 1)
df_daily_counts_test_trunked.to_parquet('daily_counts_carts_test_trunked.parquet')

In [21]:
# Calculate average daily orders per aid for test dataset and for the last week of full sessions.
df_daily_counts_test_full = otto_common.create_average_daily_counts(click_data_test, False, 2)
df_daily_counts_test_full.to_parquet('daily_counts_orders_test_full.parquet')

df_daily_counts_test_trunked = otto_common.create_average_daily_counts(trunked_sessions_path, True, 2)
df_daily_counts_test_trunked.to_parquet('daily_counts_orders_test_trunked.parquet')

In [22]:
# Calculate average clicks before buy per aid.
df_clicks_before_buy = clicks_before_buy(click_data_test)
df_clicks_before_buy.to_parquet('clicks_before_buy_test.parquet')

In [23]:
del df_daily_counts_test_full, df_daily_counts_test_trunked
gc.collect()

21

In [24]:
# Calculate general conversion rate per aid.
df_conversions = count_conversion(click_data_test)
df_conversions.to_parquet('conversions_test.parquet')

In [25]:
# Calculate click2order, cart2order and order2order conversion rates.
df_2order_conversions = count_2order_conversion(click_data_test)
df_2order_conversions.to_parquet('2order_conversions_test.parquet')

In [26]:
# Calculate click2cart, cart2cart and order2cart conversion rates.
df_2cart_conversions = count_2cart_conversion(click_data_test)
df_2cart_conversions.to_parquet('2cart_conversions_test.parquet')