# OTTO common
This notebook contains a number of functions and a class, that are used in most notebooks of the project. This notebook was made to avoid copying their definitions between notebooks.

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
        
import gc
from datetime import datetime

## General functions and classes

In [2]:
# Split dataframe into chunks, while keeping all records with the same values of some column in a single chunk.
def divide_df_by_column(df, n_splits, i, column_name):
    min_col_value = df[column_name].min() + i*(df[column_name].max() - df[column_name].min())/n_splits
    if i+1 == n_splits:
        max_col_value = df[column_name].max() + 1
    else:
        max_col_value = df[column_name].min() + (i+1)*(df[column_name].max() - df[column_name].min())/n_splits
    df_i = df.loc[(df[column_name] >= min_col_value) & (df[column_name] < max_col_value)]
    return df_i

In [3]:
# Calculate local datetime from timestamp.
def add_datetime(df, ts_col='ts'):
    df['time'] = [datetime.utcfromtimestamp(x).strftime('%Y-%m-%d %H:%M:%S') for x in df[ts_col]]
    df['time'] = pd.to_datetime(df['time'])
    df['time'] = df['time'].dt.tz_localize('Etc/GMT', ambiguous=True).dt.tz_convert('Europe/Berlin')
    return df

In [4]:
# This function is used in many notebooks. It selects up to n_max last events from each session if they are within time_frame from the last event in the session.
def filter_by_time_and_n_max(df, time_frame, n_max):
    df = df.sort_values(['session','ts'],ascending=[True,False])
    df['n'] = df.groupby('session').cumcount().astype(np.int16)
    df['time_delta'] = df.groupby('session')['ts'].transform(np.max).astype(np.int32)
    df['time_delta'] = df['time_delta'] - df['ts']
    df = df.loc[df['time_delta'] < time_frame]
    gc.collect()
    df = df[['session', 'aid', 'n']]
    df = df.loc[df['n'] < n_max]
    return df

In [5]:
# This function is used in all notebooks that either train or use a w2vec model.
def simple_hash_function(key):
    return sum(
        index * ord(character)
        for index, character in enumerate(repr(key), start=1)
    )

## Functions and classes for co-visitation matrixes only

In [6]:
# This function is used to build a list of all aids that show up in cross-validation or test period.
# Removing from a co-visitation matrix aid_x that do not show up in cross-validation/test data makes it possible to reduce matrix size without decrease in performance.
def build_aid_list(trunked_sessions, trunked_sessions2=None):
    df_cv = pd.read_parquet(trunked_sessions)
    aid_list = list(set(df_cv['aid']))
    if trunked_sessions2:
        df_cv2 = pd.read_parquet(trunked_sessions2)
        aid_list2 = list(set(df_cv2['aid']))
        aid_list = list(set(aid_list + aid_list2))
    return aid_list

In [7]:
class CalculateCovisitationMatrix:
    '''
        Class to calculate the co-visitation matrixes for OTTO project. This  parent class only has common
        logic for all the matrixes, and only its child classes will be used for the calculation itself.
    '''

    def __init__(self, n_splits, n_chunks_groupby, aid_list):
        self.n_splits = n_splits
        self.n_chunks_groupby = n_chunks_groupby
        self.aid_list = aid_list
        
        self.aid_max = None
        
    @staticmethod
    def groupby_reset_and_reduce(df):
        df = df.groupby(['aid_x','aid_y']).wgt.sum()
        df = df.reset_index()
        df['aid_x'] = df['aid_x'].astype(np.int32)
        df['aid_y'] = df['aid_y'].astype(np.int32)
        return df
    
    # Perform groupby chunk by chunk to reduce RAM usage.
    def groupby_in_chunks(self, df1, df2):
        for j in range(self.n_chunks_groupby):
            aid_x_min = j*(self.aid_max)/self.n_chunks_groupby
            if j + 1 == self.n_chunks_groupby:
                aid_x_max = self.aid_max + 1
            else:
                aid_x_max = (j+1)*(self.aid_max)/self.n_chunks_groupby
            df2_j = df2.loc[(df2['aid_x'] >= aid_x_min) & (df2['aid_x'] < aid_x_max)].copy()
            df1_j = df1.loc[(df1['aid_x'] >= aid_x_min) & (df1['aid_x'] < aid_x_max)].copy()
            df2_j = pd.concat([df2_j, df1_j])
            del df1_j
            gc.collect()
            df2_j = self.groupby_reset_and_reduce(df2_j)
            if j == 0:
                df_all = df2_j
            else:
                df_all = pd.concat([df_all, df2_j])
        return df_all
    
    # Reduce the co-visitation matrix only to top_n rows for each aid_x.
    def get_top_n(self, df_matrix, top_n):
        for j in range(self.n_chunks_groupby):
            df_matrix_chunk = divide_df_by_column(df_matrix, self.n_chunks_groupby, j, 'aid_x')
            df_matrix_chunk = df_matrix_chunk.sort_values(['aid_x','wgt'],ascending=[True,False])
            df_matrix_chunk = df_matrix_chunk.reset_index(drop=True)
            df_matrix_chunk['n'] = df_matrix_chunk.groupby('aid_x').aid_y.cumcount()
            df_matrix_chunk = df_matrix_chunk.loc[df_matrix_chunk.n<top_n].drop('n',axis=1)
            gc.collect()
            if j == 0:
                df_matrix_top_n = df_matrix_chunk
            else:
                df_matrix_top_n = pd.concat([df_matrix_top_n, df_matrix_chunk])
        return df_matrix_top_n
        
    # Main method of the class. Provides the framework for the calculation.
    def generate_covisitation_matrix(self, data_path):
        df_click_data = pd.read_parquet(data_path)
        self.aid_max = df_click_data['aid'].max()
    
        for i in range(self.n_splits):
            print(str(i))
            df_i = divide_df_by_column(df_click_data, self.n_splits, i, 'session')
            df_i_wgt = self.calculate_weights(df_i) 
            del df_i
            gc.collect()
            if str(i).endswith('0'): 
                df_wgt = df_i_wgt
            else:
                df_wgt = pd.concat([df_wgt, df_i_wgt], axis=0)
            if str(i).endswith('9'):
                df_wgt = self.groupby_reset_and_reduce(df_wgt)
                if i == 9:
                    df_wgt_all = df_wgt
                else:
                    df_wgt_all = self.groupby_in_chunks(df_wgt, df_wgt_all)
                    del df_wgt
                    gc.collect()
        return df_wgt_all

## Functions for count_clicks/count_buys notebooks

In [8]:
# Calculate averaged aid counts after selecting data for required period of time.
# Used by the create_average_daily_counts function.
def count_aids(df):
    df = df.groupby('aid')['session'].nunique()
    total_clicks = df.sum()/10000
    df = df.reset_index()
    df['aid_count'] = df['session']/total_clicks
    df = df[['aid', 'aid_count']]
    return df

In [9]:
# Create averaged daily counts for 7 the last days of full data or the cross-validation data.
def create_average_daily_counts(data_path, is_trunked, buy_type=None):
    df_data = pd.read_parquet(data_path)
    if buy_type:
        df_data = df_data.loc[df_data['type'] == buy_type]
    if is_trunked == False:
        last_week_ts = df_data['ts'].max() - 7 * 24 * 60 * 60
        df_data = df_data.loc[df_data['ts'] > last_week_ts]
    df_data = add_datetime(df_data)
    df_data['day_of_week'] = df_data['time'].dt.dayofweek.astype(np.int8)
    for i in range(7):
        df_i = df_data.loc[df_data['day_of_week'] == i]
        df_i = df_i[['session', 'aid']]
        df_i = count_aids(df_i)
        df_i['day_of_week'] = i
        if i == 0:
            df_total = df_i
        else:
            df_total = pd.concat([df_total, df_i])
    return df_total

## Functions for candidate generation

In [10]:
# Builds a dictionary of items most commonly clicked/carted/ordered during a day.
# If there are free slots left after generating candidates using all types of aids in history, aids from top_dict are used.
def build_top_dict(df, n_candidates, event_type):   
    top_df = df.loc[df['type']==event_type].groupby(['day_of_week', 'aid'])['session'].count()
    top_df = top_df.reset_index()
    top_df = top_df.sort_values(['day_of_week','session'],ascending=[True,False])
    top_df['n'] = top_df.groupby('day_of_week').session.cumcount()
    top_df = top_df.loc[top_df.n<n_candidates].drop('n',axis=1)
    top_dict = top_df.groupby('day_of_week').aid.apply(list).to_dict()
    return top_dict

In [11]:
# When generating candidates for cross-validation dataset, it makes sense to leave only sessions with some positive target.
# Most sessions do not have any aid carted or ordered, so keeping only sessions with some item carted/ordered will speed up the calculations
# and will not harm the model performance.
def reduce_df_prepare_answers(main_df, answers_path, col_name):
    col_name_len = col_name + '_len'
    df_answers = pd.read_parquet(answers_path)
    df_answers = df_answers[['session', col_name]]
    df_answers[col_name_len] = [len(x) for x in df_answers[col_name]]
    df_answers = df_answers.loc[df_answers[col_name_len] > 0]
    main_df = pd.merge(main_df, df_answers, on='session', how = 'inner')
    main_df = main_df.drop(col_name,axis=1).drop(col_name_len,axis=1)
    return main_df, df_answers

In [12]:
# Prints stats after candidates for a cross-validation dataset is ready.
# Shows both absolute numbers and percentages of guessed aids.
def print_stats(df_check, col_name):
    col_name_len = col_name + '_len'
    col_name_clipped = col_name_len + '_clipped'
    
    total_target = df_check[col_name_len].sum()
    total_after_clip = df_check[col_name_clipped].sum()
    total_guessed = df_check['pred_true'].sum()
    print(f"Total {col_name}:  {total_target}")
    print(f"Total {col_name} clipped:  {total_after_clip}")
    print(f"Total {col_name} guessed:  {total_guessed}")

    target_with_buys_in_history = df_check.loc[df_check['buys'] > 0, col_name_clipped].sum()
    target_with_buys_guessed = df_check.loc[df_check['buys'] > 0, 'pred_true'].sum()
    print(f"Total {col_name} with buys in history:  {target_with_buys_in_history}")
    print(f"{col_name.capitalize()} with buys in history guessed:  {target_with_buys_guessed}")
    
    target_no_buys_in_history = df_check.loc[df_check['buys'] == 0, col_name_clipped].sum()
    target_with_no_buys_in_history_guessed = df_check.loc[df_check['buys'] == 0, 'pred_true'].sum()
    print(f"Total {col_name} with no buys in history:  {target_no_buys_in_history}")
    print(f"{col_name.capitalize()} with no buys in history guessed:  {target_with_no_buys_in_history_guessed}")
    
    print(f"Total:  {100*total_guessed/total_after_clip:.2f}%")
    print(f"Buys:  {100*target_with_buys_guessed/target_with_buys_in_history:.2f}%")
    print(f"No buys:  {100*target_with_no_buys_in_history_guessed/target_no_buys_in_history:.2f}%")

In [13]:
# Join dataframe with generated candidates and dataframe with answers.
# Prepares all the data needed to print the stats.
def calculate_stats(prediction_df, answers_df, unique_session_aids, unique_session_buys, col_name, n_candidates):
    col_name_len = col_name + '_len'
    col_name_clipped = col_name_len + '_clipped'
    col_name_prediction = col_name[:-1] + '_predictions'
    
    prediction_df = pd.merge(prediction_df, answers_df, on = 'session')
    df_check_stats = prediction_df.explode(col_name).reset_index(drop=True)
    df_check_stats['pred_true'] = df_check_stats.apply(lambda x: x[col_name] in x[col_name_prediction], axis=1)
    df_check_stats['pred_true'] = df_check_stats['pred_true'].astype(np.int8)
    df_check_stats = df_check_stats.groupby('session').agg({col_name_len: 'max', 'pred_true': 'sum'})
    df_check_stats[col_name_clipped] = df_check_stats[col_name_len].clip(0,n_candidates)
    df_check_stats = df_check_stats.reset_index()
    df_check_stats = pd.merge(df_check_stats, unique_session_aids, how = 'left', on='session')
    df_check_stats = pd.merge(df_check_stats, unique_session_buys, how = 'left', on='session')
    df_check_stats['buys'] = df_check_stats['buys'].fillna(0)
    print_stats(df_check_stats, col_name)
        
    del df_check_stats[col_name_len]
    prediction_df = pd.merge(prediction_df, df_check_stats, left_on = 'session', right_on='session')
        
    prediction_df = prediction_df[['session', col_name_prediction, col_name, 'pred_true']]
    return prediction_df


In [14]:
# Reduce the co-visitation matrix to n_candidates and turn it from a dataframe to a dictionary.
# Using dictionary speeds up the candidate generation process.
def matrix_to_dict(click2buy_matrix_path, n_candidates):
    df_matrix = pd.read_parquet(click2buy_matrix_path)
    df_matrix['n'] = df_matrix.groupby('aid_x').aid_y.cumcount()
    df_matrix = df_matrix.loc[df_matrix.n<n_candidates].drop('n',axis=1)
    click2buy_dict = df_matrix.groupby('aid_x').aid_y.apply(list).to_dict()
    return click2buy_dict

## Functions for the model notebooks

In [15]:
# Prints some stats after running cross-validation.
# It prints absolute number of aids guessed, percent of aids guessed, and average position of ground truth candidates after re-ranking.
def calculate_recall(df_cv, result_col_name, const):
    df_cv = df_cv[['session', result_col_name, 'target']]
    gc.collect()
    mean_prediction_true = df_cv.loc[df_cv['target'] == True, result_col_name].mean()
    mean_prediction_total = df_cv[result_col_name].mean()
    df_cv = df_cv.sort_values(['session', result_col_name],ascending=[True,False])
    df_cv['n'] = df_cv.groupby('session').cumcount().astype(np.int8)
    mean_n = df_cv.loc[df_cv['target'] == True, 'n'].mean()
    df_cv = df_cv.loc[df_cv['n'] < 20].drop('n',axis=1)
    total_guessed = len(df_cv.loc[df_cv['target'] == True])
    print('Total_guessed: ' + str(total_guessed))
    percent = total_guessed/const
    print(f"Percent {100*percent:.2f}%")
    print(f"Av_n {mean_n:.2f}")
    del df_cv
    gc.collect()

In [16]:
# Removes a fraction of "negative" (not clicked/carted/ordered) candidates from cross-validation dataset.
# Having too much "negative" candidates actually harms the model's performance and leads to too high memory consumption.
def remove_frac(train_index, df, frac):
    df = df.iloc[train_index]
    if frac > 0:
        remove_index = df.loc[df['target'] == False].sample(frac=frac, random_state=25).index
        df = df.drop(remove_index)
    train_index = df.index
    return train_index

In [17]:
# Function to select top_20 candidates and perform some formatting required to upload the results.
def select_top_20_and_format(df, candidate_col, rating_col):
    # Enumereate the results and select top 20 for each session.
    df = df.sort_values(['session', rating_col],ascending=[True,False])
    df['n'] = df.groupby('session').cumcount().astype(np.int8)
    df = df.loc[df['n'] < 20].drop('n',axis=1)
    
    # Final formatting.
    df[candidate_col] = df[candidate_col].apply(str)
    df = (df.groupby('session').agg({candidate_col: lambda x: " ".join(x)}))
    return df