<a href="https://colab.research.google.com/github/Adlucent/ga4-return-prediction/blob/main/4_Customer_Level_Aggregation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 4. Customer-Level Aggregation

In [None]:
################################################################################
######################### CHANGE BQ PROJECT NAME BELOW #########################
################################################################################

project_name = 'adl-analytics' #add proj name
region = "US"  # GCP project region
table_name = 'adl-analytics.return_prediction_ga4.step_3_final'

In [None]:
# If your notebook does not have pandas_gbq you can install it here:
# ! pip install pandas_gbq

In [None]:
# Google credentials
from google.colab import auth
auth.authenticate_user()

# # BigQuery Magics
# '''BigQuery magics are used to run BigQuery SQL queries in a python environment.
# These queries can also be run in the BigQuery UI '''

# from google.cloud import bigquery
# from google.cloud.bigquery import magics, Client, QueryJobConfig

# magics.context.project = project_name #update project name
# client = bigquery.Client(project=magics.context.project)

# Interface between Jupyter and BigQuery
import pandas_gbq

# data processing libraries
import pandas as pd
import numpy as np

from functools import reduce

# visualization
import matplotlib.pyplot as plt
import seaborn as sns

sns.set()

# suppress notebook warnings
import warnings
warnings.filterwarnings('ignore')

# dataframe formatting
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 50)
pd.set_option("display.float_format", lambda x: "%.3f" % x)

# Load data

In [None]:
sql = f"""
SELECT *
FROM `{table_name}`;
"""
data = pandas_gbq.read_gbq(sql, project_id=project_name, location=region, use_bqstorage_api=True)

Downloading: 100%|[32m██████████[0m|


In [None]:
# create a copy of the imported data to avoid re-importing if we need to revert to original table
df = data.copy()

In [None]:
df.head()

Unnamed: 0,user_pseudo_id,transaction_date,transaction_ga_session_number,ecommerce_transaction_id,sum_event_params_engagement_time_msec,sum_event_params_session_engaged,user_ltv_revenue,ecommerce_total_item_quantity,total_return_item_quantity,ecommerce_purchase_revenue_in_usd,ecommerce_refund_value_in_usd,ecommerce_tax_value_in_usd,ecommerce_unique_items,sum_item_price_in_usd,sum_item_promotions,sum_item_quantity,sum_item_refund_in_usd,sum_item_refund_quantity,sum_item_revenue_in_usd,sum_event_name_add_to_cart,sum_event_name_begin_checkout,sum_event_name_page_view,sum_event_name_scroll,sum_event_name_select_item,sum_event_name_user_engagement,sum_event_name_view_item,sum_device_category_desktop,sum_device_category_mobile,sum_device_mobile_brand_name_Apple,sum_device_mobile_brand_name_Google,sum_device_mobile_brand_name_Huawei,sum_device_mobile_brand_name_Microsoft,sum_device_mobile_brand_name_Mozilla,sum_device_mobile_brand_name_Samsung,sum_device_mobile_brand_name_Xiaomi,sum_device_mobile_model_name_Chrome,sum_device_mobile_model_name_ChromeBook,sum_device_mobile_model_name_Edge,sum_device_mobile_model_name_Firefox,sum_device_mobile_model_name_Safari,sum_device_mobile_model_name_iPad,sum_device_mobile_model_name_iPhone,sum_device_web_info_browser_AndroidWebview,sum_device_web_info_browser_Chrome,sum_device_web_info_browser_Edge,sum_device_web_info_browser_Firefox,sum_device_web_info_browser_Safari,sum_geo_country_Canada,sum_geo_country_France,sum_geo_country_India,sum_geo_country_Other,sum_geo_country_Spain,sum_geo_country_UnitedKingdom,sum_geo_country_UnitedStates,sum_traffic_source_medium_Other,sum_traffic_source_medium_cpc,sum_traffic_source_medium_organic,sum_traffic_source_medium_referral,sum_event_params_parent_page_Apparel,sum_event_params_parent_page_CampusCollection,sum_event_params_parent_page_CheckoutConfirmation,sum_event_params_parent_page_CheckoutYourInformation,sum_event_params_parent_page_EcoFriendly,sum_event_params_parent_page_Home,sum_event_params_parent_page_Lifestyle,sum_event_params_parent_page_New,sum_event_params_parent_page_Other,sum_event_params_parent_page_PaymentMethod,sum_event_params_parent_page_Sale,sum_event_params_parent_page_ShopbyBrand,sum_event_params_parent_page_ShoppingCart,sum_event_params_parent_page_Stationery,sum_event_params_child_page_Bags,sum_event_params_child_page_Drinkware,sum_event_params_child_page_Google,sum_event_params_child_page_Hats,sum_event_params_child_page_Kids,sum_event_params_child_page_MensUnisex,sum_event_params_child_page_Notebooks,sum_event_params_child_page_Other,sum_event_params_child_page_SmallGoods,sum_event_params_child_page_Socks,sum_event_params_child_page_Stickers,sum_event_params_child_page_Womens,sum_event_params_child_page_Writing,sum_event_params_child_page_YouTube,sum_item_parent_category_Apparel,sum_item_parent_category_Collections,sum_item_parent_category_Lifestyle,sum_item_parent_category_New,sum_item_parent_category_Other,sum_item_parent_category_Sale,sum_item_parent_category_ShopbyBrand,sum_item_parent_category_Stationery,sum_item_child_category_Bags,sum_item_child_category_CampusCollection,sum_item_child_category_Drinkware,sum_item_child_category_Google,sum_item_child_category_Kids,sum_item_child_category_MensUnisex,sum_item_child_category_Other,sum_item_child_category_SmallGoods,sum_item_child_category_Womens,sum_item_child_subcategory_Backpacks,sum_item_child_subcategory_ElectronicsAccessories,sum_item_child_subcategory_Infant,sum_item_child_subcategory_MensTShirts,sum_item_child_subcategory_MugsTumblers,sum_item_child_subcategory_Other,sum_item_child_subcategory_WaterBottles,max_event_params_engagement_time_msec,max_item_price_in_usd,max_item_promotions,max_item_quantity,max_item_refund_in_usd,max_item_refund_quantity,max_item_revenue_in_usd,avg_event_params_engagement_time_msec,avg_item_price_in_usd,avg_item_promotions,avg_item_quantity,avg_item_refund_in_usd,avg_item_refund_quantity,avg_item_revenue_in_usd,session_type,pre_nunique_event_params_ga_sessions,pre_stdev_max_event_params_engagement_time_msec,pre_stdev_avg_event_params_engagement_time_msec,pre_max_event_params_ga_session_number,pre_max_event_params_engagement_time_msec,pre_avg_event_params_engagement_time_msec,pre_sum_event_params_engagement_time_msec,pre_sum_event_params_session_engaged,pre_sum_event_name_add_to_cart,pre_sum_event_name_begin_checkout,pre_sum_event_name_page_view,pre_sum_event_name_scroll,pre_sum_event_name_select_item,pre_sum_event_name_user_engagement,pre_sum_event_name_view_item,pre_sum_device_category_desktop,pre_sum_device_category_mobile,pre_sum_device_mobile_brand_name_Apple,pre_sum_device_mobile_brand_name_Google,pre_sum_device_mobile_brand_name_Huawei,pre_sum_device_mobile_brand_name_Microsoft,pre_sum_device_mobile_brand_name_Mozilla,pre_sum_device_mobile_brand_name_Samsung,pre_sum_device_mobile_brand_name_Xiaomi,pre_sum_device_mobile_model_name_Chrome,pre_sum_device_mobile_model_name_ChromeBook,pre_sum_device_mobile_model_name_Edge,pre_sum_device_mobile_model_name_Firefox,pre_sum_device_mobile_model_name_Safari,pre_sum_device_mobile_model_name_iPad,pre_sum_device_mobile_model_name_iPhone,pre_sum_device_web_info_browser_AndroidWebview,pre_sum_device_web_info_browser_Chrome,pre_sum_device_web_info_browser_Edge,pre_sum_device_web_info_browser_Firefox,pre_sum_device_web_info_browser_Safari,pre_sum_geo_country_Canada,pre_sum_geo_country_France,pre_sum_geo_country_India,pre_sum_geo_country_Other,pre_sum_geo_country_Spain,pre_sum_geo_country_UnitedKingdom,pre_sum_geo_country_UnitedStates,pre_sum_traffic_source_medium_Other,pre_sum_traffic_source_medium_cpc,pre_sum_traffic_source_medium_organic,pre_sum_traffic_source_medium_referral,pre_sum_event_params_parent_page_Apparel,pre_sum_event_params_parent_page_CampusCollection,pre_sum_event_params_parent_page_CheckoutConfirmation,pre_sum_event_params_parent_page_CheckoutYourInformation,pre_sum_event_params_parent_page_EcoFriendly,pre_sum_event_params_parent_page_Home,pre_sum_event_params_parent_page_Lifestyle,pre_sum_event_params_parent_page_New,pre_sum_event_params_parent_page_Other,pre_sum_event_params_parent_page_PaymentMethod,pre_sum_event_params_parent_page_Sale,pre_sum_event_params_parent_page_ShopbyBrand,pre_sum_event_params_parent_page_ShoppingCart,pre_sum_event_params_parent_page_Stationery,pre_sum_event_params_child_page_Bags,pre_sum_event_params_child_page_Drinkware,pre_sum_event_params_child_page_Google,pre_sum_event_params_child_page_Hats,pre_sum_event_params_child_page_Kids,pre_sum_event_params_child_page_MensUnisex,pre_sum_event_params_child_page_Notebooks,pre_sum_event_params_child_page_Other,pre_sum_event_params_child_page_SmallGoods,pre_sum_event_params_child_page_Socks,pre_sum_event_params_child_page_Stickers,pre_sum_event_params_child_page_Womens,pre_sum_event_params_child_page_Writing,pre_sum_event_params_child_page_YouTube,pre_sum_item_parent_category_Apparel,pre_sum_item_parent_category_Collections,pre_sum_item_parent_category_Lifestyle,pre_sum_item_parent_category_New,pre_sum_item_parent_category_Other,pre_sum_item_parent_category_Sale,pre_sum_item_parent_category_ShopbyBrand,pre_sum_item_parent_category_Stationery,pre_sum_item_child_category_Bags,pre_sum_item_child_category_CampusCollection,pre_sum_item_child_category_Drinkware,pre_sum_item_child_category_Google,pre_sum_item_child_category_Kids,pre_sum_item_child_category_MensUnisex,pre_sum_item_child_category_Other,pre_sum_item_child_category_SmallGoods,pre_sum_item_child_category_Womens,pre_sum_item_child_subcategory_Backpacks,pre_sum_item_child_subcategory_ElectronicsAccessories,pre_sum_item_child_subcategory_Infant,pre_sum_item_child_subcategory_MensTShirts,pre_sum_item_child_subcategory_MugsTumblers,pre_sum_item_child_subcategory_Other,pre_sum_item_child_subcategory_WaterBottles,days_first_session_to_transaction
0,10111055.876868386,2020-12-10,1,741471,3324661,225,94.0,3,0,94.0,0.0,10.0,3,94.0,0,3,0.0,0,94.0,48,6,26,16,12,25,84,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,154,0,5,10,0,9,0,2,3,7,0,0,6,0,0,0,0,0,0,154,0,0,0,0,0,0,0,0,153,0,0,0,0,0,0,0,0,0,0,0,0,144,0,0,0,0,0,0,0,0,0,0,275267,48.0,0,1,0.0,0,48.0,14581.846,31.333,0.0,1.0,0.0,0.0,31.333,transaction_session,0.0,0.0,0.0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0
1,1019527.5799124268,2020-12-05,1,2105,2966128,412,44.0,7,0,44.0,0.0,5.0,4,32.0,0,7,0.0,0,44.0,78,8,34,21,0,33,231,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,2,4,7,12,27,44,186,0,3,4,62,0,13,39,90,18,0,0,0,0,0,0,78,2,36,0,3,0,6,24,192,0,3,60,0,33,84,24,12,0,0,0,63,72,0,0,0,0,0,0,0,0,52542,13.0,0,4,0.0,0,16.0,7147.296,8.0,0.0,1.75,0.0,0.0,11.0,transaction_session,0.0,0.0,0.0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0
2,1021887.1151788384,2020-12-26,1,76937,918786,177,21.0,2,0,21.0,0.0,2.0,1,10.0,1,2,0.0,0,21.0,48,9,15,13,0,15,72,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,3,21,0,6,129,3,0,4,2,0,9,0,0,129,0,0,0,0,0,0,0,0,0,0,0,0,0,0,120,0,0,0,10,0,0,0,120,0,0,0,0,0,0,0,0,0,0,0,0,0,33396,10.0,1,2,0.0,0,21.0,5104.367,10.0,1.0,2.0,0.0,0.0,21.0,transaction_session,0.0,0.0,0.0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0
3,1026932.0858862292,2020-12-01,1,339943,2387211,173,110.0,1,0,55.0,0.0,4.0,1,110.0,2,2,0.0,0,110.0,12,2,28,19,0,28,77,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,8,6,0,24,3,2,9,4,27,62,6,0,0,3,62,0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,24,60,0,0,0,0,60,0,0,0,0,0,0,0,0,0,0,0,0,68076,55.0,1,1,0.0,0,55.0,13641.206,55.0,1.0,1.0,0.0,0.0,55.0,transaction_session,0.0,0.0,0.0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0
4,1026932.0858862292,2020-12-08,3,29460,674555,127,214.0,1,0,55.0,0.0,4.0,1,55.0,0,1,0.0,0,55.0,0,4,25,11,27,24,27,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,7,12,0,61,0,0,18,8,2,0,6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,6,0,0,0,0,48,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,37441,55.0,0,1,0.0,0,55.0,5188.885,55.0,0.0,1.0,0.0,0.0,55.0,transaction_session,0.0,0.0,0.0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0


In [None]:
# drop session type since it is not relevant to this step
df = df.drop('session_type', axis=1)

In [None]:
df.shape

(4466, 224)

In [None]:
df.isna().sum().sum()

0

In [None]:
# sort the data by user_pseudo_id, transaction_date, and ecommerce_transaction_id
df_sorted = df.sort_values(by=['user_pseudo_id', 'transaction_date', 'ecommerce_transaction_id'])

## Split data
Before aggregating the transaction data to the customer-level and creating clusters, we need to split our data into training, validation, and test sets to avoid any data leakage in the customer clusters. <br>
Since our data is chronological and data leakage would be in the form of aggregating transactions that came in the future, we will use the oldest portion for training, a newer portion for validation, and the most recent portion for testing.

In [None]:
# use ~80% of data for training
train = df_sorted[df_sorted['transaction_date'] < '2021-01-01']
train.shape

(3571, 224)

In [None]:
# use ~10% for validation
val = df_sorted[(df_sorted['transaction_date'] >= '2021-01-01') & (df_sorted['transaction_date'] <= '2021-01-16')]
val.shape

(419, 224)

In [None]:
# use ~10% for testing
test = df_sorted[(df_sorted['transaction_date'] > '2021-01-16')]
test.shape

(476, 224)

In [None]:
df_sorted['split'] = df_sorted['transaction_date'].apply(lambda x: 'TRAIN'
                                                         if x < pd.to_datetime('2021-01-01')
                                                         else 'VALID'
                                                         if (x >= pd.to_datetime('2021-01-01')) & (x <= pd.to_datetime('2021-01-16'))
                                                         else 'TEST')

In [None]:
df_sorted['split'].value_counts()

TRAIN    3571
TEST      476
VALID     419
Name: split, dtype: int64

In [None]:
df_sorted['split'].value_counts(normalize=True)

TRAIN   0.800
TEST    0.107
VALID   0.094
Name: split, dtype: float64

In [None]:
# create table schema to prevent errors when uploading data
schema = []

for col, dtype in zip(df_sorted.dtypes.index, df_sorted.dtypes.values):
    col_dict = {}
    col_dict['name'] = col

    if dtype == object:
        typ = 'STRING'
    elif dtype == bool:
        typ = 'BOOLEAN'
    elif dtype == 'datetime64[ns]':
        typ = 'DATETIME'
    elif dtype in (int, 'Int64', 'uint8'):
        typ = 'INTEGER'
    elif dtype == float:
        typ = 'FLOAT'
    else:
        print(dtype)
    col_dict['type'] = typ

    schema.append(col_dict)

In [None]:
# df_sorted.to_gbq(f'{project_name}.return_prediction_ga4.step_4_split',
#                 project_id=project_name,
#                 if_exists='replace',
#                 location=region,
#                 chunksize=100_000,
#                 table_schema=schema)

# Aggregating the data

#### Columns to aggregate by initially:
1. 'user_pseudo_id'

#### Transaction session features

**Columns to get nunique:**
- 'ecommerce_transaction_id'

**Columns to get min and max:**
- 'transaction_date'

**Columns to average:**
- 'avg_event_params_engagement_time_msec'
- 'avg_item_price_in_usd'
- 'avg_item_promotions'
- 'avg_item_quantity'
- 'avg_item_refund_in_usd'
- 'avg_item_refund_quantity'
- 'avg_item_revenue_in_usd'
- 'ecommerce_total_item_quantity'     (average quantity order)
- 'total_return_item_quantity'        (average quantity returns)
- 'ecommerce_purchase_revenue_in_usd' (average monetary order)
- 'ecommerce_refund_value_in_usd'     (average monetary return)
- 'ecommerce_tax_value_in_usd'        (average tax)
- 'ecommerce_unique_items'            (average variety of order)
- 'days_first_session_to_transaction'

**Columns to get max:**
- 'user_ltv_revenue'
- 'max_event_params_engagement_time_msec'
- 'max_item_price_in_usd'
- 'max_item_promotions'
- 'max_item_quantity'
- 'max_item_refund_in_usd'
- 'max_item_refund_quantity'
- 'max_item_revenue_in_usd'
- 'ecommerce_total_item_quantity'     (largest quantity order)
- 'total_return_item_quantity'        (largest quantity returns)
- 'ecommerce_purchase_revenue_in_usd' (largest monetary order)
- 'ecommerce_refund_value_in_usd'     (largest monetary return)
- 'ecommerce_tax_value_in_usd'        (largest tax)
- 'ecommerce_unique_items'            (most varied order)
- 'days_first_session_to_transaction'

**Columns to sum:**
- 'sum_event_params_engagement_time_msec'
- 'sum_event_params_session_engaged'
- 'ecommerce_total_item_quantity'
- 'total_return_item_quantity' <--
- 'ecommerce_purchase_revenue_in_usd'
- 'ecommerce_refund_value_in_usd'
- 'ecommerce_tax_value_in_usd'
- 'ecommerce_unique_items'
- 'sum_item_price_in_usd'
- 'sum_item_promotions'
- 'sum_item_quantity'
- 'sum_item_refund_in_usd' <--
- 'sum_item_refund_quantity' <--
- 'sum_item_revenue_in_usd'
- 'sum_event_name_add_to_cart'
- 'sum_event_name_begin_checkout'
- 'sum_event_name_page_view'
- 'sum_event_name_scroll'
- 'sum_event_name_select_item'
- 'sum_event_name_user_engagement'
- 'sum_event_name_view_item'
- 'sum_device_category_desktop'
- 'sum_device_category_mobile'
- 'sum_device_mobile_brand_name_Apple'
- 'sum_device_mobile_brand_name_Google'
- 'sum_device_mobile_brand_name_Huawei'
- 'sum_device_mobile_brand_name_Microsoft'
- 'sum_device_mobile_brand_name_Mozilla'
- 'sum_device_mobile_brand_name_Samsung'
- 'sum_device_mobile_brand_name_Xiaomi'
- 'sum_device_mobile_model_name_Chrome'
- 'sum_device_mobile_model_name_ChromeBook'
- 'sum_device_mobile_model_name_Edge'
- 'sum_device_mobile_model_name_Firefox'
- 'sum_device_mobile_model_name_Safari'
- 'sum_device_mobile_model_name_iPad'
- 'sum_device_mobile_model_name_iPhone'
- 'sum_device_web_info_browser_AndroidWebview'
- 'sum_device_web_info_browser_Chrome'
- 'sum_device_web_info_browser_Edge'
- 'sum_device_web_info_browser_Firefox'
- 'sum_device_web_info_browser_Safari'
- 'sum_geo_country_Canada'
- 'sum_geo_country_France'
- 'sum_geo_country_India'
- 'sum_geo_country_Other'
- 'sum_geo_country_Spain'
- 'sum_geo_country_UnitedKingdom'
- 'sum_geo_country_UnitedStates'
- 'sum_traffic_source_medium_Other'
- 'sum_traffic_source_medium_cpc'
- 'sum_traffic_source_medium_organic'
- 'sum_traffic_source_medium_referral'
- 'sum_event_params_parent_page_Apparel'
- 'sum_event_params_parent_page_CampusCollection'
- 'sum_event_params_parent_page_CheckoutConfirmation'
- 'sum_event_params_parent_page_CheckoutYourInformation'
- 'sum_event_params_parent_page_EcoFriendly'
- 'sum_event_params_parent_page_Home'
- 'sum_event_params_parent_page_Lifestyle'
- 'sum_event_params_parent_page_New'
- 'sum_event_params_parent_page_Other'
- 'sum_event_params_parent_page_PaymentMethod'
- 'sum_event_params_parent_page_Sale'
- 'sum_event_params_parent_page_ShopbyBrand'
- 'sum_event_params_parent_page_ShoppingCart'
- 'sum_event_params_parent_page_Stationery'
- 'sum_event_params_child_page_Bags'
- 'sum_event_params_child_page_Drinkware'
- 'sum_event_params_child_page_Google'
- 'sum_event_params_child_page_Hats'
- 'sum_event_params_child_page_Kids'
- 'sum_event_params_child_page_MensUnisex'
- 'sum_event_params_child_page_Notebooks'
- 'sum_event_params_child_page_Other'
- 'sum_event_params_child_page_SmallGoods'
- 'sum_event_params_child_page_Socks'
- 'sum_event_params_child_page_Stickers'
- 'sum_event_params_child_page_Womens'
- 'sum_event_params_child_page_Writing'
- 'sum_event_params_child_page_YouTube'
- 'sum_item_parent_category_Apparel'
- 'sum_item_parent_category_Collections'
- 'sum_item_parent_category_Lifestyle'
- 'sum_item_parent_category_New'
- 'sum_item_parent_category_Other'
- 'sum_item_parent_category_Sale'
- 'sum_item_parent_category_ShopbyBrand'
- 'sum_item_parent_category_Stationery'
- 'sum_item_child_category_Bags'
- 'sum_item_child_category_CampusCollection'
- 'sum_item_child_category_Drinkware'
- 'sum_item_child_category_Google'
- 'sum_item_child_category_Kids'
- 'sum_item_child_category_MensUnisex'
- 'sum_item_child_category_Other'
- 'sum_item_child_category_SmallGoods'
- 'sum_item_child_category_Womens'
- 'sum_item_child_subcategory_Backpacks'
- 'sum_item_child_subcategory_ElectronicsAccessories'
- 'sum_item_child_subcategory_Infant'
- 'sum_item_child_subcategory_MensTShirts'
- 'sum_item_child_subcategory_MugsTumblers'
- 'sum_item_child_subcategory_Other'
- 'sum_item_child_subcategory_WaterBottles'

#### Pre-transaction session features

**Columns to average:**
- 'pre_avg_event_params_engagement_time_msec'

**Columns to get max:**
- 'pre_max_event_params_ga_session_number'
- 'pre_max_event_params_engagement_time_msec'
- 'pre_stdev_max_event_params_engagement_time_msec'
- 'pre_stdev_avg_event_params_engagement_time_msec'

**Columns to sum:**
- 'pre_nunique_event_params_ga_sessions'
- 'pre_sum_event_params_engagement_time_msec'
- 'pre_sum_event_params_session_engaged'
- 'pre_sum_event_name_add_to_cart'
- 'pre_sum_event_name_begin_checkout'
- 'pre_sum_event_name_page_view'
- 'pre_sum_event_name_scroll'
- 'pre_sum_event_name_select_item'
- 'pre_sum_event_name_user_engagement'
- 'pre_sum_event_name_view_item'
- 'pre_sum_device_category_desktop'
- 'pre_sum_device_category_mobile'
- 'pre_sum_device_mobile_brand_name_Apple'
- 'pre_sum_device_mobile_brand_name_Google'
- 'pre_sum_device_mobile_brand_name_Huawei'
- 'pre_sum_device_mobile_brand_name_Microsoft'
- 'pre_sum_device_mobile_brand_name_Mozilla'
- 'pre_sum_device_mobile_brand_name_Samsung'
- 'pre_sum_device_mobile_brand_name_Xiaomi'
- 'pre_sum_device_mobile_model_name_Chrome'
- 'pre_sum_device_mobile_model_name_ChromeBook'
- 'pre_sum_device_mobile_model_name_Edge'
- 'pre_sum_device_mobile_model_name_Firefox'
- 'pre_sum_device_mobile_model_name_Safari'
- 'pre_sum_device_mobile_model_name_iPad'
- 'pre_sum_device_mobile_model_name_iPhone'
- 'pre_sum_device_web_info_browser_AndroidWebview'
- 'pre_sum_device_web_info_browser_Chrome'
- 'pre_sum_device_web_info_browser_Edge'
- 'pre_sum_device_web_info_browser_Firefox'
- 'pre_sum_device_web_info_browser_Safari'
- 'pre_sum_geo_country_Canada'
- 'pre_sum_geo_country_France'
- 'pre_sum_geo_country_India'
- 'pre_sum_geo_country_Other'
- 'pre_sum_geo_country_Spain'
- 'pre_sum_geo_country_UnitedKingdom'
- 'pre_sum_geo_country_UnitedStates'
- 'pre_sum_traffic_source_medium_Other'
- 'pre_sum_traffic_source_medium_cpc'
- 'pre_sum_traffic_source_medium_organic'
- 'pre_sum_traffic_source_medium_referral'
- 'pre_sum_event_params_parent_page_Apparel'
- 'pre_sum_event_params_parent_page_CampusCollection'
- 'pre_sum_event_params_parent_page_CheckoutConfirmation'
- 'pre_sum_event_params_parent_page_CheckoutYourInformation'
- 'pre_sum_event_params_parent_page_EcoFriendly'
- 'pre_sum_event_params_parent_page_Home'
- 'pre_sum_event_params_parent_page_Lifestyle'
- 'pre_sum_event_params_parent_page_New'
- 'pre_sum_event_params_parent_page_Other'
- 'pre_sum_event_params_parent_page_PaymentMethod'
- 'pre_sum_event_params_parent_page_Sale'
- 'pre_sum_event_params_parent_page_ShopbyBrand'
- 'pre_sum_event_params_parent_page_ShoppingCart'
- 'pre_sum_event_params_parent_page_Stationery'
- 'pre_sum_event_params_child_page_Bags'
- 'pre_sum_event_params_child_page_Drinkware'
- 'pre_sum_event_params_child_page_Google'
- 'pre_sum_event_params_child_page_Hats'
- 'pre_sum_event_params_child_page_Kids'
- 'pre_sum_event_params_child_page_MensUnisex'
- 'pre_sum_event_params_child_page_Notebooks'
- 'pre_sum_event_params_child_page_Other'
- 'pre_sum_event_params_child_page_SmallGoods'
- 'pre_sum_event_params_child_page_Socks'
- 'pre_sum_event_params_child_page_Stickers'
- 'pre_sum_event_params_child_page_Womens'
- 'pre_sum_event_params_child_page_Writing'
- 'pre_sum_event_params_child_page_YouTube'
- 'pre_sum_item_parent_category_Apparel'
- 'pre_sum_item_parent_category_Collections'
- 'pre_sum_item_parent_category_Lifestyle'
- 'pre_sum_item_parent_category_New'
- 'pre_sum_item_parent_category_Other'
- 'pre_sum_item_parent_category_Sale'
- 'pre_sum_item_parent_category_ShopbyBrand'
- 'pre_sum_item_parent_category_Stationery'
- 'pre_sum_item_child_category_Bags'
- 'pre_sum_item_child_category_CampusCollection'
- 'pre_sum_item_child_category_Drinkware'
- 'pre_sum_item_child_category_Google'
- 'pre_sum_item_child_category_Kids'
- 'pre_sum_item_child_category_MensUnisex'
- 'pre_sum_item_child_category_Other'
- 'pre_sum_item_child_category_SmallGoods'
- 'pre_sum_item_child_category_Womens'
- 'pre_sum_item_child_subcategory_Backpacks'
- 'pre_sum_item_child_subcategory_ElectronicsAccessories'
- 'pre_sum_item_child_subcategory_Infant'
- 'pre_sum_item_child_subcategory_MensTShirts'
- 'pre_sum_item_child_subcategory_MugsTumblers'
- 'pre_sum_item_child_subcategory_Other'
- 'pre_sum_item_child_subcategory_WaterBottles'

In [None]:
historical_cols = [col for col in df_sorted.columns if 'pre_' in col]
current_cols = [col for col in df_sorted.columns if col not in historical_cols]
current_cols.remove('split')

## Split into pre- and current transaction
In order to prevent any data leakage when performing customer aggregation and creating customer clusters, only aggregate transaction-session features for transactions that occurred prior to the current transaction. Pre-transaction features can be aggregated including the current transaction.

In [None]:
df_pre = df_sorted[['user_pseudo_id', 'transaction_date', 'ecommerce_transaction_id'] + historical_cols]
df_pre.shape

(4466, 102)

In [None]:
df_current = df_sorted[current_cols]
df_current.shape

(4466, 125)

### Aggregate pre-transaction

In [None]:
avg_cols = ['pre_avg_event_params_engagement_time_msec']

max_cols = ['pre_max_event_params_ga_session_number',
            'pre_max_event_params_engagement_time_msec',
            'pre_stdev_max_event_params_engagement_time_msec',
            'pre_stdev_avg_event_params_engagement_time_msec']

sum_cols = ['pre_nunique_event_params_ga_sessions',
            'pre_sum_event_params_engagement_time_msec',
            'pre_sum_event_params_session_engaged',
            'pre_sum_event_name_add_to_cart',
            'pre_sum_event_name_begin_checkout',
            'pre_sum_event_name_page_view',
            'pre_sum_event_name_scroll',
            'pre_sum_event_name_select_item',
            'pre_sum_event_name_user_engagement',
            'pre_sum_event_name_view_item',
            'pre_sum_device_category_desktop',
            'pre_sum_device_category_mobile',
            'pre_sum_device_mobile_brand_name_Apple',
            'pre_sum_device_mobile_brand_name_Google',
            'pre_sum_device_mobile_brand_name_Huawei',
            'pre_sum_device_mobile_brand_name_Microsoft',
            'pre_sum_device_mobile_brand_name_Mozilla',
            'pre_sum_device_mobile_brand_name_Samsung',
            'pre_sum_device_mobile_brand_name_Xiaomi',
            'pre_sum_device_mobile_model_name_Chrome',
            'pre_sum_device_mobile_model_name_ChromeBook',
            'pre_sum_device_mobile_model_name_Edge',
            'pre_sum_device_mobile_model_name_Firefox',
            'pre_sum_device_mobile_model_name_Safari',
            'pre_sum_device_mobile_model_name_iPad',
            'pre_sum_device_mobile_model_name_iPhone',
            'pre_sum_device_web_info_browser_AndroidWebview',
            'pre_sum_device_web_info_browser_Chrome',
            'pre_sum_device_web_info_browser_Edge',
            'pre_sum_device_web_info_browser_Firefox',
            'pre_sum_device_web_info_browser_Safari',
            'pre_sum_geo_country_Canada',
            'pre_sum_geo_country_France',
            'pre_sum_geo_country_India',
            'pre_sum_geo_country_Other',
            'pre_sum_geo_country_Spain',
            'pre_sum_geo_country_UnitedKingdom',
            'pre_sum_geo_country_UnitedStates',
            'pre_sum_traffic_source_medium_Other',
            'pre_sum_traffic_source_medium_cpc',
            'pre_sum_traffic_source_medium_organic',
            'pre_sum_traffic_source_medium_referral',
            'pre_sum_event_params_parent_page_Apparel',
            'pre_sum_event_params_parent_page_CampusCollection',
            'pre_sum_event_params_parent_page_CheckoutConfirmation',
            'pre_sum_event_params_parent_page_CheckoutYourInformation',
            'pre_sum_event_params_parent_page_EcoFriendly',
            'pre_sum_event_params_parent_page_Home',
            'pre_sum_event_params_parent_page_Lifestyle',
            'pre_sum_event_params_parent_page_New',
            'pre_sum_event_params_parent_page_Other',
            'pre_sum_event_params_parent_page_PaymentMethod',
            'pre_sum_event_params_parent_page_Sale',
            'pre_sum_event_params_parent_page_ShopbyBrand',
            'pre_sum_event_params_parent_page_ShoppingCart',
            'pre_sum_event_params_parent_page_Stationery',
            'pre_sum_event_params_child_page_Bags',
            'pre_sum_event_params_child_page_Drinkware',
            'pre_sum_event_params_child_page_Google',
            'pre_sum_event_params_child_page_Hats',
            'pre_sum_event_params_child_page_Kids',
            'pre_sum_event_params_child_page_MensUnisex',
            'pre_sum_event_params_child_page_Notebooks',
            'pre_sum_event_params_child_page_Other',
            'pre_sum_event_params_child_page_SmallGoods',
            'pre_sum_event_params_child_page_Socks',
            'pre_sum_event_params_child_page_Stickers',
            'pre_sum_event_params_child_page_Womens',
            'pre_sum_event_params_child_page_Writing',
            'pre_sum_event_params_child_page_YouTube',
            'pre_sum_item_parent_category_Apparel',
            'pre_sum_item_parent_category_Collections',
            'pre_sum_item_parent_category_Lifestyle',
            'pre_sum_item_parent_category_New',
            'pre_sum_item_parent_category_Other',
            'pre_sum_item_parent_category_Sale',
            'pre_sum_item_parent_category_ShopbyBrand',
            'pre_sum_item_parent_category_Stationery',
            'pre_sum_item_child_category_Bags',
            'pre_sum_item_child_category_CampusCollection',
            'pre_sum_item_child_category_Drinkware',
            'pre_sum_item_child_category_Google',
            'pre_sum_item_child_category_Kids',
            'pre_sum_item_child_category_MensUnisex',
            'pre_sum_item_child_category_Other',
            'pre_sum_item_child_category_SmallGoods',
            'pre_sum_item_child_category_Womens',
            'pre_sum_item_child_subcategory_Backpacks',
            'pre_sum_item_child_subcategory_ElectronicsAccessories',
            'pre_sum_item_child_subcategory_Infant',
            'pre_sum_item_child_subcategory_MensTShirts',
            'pre_sum_item_child_subcategory_MugsTumblers',
            'pre_sum_item_child_subcategory_Other',
            'pre_sum_item_child_subcategory_WaterBottles']

# Group by and aggregate
df_pre_agg = df_pre.groupby(['user_pseudo_id'])[
    sum_cols + max_cols
].agg({
    **{col: 'cumsum' for col in sum_cols},
    **{col: 'cummax' for col in max_cols},
})

In [None]:
df_pre_agg = pd.merge(df_pre[['user_pseudo_id', 'transaction_date', 'ecommerce_transaction_id']],
                          df_pre_agg,
                          how='left',
                          left_index=True,
                          right_index=True)
df_pre_agg.head()

Unnamed: 0,user_pseudo_id,transaction_date,ecommerce_transaction_id,pre_nunique_event_params_ga_sessions,pre_sum_event_params_engagement_time_msec,pre_sum_event_params_session_engaged,pre_sum_event_name_add_to_cart,pre_sum_event_name_begin_checkout,pre_sum_event_name_page_view,pre_sum_event_name_scroll,pre_sum_event_name_select_item,pre_sum_event_name_user_engagement,pre_sum_event_name_view_item,pre_sum_device_category_desktop,pre_sum_device_category_mobile,pre_sum_device_mobile_brand_name_Apple,pre_sum_device_mobile_brand_name_Google,pre_sum_device_mobile_brand_name_Huawei,pre_sum_device_mobile_brand_name_Microsoft,pre_sum_device_mobile_brand_name_Mozilla,pre_sum_device_mobile_brand_name_Samsung,pre_sum_device_mobile_brand_name_Xiaomi,pre_sum_device_mobile_model_name_Chrome,pre_sum_device_mobile_model_name_ChromeBook,pre_sum_device_mobile_model_name_Edge,pre_sum_device_mobile_model_name_Firefox,pre_sum_device_mobile_model_name_Safari,pre_sum_device_mobile_model_name_iPad,pre_sum_device_mobile_model_name_iPhone,pre_sum_device_web_info_browser_AndroidWebview,pre_sum_device_web_info_browser_Chrome,pre_sum_device_web_info_browser_Edge,pre_sum_device_web_info_browser_Firefox,pre_sum_device_web_info_browser_Safari,pre_sum_geo_country_Canada,pre_sum_geo_country_France,pre_sum_geo_country_India,pre_sum_geo_country_Other,pre_sum_geo_country_Spain,pre_sum_geo_country_UnitedKingdom,pre_sum_geo_country_UnitedStates,pre_sum_traffic_source_medium_Other,pre_sum_traffic_source_medium_cpc,pre_sum_traffic_source_medium_organic,pre_sum_traffic_source_medium_referral,pre_sum_event_params_parent_page_Apparel,pre_sum_event_params_parent_page_CampusCollection,pre_sum_event_params_parent_page_CheckoutConfirmation,pre_sum_event_params_parent_page_CheckoutYourInformation,pre_sum_event_params_parent_page_EcoFriendly,pre_sum_event_params_parent_page_Home,pre_sum_event_params_parent_page_Lifestyle,pre_sum_event_params_parent_page_New,pre_sum_event_params_parent_page_Other,pre_sum_event_params_parent_page_PaymentMethod,pre_sum_event_params_parent_page_Sale,pre_sum_event_params_parent_page_ShopbyBrand,pre_sum_event_params_parent_page_ShoppingCart,pre_sum_event_params_parent_page_Stationery,pre_sum_event_params_child_page_Bags,pre_sum_event_params_child_page_Drinkware,pre_sum_event_params_child_page_Google,pre_sum_event_params_child_page_Hats,pre_sum_event_params_child_page_Kids,pre_sum_event_params_child_page_MensUnisex,pre_sum_event_params_child_page_Notebooks,pre_sum_event_params_child_page_Other,pre_sum_event_params_child_page_SmallGoods,pre_sum_event_params_child_page_Socks,pre_sum_event_params_child_page_Stickers,pre_sum_event_params_child_page_Womens,pre_sum_event_params_child_page_Writing,pre_sum_event_params_child_page_YouTube,pre_sum_item_parent_category_Apparel,pre_sum_item_parent_category_Collections,pre_sum_item_parent_category_Lifestyle,pre_sum_item_parent_category_New,pre_sum_item_parent_category_Other,pre_sum_item_parent_category_Sale,pre_sum_item_parent_category_ShopbyBrand,pre_sum_item_parent_category_Stationery,pre_sum_item_child_category_Bags,pre_sum_item_child_category_CampusCollection,pre_sum_item_child_category_Drinkware,pre_sum_item_child_category_Google,pre_sum_item_child_category_Kids,pre_sum_item_child_category_MensUnisex,pre_sum_item_child_category_Other,pre_sum_item_child_category_SmallGoods,pre_sum_item_child_category_Womens,pre_sum_item_child_subcategory_Backpacks,pre_sum_item_child_subcategory_ElectronicsAccessories,pre_sum_item_child_subcategory_Infant,pre_sum_item_child_subcategory_MensTShirts,pre_sum_item_child_subcategory_MugsTumblers,pre_sum_item_child_subcategory_Other,pre_sum_item_child_subcategory_WaterBottles,pre_max_event_params_ga_session_number,pre_max_event_params_engagement_time_msec,pre_stdev_max_event_params_engagement_time_msec,pre_stdev_avg_event_params_engagement_time_msec
3022,10092926.37863064,2021-01-22,719410,1.0,0,1,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0.0,0.0
0,10111055.876868386,2020-12-10,741471,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0
2748,1016446.8237887674,2020-12-21,983645,1.0,2813178,339,72,0,29,14,24,26,174,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,81,3,0,0,0,5,209,2,0,0,2,0,13,6,91,89,0,0,0,81,3,0,15,0,0,0,3,0,72,0,192,0,0,0,0,0,84,0,84,0,0,72,0,12,0,0,0,0,0,0,0,0,1,62804,0.0,0.0
3023,10172849.537529336,2020-12-09,406646,1.0,0,1,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0.0,0.0
1,1019527.5799124268,2020-12-05,2105,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0


Pandas does not have a built in cumulative mean function so we'll calculate it based on cumsum() and cumcount().

In [None]:
# calculate the cumulative sum and count for each group
df_pre_agg['cumulative_sum'] = df_pre.groupby(['user_pseudo_id'])['pre_avg_event_params_engagement_time_msec'].cumsum()
df_pre_agg['cumulative_count'] = df_pre.groupby(['user_pseudo_id']).cumcount() + 1

# calculate the cumulative mean
df_pre_agg['pre_avg_event_params_engagement_time_msec'] = df_pre_agg['cumulative_sum'] / df_pre_agg['cumulative_count']

# drop sum and count columns
df_pre_agg = df_pre_agg.drop(['cumulative_sum', 'cumulative_count'], axis=1)

In [None]:
df_pre_agg.head()

Unnamed: 0,user_pseudo_id,transaction_date,ecommerce_transaction_id,pre_nunique_event_params_ga_sessions,pre_sum_event_params_engagement_time_msec,pre_sum_event_params_session_engaged,pre_sum_event_name_add_to_cart,pre_sum_event_name_begin_checkout,pre_sum_event_name_page_view,pre_sum_event_name_scroll,pre_sum_event_name_select_item,pre_sum_event_name_user_engagement,pre_sum_event_name_view_item,pre_sum_device_category_desktop,pre_sum_device_category_mobile,pre_sum_device_mobile_brand_name_Apple,pre_sum_device_mobile_brand_name_Google,pre_sum_device_mobile_brand_name_Huawei,pre_sum_device_mobile_brand_name_Microsoft,pre_sum_device_mobile_brand_name_Mozilla,pre_sum_device_mobile_brand_name_Samsung,pre_sum_device_mobile_brand_name_Xiaomi,pre_sum_device_mobile_model_name_Chrome,pre_sum_device_mobile_model_name_ChromeBook,pre_sum_device_mobile_model_name_Edge,pre_sum_device_mobile_model_name_Firefox,pre_sum_device_mobile_model_name_Safari,pre_sum_device_mobile_model_name_iPad,pre_sum_device_mobile_model_name_iPhone,pre_sum_device_web_info_browser_AndroidWebview,pre_sum_device_web_info_browser_Chrome,pre_sum_device_web_info_browser_Edge,pre_sum_device_web_info_browser_Firefox,pre_sum_device_web_info_browser_Safari,pre_sum_geo_country_Canada,pre_sum_geo_country_France,pre_sum_geo_country_India,pre_sum_geo_country_Other,pre_sum_geo_country_Spain,pre_sum_geo_country_UnitedKingdom,pre_sum_geo_country_UnitedStates,pre_sum_traffic_source_medium_Other,pre_sum_traffic_source_medium_cpc,pre_sum_traffic_source_medium_organic,pre_sum_traffic_source_medium_referral,pre_sum_event_params_parent_page_Apparel,pre_sum_event_params_parent_page_CampusCollection,pre_sum_event_params_parent_page_CheckoutConfirmation,pre_sum_event_params_parent_page_CheckoutYourInformation,pre_sum_event_params_parent_page_EcoFriendly,pre_sum_event_params_parent_page_Home,pre_sum_event_params_parent_page_Lifestyle,pre_sum_event_params_parent_page_New,pre_sum_event_params_parent_page_Other,pre_sum_event_params_parent_page_PaymentMethod,pre_sum_event_params_parent_page_Sale,pre_sum_event_params_parent_page_ShopbyBrand,pre_sum_event_params_parent_page_ShoppingCart,pre_sum_event_params_parent_page_Stationery,pre_sum_event_params_child_page_Bags,pre_sum_event_params_child_page_Drinkware,pre_sum_event_params_child_page_Google,pre_sum_event_params_child_page_Hats,pre_sum_event_params_child_page_Kids,pre_sum_event_params_child_page_MensUnisex,pre_sum_event_params_child_page_Notebooks,pre_sum_event_params_child_page_Other,pre_sum_event_params_child_page_SmallGoods,pre_sum_event_params_child_page_Socks,pre_sum_event_params_child_page_Stickers,pre_sum_event_params_child_page_Womens,pre_sum_event_params_child_page_Writing,pre_sum_event_params_child_page_YouTube,pre_sum_item_parent_category_Apparel,pre_sum_item_parent_category_Collections,pre_sum_item_parent_category_Lifestyle,pre_sum_item_parent_category_New,pre_sum_item_parent_category_Other,pre_sum_item_parent_category_Sale,pre_sum_item_parent_category_ShopbyBrand,pre_sum_item_parent_category_Stationery,pre_sum_item_child_category_Bags,pre_sum_item_child_category_CampusCollection,pre_sum_item_child_category_Drinkware,pre_sum_item_child_category_Google,pre_sum_item_child_category_Kids,pre_sum_item_child_category_MensUnisex,pre_sum_item_child_category_Other,pre_sum_item_child_category_SmallGoods,pre_sum_item_child_category_Womens,pre_sum_item_child_subcategory_Backpacks,pre_sum_item_child_subcategory_ElectronicsAccessories,pre_sum_item_child_subcategory_Infant,pre_sum_item_child_subcategory_MensTShirts,pre_sum_item_child_subcategory_MugsTumblers,pre_sum_item_child_subcategory_Other,pre_sum_item_child_subcategory_WaterBottles,pre_max_event_params_ga_session_number,pre_max_event_params_engagement_time_msec,pre_stdev_max_event_params_engagement_time_msec,pre_stdev_avg_event_params_engagement_time_msec,pre_avg_event_params_engagement_time_msec
3022,10092926.37863064,2021-01-22,719410,1.0,0,1,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0.0,0.0,0.0
0,10111055.876868386,2020-12-10,741471,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0
2748,1016446.8237887674,2020-12-21,983645,1.0,2813178,339,72,0,29,14,24,26,174,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,81,3,0,0,0,5,209,2,0,0,2,0,13,6,91,89,0,0,0,81,3,0,15,0,0,0,3,0,72,0,192,0,0,0,0,0,84,0,84,0,0,72,0,12,0,0,0,0,0,0,0,0,1,62804,0.0,0.0,8225.667
3023,10172849.537529336,2020-12-09,406646,1.0,0,1,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0.0,0.0,0.0
1,1019527.5799124268,2020-12-05,2105,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0


In [None]:
# create table schema to prevent errors when uploading data
schema = []

for col, dtype in zip(df_pre_agg.dtypes.index, df_pre_agg.dtypes.values):
    col_dict = {}
    col_dict['name'] = col

    if dtype == object:
        typ = 'STRING'
    elif dtype == bool:
        typ = 'BOOLEAN'
    elif dtype == 'datetime64[ns]':
        typ = 'DATETIME'
    elif dtype in (int, 'Int64', 'uint8'):
        typ = 'INTEGER'
    elif dtype == float:
        typ = 'FLOAT'
    else:
        print(dtype)
    col_dict['type'] = typ

    schema.append(col_dict)

In [None]:
# df_pre_agg.to_gbq(f'{project_name}.return_prediction_ga4.step_4_customer_agg_pre',
#                     project_id=project_name,
#                     if_exists='replace',
#                     location=region,
#                     chunksize=100_000,
#                     table_schema=schema)

### Aggregate previous transactions
If a customer has had a transaction prior to the current one, aggregate the previous transaction values.

In [None]:
count_cols = ['ecommerce_transaction_id']

min_max_cols = ['transaction_date']

avg_cols = ['avg_event_params_engagement_time_msec',
            'avg_item_price_in_usd',
            'avg_item_promotions',
            'avg_item_quantity',
            'avg_item_refund_in_usd',
            'avg_item_refund_quantity',
            'avg_item_revenue_in_usd',
            'ecommerce_total_item_quantity',
            'total_return_item_quantity',
            'ecommerce_purchase_revenue_in_usd',
            'ecommerce_refund_value_in_usd',
            'ecommerce_tax_value_in_usd',
            'ecommerce_unique_items',
            'days_first_session_to_transaction']

max_cols = ['user_ltv_revenue',
            'max_event_params_engagement_time_msec',
            'max_item_price_in_usd',
            'max_item_promotions',
            'max_item_quantity',
            'max_item_refund_in_usd',
            'max_item_refund_quantity',
            'max_item_revenue_in_usd',
            'ecommerce_total_item_quantity',
            'total_return_item_quantity',
            'ecommerce_purchase_revenue_in_usd',
            'ecommerce_refund_value_in_usd',
            'ecommerce_tax_value_in_usd',
            'ecommerce_unique_items',
            'days_first_session_to_transaction']

sum_cols = ['sum_event_params_engagement_time_msec',
            'sum_event_params_session_engaged',
            'ecommerce_total_item_quantity',
            'total_return_item_quantity',
            'ecommerce_purchase_revenue_in_usd',
            'ecommerce_refund_value_in_usd',
            'ecommerce_tax_value_in_usd',
            'ecommerce_unique_items',
            'sum_item_price_in_usd',
            'sum_item_promotions',
            'sum_item_quantity',
            'sum_item_refund_in_usd',
            'sum_item_refund_quantity',
            'sum_item_revenue_in_usd',
            'sum_event_name_add_to_cart',
            'sum_event_name_begin_checkout',
            'sum_event_name_page_view',
            'sum_event_name_scroll',
            'sum_event_name_select_item',
            'sum_event_name_user_engagement',
            'sum_event_name_view_item',
            'sum_device_category_desktop',
            'sum_device_category_mobile',
            'sum_device_mobile_brand_name_Apple',
            'sum_device_mobile_brand_name_Google',
            'sum_device_mobile_brand_name_Huawei',
            'sum_device_mobile_brand_name_Microsoft',
            'sum_device_mobile_brand_name_Mozilla',
            'sum_device_mobile_brand_name_Samsung',
            'sum_device_mobile_brand_name_Xiaomi',
            'sum_device_mobile_model_name_Chrome',
            'sum_device_mobile_model_name_ChromeBook',
            'sum_device_mobile_model_name_Edge',
            'sum_device_mobile_model_name_Firefox',
            'sum_device_mobile_model_name_Safari',
            'sum_device_mobile_model_name_iPad',
            'sum_device_mobile_model_name_iPhone',
            'sum_device_web_info_browser_AndroidWebview',
            'sum_device_web_info_browser_Chrome',
            'sum_device_web_info_browser_Edge',
            'sum_device_web_info_browser_Firefox',
            'sum_device_web_info_browser_Safari',
            'sum_geo_country_Canada',
            'sum_geo_country_France',
            'sum_geo_country_India',
            'sum_geo_country_Other',
            'sum_geo_country_Spain',
            'sum_geo_country_UnitedKingdom',
            'sum_geo_country_UnitedStates',
            'sum_traffic_source_medium_Other',
            'sum_traffic_source_medium_cpc',
            'sum_traffic_source_medium_organic',
            'sum_traffic_source_medium_referral',
            'sum_event_params_parent_page_Apparel',
            'sum_event_params_parent_page_CampusCollection',
            'sum_event_params_parent_page_CheckoutConfirmation',
            'sum_event_params_parent_page_CheckoutYourInformation',
            'sum_event_params_parent_page_EcoFriendly',
            'sum_event_params_parent_page_Home',
            'sum_event_params_parent_page_Lifestyle',
            'sum_event_params_parent_page_New',
            'sum_event_params_parent_page_Other',
            'sum_event_params_parent_page_PaymentMethod',
            'sum_event_params_parent_page_Sale',
            'sum_event_params_parent_page_ShopbyBrand',
            'sum_event_params_parent_page_ShoppingCart',
            'sum_event_params_parent_page_Stationery',
            'sum_event_params_child_page_Bags',
            'sum_event_params_child_page_Drinkware',
            'sum_event_params_child_page_Google',
            'sum_event_params_child_page_Hats',
            'sum_event_params_child_page_Kids',
            'sum_event_params_child_page_MensUnisex',
            'sum_event_params_child_page_Notebooks',
            'sum_event_params_child_page_Other',
            'sum_event_params_child_page_SmallGoods',
            'sum_event_params_child_page_Socks',
            'sum_event_params_child_page_Stickers',
            'sum_event_params_child_page_Womens',
            'sum_event_params_child_page_Writing',
            'sum_event_params_child_page_YouTube',
            'sum_item_parent_category_Apparel',
            'sum_item_parent_category_Collections',
            'sum_item_parent_category_Lifestyle',
            'sum_item_parent_category_New',
            'sum_item_parent_category_Other',
            'sum_item_parent_category_Sale',
            'sum_item_parent_category_ShopbyBrand',
            'sum_item_parent_category_Stationery',
            'sum_item_child_category_Bags',
            'sum_item_child_category_CampusCollection',
            'sum_item_child_category_Drinkware',
            'sum_item_child_category_Google',
            'sum_item_child_category_Kids',
            'sum_item_child_category_MensUnisex',
            'sum_item_child_category_Other',
            'sum_item_child_category_SmallGoods',
            'sum_item_child_category_Womens',
            'sum_item_child_subcategory_Backpacks',
            'sum_item_child_subcategory_ElectronicsAccessories',
            'sum_item_child_subcategory_Infant',
            'sum_item_child_subcategory_MensTShirts',
            'sum_item_child_subcategory_MugsTumblers',
            'sum_item_child_subcategory_Other',
            'sum_item_child_subcategory_WaterBottles']

In [None]:
## this cell takes a long time to run, is there a better way to write this? ##
# aggregate the transactions prior to the current one
df_current_agg = pd.DataFrame([])

for user_id in df_current['user_pseudo_id'].unique():
    # all transactions for this user in order of transaction_date
    user_transactions = df_current[df_current['user_pseudo_id'] == user_id]
    for _, row in user_transactions.iterrows():
        # look at the transactions that occurred before the current one
        previous_transactions = user_transactions[user_transactions['transaction_date'] < row['transaction_date']]

        # aggregate transaction values
        cumulative_transaction = previous_transactions.groupby('user_pseudo_id')[
            count_cols + min_max_cols + avg_cols + max_cols + sum_cols
        ].agg({
            **{col: 'nunique' for col in count_cols},
            **{col: ['min', 'max'] for col in min_max_cols},
            **{col: 'mean' for col in avg_cols},
            **{col: 'max' for col in max_cols},
            **{col: 'sum' for col in sum_cols}
        })

        cumulative_transaction.columns = ['_'.join(col[::-1]).strip('_') for col in cumulative_transaction.columns.values]

        # attach identifying columns for combining pre-transaction features
        df_id = pd.DataFrame([(row['user_pseudo_id'], row['transaction_date'], row['ecommerce_transaction_id'])],
                             columns=['user_pseudo_id', 'transaction_date', 'ecommerce_transaction_id'])

        # put aggregated transaction features and id features together
        if not cumulative_transaction.empty:
            cumulative_transaction_id = pd.concat([df_id, cumulative_transaction.set_index(df_id.index)], axis=1)
        else:
            cumulative_transaction_id = pd.concat([df_id, cumulative_transaction])

        # put all aggregated transactions together
        df_current_agg = pd.concat([df_current_agg, cumulative_transaction_id])

In [None]:
# rename columns
df_current_agg.columns = ['user_pseudo_id',
                          'transaction_date',
                          'ecommerce_transaction_id',
                          'nunique_ecommerce_transactions',
                          'min_transaction_date',
                          'max_transaction_date',
                          'avg_event_params_engagement_time_msec',
                          'avg_item_price_in_usd',
                          'avg_item_promotions',
                          'avg_item_quantity',
                          'avg_item_refund_in_usd',
                          'avg_item_refund_quantity',
                          'avg_item_revenue_in_usd',
                          'sum_ecommerce_total_item_quantity',
                          'sum_total_return_item_quantity',
                          'sum_ecommerce_purchase_revenue_in_usd',
                          'sum_ecommerce_refund_value_in_usd',
                          'sum_ecommerce_tax_value_in_usd',
                          'sum_ecommerce_unique_items',
                          'max_days_first_session_to_transaction',
                          'max_user_ltv_revenue',
                          'max_event_params_engagement_time_msec',
                          'max_item_price_in_usd',
                          'max_item_promotions',
                          'max_item_quantity',
                          'max_item_refund_in_usd',
                          'max_item_refund_quantity',
                          'max_item_revenue_in_usd',
                          'sum_event_params_engagement_time_msec',
                          'sum_event_params_session_engaged',
                          'sum_item_price_in_usd',
                          'sum_item_promotions',
                          'sum_item_quantity',
                          'sum_item_refund_in_usd',
                          'sum_item_refund_quantity',
                          'sum_item_revenue_in_usd',
                          'sum_event_name_add_to_cart',
                          'sum_event_name_begin_checkout',
                          'sum_event_name_page_view',
                          'sum_event_name_scroll',
                          'sum_event_name_select_item',
                          'sum_event_name_user_engagement',
                          'sum_event_name_view_item',
                          'sum_device_category_desktop',
                          'sum_device_category_mobile',
                          'sum_device_mobile_brand_name_Apple',
                          'sum_device_mobile_brand_name_Google',
                          'sum_device_mobile_brand_name_Huawei',
                          'sum_device_mobile_brand_name_Microsoft',
                          'sum_device_mobile_brand_name_Mozilla',
                          'sum_device_mobile_brand_name_Samsung',
                          'sum_device_mobile_brand_name_Xiaomi',
                          'sum_device_mobile_model_name_Chrome',
                          'sum_device_mobile_model_name_ChromeBook',
                          'sum_device_mobile_model_name_Edge',
                          'sum_device_mobile_model_name_Firefox',
                          'sum_device_mobile_model_name_Safari',
                          'sum_device_mobile_model_name_iPad',
                          'sum_device_mobile_model_name_iPhone',
                          'sum_device_web_info_browser_AndroidWebview',
                          'sum_device_web_info_browser_Chrome',
                          'sum_device_web_info_browser_Edge',
                          'sum_device_web_info_browser_Firefox',
                          'sum_device_web_info_browser_Safari',
                          'sum_geo_country_Canada',
                          'sum_geo_country_France',
                          'sum_geo_country_India',
                          'sum_geo_country_Other',
                          'sum_geo_country_Spain',
                          'sum_geo_country_UnitedKingdom',
                          'sum_geo_country_UnitedStates',
                          'sum_traffic_source_medium_Other',
                          'sum_traffic_source_medium_cpc',
                          'sum_traffic_source_medium_organic',
                          'sum_traffic_source_medium_referral',
                          'sum_event_params_parent_page_Apparel',
                          'sum_event_params_parent_page_CampusCollection',
                          'sum_event_params_parent_page_CheckoutConfirmation',
                          'sum_event_params_parent_page_CheckoutYourInformation',
                          'sum_event_params_parent_page_EcoFriendly',
                          'sum_event_params_parent_page_Home',
                          'sum_event_params_parent_page_Lifestyle',
                          'sum_event_params_parent_page_New',
                          'sum_event_params_parent_page_Other',
                          'sum_event_params_parent_page_PaymentMethod',
                          'sum_event_params_parent_page_Sale',
                          'sum_event_params_parent_page_ShopbyBrand',
                          'sum_event_params_parent_page_ShoppingCart',
                          'sum_event_params_parent_page_Stationery',
                          'sum_event_params_child_page_Bags',
                          'sum_event_params_child_page_Drinkware',
                          'sum_event_params_child_page_Google',
                          'sum_event_params_child_page_Hats',
                          'sum_event_params_child_page_Kids',
                          'sum_event_params_child_page_MensUnisex',
                          'sum_event_params_child_page_Notebooks',
                          'sum_event_params_child_page_Other',
                          'sum_event_params_child_page_SmallGoods',
                          'sum_event_params_child_page_Socks',
                          'sum_event_params_child_page_Stickers',
                          'sum_event_params_child_page_Womens',
                          'sum_event_params_child_page_Writing',
                          'sum_event_params_child_page_YouTube',
                          'sum_item_parent_category_Apparel',
                          'sum_item_parent_category_Collections',
                          'sum_item_parent_category_Lifestyle',
                          'sum_item_parent_category_New',
                          'sum_item_parent_category_Other',
                          'sum_item_parent_category_Sale',
                          'sum_item_parent_category_ShopbyBrand',
                          'sum_item_parent_category_Stationery',
                          'sum_item_child_category_Bags',
                          'sum_item_child_category_CampusCollection',
                          'sum_item_child_category_Drinkware',
                          'sum_item_child_category_Google',
                          'sum_item_child_category_Kids',
                          'sum_item_child_category_MensUnisex',
                          'sum_item_child_category_Other',
                          'sum_item_child_category_SmallGoods',
                          'sum_item_child_category_Womens',
                          'sum_item_child_subcategory_Backpacks',
                          'sum_item_child_subcategory_ElectronicsAccessories',
                          'sum_item_child_subcategory_Infant',
                          'sum_item_child_subcategory_MensTShirts',
                          'sum_item_child_subcategory_MugsTumblers',
                          'sum_item_child_subcategory_Other',
                          'sum_item_child_subcategory_WaterBottles']

In [None]:
df_current_agg.shape

(4466, 127)

In [None]:
df_current_agg.head()

Unnamed: 0,user_pseudo_id,transaction_date,ecommerce_transaction_id,nunique_ecommerce_transactions,min_transaction_date,max_transaction_date,avg_event_params_engagement_time_msec,avg_item_price_in_usd,avg_item_promotions,avg_item_quantity,avg_item_refund_in_usd,avg_item_refund_quantity,avg_item_revenue_in_usd,sum_ecommerce_total_item_quantity,sum_total_return_item_quantity,sum_ecommerce_purchase_revenue_in_usd,sum_ecommerce_refund_value_in_usd,sum_ecommerce_tax_value_in_usd,sum_ecommerce_unique_items,max_days_first_session_to_transaction,max_user_ltv_revenue,max_event_params_engagement_time_msec,max_item_price_in_usd,max_item_promotions,max_item_quantity,max_item_refund_in_usd,max_item_refund_quantity,max_item_revenue_in_usd,sum_event_params_engagement_time_msec,sum_event_params_session_engaged,sum_item_price_in_usd,sum_item_promotions,sum_item_quantity,sum_item_refund_in_usd,sum_item_refund_quantity,sum_item_revenue_in_usd,sum_event_name_add_to_cart,sum_event_name_begin_checkout,sum_event_name_page_view,sum_event_name_scroll,sum_event_name_select_item,sum_event_name_user_engagement,sum_event_name_view_item,sum_device_category_desktop,sum_device_category_mobile,sum_device_mobile_brand_name_Apple,sum_device_mobile_brand_name_Google,sum_device_mobile_brand_name_Huawei,sum_device_mobile_brand_name_Microsoft,sum_device_mobile_brand_name_Mozilla,sum_device_mobile_brand_name_Samsung,sum_device_mobile_brand_name_Xiaomi,sum_device_mobile_model_name_Chrome,sum_device_mobile_model_name_ChromeBook,sum_device_mobile_model_name_Edge,sum_device_mobile_model_name_Firefox,sum_device_mobile_model_name_Safari,sum_device_mobile_model_name_iPad,sum_device_mobile_model_name_iPhone,sum_device_web_info_browser_AndroidWebview,sum_device_web_info_browser_Chrome,sum_device_web_info_browser_Edge,sum_device_web_info_browser_Firefox,sum_device_web_info_browser_Safari,sum_geo_country_Canada,sum_geo_country_France,sum_geo_country_India,sum_geo_country_Other,sum_geo_country_Spain,sum_geo_country_UnitedKingdom,sum_geo_country_UnitedStates,sum_traffic_source_medium_Other,sum_traffic_source_medium_cpc,sum_traffic_source_medium_organic,sum_traffic_source_medium_referral,sum_event_params_parent_page_Apparel,sum_event_params_parent_page_CampusCollection,sum_event_params_parent_page_CheckoutConfirmation,sum_event_params_parent_page_CheckoutYourInformation,sum_event_params_parent_page_EcoFriendly,sum_event_params_parent_page_Home,sum_event_params_parent_page_Lifestyle,sum_event_params_parent_page_New,sum_event_params_parent_page_Other,sum_event_params_parent_page_PaymentMethod,sum_event_params_parent_page_Sale,sum_event_params_parent_page_ShopbyBrand,sum_event_params_parent_page_ShoppingCart,sum_event_params_parent_page_Stationery,sum_event_params_child_page_Bags,sum_event_params_child_page_Drinkware,sum_event_params_child_page_Google,sum_event_params_child_page_Hats,sum_event_params_child_page_Kids,sum_event_params_child_page_MensUnisex,sum_event_params_child_page_Notebooks,sum_event_params_child_page_Other,sum_event_params_child_page_SmallGoods,sum_event_params_child_page_Socks,sum_event_params_child_page_Stickers,sum_event_params_child_page_Womens,sum_event_params_child_page_Writing,sum_event_params_child_page_YouTube,sum_item_parent_category_Apparel,sum_item_parent_category_Collections,sum_item_parent_category_Lifestyle,sum_item_parent_category_New,sum_item_parent_category_Other,sum_item_parent_category_Sale,sum_item_parent_category_ShopbyBrand,sum_item_parent_category_Stationery,sum_item_child_category_Bags,sum_item_child_category_CampusCollection,sum_item_child_category_Drinkware,sum_item_child_category_Google,sum_item_child_category_Kids,sum_item_child_category_MensUnisex,sum_item_child_category_Other,sum_item_child_category_SmallGoods,sum_item_child_category_Womens,sum_item_child_subcategory_Backpacks,sum_item_child_subcategory_ElectronicsAccessories,sum_item_child_subcategory_Infant,sum_item_child_subcategory_MensTShirts,sum_item_child_subcategory_MugsTumblers,sum_item_child_subcategory_Other,sum_item_child_subcategory_WaterBottles
0,10092926.37863064,2021-01-22,719410,,NaT,NaT,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
0,10111055.876868386,2020-12-10,741471,,NaT,NaT,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
0,1016446.8237887674,2020-12-21,983645,,NaT,NaT,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
0,10172849.537529336,2020-12-09,406646,,NaT,NaT,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
0,1019527.5799124268,2020-12-05,2105,,NaT,NaT,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [None]:
# create table schema to prevent errors when uploading data
schema = []

for col, dtype in zip(df_current_agg.dtypes.index, df_current_agg.dtypes.values):
    col_dict = {}
    col_dict['name'] = col

    if dtype == object:
        typ = 'STRING'
    elif dtype == bool:
        typ = 'BOOLEAN'
    elif dtype == 'datetime64[ns]':
        typ = 'DATETIME'
    elif dtype in (int, 'Int64', 'uint8'):
        typ = 'INTEGER'
    elif dtype == float:
        typ = 'FLOAT'
    else:
        print(dtype)
    col_dict['type'] = typ

    schema.append(col_dict)

In [None]:
# df_current_agg.to_gbq(f'{project_name}.return_prediction_ga4.step_4_customer_agg_transaction',
#                         project_id=project_name,
#                         if_exists='replace',
#                         location=region,
#                         chunksize=100_000,
#                         table_schema=schema)

## Join customer aggregated dfs

In [None]:
# check shape of each first to make sure nothing is unexpected
# we want to make sure the row count is the same across all dfs
print(df_pre_agg.shape)
print(df_current_agg.shape)

(4466, 102)
(4466, 127)


In [None]:
df_merged = pd.merge(df_current_agg, df_pre_agg, how='left', on=['user_pseudo_id', 'ecommerce_transaction_id', 'transaction_date'])

In [None]:
df_merged.shape

(4466, 226)

In [None]:
df_merged

Unnamed: 0,user_pseudo_id,transaction_date,ecommerce_transaction_id,nunique_ecommerce_transactions,min_transaction_date,max_transaction_date,avg_event_params_engagement_time_msec,avg_item_price_in_usd,avg_item_promotions,avg_item_quantity,avg_item_refund_in_usd,avg_item_refund_quantity,avg_item_revenue_in_usd,sum_ecommerce_total_item_quantity,sum_total_return_item_quantity,sum_ecommerce_purchase_revenue_in_usd,sum_ecommerce_refund_value_in_usd,sum_ecommerce_tax_value_in_usd,sum_ecommerce_unique_items,max_days_first_session_to_transaction,max_user_ltv_revenue,max_event_params_engagement_time_msec,max_item_price_in_usd,max_item_promotions,max_item_quantity,max_item_refund_in_usd,max_item_refund_quantity,max_item_revenue_in_usd,sum_event_params_engagement_time_msec,sum_event_params_session_engaged,sum_item_price_in_usd,sum_item_promotions,sum_item_quantity,sum_item_refund_in_usd,sum_item_refund_quantity,sum_item_revenue_in_usd,sum_event_name_add_to_cart,sum_event_name_begin_checkout,sum_event_name_page_view,sum_event_name_scroll,sum_event_name_select_item,sum_event_name_user_engagement,sum_event_name_view_item,sum_device_category_desktop,sum_device_category_mobile,sum_device_mobile_brand_name_Apple,sum_device_mobile_brand_name_Google,sum_device_mobile_brand_name_Huawei,sum_device_mobile_brand_name_Microsoft,sum_device_mobile_brand_name_Mozilla,sum_device_mobile_brand_name_Samsung,sum_device_mobile_brand_name_Xiaomi,sum_device_mobile_model_name_Chrome,sum_device_mobile_model_name_ChromeBook,sum_device_mobile_model_name_Edge,sum_device_mobile_model_name_Firefox,sum_device_mobile_model_name_Safari,sum_device_mobile_model_name_iPad,sum_device_mobile_model_name_iPhone,sum_device_web_info_browser_AndroidWebview,sum_device_web_info_browser_Chrome,sum_device_web_info_browser_Edge,sum_device_web_info_browser_Firefox,sum_device_web_info_browser_Safari,sum_geo_country_Canada,sum_geo_country_France,sum_geo_country_India,sum_geo_country_Other,sum_geo_country_Spain,sum_geo_country_UnitedKingdom,sum_geo_country_UnitedStates,sum_traffic_source_medium_Other,sum_traffic_source_medium_cpc,sum_traffic_source_medium_organic,sum_traffic_source_medium_referral,sum_event_params_parent_page_Apparel,sum_event_params_parent_page_CampusCollection,sum_event_params_parent_page_CheckoutConfirmation,sum_event_params_parent_page_CheckoutYourInformation,sum_event_params_parent_page_EcoFriendly,sum_event_params_parent_page_Home,sum_event_params_parent_page_Lifestyle,sum_event_params_parent_page_New,sum_event_params_parent_page_Other,sum_event_params_parent_page_PaymentMethod,sum_event_params_parent_page_Sale,sum_event_params_parent_page_ShopbyBrand,sum_event_params_parent_page_ShoppingCart,sum_event_params_parent_page_Stationery,sum_event_params_child_page_Bags,sum_event_params_child_page_Drinkware,sum_event_params_child_page_Google,sum_event_params_child_page_Hats,sum_event_params_child_page_Kids,sum_event_params_child_page_MensUnisex,sum_event_params_child_page_Notebooks,sum_event_params_child_page_Other,sum_event_params_child_page_SmallGoods,sum_event_params_child_page_Socks,sum_event_params_child_page_Stickers,sum_event_params_child_page_Womens,sum_event_params_child_page_Writing,sum_event_params_child_page_YouTube,sum_item_parent_category_Apparel,sum_item_parent_category_Collections,sum_item_parent_category_Lifestyle,sum_item_parent_category_New,sum_item_parent_category_Other,sum_item_parent_category_Sale,sum_item_parent_category_ShopbyBrand,sum_item_parent_category_Stationery,sum_item_child_category_Bags,sum_item_child_category_CampusCollection,sum_item_child_category_Drinkware,sum_item_child_category_Google,sum_item_child_category_Kids,sum_item_child_category_MensUnisex,sum_item_child_category_Other,sum_item_child_category_SmallGoods,sum_item_child_category_Womens,sum_item_child_subcategory_Backpacks,sum_item_child_subcategory_ElectronicsAccessories,sum_item_child_subcategory_Infant,sum_item_child_subcategory_MensTShirts,sum_item_child_subcategory_MugsTumblers,sum_item_child_subcategory_Other,sum_item_child_subcategory_WaterBottles,pre_nunique_event_params_ga_sessions,pre_sum_event_params_engagement_time_msec,pre_sum_event_params_session_engaged,pre_sum_event_name_add_to_cart,pre_sum_event_name_begin_checkout,pre_sum_event_name_page_view,pre_sum_event_name_scroll,pre_sum_event_name_select_item,pre_sum_event_name_user_engagement,pre_sum_event_name_view_item,pre_sum_device_category_desktop,pre_sum_device_category_mobile,pre_sum_device_mobile_brand_name_Apple,pre_sum_device_mobile_brand_name_Google,pre_sum_device_mobile_brand_name_Huawei,pre_sum_device_mobile_brand_name_Microsoft,pre_sum_device_mobile_brand_name_Mozilla,pre_sum_device_mobile_brand_name_Samsung,pre_sum_device_mobile_brand_name_Xiaomi,pre_sum_device_mobile_model_name_Chrome,pre_sum_device_mobile_model_name_ChromeBook,pre_sum_device_mobile_model_name_Edge,pre_sum_device_mobile_model_name_Firefox,pre_sum_device_mobile_model_name_Safari,pre_sum_device_mobile_model_name_iPad,pre_sum_device_mobile_model_name_iPhone,pre_sum_device_web_info_browser_AndroidWebview,pre_sum_device_web_info_browser_Chrome,pre_sum_device_web_info_browser_Edge,pre_sum_device_web_info_browser_Firefox,pre_sum_device_web_info_browser_Safari,pre_sum_geo_country_Canada,pre_sum_geo_country_France,pre_sum_geo_country_India,pre_sum_geo_country_Other,pre_sum_geo_country_Spain,pre_sum_geo_country_UnitedKingdom,pre_sum_geo_country_UnitedStates,pre_sum_traffic_source_medium_Other,pre_sum_traffic_source_medium_cpc,pre_sum_traffic_source_medium_organic,pre_sum_traffic_source_medium_referral,pre_sum_event_params_parent_page_Apparel,pre_sum_event_params_parent_page_CampusCollection,pre_sum_event_params_parent_page_CheckoutConfirmation,pre_sum_event_params_parent_page_CheckoutYourInformation,pre_sum_event_params_parent_page_EcoFriendly,pre_sum_event_params_parent_page_Home,pre_sum_event_params_parent_page_Lifestyle,pre_sum_event_params_parent_page_New,pre_sum_event_params_parent_page_Other,pre_sum_event_params_parent_page_PaymentMethod,pre_sum_event_params_parent_page_Sale,pre_sum_event_params_parent_page_ShopbyBrand,pre_sum_event_params_parent_page_ShoppingCart,pre_sum_event_params_parent_page_Stationery,pre_sum_event_params_child_page_Bags,pre_sum_event_params_child_page_Drinkware,pre_sum_event_params_child_page_Google,pre_sum_event_params_child_page_Hats,pre_sum_event_params_child_page_Kids,pre_sum_event_params_child_page_MensUnisex,pre_sum_event_params_child_page_Notebooks,pre_sum_event_params_child_page_Other,pre_sum_event_params_child_page_SmallGoods,pre_sum_event_params_child_page_Socks,pre_sum_event_params_child_page_Stickers,pre_sum_event_params_child_page_Womens,pre_sum_event_params_child_page_Writing,pre_sum_event_params_child_page_YouTube,pre_sum_item_parent_category_Apparel,pre_sum_item_parent_category_Collections,pre_sum_item_parent_category_Lifestyle,pre_sum_item_parent_category_New,pre_sum_item_parent_category_Other,pre_sum_item_parent_category_Sale,pre_sum_item_parent_category_ShopbyBrand,pre_sum_item_parent_category_Stationery,pre_sum_item_child_category_Bags,pre_sum_item_child_category_CampusCollection,pre_sum_item_child_category_Drinkware,pre_sum_item_child_category_Google,pre_sum_item_child_category_Kids,pre_sum_item_child_category_MensUnisex,pre_sum_item_child_category_Other,pre_sum_item_child_category_SmallGoods,pre_sum_item_child_category_Womens,pre_sum_item_child_subcategory_Backpacks,pre_sum_item_child_subcategory_ElectronicsAccessories,pre_sum_item_child_subcategory_Infant,pre_sum_item_child_subcategory_MensTShirts,pre_sum_item_child_subcategory_MugsTumblers,pre_sum_item_child_subcategory_Other,pre_sum_item_child_subcategory_WaterBottles,pre_max_event_params_ga_session_number,pre_max_event_params_engagement_time_msec,pre_stdev_max_event_params_engagement_time_msec,pre_stdev_avg_event_params_engagement_time_msec,pre_avg_event_params_engagement_time_msec
0,10092926.3786306416,2021-01-22,719410,,NaT,NaT,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.000,0,1,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0.000,0.000,0.000
1,10111055.8768683862,2020-12-10,741471,,NaT,NaT,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.000,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.000,0.000,0.000
2,1016446.8237887674,2020-12-21,983645,,NaT,NaT,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.000,2813178,339,72,0,29,14,24,26,174,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,81,3,0,0,0,5,209,2,0,0,2,0,13,6,91,89,0,0,0,81,3,0,15,0,0,0,3,0,72,0,192,0,0,0,0,0,84,0,84,0,0,72,0,12,0,0,0,0,0,0,0,0,1,62804,0.000,0.000,8225.667
3,10172849.5375293351,2020-12-09,406646,,NaT,NaT,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.000,0,1,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0.000,0.000,0.000
4,1019527.5799124267,2020-12-05,2105,,NaT,NaT,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.000,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.000,0.000,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4461,9963542.3978980993,2020-11-12,949022,,NaT,NaT,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.000,1878906,127,0,0,3,2,0,1,120,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,3,0,0,0,0,122,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,120,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,47278,0.000,0.000,14565.163
4462,9963542.3978980993,2020-12-10,221127,1.000,2020-11-12,2020-11-12,10473.966,3.077,0.000,2.615,6.538,2.615,6.308,34,34,82.000,82.000,7.000,13,0.000,82.000,54241,6.000,0,8,12.000,8,11.000,2178585,207,40.000,0,34,85.000,34,82.000,0,4,19,14,0,20,133,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,16,12,0,0,38,0,3,7,101,0,25,0,0,0,0,0,0,0,0,0,38,0,0,0,0,0,3,0,39,2,2,96,0,3,0,0,1,0,0,0,8,36,0,0,0,0,0,0,0,0,1.000,1878906,127,0,0,3,2,0,1,120,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,3,0,0,0,0,122,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,120,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,47278,0.000,0.000,7282.581
4463,99826321.7020196151,2021-01-25,533525,,NaT,NaT,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,4.000,3594931,491,92,0,40,23,12,35,284,4,0,3,1,0,0,0,0,0,1,0,0,0,3,0,0,0,4,0,0,0,0,0,0,4,0,0,0,0,0,2,0,107,0,0,0,3,345,5,0,0,0,3,3,10,0,0,5,3,3,0,36,0,30,0,0,0,0,0,0,368,0,20,0,0,0,0,0,0,0,20,0,0,332,0,0,0,0,0,0,24,0,0,0,5,71736,28808.841,3160.067,5159.230
4464,9992779.7021507317,2020-11-30,466106,,NaT,NaT,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.000,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.000,0.000,0.000


Since we won't end up using datetime columns in our final features, create some features based off of `min_` and `max_transaction_date`.

In [None]:
# recency is calculated as the time between the first and last transaction
df_merged['recency'] = (df_merged['max_transaction_date'] - df_merged['min_transaction_date']).dt.days

In [None]:
df_merged['recency'].describe()

count   324.000
mean      0.796
std       3.110
min       0.000
25%       0.000
50%       0.000
75%       0.000
max      22.000
Name: recency, dtype: float64

In [None]:
# customer age is calculated as the time between the end of the training period and the first transaction date
df_merged['age'] = df_merged.apply(lambda x: (pd.to_datetime('2020-12-31') - x['min_transaction_date']).days
                                   if not pd.isna(x['min_transaction_date'])
                                   else (pd.to_datetime('2020-12-31') - pd.to_datetime(x['transaction_date'])).days,
                                   axis=1)

In [None]:
df_merged['age'].describe()

count   4466.000
mean      18.325
std       20.173
min      -30.000
25%       10.000
50%       21.000
75%       33.000
max       50.000
Name: age, dtype: float64

In [None]:
# maintain split column for future use
df_merged['split'] = df_merged['transaction_date'].apply(lambda x: 'TRAIN'
                                                         if x < pd.to_datetime('2021-01-01')
                                                         else 'VALID'
                                                         if (x >= pd.to_datetime('2021-01-01')) & (x <= pd.to_datetime('2021-01-16'))
                                                         else 'TEST')

In [None]:
df_merged = df_merged.drop(['transaction_date', 'min_transaction_date', 'max_transaction_date'], axis=1)

In [None]:
df_merged.shape

(4466, 226)

**Fill missing values** <br>
In cases where the current transaction is the first transaction, fill all aggregated previous transaction values with zero.

In [None]:
df_merged.isna().sum()

user_pseudo_id                                        0
ecommerce_transaction_id                              0
nunique_ecommerce_transactions                     4142
avg_event_params_engagement_time_msec              4142
avg_item_price_in_usd                              4142
                                                   ... 
pre_stdev_avg_event_params_engagement_time_msec       0
pre_avg_event_params_engagement_time_msec             0
recency                                            4142
age                                                   0
split                                                 0
Length: 226, dtype: int64

In [None]:
df_merged = df_merged.fillna(0)
df_merged.head()

Unnamed: 0,user_pseudo_id,ecommerce_transaction_id,nunique_ecommerce_transactions,avg_event_params_engagement_time_msec,avg_item_price_in_usd,avg_item_promotions,avg_item_quantity,avg_item_refund_in_usd,avg_item_refund_quantity,avg_item_revenue_in_usd,sum_ecommerce_total_item_quantity,sum_total_return_item_quantity,sum_ecommerce_purchase_revenue_in_usd,sum_ecommerce_refund_value_in_usd,sum_ecommerce_tax_value_in_usd,sum_ecommerce_unique_items,max_days_first_session_to_transaction,max_user_ltv_revenue,max_event_params_engagement_time_msec,max_item_price_in_usd,max_item_promotions,max_item_quantity,max_item_refund_in_usd,max_item_refund_quantity,max_item_revenue_in_usd,sum_event_params_engagement_time_msec,sum_event_params_session_engaged,sum_item_price_in_usd,sum_item_promotions,sum_item_quantity,sum_item_refund_in_usd,sum_item_refund_quantity,sum_item_revenue_in_usd,sum_event_name_add_to_cart,sum_event_name_begin_checkout,sum_event_name_page_view,sum_event_name_scroll,sum_event_name_select_item,sum_event_name_user_engagement,sum_event_name_view_item,sum_device_category_desktop,sum_device_category_mobile,sum_device_mobile_brand_name_Apple,sum_device_mobile_brand_name_Google,sum_device_mobile_brand_name_Huawei,sum_device_mobile_brand_name_Microsoft,sum_device_mobile_brand_name_Mozilla,sum_device_mobile_brand_name_Samsung,sum_device_mobile_brand_name_Xiaomi,sum_device_mobile_model_name_Chrome,sum_device_mobile_model_name_ChromeBook,sum_device_mobile_model_name_Edge,sum_device_mobile_model_name_Firefox,sum_device_mobile_model_name_Safari,sum_device_mobile_model_name_iPad,sum_device_mobile_model_name_iPhone,sum_device_web_info_browser_AndroidWebview,sum_device_web_info_browser_Chrome,sum_device_web_info_browser_Edge,sum_device_web_info_browser_Firefox,sum_device_web_info_browser_Safari,sum_geo_country_Canada,sum_geo_country_France,sum_geo_country_India,sum_geo_country_Other,sum_geo_country_Spain,sum_geo_country_UnitedKingdom,sum_geo_country_UnitedStates,sum_traffic_source_medium_Other,sum_traffic_source_medium_cpc,sum_traffic_source_medium_organic,sum_traffic_source_medium_referral,sum_event_params_parent_page_Apparel,sum_event_params_parent_page_CampusCollection,sum_event_params_parent_page_CheckoutConfirmation,sum_event_params_parent_page_CheckoutYourInformation,sum_event_params_parent_page_EcoFriendly,sum_event_params_parent_page_Home,sum_event_params_parent_page_Lifestyle,sum_event_params_parent_page_New,sum_event_params_parent_page_Other,sum_event_params_parent_page_PaymentMethod,sum_event_params_parent_page_Sale,sum_event_params_parent_page_ShopbyBrand,sum_event_params_parent_page_ShoppingCart,sum_event_params_parent_page_Stationery,sum_event_params_child_page_Bags,sum_event_params_child_page_Drinkware,sum_event_params_child_page_Google,sum_event_params_child_page_Hats,sum_event_params_child_page_Kids,sum_event_params_child_page_MensUnisex,sum_event_params_child_page_Notebooks,sum_event_params_child_page_Other,sum_event_params_child_page_SmallGoods,sum_event_params_child_page_Socks,sum_event_params_child_page_Stickers,sum_event_params_child_page_Womens,sum_event_params_child_page_Writing,sum_event_params_child_page_YouTube,sum_item_parent_category_Apparel,sum_item_parent_category_Collections,sum_item_parent_category_Lifestyle,sum_item_parent_category_New,sum_item_parent_category_Other,sum_item_parent_category_Sale,sum_item_parent_category_ShopbyBrand,sum_item_parent_category_Stationery,sum_item_child_category_Bags,sum_item_child_category_CampusCollection,sum_item_child_category_Drinkware,sum_item_child_category_Google,sum_item_child_category_Kids,sum_item_child_category_MensUnisex,sum_item_child_category_Other,sum_item_child_category_SmallGoods,sum_item_child_category_Womens,sum_item_child_subcategory_Backpacks,sum_item_child_subcategory_ElectronicsAccessories,sum_item_child_subcategory_Infant,sum_item_child_subcategory_MensTShirts,sum_item_child_subcategory_MugsTumblers,sum_item_child_subcategory_Other,sum_item_child_subcategory_WaterBottles,pre_nunique_event_params_ga_sessions,pre_sum_event_params_engagement_time_msec,pre_sum_event_params_session_engaged,pre_sum_event_name_add_to_cart,pre_sum_event_name_begin_checkout,pre_sum_event_name_page_view,pre_sum_event_name_scroll,pre_sum_event_name_select_item,pre_sum_event_name_user_engagement,pre_sum_event_name_view_item,pre_sum_device_category_desktop,pre_sum_device_category_mobile,pre_sum_device_mobile_brand_name_Apple,pre_sum_device_mobile_brand_name_Google,pre_sum_device_mobile_brand_name_Huawei,pre_sum_device_mobile_brand_name_Microsoft,pre_sum_device_mobile_brand_name_Mozilla,pre_sum_device_mobile_brand_name_Samsung,pre_sum_device_mobile_brand_name_Xiaomi,pre_sum_device_mobile_model_name_Chrome,pre_sum_device_mobile_model_name_ChromeBook,pre_sum_device_mobile_model_name_Edge,pre_sum_device_mobile_model_name_Firefox,pre_sum_device_mobile_model_name_Safari,pre_sum_device_mobile_model_name_iPad,pre_sum_device_mobile_model_name_iPhone,pre_sum_device_web_info_browser_AndroidWebview,pre_sum_device_web_info_browser_Chrome,pre_sum_device_web_info_browser_Edge,pre_sum_device_web_info_browser_Firefox,pre_sum_device_web_info_browser_Safari,pre_sum_geo_country_Canada,pre_sum_geo_country_France,pre_sum_geo_country_India,pre_sum_geo_country_Other,pre_sum_geo_country_Spain,pre_sum_geo_country_UnitedKingdom,pre_sum_geo_country_UnitedStates,pre_sum_traffic_source_medium_Other,pre_sum_traffic_source_medium_cpc,pre_sum_traffic_source_medium_organic,pre_sum_traffic_source_medium_referral,pre_sum_event_params_parent_page_Apparel,pre_sum_event_params_parent_page_CampusCollection,pre_sum_event_params_parent_page_CheckoutConfirmation,pre_sum_event_params_parent_page_CheckoutYourInformation,pre_sum_event_params_parent_page_EcoFriendly,pre_sum_event_params_parent_page_Home,pre_sum_event_params_parent_page_Lifestyle,pre_sum_event_params_parent_page_New,pre_sum_event_params_parent_page_Other,pre_sum_event_params_parent_page_PaymentMethod,pre_sum_event_params_parent_page_Sale,pre_sum_event_params_parent_page_ShopbyBrand,pre_sum_event_params_parent_page_ShoppingCart,pre_sum_event_params_parent_page_Stationery,pre_sum_event_params_child_page_Bags,pre_sum_event_params_child_page_Drinkware,pre_sum_event_params_child_page_Google,pre_sum_event_params_child_page_Hats,pre_sum_event_params_child_page_Kids,pre_sum_event_params_child_page_MensUnisex,pre_sum_event_params_child_page_Notebooks,pre_sum_event_params_child_page_Other,pre_sum_event_params_child_page_SmallGoods,pre_sum_event_params_child_page_Socks,pre_sum_event_params_child_page_Stickers,pre_sum_event_params_child_page_Womens,pre_sum_event_params_child_page_Writing,pre_sum_event_params_child_page_YouTube,pre_sum_item_parent_category_Apparel,pre_sum_item_parent_category_Collections,pre_sum_item_parent_category_Lifestyle,pre_sum_item_parent_category_New,pre_sum_item_parent_category_Other,pre_sum_item_parent_category_Sale,pre_sum_item_parent_category_ShopbyBrand,pre_sum_item_parent_category_Stationery,pre_sum_item_child_category_Bags,pre_sum_item_child_category_CampusCollection,pre_sum_item_child_category_Drinkware,pre_sum_item_child_category_Google,pre_sum_item_child_category_Kids,pre_sum_item_child_category_MensUnisex,pre_sum_item_child_category_Other,pre_sum_item_child_category_SmallGoods,pre_sum_item_child_category_Womens,pre_sum_item_child_subcategory_Backpacks,pre_sum_item_child_subcategory_ElectronicsAccessories,pre_sum_item_child_subcategory_Infant,pre_sum_item_child_subcategory_MensTShirts,pre_sum_item_child_subcategory_MugsTumblers,pre_sum_item_child_subcategory_Other,pre_sum_item_child_subcategory_WaterBottles,pre_max_event_params_ga_session_number,pre_max_event_params_engagement_time_msec,pre_stdev_max_event_params_engagement_time_msec,pre_stdev_avg_event_params_engagement_time_msec,pre_avg_event_params_engagement_time_msec,recency,age,split
0,10092926.37863064,719410,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0,0.0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1.0,0,1,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0.0,0.0,0.0,0.0,-22,TEST
1,10111055.876868386,741471,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0,0.0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,21,TRAIN
2,1016446.8237887674,983645,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0,0.0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1.0,2813178,339,72,0,29,14,24,26,174,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,81,3,0,0,0,5,209,2,0,0,2,0,13,6,91,89,0,0,0,81,3,0,15,0,0,0,3,0,72,0,192,0,0,0,0,0,84,0,84,0,0,72,0,12,0,0,0,0,0,0,0,0,1,62804,0.0,0.0,8225.667,0.0,10,TRAIN
3,10172849.537529336,406646,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0,0.0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1.0,0,1,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0.0,0.0,0.0,0.0,22,TRAIN
4,1019527.5799124268,2105,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0,0.0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,26,TRAIN


In [None]:
# create table schema to prevent errors when uploading data
schema = []

for col, dtype in zip(df_merged.dtypes.index, df_merged.dtypes.values):
    col_dict = {}
    col_dict['name'] = col

    if dtype == object:
        typ = 'STRING'
    elif dtype == bool:
        typ = 'BOOLEAN'
    elif dtype == 'datetime64[ns]':
        typ = 'DATETIME'
    elif dtype in (int, 'Int64', 'uint8'):
        typ = 'INTEGER'
    elif dtype == float:
        typ = 'FLOAT'
    else:
        print(dtype)
    col_dict['type'] = typ

    schema.append(col_dict)

In [None]:
df_merged.to_gbq(f'{project_name}.return_prediction_ga4.step_4_customer_agg',
                    project_id=project_name,
                    if_exists='replace',
                    location=region,
                    chunksize=100_000,
                    table_schema=schema)

100%|██████████| 1/1 [00:00<00:00, 11949.58it/s]
