<a href="https://colab.research.google.com/github/Adlucent/ga4-return-prediction/blob/main/6_Scale_Data_and_Customer_Segmentation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 6. Scale Data and Customer Segmentation

In [None]:
################################################################################
######################### CHANGE BQ PROJECT NAME BELOW #########################
################################################################################

project_name = 'adl-analytics' #add proj name
region = "US"  # GCP project region
table_name = 'adl-analytics.return_prediction_ga4.step_5_merged_final'
customer_segment_table = 'adl-analytics.public_sample_data.synthetic_returns_ga4'

In [None]:
# If your notebook does not have pandas_gbq you can install it here:
# ! pip install pandas_gbq

In [None]:
# Google credentials
from google.colab import auth
auth.authenticate_user()

# BigQuery Magics
'''BigQuery magics are used to run BigQuery SQL queries in a python environment.
These queries can also be run in the BigQuery UI '''

from google.cloud import bigquery
from google.cloud.bigquery import magics, Client, QueryJobConfig

magics.context.project = project_name #update project name
client = bigquery.Client(project=magics.context.project)

# Interface between Jupyter and BigQuery
import pandas_gbq

# data processing libraries + ML tools
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

# visualization
import matplotlib.pyplot as plt
import seaborn as sns

sns.set()

# suppress notebook warnings
import warnings
warnings.filterwarnings('ignore')

# dataframe formatting
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 50)
pd.set_option("display.float_format", lambda x: "%.3f" % x)

# Load data

In [None]:
sql = f"""
SELECT *
FROM `{table_name}`;
"""
data = pandas_gbq.read_gbq(sql, project_id=project_name, location=region, use_bqstorage_api=True)

Downloading: 100%|[32m██████████[0m|


In [None]:
# create a copy of the imported data to avoid re-importing if we need to revert to original table
df = data.copy()

In [None]:
df.head()

Unnamed: 0,user_pseudo_id,ecommerce_transaction_id,transaction_ga_session_number,user_ltv_revenue,ecommerce_tax_value_in_usd,sum_item_price_in_usd,sum_item_promotions,sum_item_quantity,sum_event_name_begin_checkout,sum_event_name_select_item,sum_device_category_mobile,sum_device_mobile_brand_name_Google,sum_device_mobile_brand_name_Huawei,sum_device_mobile_brand_name_Microsoft,sum_device_mobile_brand_name_Samsung,sum_device_mobile_brand_name_Xiaomi,sum_device_mobile_model_name_ChromeBook,sum_device_mobile_model_name_Safari,sum_device_mobile_model_name_iPad,sum_device_mobile_model_name_iPhone,sum_device_web_info_browser_AndroidWebview,sum_device_web_info_browser_Firefox,sum_device_web_info_browser_Safari,sum_geo_country_Canada,sum_geo_country_France,sum_geo_country_India,sum_geo_country_Other,sum_geo_country_Spain,sum_geo_country_UnitedKingdom,sum_traffic_source_medium_Other,sum_traffic_source_medium_cpc,sum_traffic_source_medium_organic,sum_traffic_source_medium_referral,sum_event_params_parent_page_CampusCollection,sum_event_params_parent_page_CheckoutConfirmation,sum_event_params_parent_page_EcoFriendly,sum_event_params_parent_page_Home,sum_event_params_parent_page_Lifestyle,sum_event_params_parent_page_New,sum_event_params_parent_page_Other,sum_event_params_parent_page_PaymentMethod,sum_event_params_parent_page_Sale,sum_event_params_parent_page_ShoppingCart,sum_event_params_child_page_Bags,sum_event_params_child_page_Google,sum_event_params_child_page_Hats,sum_event_params_child_page_Kids,sum_event_params_child_page_Notebooks,sum_event_params_child_page_Other,sum_event_params_child_page_Socks,sum_event_params_child_page_Stickers,sum_event_params_child_page_Womens,sum_event_params_child_page_Writing,sum_event_params_child_page_YouTube,sum_item_parent_category_Other,sum_item_parent_category_Stationery,sum_item_child_category_Drinkware,sum_item_child_category_Google,sum_item_child_category_MensUnisex,sum_item_child_category_SmallGoods,sum_item_child_subcategory_Backpacks,sum_item_child_subcategory_ElectronicsAccessories,sum_item_child_subcategory_Infant,sum_item_child_subcategory_MensTShirts,sum_item_child_subcategory_MugsTumblers,sum_item_child_subcategory_Other,sum_item_child_subcategory_WaterBottles,max_event_params_engagement_time_msec,max_item_promotions,avg_event_params_engagement_time_msec,avg_item_promotions,avg_item_quantity,avg_item_revenue_in_usd,pre_max_event_params_ga_session_number,pre_avg_event_params_engagement_time_msec,pre_sum_event_name_scroll,pre_sum_event_name_select_item,pre_sum_device_mobile_brand_name_Google,pre_sum_device_mobile_brand_name_Samsung,pre_sum_device_mobile_brand_name_Xiaomi,pre_sum_device_mobile_model_name_ChromeBook,pre_sum_device_mobile_model_name_Safari,pre_sum_device_mobile_model_name_iPad,pre_sum_device_web_info_browser_AndroidWebview,pre_sum_device_web_info_browser_Edge,pre_sum_device_web_info_browser_Firefox,pre_sum_geo_country_Canada,pre_sum_geo_country_India,pre_sum_geo_country_Other,pre_sum_geo_country_Spain,pre_sum_geo_country_UnitedKingdom,pre_sum_traffic_source_medium_Other,pre_sum_traffic_source_medium_cpc,pre_sum_traffic_source_medium_organic,pre_sum_traffic_source_medium_referral,pre_sum_event_params_parent_page_CampusCollection,pre_sum_event_params_parent_page_CheckoutConfirmation,pre_sum_event_params_parent_page_EcoFriendly,pre_sum_event_params_parent_page_Home,pre_sum_event_params_parent_page_PaymentMethod,pre_sum_event_params_parent_page_Sale,pre_sum_event_params_child_page_Hats,pre_sum_event_params_child_page_Socks,pre_sum_event_params_child_page_Stickers,pre_sum_event_params_child_page_Womens,pre_sum_event_params_child_page_YouTube,pre_sum_item_parent_category_New,pre_sum_item_child_category_Bags,pre_sum_item_child_category_Drinkware,pre_sum_item_child_category_Kids,pre_sum_item_child_category_MensUnisex,pre_sum_item_child_subcategory_Backpacks,pre_sum_item_child_subcategory_ElectronicsAccessories,pre_sum_item_child_subcategory_WaterBottles,days_first_session_to_transaction,historical_avg_event_params_engagement_time_msec,historical_avg_item_price_in_usd,historical_avg_item_promotions,historical_avg_item_refund_in_usd,historical_sum_total_return_item_quantity,historical_sum_ecommerce_refund_value_in_usd,historical_max_days_first_session_to_transaction,historical_max_event_params_engagement_time_msec,historical_max_item_promotions,historical_max_item_quantity,historical_sum_item_promotions,historical_sum_item_quantity,historical_sum_device_category_mobile,historical_sum_device_mobile_brand_name_Google,historical_sum_device_mobile_brand_name_Samsung,historical_sum_device_mobile_brand_name_Xiaomi,historical_sum_device_mobile_model_name_ChromeBook,historical_sum_device_web_info_browser_AndroidWebview,historical_sum_device_web_info_browser_Edge,historical_sum_device_web_info_browser_Safari,historical_sum_geo_country_Canada,historical_sum_geo_country_France,historical_sum_geo_country_India,historical_sum_geo_country_Spain,historical_sum_geo_country_UnitedKingdom,historical_sum_geo_country_UnitedStates,historical_sum_traffic_source_medium_Other,historical_sum_traffic_source_medium_cpc,historical_sum_traffic_source_medium_organic,historical_sum_traffic_source_medium_referral,historical_sum_event_params_parent_page_CampusCollection,historical_sum_event_params_parent_page_EcoFriendly,historical_sum_event_params_parent_page_Home,historical_sum_event_params_parent_page_New,historical_sum_event_params_parent_page_Other,historical_sum_event_params_parent_page_Sale,historical_sum_event_params_parent_page_Stationery,historical_sum_event_params_child_page_Google,historical_sum_event_params_child_page_Hats,historical_sum_event_params_child_page_Notebooks,historical_sum_event_params_child_page_Other,historical_sum_event_params_child_page_Socks,historical_sum_event_params_child_page_Stickers,historical_sum_event_params_child_page_Writing,historical_sum_event_params_child_page_YouTube,historical_sum_item_parent_category_New,historical_sum_item_parent_category_Other,historical_sum_item_child_category_Google,historical_sum_item_child_category_Kids,historical_sum_item_child_category_MensUnisex,historical_sum_item_child_category_Womens,historical_sum_item_child_subcategory_Backpacks,historical_sum_item_child_subcategory_ElectronicsAccessories,historical_sum_item_child_subcategory_Infant,historical_sum_item_child_subcategory_MensTShirts,historical_sum_item_child_subcategory_MugsTumblers,historical_sum_item_child_subcategory_Other,historical_sum_item_child_subcategory_WaterBottles,historical_pre_sum_event_name_add_to_cart,historical_pre_sum_device_mobile_brand_name_Microsoft,historical_pre_sum_device_mobile_model_name_iPhone,historical_pre_sum_event_params_parent_page_CheckoutConfirmation,historical_pre_sum_event_params_parent_page_New,historical_pre_sum_item_child_subcategory_Backpacks,historical_pre_sum_item_child_subcategory_ElectronicsAccessories,historical_pre_sum_item_child_subcategory_WaterBottles,historical_recency,historical_age,split,ecommerce_refund_value_in_usd
0,1051154.074017205,283445,1.0,55.0,5.0,55.0,0.0,1.0,3.0,36.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2.0,2.0,18.0,3.0,5.0,56.0,0.0,4.0,43.0,15.0,0.0,83.0,3.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,71.0,24.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,162395.0,0.0,12827.119,0.0,1.0,55.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,14.0,TRAIN,0.0
1,1111007.3250339243,684695,1.0,55.0,5.0,55.0,0.0,1.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2.0,0.0,3.0,0.0,0.0,0.0,3.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,24.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,55082.0,0.0,10435.233,0.0,1.0,55.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20.0,TRAIN,0.0
2,1158637.0383647084,72777,5.0,136.0,4.0,44.0,0.0,1.0,3.0,12.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2.0,0.0,5.0,0.0,0.0,0.0,4.0,0.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,213638.0,0.0,4449.619,0.0,1.0,44.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12176.876,30.667,0.333,0.0,0.0,0.0,23.0,67227.0,1.0,1.0,1.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,27.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,120.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,192.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-4.0,VALID,0.0
3,1207697.7765216064,251826,1.0,17.0,2.0,17.0,0.0,1.0,6.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,0.0,4.0,0.0,0.0,5.0,3.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,5.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,67503.0,0.0,7103.134,0.0,1.0,17.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,44.0,TRAIN,0.0
4,14107430.813614637,422916,1.0,3.0,1.0,3.0,0.0,1.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,2.0,0.0,3.0,0.0,0.0,6.0,10.0,0.0,11.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,16.0,0.0,0.0,0.0,0.0,11.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,41749.0,0.0,6682.712,0.0,1.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,28.0,TRAIN,3.0


In [None]:
df.shape

(4466, 185)

In [None]:
# check for missing values
df.isna().sum().sum()

0

# Customer segmentation
As an additional feature to the model, it may be useful to cluster customers based on their history. <br>
This can be achieved using a clustering algorithm, such as KMeans, on your customer history data. <br>
In this case, we have already performed customer segmentation outside and will pull in the results here. <br>
**Add more verbiage here/how to**

In [None]:
sql = f"""
SELECT DISTINCT user_pseudo_id, customer_segment
FROM `{customer_segment_table}`;
"""
segments = pandas_gbq.read_gbq(sql, project_id=project_name, location=region, use_bqstorage_api=True)

Downloading: 100%|[32m██████████[0m|


In [None]:
customer_segments = segments.copy()

In [None]:
customer_segments.head()

Unnamed: 0,user_pseudo_id,customer_segment
0,4268698.489339352,9
1,22300841.99098763,0
2,4853857.318151629,0
3,50121199.432344526,9
4,14107430.813614637,9


In [None]:
# add customer segment to our existing dataset
df = pd.merge(df, customer_segments, how='left', on='user_pseudo_id')
df.head()

Unnamed: 0,user_pseudo_id,ecommerce_transaction_id,transaction_ga_session_number,user_ltv_revenue,ecommerce_tax_value_in_usd,sum_item_price_in_usd,sum_item_promotions,sum_item_quantity,sum_event_name_begin_checkout,sum_event_name_select_item,sum_device_category_mobile,sum_device_mobile_brand_name_Google,sum_device_mobile_brand_name_Huawei,sum_device_mobile_brand_name_Microsoft,sum_device_mobile_brand_name_Samsung,sum_device_mobile_brand_name_Xiaomi,sum_device_mobile_model_name_ChromeBook,sum_device_mobile_model_name_Safari,sum_device_mobile_model_name_iPad,sum_device_mobile_model_name_iPhone,sum_device_web_info_browser_AndroidWebview,sum_device_web_info_browser_Firefox,sum_device_web_info_browser_Safari,sum_geo_country_Canada,sum_geo_country_France,sum_geo_country_India,sum_geo_country_Other,sum_geo_country_Spain,sum_geo_country_UnitedKingdom,sum_traffic_source_medium_Other,sum_traffic_source_medium_cpc,sum_traffic_source_medium_organic,sum_traffic_source_medium_referral,sum_event_params_parent_page_CampusCollection,sum_event_params_parent_page_CheckoutConfirmation,sum_event_params_parent_page_EcoFriendly,sum_event_params_parent_page_Home,sum_event_params_parent_page_Lifestyle,sum_event_params_parent_page_New,sum_event_params_parent_page_Other,sum_event_params_parent_page_PaymentMethod,sum_event_params_parent_page_Sale,sum_event_params_parent_page_ShoppingCart,sum_event_params_child_page_Bags,sum_event_params_child_page_Google,sum_event_params_child_page_Hats,sum_event_params_child_page_Kids,sum_event_params_child_page_Notebooks,sum_event_params_child_page_Other,sum_event_params_child_page_Socks,sum_event_params_child_page_Stickers,sum_event_params_child_page_Womens,sum_event_params_child_page_Writing,sum_event_params_child_page_YouTube,sum_item_parent_category_Other,sum_item_parent_category_Stationery,sum_item_child_category_Drinkware,sum_item_child_category_Google,sum_item_child_category_MensUnisex,sum_item_child_category_SmallGoods,sum_item_child_subcategory_Backpacks,sum_item_child_subcategory_ElectronicsAccessories,sum_item_child_subcategory_Infant,sum_item_child_subcategory_MensTShirts,sum_item_child_subcategory_MugsTumblers,sum_item_child_subcategory_Other,sum_item_child_subcategory_WaterBottles,max_event_params_engagement_time_msec,max_item_promotions,avg_event_params_engagement_time_msec,avg_item_promotions,avg_item_quantity,avg_item_revenue_in_usd,pre_max_event_params_ga_session_number,pre_avg_event_params_engagement_time_msec,pre_sum_event_name_scroll,pre_sum_event_name_select_item,pre_sum_device_mobile_brand_name_Google,pre_sum_device_mobile_brand_name_Samsung,pre_sum_device_mobile_brand_name_Xiaomi,pre_sum_device_mobile_model_name_ChromeBook,pre_sum_device_mobile_model_name_Safari,pre_sum_device_mobile_model_name_iPad,pre_sum_device_web_info_browser_AndroidWebview,pre_sum_device_web_info_browser_Edge,pre_sum_device_web_info_browser_Firefox,pre_sum_geo_country_Canada,pre_sum_geo_country_India,pre_sum_geo_country_Other,pre_sum_geo_country_Spain,pre_sum_geo_country_UnitedKingdom,pre_sum_traffic_source_medium_Other,pre_sum_traffic_source_medium_cpc,pre_sum_traffic_source_medium_organic,pre_sum_traffic_source_medium_referral,pre_sum_event_params_parent_page_CampusCollection,pre_sum_event_params_parent_page_CheckoutConfirmation,pre_sum_event_params_parent_page_EcoFriendly,pre_sum_event_params_parent_page_Home,pre_sum_event_params_parent_page_PaymentMethod,pre_sum_event_params_parent_page_Sale,pre_sum_event_params_child_page_Hats,pre_sum_event_params_child_page_Socks,pre_sum_event_params_child_page_Stickers,pre_sum_event_params_child_page_Womens,pre_sum_event_params_child_page_YouTube,pre_sum_item_parent_category_New,pre_sum_item_child_category_Bags,pre_sum_item_child_category_Drinkware,pre_sum_item_child_category_Kids,pre_sum_item_child_category_MensUnisex,pre_sum_item_child_subcategory_Backpacks,pre_sum_item_child_subcategory_ElectronicsAccessories,pre_sum_item_child_subcategory_WaterBottles,days_first_session_to_transaction,historical_avg_event_params_engagement_time_msec,historical_avg_item_price_in_usd,historical_avg_item_promotions,historical_avg_item_refund_in_usd,historical_sum_total_return_item_quantity,historical_sum_ecommerce_refund_value_in_usd,historical_max_days_first_session_to_transaction,historical_max_event_params_engagement_time_msec,historical_max_item_promotions,historical_max_item_quantity,historical_sum_item_promotions,historical_sum_item_quantity,historical_sum_device_category_mobile,historical_sum_device_mobile_brand_name_Google,historical_sum_device_mobile_brand_name_Samsung,historical_sum_device_mobile_brand_name_Xiaomi,historical_sum_device_mobile_model_name_ChromeBook,historical_sum_device_web_info_browser_AndroidWebview,historical_sum_device_web_info_browser_Edge,historical_sum_device_web_info_browser_Safari,historical_sum_geo_country_Canada,historical_sum_geo_country_France,historical_sum_geo_country_India,historical_sum_geo_country_Spain,historical_sum_geo_country_UnitedKingdom,historical_sum_geo_country_UnitedStates,historical_sum_traffic_source_medium_Other,historical_sum_traffic_source_medium_cpc,historical_sum_traffic_source_medium_organic,historical_sum_traffic_source_medium_referral,historical_sum_event_params_parent_page_CampusCollection,historical_sum_event_params_parent_page_EcoFriendly,historical_sum_event_params_parent_page_Home,historical_sum_event_params_parent_page_New,historical_sum_event_params_parent_page_Other,historical_sum_event_params_parent_page_Sale,historical_sum_event_params_parent_page_Stationery,historical_sum_event_params_child_page_Google,historical_sum_event_params_child_page_Hats,historical_sum_event_params_child_page_Notebooks,historical_sum_event_params_child_page_Other,historical_sum_event_params_child_page_Socks,historical_sum_event_params_child_page_Stickers,historical_sum_event_params_child_page_Writing,historical_sum_event_params_child_page_YouTube,historical_sum_item_parent_category_New,historical_sum_item_parent_category_Other,historical_sum_item_child_category_Google,historical_sum_item_child_category_Kids,historical_sum_item_child_category_MensUnisex,historical_sum_item_child_category_Womens,historical_sum_item_child_subcategory_Backpacks,historical_sum_item_child_subcategory_ElectronicsAccessories,historical_sum_item_child_subcategory_Infant,historical_sum_item_child_subcategory_MensTShirts,historical_sum_item_child_subcategory_MugsTumblers,historical_sum_item_child_subcategory_Other,historical_sum_item_child_subcategory_WaterBottles,historical_pre_sum_event_name_add_to_cart,historical_pre_sum_device_mobile_brand_name_Microsoft,historical_pre_sum_device_mobile_model_name_iPhone,historical_pre_sum_event_params_parent_page_CheckoutConfirmation,historical_pre_sum_event_params_parent_page_New,historical_pre_sum_item_child_subcategory_Backpacks,historical_pre_sum_item_child_subcategory_ElectronicsAccessories,historical_pre_sum_item_child_subcategory_WaterBottles,historical_recency,historical_age,split,ecommerce_refund_value_in_usd,customer_segment
0,1051154.074017205,283445,1.0,55.0,5.0,55.0,0.0,1.0,3.0,36.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2.0,2.0,18.0,3.0,5.0,56.0,0.0,4.0,43.0,15.0,0.0,83.0,3.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,71.0,24.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,162395.0,0.0,12827.119,0.0,1.0,55.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,14.0,TRAIN,0.0,0
1,1111007.3250339243,684695,1.0,55.0,5.0,55.0,0.0,1.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2.0,0.0,3.0,0.0,0.0,0.0,3.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,24.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,55082.0,0.0,10435.233,0.0,1.0,55.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20.0,TRAIN,0.0,0
2,1158637.0383647084,72777,5.0,136.0,4.0,44.0,0.0,1.0,3.0,12.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2.0,0.0,5.0,0.0,0.0,0.0,4.0,0.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,213638.0,0.0,4449.619,0.0,1.0,44.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12176.876,30.667,0.333,0.0,0.0,0.0,23.0,67227.0,1.0,1.0,1.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,27.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,120.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,192.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-4.0,VALID,0.0,6
3,1207697.7765216064,251826,1.0,17.0,2.0,17.0,0.0,1.0,6.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,0.0,4.0,0.0,0.0,5.0,3.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,5.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,67503.0,0.0,7103.134,0.0,1.0,17.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,44.0,TRAIN,0.0,9
4,14107430.813614637,422916,1.0,3.0,1.0,3.0,0.0,1.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,2.0,0.0,3.0,0.0,0.0,6.0,10.0,0.0,11.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,16.0,0.0,0.0,0.0,0.0,11.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,41749.0,0.0,6682.712,0.0,1.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,28.0,TRAIN,3.0,9


# Scale data
In order to make our algorithm converge faster and to make all features on the same relative scale, we'll perform a standard scaler transformation on the data.

In [None]:
# select only features, not ID columns, target or split column
X = df[list(df.columns[2:-3]) + ['customer_segment']]
X.shape

(4466, 182)

In [None]:
scaler = StandardScaler()
scaler.fit(X[df['split'] == 'TRAIN'])
X_scaled = scaler.transform(X)

In [None]:
df_scaled = pd.DataFrame(X_scaled, columns=list(df.columns[2:-3]) + ['customer_segment'])
df_scaled = pd.concat([df[['user_pseudo_id', 'ecommerce_transaction_id']],
                       df_scaled,
                       df[['split', 'ecommerce_refund_value_in_usd']]],
                      axis=1)
df_scaled.head()

Unnamed: 0,user_pseudo_id,ecommerce_transaction_id,transaction_ga_session_number,user_ltv_revenue,ecommerce_tax_value_in_usd,sum_item_price_in_usd,sum_item_promotions,sum_item_quantity,sum_event_name_begin_checkout,sum_event_name_select_item,sum_device_category_mobile,sum_device_mobile_brand_name_Google,sum_device_mobile_brand_name_Huawei,sum_device_mobile_brand_name_Microsoft,sum_device_mobile_brand_name_Samsung,sum_device_mobile_brand_name_Xiaomi,sum_device_mobile_model_name_ChromeBook,sum_device_mobile_model_name_Safari,sum_device_mobile_model_name_iPad,sum_device_mobile_model_name_iPhone,sum_device_web_info_browser_AndroidWebview,sum_device_web_info_browser_Firefox,sum_device_web_info_browser_Safari,sum_geo_country_Canada,sum_geo_country_France,sum_geo_country_India,sum_geo_country_Other,sum_geo_country_Spain,sum_geo_country_UnitedKingdom,sum_traffic_source_medium_Other,sum_traffic_source_medium_cpc,sum_traffic_source_medium_organic,sum_traffic_source_medium_referral,sum_event_params_parent_page_CampusCollection,sum_event_params_parent_page_CheckoutConfirmation,sum_event_params_parent_page_EcoFriendly,sum_event_params_parent_page_Home,sum_event_params_parent_page_Lifestyle,sum_event_params_parent_page_New,sum_event_params_parent_page_Other,sum_event_params_parent_page_PaymentMethod,sum_event_params_parent_page_Sale,sum_event_params_parent_page_ShoppingCart,sum_event_params_child_page_Bags,sum_event_params_child_page_Google,sum_event_params_child_page_Hats,sum_event_params_child_page_Kids,sum_event_params_child_page_Notebooks,sum_event_params_child_page_Other,sum_event_params_child_page_Socks,sum_event_params_child_page_Stickers,sum_event_params_child_page_Womens,sum_event_params_child_page_Writing,sum_event_params_child_page_YouTube,sum_item_parent_category_Other,sum_item_parent_category_Stationery,sum_item_child_category_Drinkware,sum_item_child_category_Google,sum_item_child_category_MensUnisex,sum_item_child_category_SmallGoods,sum_item_child_subcategory_Backpacks,sum_item_child_subcategory_ElectronicsAccessories,sum_item_child_subcategory_Infant,sum_item_child_subcategory_MensTShirts,sum_item_child_subcategory_MugsTumblers,sum_item_child_subcategory_Other,sum_item_child_subcategory_WaterBottles,max_event_params_engagement_time_msec,max_item_promotions,avg_event_params_engagement_time_msec,avg_item_promotions,avg_item_quantity,avg_item_revenue_in_usd,pre_max_event_params_ga_session_number,pre_avg_event_params_engagement_time_msec,pre_sum_event_name_scroll,pre_sum_event_name_select_item,pre_sum_device_mobile_brand_name_Google,pre_sum_device_mobile_brand_name_Samsung,pre_sum_device_mobile_brand_name_Xiaomi,pre_sum_device_mobile_model_name_ChromeBook,pre_sum_device_mobile_model_name_Safari,pre_sum_device_mobile_model_name_iPad,pre_sum_device_web_info_browser_AndroidWebview,pre_sum_device_web_info_browser_Edge,pre_sum_device_web_info_browser_Firefox,pre_sum_geo_country_Canada,pre_sum_geo_country_India,pre_sum_geo_country_Other,pre_sum_geo_country_Spain,pre_sum_geo_country_UnitedKingdom,pre_sum_traffic_source_medium_Other,pre_sum_traffic_source_medium_cpc,pre_sum_traffic_source_medium_organic,pre_sum_traffic_source_medium_referral,pre_sum_event_params_parent_page_CampusCollection,pre_sum_event_params_parent_page_CheckoutConfirmation,pre_sum_event_params_parent_page_EcoFriendly,pre_sum_event_params_parent_page_Home,pre_sum_event_params_parent_page_PaymentMethod,pre_sum_event_params_parent_page_Sale,pre_sum_event_params_child_page_Hats,pre_sum_event_params_child_page_Socks,pre_sum_event_params_child_page_Stickers,pre_sum_event_params_child_page_Womens,pre_sum_event_params_child_page_YouTube,pre_sum_item_parent_category_New,pre_sum_item_child_category_Bags,pre_sum_item_child_category_Drinkware,pre_sum_item_child_category_Kids,pre_sum_item_child_category_MensUnisex,pre_sum_item_child_subcategory_Backpacks,pre_sum_item_child_subcategory_ElectronicsAccessories,pre_sum_item_child_subcategory_WaterBottles,days_first_session_to_transaction,historical_avg_event_params_engagement_time_msec,historical_avg_item_price_in_usd,historical_avg_item_promotions,historical_avg_item_refund_in_usd,historical_sum_total_return_item_quantity,historical_sum_ecommerce_refund_value_in_usd,historical_max_days_first_session_to_transaction,historical_max_event_params_engagement_time_msec,historical_max_item_promotions,historical_max_item_quantity,historical_sum_item_promotions,historical_sum_item_quantity,historical_sum_device_category_mobile,historical_sum_device_mobile_brand_name_Google,historical_sum_device_mobile_brand_name_Samsung,historical_sum_device_mobile_brand_name_Xiaomi,historical_sum_device_mobile_model_name_ChromeBook,historical_sum_device_web_info_browser_AndroidWebview,historical_sum_device_web_info_browser_Edge,historical_sum_device_web_info_browser_Safari,historical_sum_geo_country_Canada,historical_sum_geo_country_France,historical_sum_geo_country_India,historical_sum_geo_country_Spain,historical_sum_geo_country_UnitedKingdom,historical_sum_geo_country_UnitedStates,historical_sum_traffic_source_medium_Other,historical_sum_traffic_source_medium_cpc,historical_sum_traffic_source_medium_organic,historical_sum_traffic_source_medium_referral,historical_sum_event_params_parent_page_CampusCollection,historical_sum_event_params_parent_page_EcoFriendly,historical_sum_event_params_parent_page_Home,historical_sum_event_params_parent_page_New,historical_sum_event_params_parent_page_Other,historical_sum_event_params_parent_page_Sale,historical_sum_event_params_parent_page_Stationery,historical_sum_event_params_child_page_Google,historical_sum_event_params_child_page_Hats,historical_sum_event_params_child_page_Notebooks,historical_sum_event_params_child_page_Other,historical_sum_event_params_child_page_Socks,historical_sum_event_params_child_page_Stickers,historical_sum_event_params_child_page_Writing,historical_sum_event_params_child_page_YouTube,historical_sum_item_parent_category_New,historical_sum_item_parent_category_Other,historical_sum_item_child_category_Google,historical_sum_item_child_category_Kids,historical_sum_item_child_category_MensUnisex,historical_sum_item_child_category_Womens,historical_sum_item_child_subcategory_Backpacks,historical_sum_item_child_subcategory_ElectronicsAccessories,historical_sum_item_child_subcategory_Infant,historical_sum_item_child_subcategory_MensTShirts,historical_sum_item_child_subcategory_MugsTumblers,historical_sum_item_child_subcategory_Other,historical_sum_item_child_subcategory_WaterBottles,historical_pre_sum_event_name_add_to_cart,historical_pre_sum_device_mobile_brand_name_Microsoft,historical_pre_sum_device_mobile_model_name_iPhone,historical_pre_sum_event_params_parent_page_CheckoutConfirmation,historical_pre_sum_event_params_parent_page_New,historical_pre_sum_item_child_subcategory_Backpacks,historical_pre_sum_item_child_subcategory_ElectronicsAccessories,historical_pre_sum_item_child_subcategory_WaterBottles,historical_recency,historical_age,customer_segment,split,ecommerce_refund_value_in_usd
0,1051154.074017205,283445,-0.676,-0.383,-0.132,-0.141,-0.516,-0.35,-0.508,0.54,1.209,-0.74,7.232,-0.124,-0.286,-0.131,-0.24,-0.51,-0.102,-0.524,-0.125,-0.118,-0.569,-0.292,-0.146,-0.33,-0.666,-0.145,-0.174,-0.342,-0.17,1.619,-0.533,-0.149,-0.794,0.748,-0.389,-0.47,2.718,-0.653,-0.632,0.232,0.016,-0.3,2.184,-0.196,-0.235,-0.203,-0.097,0.187,-0.209,-0.322,-0.187,-0.137,-0.254,-0.321,-0.389,1.644,-0.207,-0.295,-0.017,-0.07,-0.036,-0.041,-0.04,-0.024,-0.024,0.128,-0.976,0.187,-0.776,-0.127,0.917,-0.601,-0.48,-0.518,-0.246,-0.388,-0.173,-0.085,-0.15,-0.28,-0.071,-0.081,-0.097,-0.079,-0.166,-0.193,-0.361,-0.092,-0.112,-0.334,-0.185,-0.533,-0.472,-0.12,-0.234,-0.179,-0.287,-0.205,-0.241,-0.171,-0.159,-0.117,-0.217,-0.083,-0.21,-0.181,-0.205,-0.147,-0.3,0.0,-0.017,0.0,-0.5,-0.24,-0.229,-0.179,-0.154,-0.125,-0.129,-0.157,-0.191,-0.212,-0.19,-0.128,-0.181,-0.153,-0.109,-0.057,-0.04,-0.039,-0.035,-0.024,-0.114,-0.054,-0.041,-0.077,-0.048,-0.034,-0.14,-0.079,-0.037,-0.113,-0.128,-0.054,-0.072,-0.116,-0.09,-0.089,-0.084,-0.099,-0.066,-0.071,-0.071,-0.029,-0.064,-0.055,-0.042,-0.038,-0.078,-0.069,-0.064,-0.059,-0.131,-0.105,0.0,-0.019,0.0,0.0,0.0,0.0,0.0,-0.274,-0.102,-0.313,-0.238,-0.26,0.0,-0.024,0.0,-0.069,-1.037,-1.489,TRAIN,0.0
1,1111007.3250339243,684695,-0.676,-0.383,-0.132,-0.141,-0.516,-0.35,-0.46,-0.47,-0.827,-0.74,-0.138,-0.124,-0.286,-0.131,-0.24,1.959,-0.102,-0.524,-0.125,-0.118,-0.569,-0.292,-0.146,3.029,-0.666,-0.145,-0.174,-0.342,-0.17,-0.618,1.878,-0.207,-0.794,-0.239,-0.389,-0.539,-0.319,-0.653,-0.75,-0.358,-0.785,-0.3,-0.175,-0.315,-0.235,-0.203,-0.097,-0.26,-0.209,-0.322,-0.187,-0.137,-0.254,-0.321,-0.389,-0.188,-0.207,-0.295,-0.017,-0.07,-0.036,-0.041,-0.04,-0.024,-0.024,-0.396,-0.976,0.02,-0.776,-0.127,0.917,-0.601,-0.48,-0.518,-0.246,-0.388,-0.173,-0.085,-0.15,-0.28,-0.071,-0.081,-0.097,-0.079,-0.166,-0.193,-0.361,-0.092,-0.112,-0.334,-0.185,-0.533,-0.472,-0.12,-0.234,-0.179,-0.287,-0.205,-0.241,-0.171,-0.159,-0.117,-0.217,-0.083,-0.21,-0.181,-0.205,-0.147,-0.3,0.0,-0.017,0.0,-0.5,-0.24,-0.229,-0.179,-0.154,-0.125,-0.129,-0.157,-0.191,-0.212,-0.19,-0.128,-0.181,-0.153,-0.109,-0.057,-0.04,-0.039,-0.035,-0.024,-0.114,-0.054,-0.041,-0.077,-0.048,-0.034,-0.14,-0.079,-0.037,-0.113,-0.128,-0.054,-0.072,-0.116,-0.09,-0.089,-0.084,-0.099,-0.066,-0.071,-0.071,-0.029,-0.064,-0.055,-0.042,-0.038,-0.078,-0.069,-0.064,-0.059,-0.131,-0.105,0.0,-0.019,0.0,0.0,0.0,0.0,0.0,-0.274,-0.102,-0.313,-0.238,-0.26,0.0,-0.024,0.0,-0.069,-0.538,-1.489,TRAIN,0.0
2,1158637.0383647084,72777,0.328,0.23,-0.28,-0.323,-0.516,-0.35,-0.508,-0.133,1.209,-0.74,-0.138,-0.124,-0.286,-0.131,-0.24,-0.51,-0.102,1.907,-0.125,-0.118,1.758,-0.292,-0.146,-0.33,-0.666,-0.145,-0.174,-0.342,-0.17,-0.618,1.878,-0.207,-0.794,-0.239,-0.365,-0.539,-0.319,-0.653,-0.632,-0.358,-0.421,-0.3,-0.175,-0.315,-0.235,-0.203,-0.097,-0.26,-0.209,1.782,-0.187,-0.137,-0.254,-0.321,-0.389,-0.188,-0.471,-0.295,-0.017,-0.07,-0.036,-0.041,-0.04,-0.024,-0.024,0.378,-0.976,-0.397,-0.776,-0.127,0.539,-0.601,-0.48,-0.518,-0.246,-0.388,-0.173,-0.085,-0.15,-0.28,-0.071,-0.081,-0.097,-0.079,-0.166,-0.193,-0.361,-0.092,-0.112,-0.334,-0.185,-0.533,-0.472,-0.12,-0.234,-0.179,-0.287,-0.205,-0.241,-0.171,-0.159,-0.117,-0.217,-0.083,-0.21,-0.181,-0.205,-0.147,-0.3,0.0,-0.017,0.0,-0.5,4.01,3.524,2.235,-0.154,-0.125,-0.129,7.428,1.193,4.727,1.428,0.931,0.996,3.895,-0.109,-0.057,-0.04,-0.039,-0.035,-0.024,5.121,-0.054,-0.041,-0.077,-0.048,-0.034,3.409,-0.079,-0.037,-0.113,-0.128,-0.054,-0.072,0.118,-0.09,-0.089,-0.084,-0.099,-0.066,1.529,-0.071,-0.029,-0.064,-0.055,-0.042,-0.038,-0.078,-0.069,-0.064,-0.059,-0.131,9.617,0.0,-0.019,0.0,0.0,0.0,0.0,0.0,3.485,-0.102,-0.313,-0.238,-0.26,0.0,-0.024,0.0,-0.069,-2.533,0.056,VALID,0.0
3,1207697.7765216064,251826,-0.676,-0.671,-0.576,-0.769,-0.516,-0.35,-0.365,-0.47,-0.827,1.351,-0.138,-0.124,-0.286,-0.131,-0.24,-0.51,-0.102,-0.524,-0.125,-0.118,-0.569,-0.292,-0.146,-0.33,1.501,-0.145,-0.174,-0.342,-0.17,1.619,-0.533,-0.207,-0.794,-0.239,-0.377,-0.539,-0.319,-0.122,-0.75,-0.358,-0.348,-0.3,-0.175,-0.315,-0.235,-0.203,0.029,1.079,-0.209,-0.322,-0.187,-0.137,-0.254,-0.321,-0.389,-0.188,-0.471,-0.295,-0.017,-0.07,-0.036,-0.041,-0.04,-0.024,-0.024,-0.335,-0.976,-0.212,-0.776,-0.127,-0.39,-0.601,-0.48,-0.518,-0.246,-0.388,-0.173,-0.085,-0.15,-0.28,-0.071,-0.081,-0.097,-0.079,-0.166,-0.193,-0.361,-0.092,-0.112,-0.334,-0.185,-0.533,-0.472,-0.12,-0.234,-0.179,-0.287,-0.205,-0.241,-0.171,-0.159,-0.117,-0.217,-0.083,-0.21,-0.181,-0.205,-0.147,-0.3,0.0,-0.017,0.0,-0.5,-0.24,-0.229,-0.179,-0.154,-0.125,-0.129,-0.157,-0.191,-0.212,-0.19,-0.128,-0.181,-0.153,-0.109,-0.057,-0.04,-0.039,-0.035,-0.024,-0.114,-0.054,-0.041,-0.077,-0.048,-0.034,-0.14,-0.079,-0.037,-0.113,-0.128,-0.054,-0.072,-0.116,-0.09,-0.089,-0.084,-0.099,-0.066,-0.071,-0.071,-0.029,-0.064,-0.055,-0.042,-0.038,-0.078,-0.069,-0.064,-0.059,-0.131,-0.105,0.0,-0.019,0.0,0.0,0.0,0.0,0.0,-0.274,-0.102,-0.313,-0.238,-0.26,0.0,-0.024,0.0,-0.069,1.458,0.828,TRAIN,0.0
4,14107430.813614637,422916,-0.676,-0.777,-0.724,-1.001,-0.516,-0.35,-0.365,-0.47,-0.827,-0.74,-0.138,-0.124,-0.286,-0.131,-0.24,1.959,-0.102,-0.524,-0.125,-0.118,-0.569,-0.292,-0.146,-0.33,-0.666,-0.145,5.745,2.928,-0.17,-0.618,-0.533,-0.207,-0.794,-0.239,-0.389,-0.539,-0.319,-0.016,0.074,-0.358,-0.275,-0.3,-0.175,-0.315,-0.235,-0.203,-0.097,-0.26,1.619,-0.322,-0.187,-0.137,-0.254,-0.064,-0.389,-0.188,-0.471,-0.295,-0.017,-0.07,-0.036,-0.041,-0.04,-0.024,-0.024,-0.461,-0.976,-0.242,-0.776,-0.127,-0.872,-0.601,-0.48,-0.518,-0.246,-0.388,-0.173,-0.085,-0.15,-0.28,-0.071,-0.081,-0.097,-0.079,-0.166,-0.193,-0.361,-0.092,-0.112,-0.334,-0.185,-0.533,-0.472,-0.12,-0.234,-0.179,-0.287,-0.205,-0.241,-0.171,-0.159,-0.117,-0.217,-0.083,-0.21,-0.181,-0.205,-0.147,-0.3,0.0,-0.017,0.0,-0.5,-0.24,-0.229,-0.179,-0.154,-0.125,-0.129,-0.157,-0.191,-0.212,-0.19,-0.128,-0.181,-0.153,-0.109,-0.057,-0.04,-0.039,-0.035,-0.024,-0.114,-0.054,-0.041,-0.077,-0.048,-0.034,-0.14,-0.079,-0.037,-0.113,-0.128,-0.054,-0.072,-0.116,-0.09,-0.089,-0.084,-0.099,-0.066,-0.071,-0.071,-0.029,-0.064,-0.055,-0.042,-0.038,-0.078,-0.069,-0.064,-0.059,-0.131,-0.105,0.0,-0.019,0.0,0.0,0.0,0.0,0.0,-0.274,-0.102,-0.313,-0.238,-0.26,0.0,-0.024,0.0,-0.069,0.128,0.828,TRAIN,3.0


In [None]:
# create table schema to prevent errors when uploading data
schema = []

for col, dtype in zip(df_scaled.dtypes.index, df_scaled.dtypes.values):
    col_dict = {}
    col_dict['name'] = col

    if dtype == object:
        typ = 'STRING'
    elif dtype == bool:
        typ = 'BOOLEAN'
    elif dtype == 'datetime64[ns]':
        typ = 'DATETIME'
    elif dtype in (int, 'Int64', 'uint8'):
        typ = 'INTEGER'
    elif dtype == float:
        typ = 'FLOAT'
    else:
        print(dtype)
    col_dict['type'] = typ

    schema.append(col_dict)

In [None]:
# df_scaled.to_gbq(f'{project_name}.return_prediction_ga4.step_6_merged_scaled',
#                     project_id=project_name,
#                     if_exists='replace',
#                     location=region,
#                     chunksize=100_000,
#                     table_schema=schema)

# Class imbalance
Another important check on your data that you should perform is checking for class imbalance. <br>
In this case we do have a continuous target, but we can artifically create classes of "return" or "no return" based on the target value.

In [None]:
df['return_class'] = df.ecommerce_refund_value_in_usd.apply(lambda x: False if x == 0 else True)
df['return_class'].value_counts()

False    3434
True     1032
Name: return_class, dtype: int64

In [None]:
df['return_class'].value_counts(normalize=True)

False   0.769
True    0.231
Name: return_class, dtype: float64

In this dataset we can see that the class imbalance between return and no return is not very severe. However, if your dataset is highly imbalanced you may want to rectify that imbalance using class balancing techniques such as over- and under-sampling or SMOTE.
**Add links etc**