<a href="https://colab.research.google.com/github/Adlucent/ga4-return-prediction/blob/main/5_Multicollinearity_Check_and_Joining_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 5. Multicollinearity Check and Join Data
At this point, we should check our features for multicollinearity issues.

In [None]:
################################################################################
######################### CHANGE BQ PROJECT NAME BELOW #########################
################################################################################

project_name = 'adl-analytics' #add proj name
region = "US"  # GCP project region
customer_table_name = 'adl-analytics.return_prediction_ga4.step_4_customer_agg'
transaction_table_name = 'adl-analytics.return_prediction_ga4.step_4_split'

In [None]:
# If your notebook does not have pandas_gbq you can install it here:
# ! pip install pandas_gbq

In [None]:
# Google credentials
from google.colab import auth
auth.authenticate_user()

# # BigQuery Magics
# '''BigQuery magics are used to run BigQuery SQL queries in a python environment.
# These queries can also be run in the BigQuery UI '''

# from google.cloud import bigquery
# from google.cloud.bigquery import magics, Client, QueryJobConfig

# magics.context.project = project_name #update project name
# client = bigquery.Client(project=magics.context.project)

# Interface between Jupyter and BigQuery
import pandas_gbq

# data processing libraries
import pandas as pd
import numpy as np

# multicollinearity tools
from statsmodels.stats.outliers_influence import variance_inflation_factor

# visualization
import matplotlib.pyplot as plt
import seaborn as sns

sns.set()

# suppress notebook warnings
import warnings
warnings.filterwarnings('ignore')

# dataframe formatting
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)
pd.set_option("display.float_format", lambda x: "%.3f" % x)

# Load customer data

In [None]:
sql = f"""
SELECT *
FROM `{customer_table_name}`;
"""
customer_data = pandas_gbq.read_gbq(sql, project_id=project_name, location=region, use_bqstorage_api=True)

Downloading: 100%|[32m██████████[0m|


In [None]:
# create a copy of the imported data to avoid re-importing if we need to revert to original table
df_customer = customer_data.copy()

In [None]:
df_customer.head()

Unnamed: 0,user_pseudo_id,ecommerce_transaction_id,nunique_ecommerce_transactions,avg_event_params_engagement_time_msec,avg_item_price_in_usd,avg_item_promotions,avg_item_quantity,avg_item_refund_in_usd,avg_item_refund_quantity,avg_item_revenue_in_usd,sum_ecommerce_total_item_quantity,sum_total_return_item_quantity,sum_ecommerce_purchase_revenue_in_usd,sum_ecommerce_refund_value_in_usd,sum_ecommerce_tax_value_in_usd,sum_ecommerce_unique_items,max_days_first_session_to_transaction,max_user_ltv_revenue,max_event_params_engagement_time_msec,max_item_price_in_usd,max_item_promotions,max_item_quantity,max_item_refund_in_usd,max_item_refund_quantity,max_item_revenue_in_usd,sum_event_params_engagement_time_msec,sum_event_params_session_engaged,sum_item_price_in_usd,sum_item_promotions,sum_item_quantity,sum_item_refund_in_usd,sum_item_refund_quantity,sum_item_revenue_in_usd,sum_event_name_add_to_cart,sum_event_name_begin_checkout,sum_event_name_page_view,sum_event_name_scroll,sum_event_name_select_item,sum_event_name_user_engagement,sum_event_name_view_item,sum_device_category_desktop,sum_device_category_mobile,sum_device_mobile_brand_name_Apple,sum_device_mobile_brand_name_Google,sum_device_mobile_brand_name_Huawei,sum_device_mobile_brand_name_Microsoft,sum_device_mobile_brand_name_Mozilla,sum_device_mobile_brand_name_Samsung,sum_device_mobile_brand_name_Xiaomi,sum_device_mobile_model_name_Chrome,sum_device_mobile_model_name_ChromeBook,sum_device_mobile_model_name_Edge,sum_device_mobile_model_name_Firefox,sum_device_mobile_model_name_Safari,sum_device_mobile_model_name_iPad,sum_device_mobile_model_name_iPhone,sum_device_web_info_browser_AndroidWebview,sum_device_web_info_browser_Chrome,sum_device_web_info_browser_Edge,sum_device_web_info_browser_Firefox,sum_device_web_info_browser_Safari,sum_geo_country_Canada,sum_geo_country_France,sum_geo_country_India,sum_geo_country_Other,sum_geo_country_Spain,sum_geo_country_UnitedKingdom,sum_geo_country_UnitedStates,sum_traffic_source_medium_Other,sum_traffic_source_medium_cpc,sum_traffic_source_medium_organic,sum_traffic_source_medium_referral,sum_event_params_parent_page_Apparel,sum_event_params_parent_page_CampusCollection,sum_event_params_parent_page_CheckoutConfirmation,sum_event_params_parent_page_CheckoutYourInformation,sum_event_params_parent_page_EcoFriendly,sum_event_params_parent_page_Home,sum_event_params_parent_page_Lifestyle,sum_event_params_parent_page_New,sum_event_params_parent_page_Other,sum_event_params_parent_page_PaymentMethod,sum_event_params_parent_page_Sale,sum_event_params_parent_page_ShopbyBrand,sum_event_params_parent_page_ShoppingCart,sum_event_params_parent_page_Stationery,sum_event_params_child_page_Bags,sum_event_params_child_page_Drinkware,sum_event_params_child_page_Google,sum_event_params_child_page_Hats,sum_event_params_child_page_Kids,sum_event_params_child_page_MensUnisex,sum_event_params_child_page_Notebooks,sum_event_params_child_page_Other,sum_event_params_child_page_SmallGoods,sum_event_params_child_page_Socks,sum_event_params_child_page_Stickers,sum_event_params_child_page_Womens,sum_event_params_child_page_Writing,sum_event_params_child_page_YouTube,sum_item_parent_category_Apparel,sum_item_parent_category_Collections,sum_item_parent_category_Lifestyle,sum_item_parent_category_New,sum_item_parent_category_Other,sum_item_parent_category_Sale,sum_item_parent_category_ShopbyBrand,sum_item_parent_category_Stationery,sum_item_child_category_Bags,sum_item_child_category_CampusCollection,sum_item_child_category_Drinkware,sum_item_child_category_Google,sum_item_child_category_Kids,sum_item_child_category_MensUnisex,sum_item_child_category_Other,sum_item_child_category_SmallGoods,sum_item_child_category_Womens,sum_item_child_subcategory_Backpacks,sum_item_child_subcategory_ElectronicsAccessories,sum_item_child_subcategory_Infant,sum_item_child_subcategory_MensTShirts,sum_item_child_subcategory_MugsTumblers,sum_item_child_subcategory_Other,sum_item_child_subcategory_WaterBottles,pre_nunique_event_params_ga_sessions,pre_sum_event_params_engagement_time_msec,pre_sum_event_params_session_engaged,pre_sum_event_name_add_to_cart,pre_sum_event_name_begin_checkout,pre_sum_event_name_page_view,pre_sum_event_name_scroll,pre_sum_event_name_select_item,pre_sum_event_name_user_engagement,pre_sum_event_name_view_item,pre_sum_device_category_desktop,pre_sum_device_category_mobile,pre_sum_device_mobile_brand_name_Apple,pre_sum_device_mobile_brand_name_Google,pre_sum_device_mobile_brand_name_Huawei,pre_sum_device_mobile_brand_name_Microsoft,pre_sum_device_mobile_brand_name_Mozilla,pre_sum_device_mobile_brand_name_Samsung,pre_sum_device_mobile_brand_name_Xiaomi,pre_sum_device_mobile_model_name_Chrome,pre_sum_device_mobile_model_name_ChromeBook,pre_sum_device_mobile_model_name_Edge,pre_sum_device_mobile_model_name_Firefox,pre_sum_device_mobile_model_name_Safari,pre_sum_device_mobile_model_name_iPad,pre_sum_device_mobile_model_name_iPhone,pre_sum_device_web_info_browser_AndroidWebview,pre_sum_device_web_info_browser_Chrome,pre_sum_device_web_info_browser_Edge,pre_sum_device_web_info_browser_Firefox,pre_sum_device_web_info_browser_Safari,pre_sum_geo_country_Canada,pre_sum_geo_country_France,pre_sum_geo_country_India,pre_sum_geo_country_Other,pre_sum_geo_country_Spain,pre_sum_geo_country_UnitedKingdom,pre_sum_geo_country_UnitedStates,pre_sum_traffic_source_medium_Other,pre_sum_traffic_source_medium_cpc,pre_sum_traffic_source_medium_organic,pre_sum_traffic_source_medium_referral,pre_sum_event_params_parent_page_Apparel,pre_sum_event_params_parent_page_CampusCollection,pre_sum_event_params_parent_page_CheckoutConfirmation,pre_sum_event_params_parent_page_CheckoutYourInformation,pre_sum_event_params_parent_page_EcoFriendly,pre_sum_event_params_parent_page_Home,pre_sum_event_params_parent_page_Lifestyle,pre_sum_event_params_parent_page_New,pre_sum_event_params_parent_page_Other,pre_sum_event_params_parent_page_PaymentMethod,pre_sum_event_params_parent_page_Sale,pre_sum_event_params_parent_page_ShopbyBrand,pre_sum_event_params_parent_page_ShoppingCart,pre_sum_event_params_parent_page_Stationery,pre_sum_event_params_child_page_Bags,pre_sum_event_params_child_page_Drinkware,pre_sum_event_params_child_page_Google,pre_sum_event_params_child_page_Hats,pre_sum_event_params_child_page_Kids,pre_sum_event_params_child_page_MensUnisex,pre_sum_event_params_child_page_Notebooks,pre_sum_event_params_child_page_Other,pre_sum_event_params_child_page_SmallGoods,pre_sum_event_params_child_page_Socks,pre_sum_event_params_child_page_Stickers,pre_sum_event_params_child_page_Womens,pre_sum_event_params_child_page_Writing,pre_sum_event_params_child_page_YouTube,pre_sum_item_parent_category_Apparel,pre_sum_item_parent_category_Collections,pre_sum_item_parent_category_Lifestyle,pre_sum_item_parent_category_New,pre_sum_item_parent_category_Other,pre_sum_item_parent_category_Sale,pre_sum_item_parent_category_ShopbyBrand,pre_sum_item_parent_category_Stationery,pre_sum_item_child_category_Bags,pre_sum_item_child_category_CampusCollection,pre_sum_item_child_category_Drinkware,pre_sum_item_child_category_Google,pre_sum_item_child_category_Kids,pre_sum_item_child_category_MensUnisex,pre_sum_item_child_category_Other,pre_sum_item_child_category_SmallGoods,pre_sum_item_child_category_Womens,pre_sum_item_child_subcategory_Backpacks,pre_sum_item_child_subcategory_ElectronicsAccessories,pre_sum_item_child_subcategory_Infant,pre_sum_item_child_subcategory_MensTShirts,pre_sum_item_child_subcategory_MugsTumblers,pre_sum_item_child_subcategory_Other,pre_sum_item_child_subcategory_WaterBottles,pre_max_event_params_ga_session_number,pre_max_event_params_engagement_time_msec,pre_stdev_max_event_params_engagement_time_msec,pre_stdev_avg_event_params_engagement_time_msec,pre_avg_event_params_engagement_time_msec,recency,age,split
0,1219636.7573278528,775080,2.0,3754.163,8.625,0.292,1.25,10.125,1.25,10.125,9,9,70.0,70.0,9.0,7,0.0,71.0,95696,20.0,1,2,20.0,2,21.0,3048380,810,58.0,2,9,70.0,9,70.0,284,92,58,48,0,60,236,0,2,2,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,2,0,0,0,2,0,0,0,0,0,0,0,154,0,26,140,0,0,160,0,12,24,0,0,32,258,0,156,0,100,54,0,6,0,4,0,100,0,152,0,172,0,200,0,0,0,0,232,0,0,144,0,48,0,356,0,0,0,0,0,0,0,0,0,6.0,8907190,909,323,8,63,35,24,57,398,0,6,6,0,0,0,0,0,0,0,0,0,0,0,0,6,0,0,0,0,6,0,0,0,6,0,0,0,0,0,1,2,441,0,0,12,0,9,180,3,0,0,41,0,35,171,42,95,0,26,29,306,5,0,5,0,25,5,139,0,400,0,160,0,0,36,0,154,36,0,84,0,24,288,182,0,0,0,0,0,0,0,0,0,7,75139,37223.89,5462.979,2004.067,0.0,26,TRAIN
1,12291446.381533444,205482,1.0,9148.439,28.857,0.0,1.0,28.857,1.0,28.857,7,7,203.0,203.0,19.0,7,10.0,824.0,186421,44.0,0,1,44.0,1,44.0,3732563,407,202.0,0,7,202.0,7,202.0,84,4,24,18,0,25,240,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,286,0,9,12,0,3,68,0,3,8,0,0,16,0,2,39,0,30,0,219,0,0,27,2,0,32,0,0,269,0,60,2,0,0,0,0,0,0,36,0,0,216,26,24,24,0,0,0,0,0,0,0,4.0,16584106,1109,0,4,104,62,0,103,809,0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0,0,4,0,0,2,0,637,0,22,20,0,18,234,0,9,23,39,3,45,3,30,28,3,15,106,168,3,0,176,0,0,348,0,0,587,0,192,0,0,36,0,0,24,0,12,0,96,144,12,156,324,0,0,0,0,0,0,0,5,480208,216251.418,64359.977,28782.001,0.0,44,TRAIN
2,12291446.381533444,597612,2.0,10078.185,18.529,0.0,1.0,18.529,1.0,18.529,12,12,244.0,244.0,24.0,12,10.0,865.0,307177,44.0,0,1,44.0,1,44.0,7464252,745,243.0,0,12,243.0,12,243.0,167,8,51,35,0,53,409,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,2,0,0,0,1,343,0,16,24,0,8,208,0,6,15,0,0,34,81,41,140,0,81,0,222,29,0,27,5,25,32,27,0,319,0,195,2,0,0,0,70,37,0,133,0,0,216,146,24,24,0,0,0,0,0,0,0,4.0,16584106,1109,0,4,104,62,0,103,809,0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0,0,4,0,0,2,0,637,0,22,20,0,18,234,0,9,23,39,3,45,3,30,28,3,15,106,168,3,0,176,0,0,348,0,0,587,0,192,0,0,36,0,0,24,0,12,0,96,144,12,156,324,0,0,0,0,0,0,0,5,480208,216251.418,64359.977,19188.0,1.0,44,TRAIN
3,13756133.77624912,462859,2.0,8643.41,19.0,1.0,9.5,38.0,2.0,182.5,19,4,365.0,76.0,36.0,2,4.0,365.0,332905,19.0,1,10,76.0,4,192.0,4511860,518,38.0,2,19,76.0,4,365.0,0,12,84,48,72,78,204,0,2,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,2,0,0,2,0,0,0,14,36,0,6,270,0,24,36,0,32,34,0,0,270,0,0,0,0,0,0,0,0,0,0,0,0,0,0,256,0,0,0,24,0,0,0,256,0,0,0,0,0,0,0,0,0,0,0,0,0,5.0,9105711,820,0,4,93,34,168,79,460,0,5,0,0,0,0,0,0,5,0,0,0,0,0,0,0,0,5,0,0,0,0,0,0,0,0,0,5,0,0,0,1,343,0,2,12,0,126,168,17,0,0,76,0,13,0,0,168,0,0,0,234,0,0,0,0,0,95,0,0,347,0,156,16,0,96,0,0,0,0,156,0,0,263,0,0,72,0,0,0,0,0,0,0,6,108084,41875.413,5056.73,2135.782,0.0,16,TRAIN
4,1415699.235686598,484939,1.0,4372.63,29.0,1.0,1.444,0.0,0.0,29.444,13,0,265.0,0.0,23.0,9,0.0,265.0,58902,92.0,1,3,0.0,0,92.0,1512930,345,261.0,9,13,0.0,0,265.0,128,9,22,20,0,24,128,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,180,0,12,13,0,11,0,0,3,4,0,0,29,91,0,0,0,2,0,87,0,0,0,0,91,91,0,0,180,0,0,0,0,0,4,90,0,0,0,2,0,84,88,0,84,0,0,0,0,0,0,0,1.0,4905218,417,0,9,62,36,36,55,215,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,272,0,0,13,0,8,3,0,8,4,3,0,39,25,0,3,0,0,3,154,0,0,0,0,25,115,0,0,234,0,0,0,0,0,1,23,0,0,0,1,0,132,22,0,96,0,0,0,0,0,0,0,1,206158,0.0,0.0,5839.545,0.0,36,TRAIN


In [None]:
df_customer.shape

(4466, 226)

In [None]:
# check for missing values
df_customer.isna().sum().sum()

0

# Customer VIF analysis

In [None]:
# remove id and split columns from analysis
customer_features = df_customer.iloc[: , 2:-1]
customer_features.head()

Unnamed: 0,nunique_ecommerce_transactions,avg_event_params_engagement_time_msec,avg_item_price_in_usd,avg_item_promotions,avg_item_quantity,avg_item_refund_in_usd,avg_item_refund_quantity,avg_item_revenue_in_usd,sum_ecommerce_total_item_quantity,sum_total_return_item_quantity,sum_ecommerce_purchase_revenue_in_usd,sum_ecommerce_refund_value_in_usd,sum_ecommerce_tax_value_in_usd,sum_ecommerce_unique_items,max_days_first_session_to_transaction,max_user_ltv_revenue,max_event_params_engagement_time_msec,max_item_price_in_usd,max_item_promotions,max_item_quantity,max_item_refund_in_usd,max_item_refund_quantity,max_item_revenue_in_usd,sum_event_params_engagement_time_msec,sum_event_params_session_engaged,sum_item_price_in_usd,sum_item_promotions,sum_item_quantity,sum_item_refund_in_usd,sum_item_refund_quantity,sum_item_revenue_in_usd,sum_event_name_add_to_cart,sum_event_name_begin_checkout,sum_event_name_page_view,sum_event_name_scroll,sum_event_name_select_item,sum_event_name_user_engagement,sum_event_name_view_item,sum_device_category_desktop,sum_device_category_mobile,sum_device_mobile_brand_name_Apple,sum_device_mobile_brand_name_Google,sum_device_mobile_brand_name_Huawei,sum_device_mobile_brand_name_Microsoft,sum_device_mobile_brand_name_Mozilla,sum_device_mobile_brand_name_Samsung,sum_device_mobile_brand_name_Xiaomi,sum_device_mobile_model_name_Chrome,sum_device_mobile_model_name_ChromeBook,sum_device_mobile_model_name_Edge,sum_device_mobile_model_name_Firefox,sum_device_mobile_model_name_Safari,sum_device_mobile_model_name_iPad,sum_device_mobile_model_name_iPhone,sum_device_web_info_browser_AndroidWebview,sum_device_web_info_browser_Chrome,sum_device_web_info_browser_Edge,sum_device_web_info_browser_Firefox,sum_device_web_info_browser_Safari,sum_geo_country_Canada,sum_geo_country_France,sum_geo_country_India,sum_geo_country_Other,sum_geo_country_Spain,sum_geo_country_UnitedKingdom,sum_geo_country_UnitedStates,sum_traffic_source_medium_Other,sum_traffic_source_medium_cpc,sum_traffic_source_medium_organic,sum_traffic_source_medium_referral,sum_event_params_parent_page_Apparel,sum_event_params_parent_page_CampusCollection,sum_event_params_parent_page_CheckoutConfirmation,sum_event_params_parent_page_CheckoutYourInformation,sum_event_params_parent_page_EcoFriendly,sum_event_params_parent_page_Home,sum_event_params_parent_page_Lifestyle,sum_event_params_parent_page_New,sum_event_params_parent_page_Other,sum_event_params_parent_page_PaymentMethod,sum_event_params_parent_page_Sale,sum_event_params_parent_page_ShopbyBrand,sum_event_params_parent_page_ShoppingCart,sum_event_params_parent_page_Stationery,sum_event_params_child_page_Bags,sum_event_params_child_page_Drinkware,sum_event_params_child_page_Google,sum_event_params_child_page_Hats,sum_event_params_child_page_Kids,sum_event_params_child_page_MensUnisex,sum_event_params_child_page_Notebooks,sum_event_params_child_page_Other,sum_event_params_child_page_SmallGoods,sum_event_params_child_page_Socks,sum_event_params_child_page_Stickers,sum_event_params_child_page_Womens,sum_event_params_child_page_Writing,sum_event_params_child_page_YouTube,sum_item_parent_category_Apparel,sum_item_parent_category_Collections,sum_item_parent_category_Lifestyle,sum_item_parent_category_New,sum_item_parent_category_Other,sum_item_parent_category_Sale,sum_item_parent_category_ShopbyBrand,sum_item_parent_category_Stationery,sum_item_child_category_Bags,sum_item_child_category_CampusCollection,sum_item_child_category_Drinkware,sum_item_child_category_Google,sum_item_child_category_Kids,sum_item_child_category_MensUnisex,sum_item_child_category_Other,sum_item_child_category_SmallGoods,sum_item_child_category_Womens,sum_item_child_subcategory_Backpacks,sum_item_child_subcategory_ElectronicsAccessories,sum_item_child_subcategory_Infant,sum_item_child_subcategory_MensTShirts,sum_item_child_subcategory_MugsTumblers,sum_item_child_subcategory_Other,sum_item_child_subcategory_WaterBottles,pre_nunique_event_params_ga_sessions,pre_sum_event_params_engagement_time_msec,pre_sum_event_params_session_engaged,pre_sum_event_name_add_to_cart,pre_sum_event_name_begin_checkout,pre_sum_event_name_page_view,pre_sum_event_name_scroll,pre_sum_event_name_select_item,pre_sum_event_name_user_engagement,pre_sum_event_name_view_item,pre_sum_device_category_desktop,pre_sum_device_category_mobile,pre_sum_device_mobile_brand_name_Apple,pre_sum_device_mobile_brand_name_Google,pre_sum_device_mobile_brand_name_Huawei,pre_sum_device_mobile_brand_name_Microsoft,pre_sum_device_mobile_brand_name_Mozilla,pre_sum_device_mobile_brand_name_Samsung,pre_sum_device_mobile_brand_name_Xiaomi,pre_sum_device_mobile_model_name_Chrome,pre_sum_device_mobile_model_name_ChromeBook,pre_sum_device_mobile_model_name_Edge,pre_sum_device_mobile_model_name_Firefox,pre_sum_device_mobile_model_name_Safari,pre_sum_device_mobile_model_name_iPad,pre_sum_device_mobile_model_name_iPhone,pre_sum_device_web_info_browser_AndroidWebview,pre_sum_device_web_info_browser_Chrome,pre_sum_device_web_info_browser_Edge,pre_sum_device_web_info_browser_Firefox,pre_sum_device_web_info_browser_Safari,pre_sum_geo_country_Canada,pre_sum_geo_country_France,pre_sum_geo_country_India,pre_sum_geo_country_Other,pre_sum_geo_country_Spain,pre_sum_geo_country_UnitedKingdom,pre_sum_geo_country_UnitedStates,pre_sum_traffic_source_medium_Other,pre_sum_traffic_source_medium_cpc,pre_sum_traffic_source_medium_organic,pre_sum_traffic_source_medium_referral,pre_sum_event_params_parent_page_Apparel,pre_sum_event_params_parent_page_CampusCollection,pre_sum_event_params_parent_page_CheckoutConfirmation,pre_sum_event_params_parent_page_CheckoutYourInformation,pre_sum_event_params_parent_page_EcoFriendly,pre_sum_event_params_parent_page_Home,pre_sum_event_params_parent_page_Lifestyle,pre_sum_event_params_parent_page_New,pre_sum_event_params_parent_page_Other,pre_sum_event_params_parent_page_PaymentMethod,pre_sum_event_params_parent_page_Sale,pre_sum_event_params_parent_page_ShopbyBrand,pre_sum_event_params_parent_page_ShoppingCart,pre_sum_event_params_parent_page_Stationery,pre_sum_event_params_child_page_Bags,pre_sum_event_params_child_page_Drinkware,pre_sum_event_params_child_page_Google,pre_sum_event_params_child_page_Hats,pre_sum_event_params_child_page_Kids,pre_sum_event_params_child_page_MensUnisex,pre_sum_event_params_child_page_Notebooks,pre_sum_event_params_child_page_Other,pre_sum_event_params_child_page_SmallGoods,pre_sum_event_params_child_page_Socks,pre_sum_event_params_child_page_Stickers,pre_sum_event_params_child_page_Womens,pre_sum_event_params_child_page_Writing,pre_sum_event_params_child_page_YouTube,pre_sum_item_parent_category_Apparel,pre_sum_item_parent_category_Collections,pre_sum_item_parent_category_Lifestyle,pre_sum_item_parent_category_New,pre_sum_item_parent_category_Other,pre_sum_item_parent_category_Sale,pre_sum_item_parent_category_ShopbyBrand,pre_sum_item_parent_category_Stationery,pre_sum_item_child_category_Bags,pre_sum_item_child_category_CampusCollection,pre_sum_item_child_category_Drinkware,pre_sum_item_child_category_Google,pre_sum_item_child_category_Kids,pre_sum_item_child_category_MensUnisex,pre_sum_item_child_category_Other,pre_sum_item_child_category_SmallGoods,pre_sum_item_child_category_Womens,pre_sum_item_child_subcategory_Backpacks,pre_sum_item_child_subcategory_ElectronicsAccessories,pre_sum_item_child_subcategory_Infant,pre_sum_item_child_subcategory_MensTShirts,pre_sum_item_child_subcategory_MugsTumblers,pre_sum_item_child_subcategory_Other,pre_sum_item_child_subcategory_WaterBottles,pre_max_event_params_ga_session_number,pre_max_event_params_engagement_time_msec,pre_stdev_max_event_params_engagement_time_msec,pre_stdev_avg_event_params_engagement_time_msec,pre_avg_event_params_engagement_time_msec,recency,age
0,2.0,3754.163,8.625,0.292,1.25,10.125,1.25,10.125,9,9,70.0,70.0,9.0,7,0.0,71.0,95696,20.0,1,2,20.0,2,21.0,3048380,810,58.0,2,9,70.0,9,70.0,284,92,58,48,0,60,236,0,2,2,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,2,0,0,0,2,0,0,0,0,0,0,0,154,0,26,140,0,0,160,0,12,24,0,0,32,258,0,156,0,100,54,0,6,0,4,0,100,0,152,0,172,0,200,0,0,0,0,232,0,0,144,0,48,0,356,0,0,0,0,0,0,0,0,0,6.0,8907190,909,323,8,63,35,24,57,398,0,6,6,0,0,0,0,0,0,0,0,0,0,0,0,6,0,0,0,0,6,0,0,0,6,0,0,0,0,0,1,2,441,0,0,12,0,9,180,3,0,0,41,0,35,171,42,95,0,26,29,306,5,0,5,0,25,5,139,0,400,0,160,0,0,36,0,154,36,0,84,0,24,288,182,0,0,0,0,0,0,0,0,0,7,75139,37223.89,5462.979,2004.067,0.0,26
1,1.0,9148.439,28.857,0.0,1.0,28.857,1.0,28.857,7,7,203.0,203.0,19.0,7,10.0,824.0,186421,44.0,0,1,44.0,1,44.0,3732563,407,202.0,0,7,202.0,7,202.0,84,4,24,18,0,25,240,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,286,0,9,12,0,3,68,0,3,8,0,0,16,0,2,39,0,30,0,219,0,0,27,2,0,32,0,0,269,0,60,2,0,0,0,0,0,0,36,0,0,216,26,24,24,0,0,0,0,0,0,0,4.0,16584106,1109,0,4,104,62,0,103,809,0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0,0,4,0,0,2,0,637,0,22,20,0,18,234,0,9,23,39,3,45,3,30,28,3,15,106,168,3,0,176,0,0,348,0,0,587,0,192,0,0,36,0,0,24,0,12,0,96,144,12,156,324,0,0,0,0,0,0,0,5,480208,216251.418,64359.977,28782.001,0.0,44
2,2.0,10078.185,18.529,0.0,1.0,18.529,1.0,18.529,12,12,244.0,244.0,24.0,12,10.0,865.0,307177,44.0,0,1,44.0,1,44.0,7464252,745,243.0,0,12,243.0,12,243.0,167,8,51,35,0,53,409,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,2,0,0,0,1,343,0,16,24,0,8,208,0,6,15,0,0,34,81,41,140,0,81,0,222,29,0,27,5,25,32,27,0,319,0,195,2,0,0,0,70,37,0,133,0,0,216,146,24,24,0,0,0,0,0,0,0,4.0,16584106,1109,0,4,104,62,0,103,809,0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0,0,4,0,0,2,0,637,0,22,20,0,18,234,0,9,23,39,3,45,3,30,28,3,15,106,168,3,0,176,0,0,348,0,0,587,0,192,0,0,36,0,0,24,0,12,0,96,144,12,156,324,0,0,0,0,0,0,0,5,480208,216251.418,64359.977,19188.0,1.0,44
3,2.0,8643.41,19.0,1.0,9.5,38.0,2.0,182.5,19,4,365.0,76.0,36.0,2,4.0,365.0,332905,19.0,1,10,76.0,4,192.0,4511860,518,38.0,2,19,76.0,4,365.0,0,12,84,48,72,78,204,0,2,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,2,0,0,2,0,0,0,14,36,0,6,270,0,24,36,0,32,34,0,0,270,0,0,0,0,0,0,0,0,0,0,0,0,0,0,256,0,0,0,24,0,0,0,256,0,0,0,0,0,0,0,0,0,0,0,0,0,5.0,9105711,820,0,4,93,34,168,79,460,0,5,0,0,0,0,0,0,5,0,0,0,0,0,0,0,0,5,0,0,0,0,0,0,0,0,0,5,0,0,0,1,343,0,2,12,0,126,168,17,0,0,76,0,13,0,0,168,0,0,0,234,0,0,0,0,0,95,0,0,347,0,156,16,0,96,0,0,0,0,156,0,0,263,0,0,72,0,0,0,0,0,0,0,6,108084,41875.413,5056.73,2135.782,0.0,16
4,1.0,4372.63,29.0,1.0,1.444,0.0,0.0,29.444,13,0,265.0,0.0,23.0,9,0.0,265.0,58902,92.0,1,3,0.0,0,92.0,1512930,345,261.0,9,13,0.0,0,265.0,128,9,22,20,0,24,128,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,180,0,12,13,0,11,0,0,3,4,0,0,29,91,0,0,0,2,0,87,0,0,0,0,91,91,0,0,180,0,0,0,0,0,4,90,0,0,0,2,0,84,88,0,84,0,0,0,0,0,0,0,1.0,4905218,417,0,9,62,36,36,55,215,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,272,0,0,13,0,8,3,0,8,4,3,0,39,25,0,3,0,0,3,154,0,0,0,0,25,115,0,0,234,0,0,0,0,0,1,23,0,0,0,1,0,132,22,0,96,0,0,0,0,0,0,0,1,206158,0.0,0.0,5839.545,0.0,36


In [None]:
customer_features = customer_features.astype('float64')

In [None]:
# first, let's take a look at VIF scores for all original features in the customer dataset
vif_before = pd.DataFrame(
    [variance_inflation_factor(customer_features.values, i) for i in range(customer_features.shape[1])],
    index=customer_features.columns,
    dtype=float,
).reset_index()
vif_before.columns = ('Feature', 'VIF')

In [None]:
print('Before feature selection:')
vif_before

Before feature selection:


Unnamed: 0,Feature,VIF
0,nunique_ecommerce_transactions,5053.236
1,avg_event_params_engagement_time_msec,26.31
2,avg_item_price_in_usd,103.844
3,avg_item_promotions,12.451
4,avg_item_quantity,62.263
5,avg_item_refund_in_usd,32.489
6,avg_item_refund_quantity,30.697
7,avg_item_revenue_in_usd,118.084
8,sum_ecommerce_total_item_quantity,579.435
9,sum_total_return_item_quantity,223.094


In [None]:
vif_before[vif_before['VIF'] > 5].shape[0]

183

Since so many of our features have such high VIF values, let's start by dropping the features with a VIF value over 1000 and recalculate VIF.

In [None]:
# drop 40 features with VIF over 1000
customer_features = customer_features.drop(list(vif_before[vif_before['VIF'] > 1000].Feature), axis=1)
customer_features.shape

(4466, 183)

In [None]:
# re-examine VIF values after dropping extreme hight VIF features
vif_trimmed = pd.DataFrame(
    [variance_inflation_factor(customer_features.values, i) for i in range(customer_features.shape[1])],
    index=customer_features.columns,
    dtype=float,
).reset_index()
vif_trimmed.columns = ('Feature', 'VIF')

In [None]:
print('After trimming extremely high VIF features:')
vif_trimmed

After trimming extremely high VIF features:


Unnamed: 0,Feature,VIF
0,avg_event_params_engagement_time_msec,16.883
1,avg_item_price_in_usd,86.707
2,avg_item_promotions,10.286
3,avg_item_quantity,45.344
4,avg_item_refund_in_usd,25.289
5,avg_item_refund_quantity,24.457
6,avg_item_revenue_in_usd,98.243
7,sum_ecommerce_total_item_quantity,448.286
8,sum_total_return_item_quantity,199.463
9,sum_ecommerce_purchase_revenue_in_usd,270.271


In [None]:
vif_trimmed[vif_trimmed['VIF'] > 500].shape[0]

1

After removing extremely high VIF features, only 1 feature still has a VIF value over 500. <br>
Let's start by automatically removing features to get all features to a max VIF value of 10.

In [None]:
# function to calculate VIF scores and loop through to drop col with max VIF each time until all are less than 10
def vif_and_drop(X, thresh=5.0):
    cols = X.columns
    features = np.arange(X.shape[1])
    dropped=True
    while dropped:
        dropped=False
        c = X[cols[features]].values
        vif = [variance_inflation_factor(c, ix) for ix in np.arange(c.shape[1])]

        maxloc = vif.index(max(vif))
        if max(vif) > thresh:
            print('dropping \'' + X[cols[features]].columns[maxloc] + '\' at index: ' + str(maxloc))
            features = np.delete(features, maxloc)
            dropped=True

    X = X[cols[features]]
    return X

In [None]:
# call the function and update customer_features
customer_features = vif_and_drop(customer_features, thresh=10)

dropping 'sum_item_revenue_in_usd' at index: 28
dropping 'sum_ecommerce_total_item_quantity' at index: 7
dropping 'sum_device_mobile_model_name_Chrome' at index: 33
dropping 'pre_sum_event_name_page_view' at index: 92
dropping 'sum_item_child_category_Other' at index: 79
dropping 'pre_nunique_event_params_ga_sessions' at index: 87
dropping 'sum_event_params_parent_page_PaymentMethod' at index: 55
dropping 'sum_item_parent_category_ShopbyBrand' at index: 73
dropping 'sum_item_refund_quantity' at index: 26
dropping 'sum_ecommerce_purchase_revenue_in_usd' at index: 8
dropping 'pre_sum_device_mobile_brand_name_Apple' at index: 91
dropping 'sum_item_refund_in_usd' at index: 24
dropping 'pre_sum_event_params_parent_page_ShopbyBrand' at index: 128
dropping 'sum_item_parent_category_Stationery' at index: 70
dropping 'sum_event_params_child_page_Kids' at index: 58
dropping 'sum_event_params_parent_page_CheckoutConfirmation' at index: 47
dropping 'pre_sum_device_category_desktop' at index: 85
dr

In [None]:
# check all final VIF values
vif_reduced = pd.DataFrame(
    [variance_inflation_factor(customer_features.values, i) for i in range(customer_features.shape[1])],
    index=customer_features.columns,
    dtype=float,
).reset_index()
vif_reduced.columns = ('Feature', 'VIF')

In [None]:
print('All features VIF < 10:')
vif_reduced

All features VIF < 10:


Unnamed: 0,Feature,VIF
0,avg_event_params_engagement_time_msec,7.136
1,avg_item_price_in_usd,5.159
2,avg_item_promotions,7.04
3,avg_item_refund_in_usd,3.208
4,sum_total_return_item_quantity,5.154
5,sum_ecommerce_refund_value_in_usd,6.96
6,max_days_first_session_to_transaction,1.957
7,max_event_params_engagement_time_msec,3.938
8,max_item_promotions,7.376
9,max_item_quantity,6.165


In [None]:
customer_features.shape

(4466, 125)

In [None]:
df_customer_reduced = pd.concat([df_customer.iloc[:, :2], customer_features, df_customer.iloc[:, -1]], axis=1)
df_customer_reduced.head()

Unnamed: 0,user_pseudo_id,ecommerce_transaction_id,avg_event_params_engagement_time_msec,avg_item_price_in_usd,avg_item_promotions,avg_item_refund_in_usd,sum_total_return_item_quantity,sum_ecommerce_refund_value_in_usd,max_days_first_session_to_transaction,max_event_params_engagement_time_msec,max_item_promotions,max_item_quantity,sum_item_promotions,sum_item_quantity,sum_device_category_mobile,sum_device_mobile_brand_name_Google,sum_device_mobile_brand_name_Huawei,sum_device_mobile_brand_name_Samsung,sum_device_mobile_brand_name_Xiaomi,sum_device_mobile_model_name_ChromeBook,sum_device_web_info_browser_AndroidWebview,sum_device_web_info_browser_Edge,sum_device_web_info_browser_Safari,sum_geo_country_Canada,sum_geo_country_France,sum_geo_country_India,sum_geo_country_Spain,sum_geo_country_UnitedKingdom,sum_geo_country_UnitedStates,sum_traffic_source_medium_Other,sum_traffic_source_medium_cpc,sum_traffic_source_medium_organic,sum_traffic_source_medium_referral,sum_event_params_parent_page_CampusCollection,sum_event_params_parent_page_EcoFriendly,sum_event_params_parent_page_Home,sum_event_params_parent_page_New,sum_event_params_parent_page_Other,sum_event_params_parent_page_Sale,sum_event_params_parent_page_Stationery,sum_event_params_child_page_Google,sum_event_params_child_page_Hats,sum_event_params_child_page_Notebooks,sum_event_params_child_page_Other,sum_event_params_child_page_Socks,sum_event_params_child_page_Stickers,sum_event_params_child_page_Writing,sum_event_params_child_page_YouTube,sum_item_parent_category_New,sum_item_parent_category_Other,sum_item_child_category_Google,sum_item_child_category_Kids,sum_item_child_category_MensUnisex,sum_item_child_category_Womens,sum_item_child_subcategory_Backpacks,sum_item_child_subcategory_ElectronicsAccessories,sum_item_child_subcategory_Infant,sum_item_child_subcategory_MensTShirts,sum_item_child_subcategory_MugsTumblers,sum_item_child_subcategory_Other,sum_item_child_subcategory_WaterBottles,pre_sum_event_params_engagement_time_msec,pre_sum_event_name_add_to_cart,pre_sum_event_name_begin_checkout,pre_sum_event_name_select_item,pre_sum_device_category_mobile,pre_sum_device_mobile_brand_name_Google,pre_sum_device_mobile_brand_name_Huawei,pre_sum_device_mobile_brand_name_Microsoft,pre_sum_device_mobile_brand_name_Samsung,pre_sum_device_mobile_brand_name_Xiaomi,pre_sum_device_mobile_model_name_ChromeBook,pre_sum_device_mobile_model_name_Safari,pre_sum_device_mobile_model_name_iPad,pre_sum_device_mobile_model_name_iPhone,pre_sum_device_web_info_browser_AndroidWebview,pre_sum_device_web_info_browser_Firefox,pre_sum_device_web_info_browser_Safari,pre_sum_geo_country_Canada,pre_sum_geo_country_France,pre_sum_geo_country_India,pre_sum_geo_country_Other,pre_sum_geo_country_Spain,pre_sum_geo_country_UnitedKingdom,pre_sum_traffic_source_medium_Other,pre_sum_traffic_source_medium_cpc,pre_sum_traffic_source_medium_organic,pre_sum_traffic_source_medium_referral,pre_sum_event_params_parent_page_CampusCollection,pre_sum_event_params_parent_page_CheckoutConfirmation,pre_sum_event_params_parent_page_EcoFriendly,pre_sum_event_params_parent_page_Home,pre_sum_event_params_parent_page_New,pre_sum_event_params_parent_page_Other,pre_sum_event_params_parent_page_PaymentMethod,pre_sum_event_params_parent_page_Sale,pre_sum_event_params_parent_page_ShoppingCart,pre_sum_event_params_child_page_Google,pre_sum_event_params_child_page_Hats,pre_sum_event_params_child_page_Kids,pre_sum_event_params_child_page_Notebooks,pre_sum_event_params_child_page_Other,pre_sum_event_params_child_page_Socks,pre_sum_event_params_child_page_Stickers,pre_sum_event_params_child_page_Womens,pre_sum_event_params_child_page_Writing,pre_sum_event_params_child_page_YouTube,pre_sum_item_parent_category_Other,pre_sum_item_parent_category_ShopbyBrand,pre_sum_item_parent_category_Stationery,pre_sum_item_child_category_Bags,pre_sum_item_child_category_Drinkware,pre_sum_item_child_category_MensUnisex,pre_sum_item_child_category_SmallGoods,pre_sum_item_child_subcategory_Backpacks,pre_sum_item_child_subcategory_ElectronicsAccessories,pre_sum_item_child_subcategory_Infant,pre_sum_item_child_subcategory_MensTShirts,pre_sum_item_child_subcategory_MugsTumblers,pre_sum_item_child_subcategory_Other,pre_sum_item_child_subcategory_WaterBottles,pre_max_event_params_ga_session_number,pre_stdev_max_event_params_engagement_time_msec,pre_stdev_avg_event_params_engagement_time_msec,pre_avg_event_params_engagement_time_msec,recency,age,split
0,1219636.7573278528,775080,3754.163,8.625,0.292,10.125,9.0,70.0,0.0,95696.0,1.0,2.0,2.0,9.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12.0,0.0,258.0,0.0,100.0,6.0,0.0,0.0,100.0,152.0,0.0,0.0,0.0,0.0,48.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8907190.0,323.0,8.0,24.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,6.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,1.0,2.0,0.0,0.0,0.0,9.0,3.0,0.0,0.0,41.0,35.0,0.0,26.0,29.0,5.0,0.0,0.0,25.0,5.0,139.0,0.0,0.0,0.0,154.0,36.0,84.0,288.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,37223.89,5462.979,2004.067,0.0,26.0,TRAIN
1,12291446.381533444,205482,9148.439,28.857,0.0,28.857,7.0,203.0,10.0,186421.0,0.0,1.0,0.0,7.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,3.0,0.0,0.0,0.0,30.0,0.0,0.0,2.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,216.0,24.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,16584106.0,0.0,4.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,22.0,0.0,18.0,0.0,9.0,23.0,39.0,45.0,3.0,15.0,106.0,3.0,0.0,0.0,0.0,348.0,0.0,0.0,0.0,0.0,0.0,24.0,12.0,144.0,156.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,216251.418,64359.977,28782.001,0.0,44.0,TRAIN
2,12291446.381533444,597612,10078.185,18.529,0.0,18.529,12.0,244.0,10.0,307177.0,0.0,1.0,0.0,12.0,2.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,1.0,0.0,0.0,8.0,0.0,6.0,0.0,81.0,0.0,81.0,29.0,0.0,5.0,25.0,27.0,0.0,2.0,0.0,0.0,0.0,216.0,24.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,16584106.0,0.0,4.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,22.0,0.0,18.0,0.0,9.0,23.0,39.0,45.0,3.0,15.0,106.0,3.0,0.0,0.0,0.0,348.0,0.0,0.0,0.0,0.0,0.0,24.0,12.0,144.0,156.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,216251.418,64359.977,19188.0,1.0,44.0,TRAIN
3,13756133.77624912,462859,8643.41,19.0,1.0,38.0,4.0,76.0,4.0,332905.0,1.0,10.0,2.0,19.0,2.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,2.0,0.0,0.0,0.0,6.0,0.0,24.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9105711.0,0.0,4.0,168.0,5.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2.0,0.0,126.0,17.0,0.0,0.0,76.0,13.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,95.0,0.0,0.0,0.0,0.0,0.0,0.0,156.0,263.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,41875.413,5056.73,2135.782,0.0,16.0,TRAIN
4,1415699.235686598,484939,4372.63,29.0,1.0,0.0,0.0,0.0,0.0,58902.0,1.0,3.0,9.0,13.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,11.0,0.0,3.0,0.0,91.0,0.0,2.0,0.0,0.0,0.0,91.0,0.0,0.0,0.0,0.0,2.0,0.0,84.0,84.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4905218.0,0.0,9.0,36.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,8.0,4.0,3.0,39.0,0.0,0.0,3.0,0.0,0.0,0.0,25.0,115.0,0.0,0.0,0.0,1.0,23.0,0.0,0.0,132.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,5839.545,0.0,36.0,TRAIN


We've reduced the number of columns from 223 to 125. All scores are now less than 10, so we can move on to the the transactions features.

In [None]:
# create table schema to prevent errors when uploading data
schema = []

for col, dtype in zip(df_customer_reduced.dtypes.index, df_customer_reduced.dtypes.values):
    col_dict = {}
    col_dict['name'] = col

    if dtype == object:
        typ = 'STRING'
    elif dtype == bool:
        typ = 'BOOLEAN'
    elif dtype == 'datetime64[ns]':
        typ = 'DATETIME'
    elif dtype in (int, 'Int64', 'uint8'):
        typ = 'INTEGER'
    elif dtype == float:
        typ = 'FLOAT'
    else:
        print(dtype)
    col_dict['type'] = typ

    schema.append(col_dict)

In [None]:
# df_customer_reduced.to_gbq(f'{project_name}.return_prediction_ga4.step_5_customer_reduced',
#                             project_id=project_name,
#                             if_exists='replace',
#                             location=region,
#                             chunksize=100_000,
#                             table_schema=schema)

# Load transaction data

In [None]:
sql = f"""
SELECT *
FROM `{transaction_table_name}`;
"""
transaction_data = pandas_gbq.read_gbq(sql, project_id=project_name, location=region, use_bqstorage_api=True)

Downloading: 100%|[32m██████████[0m|


In [None]:
# create a copy of the imported data to avoid re-importing if we need to revert to original table
df_transaction = transaction_data.copy()

In [None]:
df_transaction.head()

Unnamed: 0,user_pseudo_id,transaction_date,transaction_ga_session_number,ecommerce_transaction_id,sum_event_params_engagement_time_msec,sum_event_params_session_engaged,user_ltv_revenue,ecommerce_total_item_quantity,total_return_item_quantity,ecommerce_purchase_revenue_in_usd,ecommerce_refund_value_in_usd,ecommerce_tax_value_in_usd,ecommerce_unique_items,sum_item_price_in_usd,sum_item_promotions,sum_item_quantity,sum_item_refund_in_usd,sum_item_refund_quantity,sum_item_revenue_in_usd,sum_event_name_add_to_cart,sum_event_name_begin_checkout,sum_event_name_page_view,sum_event_name_scroll,sum_event_name_select_item,sum_event_name_user_engagement,sum_event_name_view_item,sum_device_category_desktop,sum_device_category_mobile,sum_device_mobile_brand_name_Apple,sum_device_mobile_brand_name_Google,sum_device_mobile_brand_name_Huawei,sum_device_mobile_brand_name_Microsoft,sum_device_mobile_brand_name_Mozilla,sum_device_mobile_brand_name_Samsung,sum_device_mobile_brand_name_Xiaomi,sum_device_mobile_model_name_Chrome,sum_device_mobile_model_name_ChromeBook,sum_device_mobile_model_name_Edge,sum_device_mobile_model_name_Firefox,sum_device_mobile_model_name_Safari,sum_device_mobile_model_name_iPad,sum_device_mobile_model_name_iPhone,sum_device_web_info_browser_AndroidWebview,sum_device_web_info_browser_Chrome,sum_device_web_info_browser_Edge,sum_device_web_info_browser_Firefox,sum_device_web_info_browser_Safari,sum_geo_country_Canada,sum_geo_country_France,sum_geo_country_India,sum_geo_country_Other,sum_geo_country_Spain,sum_geo_country_UnitedKingdom,sum_geo_country_UnitedStates,sum_traffic_source_medium_Other,sum_traffic_source_medium_cpc,sum_traffic_source_medium_organic,sum_traffic_source_medium_referral,sum_event_params_parent_page_Apparel,sum_event_params_parent_page_CampusCollection,sum_event_params_parent_page_CheckoutConfirmation,sum_event_params_parent_page_CheckoutYourInformation,sum_event_params_parent_page_EcoFriendly,sum_event_params_parent_page_Home,sum_event_params_parent_page_Lifestyle,sum_event_params_parent_page_New,sum_event_params_parent_page_Other,sum_event_params_parent_page_PaymentMethod,sum_event_params_parent_page_Sale,sum_event_params_parent_page_ShopbyBrand,sum_event_params_parent_page_ShoppingCart,sum_event_params_parent_page_Stationery,sum_event_params_child_page_Bags,sum_event_params_child_page_Drinkware,sum_event_params_child_page_Google,sum_event_params_child_page_Hats,sum_event_params_child_page_Kids,sum_event_params_child_page_MensUnisex,sum_event_params_child_page_Notebooks,sum_event_params_child_page_Other,sum_event_params_child_page_SmallGoods,sum_event_params_child_page_Socks,sum_event_params_child_page_Stickers,sum_event_params_child_page_Womens,sum_event_params_child_page_Writing,sum_event_params_child_page_YouTube,sum_item_parent_category_Apparel,sum_item_parent_category_Collections,sum_item_parent_category_Lifestyle,sum_item_parent_category_New,sum_item_parent_category_Other,sum_item_parent_category_Sale,sum_item_parent_category_ShopbyBrand,sum_item_parent_category_Stationery,sum_item_child_category_Bags,sum_item_child_category_CampusCollection,sum_item_child_category_Drinkware,sum_item_child_category_Google,sum_item_child_category_Kids,sum_item_child_category_MensUnisex,sum_item_child_category_Other,sum_item_child_category_SmallGoods,sum_item_child_category_Womens,sum_item_child_subcategory_Backpacks,sum_item_child_subcategory_ElectronicsAccessories,sum_item_child_subcategory_Infant,sum_item_child_subcategory_MensTShirts,sum_item_child_subcategory_MugsTumblers,sum_item_child_subcategory_Other,sum_item_child_subcategory_WaterBottles,max_event_params_engagement_time_msec,max_item_price_in_usd,max_item_promotions,max_item_quantity,max_item_refund_in_usd,max_item_refund_quantity,max_item_revenue_in_usd,avg_event_params_engagement_time_msec,avg_item_price_in_usd,avg_item_promotions,avg_item_quantity,avg_item_refund_in_usd,avg_item_refund_quantity,avg_item_revenue_in_usd,pre_nunique_event_params_ga_sessions,pre_stdev_max_event_params_engagement_time_msec,pre_stdev_avg_event_params_engagement_time_msec,pre_max_event_params_ga_session_number,pre_max_event_params_engagement_time_msec,pre_avg_event_params_engagement_time_msec,pre_sum_event_params_engagement_time_msec,pre_sum_event_params_session_engaged,pre_sum_event_name_add_to_cart,pre_sum_event_name_begin_checkout,pre_sum_event_name_page_view,pre_sum_event_name_scroll,pre_sum_event_name_select_item,pre_sum_event_name_user_engagement,pre_sum_event_name_view_item,pre_sum_device_category_desktop,pre_sum_device_category_mobile,pre_sum_device_mobile_brand_name_Apple,pre_sum_device_mobile_brand_name_Google,pre_sum_device_mobile_brand_name_Huawei,pre_sum_device_mobile_brand_name_Microsoft,pre_sum_device_mobile_brand_name_Mozilla,pre_sum_device_mobile_brand_name_Samsung,pre_sum_device_mobile_brand_name_Xiaomi,pre_sum_device_mobile_model_name_Chrome,pre_sum_device_mobile_model_name_ChromeBook,pre_sum_device_mobile_model_name_Edge,pre_sum_device_mobile_model_name_Firefox,pre_sum_device_mobile_model_name_Safari,pre_sum_device_mobile_model_name_iPad,pre_sum_device_mobile_model_name_iPhone,pre_sum_device_web_info_browser_AndroidWebview,pre_sum_device_web_info_browser_Chrome,pre_sum_device_web_info_browser_Edge,pre_sum_device_web_info_browser_Firefox,pre_sum_device_web_info_browser_Safari,pre_sum_geo_country_Canada,pre_sum_geo_country_France,pre_sum_geo_country_India,pre_sum_geo_country_Other,pre_sum_geo_country_Spain,pre_sum_geo_country_UnitedKingdom,pre_sum_geo_country_UnitedStates,pre_sum_traffic_source_medium_Other,pre_sum_traffic_source_medium_cpc,pre_sum_traffic_source_medium_organic,pre_sum_traffic_source_medium_referral,pre_sum_event_params_parent_page_Apparel,pre_sum_event_params_parent_page_CampusCollection,pre_sum_event_params_parent_page_CheckoutConfirmation,pre_sum_event_params_parent_page_CheckoutYourInformation,pre_sum_event_params_parent_page_EcoFriendly,pre_sum_event_params_parent_page_Home,pre_sum_event_params_parent_page_Lifestyle,pre_sum_event_params_parent_page_New,pre_sum_event_params_parent_page_Other,pre_sum_event_params_parent_page_PaymentMethod,pre_sum_event_params_parent_page_Sale,pre_sum_event_params_parent_page_ShopbyBrand,pre_sum_event_params_parent_page_ShoppingCart,pre_sum_event_params_parent_page_Stationery,pre_sum_event_params_child_page_Bags,pre_sum_event_params_child_page_Drinkware,pre_sum_event_params_child_page_Google,pre_sum_event_params_child_page_Hats,pre_sum_event_params_child_page_Kids,pre_sum_event_params_child_page_MensUnisex,pre_sum_event_params_child_page_Notebooks,pre_sum_event_params_child_page_Other,pre_sum_event_params_child_page_SmallGoods,pre_sum_event_params_child_page_Socks,pre_sum_event_params_child_page_Stickers,pre_sum_event_params_child_page_Womens,pre_sum_event_params_child_page_Writing,pre_sum_event_params_child_page_YouTube,pre_sum_item_parent_category_Apparel,pre_sum_item_parent_category_Collections,pre_sum_item_parent_category_Lifestyle,pre_sum_item_parent_category_New,pre_sum_item_parent_category_Other,pre_sum_item_parent_category_Sale,pre_sum_item_parent_category_ShopbyBrand,pre_sum_item_parent_category_Stationery,pre_sum_item_child_category_Bags,pre_sum_item_child_category_CampusCollection,pre_sum_item_child_category_Drinkware,pre_sum_item_child_category_Google,pre_sum_item_child_category_Kids,pre_sum_item_child_category_MensUnisex,pre_sum_item_child_category_Other,pre_sum_item_child_category_SmallGoods,pre_sum_item_child_category_Womens,pre_sum_item_child_subcategory_Backpacks,pre_sum_item_child_subcategory_ElectronicsAccessories,pre_sum_item_child_subcategory_Infant,pre_sum_item_child_subcategory_MensTShirts,pre_sum_item_child_subcategory_MugsTumblers,pre_sum_item_child_subcategory_Other,pre_sum_item_child_subcategory_WaterBottles,days_first_session_to_transaction,split
0,10111055.876868386,2020-12-10,1,741471,3324661,225,94.0,3,0,94.0,0.0,10.0,3,94.0,0,3,0.0,0,94.0,48,6,26,16,12,25,84,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,154,0,5,10,0,9,0,2,3,7,0,0,6,0,0,0,0,0,0,154,0,0,0,0,0,0,0,0,153,0,0,0,0,0,0,0,0,0,0,0,0,144,0,0,0,0,0,0,0,0,0,0,275267,48.0,0,1,0.0,0,48.0,14581.846,31.333,0.0,1.0,0.0,0.0,31.333,0.0,0.0,0.0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,TRAIN
1,1019527.5799124268,2020-12-05,1,2105,2966128,412,44.0,7,0,44.0,0.0,5.0,4,32.0,0,7,0.0,0,44.0,78,8,34,21,0,33,231,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,2,4,7,12,27,44,186,0,3,4,62,0,13,39,90,18,0,0,0,0,0,0,78,2,36,0,3,0,6,24,192,0,3,60,0,33,84,24,12,0,0,0,63,72,0,0,0,0,0,0,0,0,52542,13.0,0,4,0.0,0,16.0,7147.296,8.0,0.0,1.75,0.0,0.0,11.0,0.0,0.0,0.0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,TRAIN
2,1021887.1151788384,2020-12-26,1,76937,918786,177,21.0,2,0,21.0,0.0,2.0,1,10.0,1,2,0.0,0,21.0,48,9,15,13,0,15,72,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,3,21,0,6,129,3,0,4,2,0,9,0,0,129,0,0,0,0,0,0,0,0,0,0,0,0,0,0,120,0,0,0,10,0,0,0,120,0,0,0,0,0,0,0,0,0,0,0,0,0,33396,10.0,1,2,0.0,0,21.0,5104.367,10.0,1.0,2.0,0.0,0.0,21.0,0.0,0.0,0.0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,TRAIN
3,1026932.0858862292,2020-12-01,1,339943,2387211,173,110.0,1,0,55.0,0.0,4.0,1,110.0,2,2,0.0,0,110.0,12,2,28,19,0,28,77,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,8,6,0,24,3,2,9,4,27,62,6,0,0,3,62,0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,24,60,0,0,0,0,60,0,0,0,0,0,0,0,0,0,0,0,0,68076,55.0,1,1,0.0,0,55.0,13641.206,55.0,1.0,1.0,0.0,0.0,55.0,0.0,0.0,0.0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,TRAIN
4,1026932.0858862292,2020-12-08,3,29460,674555,127,214.0,1,0,55.0,0.0,4.0,1,55.0,0,1,0.0,0,55.0,0,4,25,11,27,24,27,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,7,12,0,61,0,0,18,8,2,0,6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,6,0,0,0,0,48,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,37441,55.0,0,1,0.0,0,55.0,5188.885,55.0,0.0,1.0,0.0,0.0,55.0,0.0,0.0,0.0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,TRAIN


In [None]:
df_transaction.shape

(4466, 225)

In [None]:
# check for missing values
df_transaction.isna().sum().sum()

0

# Transaction VIF analysis

In [None]:
# remove features related to refund/return since this is our target
transaction_features = df_transaction.drop([col for col in df_transaction.columns if ('refund' in col) or ('return' in col)], axis=1)
transaction_features.shape

(4466, 217)

In [None]:
# remove id columns from analysis
transaction_features = transaction_features.drop(['user_pseudo_id', 'transaction_date', 'ecommerce_transaction_id', 'split'], axis=1)
transaction_features.head()

Unnamed: 0,transaction_ga_session_number,sum_event_params_engagement_time_msec,sum_event_params_session_engaged,user_ltv_revenue,ecommerce_total_item_quantity,ecommerce_purchase_revenue_in_usd,ecommerce_tax_value_in_usd,ecommerce_unique_items,sum_item_price_in_usd,sum_item_promotions,sum_item_quantity,sum_item_revenue_in_usd,sum_event_name_add_to_cart,sum_event_name_begin_checkout,sum_event_name_page_view,sum_event_name_scroll,sum_event_name_select_item,sum_event_name_user_engagement,sum_event_name_view_item,sum_device_category_desktop,sum_device_category_mobile,sum_device_mobile_brand_name_Apple,sum_device_mobile_brand_name_Google,sum_device_mobile_brand_name_Huawei,sum_device_mobile_brand_name_Microsoft,sum_device_mobile_brand_name_Mozilla,sum_device_mobile_brand_name_Samsung,sum_device_mobile_brand_name_Xiaomi,sum_device_mobile_model_name_Chrome,sum_device_mobile_model_name_ChromeBook,sum_device_mobile_model_name_Edge,sum_device_mobile_model_name_Firefox,sum_device_mobile_model_name_Safari,sum_device_mobile_model_name_iPad,sum_device_mobile_model_name_iPhone,sum_device_web_info_browser_AndroidWebview,sum_device_web_info_browser_Chrome,sum_device_web_info_browser_Edge,sum_device_web_info_browser_Firefox,sum_device_web_info_browser_Safari,sum_geo_country_Canada,sum_geo_country_France,sum_geo_country_India,sum_geo_country_Other,sum_geo_country_Spain,sum_geo_country_UnitedKingdom,sum_geo_country_UnitedStates,sum_traffic_source_medium_Other,sum_traffic_source_medium_cpc,sum_traffic_source_medium_organic,sum_traffic_source_medium_referral,sum_event_params_parent_page_Apparel,sum_event_params_parent_page_CampusCollection,sum_event_params_parent_page_CheckoutConfirmation,sum_event_params_parent_page_CheckoutYourInformation,sum_event_params_parent_page_EcoFriendly,sum_event_params_parent_page_Home,sum_event_params_parent_page_Lifestyle,sum_event_params_parent_page_New,sum_event_params_parent_page_Other,sum_event_params_parent_page_PaymentMethod,sum_event_params_parent_page_Sale,sum_event_params_parent_page_ShopbyBrand,sum_event_params_parent_page_ShoppingCart,sum_event_params_parent_page_Stationery,sum_event_params_child_page_Bags,sum_event_params_child_page_Drinkware,sum_event_params_child_page_Google,sum_event_params_child_page_Hats,sum_event_params_child_page_Kids,sum_event_params_child_page_MensUnisex,sum_event_params_child_page_Notebooks,sum_event_params_child_page_Other,sum_event_params_child_page_SmallGoods,sum_event_params_child_page_Socks,sum_event_params_child_page_Stickers,sum_event_params_child_page_Womens,sum_event_params_child_page_Writing,sum_event_params_child_page_YouTube,sum_item_parent_category_Apparel,sum_item_parent_category_Collections,sum_item_parent_category_Lifestyle,sum_item_parent_category_New,sum_item_parent_category_Other,sum_item_parent_category_Sale,sum_item_parent_category_ShopbyBrand,sum_item_parent_category_Stationery,sum_item_child_category_Bags,sum_item_child_category_CampusCollection,sum_item_child_category_Drinkware,sum_item_child_category_Google,sum_item_child_category_Kids,sum_item_child_category_MensUnisex,sum_item_child_category_Other,sum_item_child_category_SmallGoods,sum_item_child_category_Womens,sum_item_child_subcategory_Backpacks,sum_item_child_subcategory_ElectronicsAccessories,sum_item_child_subcategory_Infant,sum_item_child_subcategory_MensTShirts,sum_item_child_subcategory_MugsTumblers,sum_item_child_subcategory_Other,sum_item_child_subcategory_WaterBottles,max_event_params_engagement_time_msec,max_item_price_in_usd,max_item_promotions,max_item_quantity,max_item_revenue_in_usd,avg_event_params_engagement_time_msec,avg_item_price_in_usd,avg_item_promotions,avg_item_quantity,avg_item_revenue_in_usd,pre_nunique_event_params_ga_sessions,pre_stdev_max_event_params_engagement_time_msec,pre_stdev_avg_event_params_engagement_time_msec,pre_max_event_params_ga_session_number,pre_max_event_params_engagement_time_msec,pre_avg_event_params_engagement_time_msec,pre_sum_event_params_engagement_time_msec,pre_sum_event_params_session_engaged,pre_sum_event_name_add_to_cart,pre_sum_event_name_begin_checkout,pre_sum_event_name_page_view,pre_sum_event_name_scroll,pre_sum_event_name_select_item,pre_sum_event_name_user_engagement,pre_sum_event_name_view_item,pre_sum_device_category_desktop,pre_sum_device_category_mobile,pre_sum_device_mobile_brand_name_Apple,pre_sum_device_mobile_brand_name_Google,pre_sum_device_mobile_brand_name_Huawei,pre_sum_device_mobile_brand_name_Microsoft,pre_sum_device_mobile_brand_name_Mozilla,pre_sum_device_mobile_brand_name_Samsung,pre_sum_device_mobile_brand_name_Xiaomi,pre_sum_device_mobile_model_name_Chrome,pre_sum_device_mobile_model_name_ChromeBook,pre_sum_device_mobile_model_name_Edge,pre_sum_device_mobile_model_name_Firefox,pre_sum_device_mobile_model_name_Safari,pre_sum_device_mobile_model_name_iPad,pre_sum_device_mobile_model_name_iPhone,pre_sum_device_web_info_browser_AndroidWebview,pre_sum_device_web_info_browser_Chrome,pre_sum_device_web_info_browser_Edge,pre_sum_device_web_info_browser_Firefox,pre_sum_device_web_info_browser_Safari,pre_sum_geo_country_Canada,pre_sum_geo_country_France,pre_sum_geo_country_India,pre_sum_geo_country_Other,pre_sum_geo_country_Spain,pre_sum_geo_country_UnitedKingdom,pre_sum_geo_country_UnitedStates,pre_sum_traffic_source_medium_Other,pre_sum_traffic_source_medium_cpc,pre_sum_traffic_source_medium_organic,pre_sum_traffic_source_medium_referral,pre_sum_event_params_parent_page_Apparel,pre_sum_event_params_parent_page_CampusCollection,pre_sum_event_params_parent_page_CheckoutConfirmation,pre_sum_event_params_parent_page_CheckoutYourInformation,pre_sum_event_params_parent_page_EcoFriendly,pre_sum_event_params_parent_page_Home,pre_sum_event_params_parent_page_Lifestyle,pre_sum_event_params_parent_page_New,pre_sum_event_params_parent_page_Other,pre_sum_event_params_parent_page_PaymentMethod,pre_sum_event_params_parent_page_Sale,pre_sum_event_params_parent_page_ShopbyBrand,pre_sum_event_params_parent_page_ShoppingCart,pre_sum_event_params_parent_page_Stationery,pre_sum_event_params_child_page_Bags,pre_sum_event_params_child_page_Drinkware,pre_sum_event_params_child_page_Google,pre_sum_event_params_child_page_Hats,pre_sum_event_params_child_page_Kids,pre_sum_event_params_child_page_MensUnisex,pre_sum_event_params_child_page_Notebooks,pre_sum_event_params_child_page_Other,pre_sum_event_params_child_page_SmallGoods,pre_sum_event_params_child_page_Socks,pre_sum_event_params_child_page_Stickers,pre_sum_event_params_child_page_Womens,pre_sum_event_params_child_page_Writing,pre_sum_event_params_child_page_YouTube,pre_sum_item_parent_category_Apparel,pre_sum_item_parent_category_Collections,pre_sum_item_parent_category_Lifestyle,pre_sum_item_parent_category_New,pre_sum_item_parent_category_Other,pre_sum_item_parent_category_Sale,pre_sum_item_parent_category_ShopbyBrand,pre_sum_item_parent_category_Stationery,pre_sum_item_child_category_Bags,pre_sum_item_child_category_CampusCollection,pre_sum_item_child_category_Drinkware,pre_sum_item_child_category_Google,pre_sum_item_child_category_Kids,pre_sum_item_child_category_MensUnisex,pre_sum_item_child_category_Other,pre_sum_item_child_category_SmallGoods,pre_sum_item_child_category_Womens,pre_sum_item_child_subcategory_Backpacks,pre_sum_item_child_subcategory_ElectronicsAccessories,pre_sum_item_child_subcategory_Infant,pre_sum_item_child_subcategory_MensTShirts,pre_sum_item_child_subcategory_MugsTumblers,pre_sum_item_child_subcategory_Other,pre_sum_item_child_subcategory_WaterBottles,days_first_session_to_transaction
0,1,3324661,225,94.0,3,94.0,10.0,3,94.0,0,3,94.0,48,6,26,16,12,25,84,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,154,0,5,10,0,9,0,2,3,7,0,0,6,0,0,0,0,0,0,154,0,0,0,0,0,0,0,0,153,0,0,0,0,0,0,0,0,0,0,0,0,144,0,0,0,0,0,0,0,0,0,0,275267,48.0,0,1,48.0,14581.846,31.333,0.0,1.0,31.333,0.0,0.0,0.0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0
1,1,2966128,412,44.0,7,44.0,5.0,4,32.0,0,7,44.0,78,8,34,21,0,33,231,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,2,4,7,12,27,44,186,0,3,4,62,0,13,39,90,18,0,0,0,0,0,0,78,2,36,0,3,0,6,24,192,0,3,60,0,33,84,24,12,0,0,0,63,72,0,0,0,0,0,0,0,0,52542,13.0,0,4,16.0,7147.296,8.0,0.0,1.75,11.0,0.0,0.0,0.0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0
2,1,918786,177,21.0,2,21.0,2.0,1,10.0,1,2,21.0,48,9,15,13,0,15,72,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,3,21,0,6,129,3,0,4,2,0,9,0,0,129,0,0,0,0,0,0,0,0,0,0,0,0,0,0,120,0,0,0,10,0,0,0,120,0,0,0,0,0,0,0,0,0,0,0,0,0,33396,10.0,1,2,21.0,5104.367,10.0,1.0,2.0,21.0,0.0,0.0,0.0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0
3,1,2387211,173,110.0,1,55.0,4.0,1,110.0,2,2,110.0,12,2,28,19,0,28,77,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,8,6,0,24,3,2,9,4,27,62,6,0,0,3,62,0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,24,60,0,0,0,0,60,0,0,0,0,0,0,0,0,0,0,0,0,68076,55.0,1,1,55.0,13641.206,55.0,1.0,1.0,55.0,0.0,0.0,0.0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0
4,3,674555,127,214.0,1,55.0,4.0,1,55.0,0,1,55.0,0,4,25,11,27,24,27,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,7,12,0,61,0,0,18,8,2,0,6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,6,0,0,0,0,48,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,37441,55.0,0,1,55.0,5188.885,55.0,0.0,1.0,55.0,0.0,0.0,0.0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0


In [None]:
transaction_features.shape

(4466, 213)

In [None]:
transaction_features = transaction_features.astype('float64')

In [None]:
# first, let's take a look at VIF scores for all original features in the customer dataset
vif_before = pd.DataFrame(
    [variance_inflation_factor(transaction_features.values, i) for i in range(transaction_features.shape[1])],
    index=transaction_features.columns,
    dtype=float,
).reset_index()
vif_before.columns = ('Feature', 'VIF')

In [None]:
print('Before feature selection:')
vif_before

Before feature selection:


Unnamed: 0,Feature,VIF
0,transaction_ga_session_number,6.085
1,sum_event_params_engagement_time_msec,18.369
2,sum_event_params_session_engaged,10889.288
3,user_ltv_revenue,5.226
4,ecommerce_total_item_quantity,78.908
5,ecommerce_purchase_revenue_in_usd,60.065
6,ecommerce_tax_value_in_usd,10.44
7,ecommerce_unique_items,17.525
8,sum_item_price_in_usd,44.492
9,sum_item_promotions,2.978


In [None]:
vif_before[vif_before['VIF'] > 5].shape[0]

167

Again, since so many of our features have such high VIF values, let's start by dropping the features with a VIF value over 500 and recalculate VIF.

In [None]:
# drop 22 features with VIF over 500
transaction_features = transaction_features.drop(list(vif_before[vif_before['VIF'] > 500].Feature), axis=1)
transaction_features.shape

(4466, 191)

In [None]:
# re-examine VIF values after dropping extreme hight VIF features
vif_trimmed = pd.DataFrame(
    [variance_inflation_factor(transaction_features.values, i) for i in range(transaction_features.shape[1])],
    index=transaction_features.columns,
    dtype=float,
).reset_index()
vif_trimmed.columns = ('Feature', 'VIF')

In [None]:
print('After trimming extremely high VIF features:')
vif_trimmed

After trimming extremely high VIF features:


Unnamed: 0,Feature,VIF
0,transaction_ga_session_number,5.994
1,sum_event_params_engagement_time_msec,17.155
2,user_ltv_revenue,5.175
3,ecommerce_total_item_quantity,78.232
4,ecommerce_purchase_revenue_in_usd,59.434
5,ecommerce_tax_value_in_usd,10.403
6,ecommerce_unique_items,16.9
7,sum_item_price_in_usd,43.652
8,sum_item_promotions,2.942
9,sum_item_quantity,79.052


In [None]:
vif_trimmed[vif_trimmed['VIF'] > 200].shape[0]

3

After removing extremely high VIF features, only 3 features still has a VIF value over 200. <br>
Let's start by automatically removing features to get all features to a max VIF value of 10.

In [None]:
# call the function and update customer_features
transaction_features = vif_and_drop(transaction_features, thresh=10)

dropping 'pre_sum_event_name_page_view' at index: 109
dropping 'pre_nunique_event_params_ga_sessions' at index: 101
dropping 'pre_sum_device_mobile_brand_name_Apple' at index: 113
dropping 'pre_sum_device_category_desktop' at index: 111
dropping 'sum_item_child_category_Other' at index: 81
dropping 'pre_sum_event_params_parent_page_ShopbyBrand' at index: 149
dropping 'sum_item_revenue_in_usd' at index: 10
dropping 'sum_event_params_parent_page_CheckoutYourInformation' at index: 46
dropping 'sum_device_category_desktop' at index: 14
dropping 'sum_device_mobile_model_name_Edge' at index: 22
dropping 'pre_sum_item_child_category_Other' at index: 170
dropping 'sum_event_params_parent_page_ShopbyBrand' at index: 51
dropping 'ecommerce_total_item_quantity' at index: 3
dropping 'pre_sum_device_mobile_model_name_Edge' at index: 112
dropping 'pre_sum_device_mobile_model_name_Chrome' at index: 110
dropping 'pre_sum_event_params_child_page_SmallGoods' at index: 151
dropping 'pre_sum_event_params_

In [None]:
# check all final VIF values
vif_reduced = pd.DataFrame(
    [variance_inflation_factor(transaction_features.values, i) for i in range(transaction_features.shape[1])],
    index=transaction_features.columns,
    dtype=float,
).reset_index()
vif_reduced.columns = ('Feature', 'VIF')

In [None]:
print('All features VIF < 10:')
vif_reduced

All features VIF < 10:


Unnamed: 0,Feature,VIF
0,transaction_ga_session_number,5.783
1,user_ltv_revenue,4.541
2,ecommerce_tax_value_in_usd,6.158
3,sum_item_price_in_usd,8.343
4,sum_item_promotions,2.613
5,sum_item_quantity,3.816
6,sum_event_name_begin_checkout,4.143
7,sum_event_name_select_item,2.224
8,sum_device_category_mobile,7.963
9,sum_device_mobile_brand_name_Google,4.632


In [None]:
transaction_features.shape

(4466, 137)

In [None]:
df_transaction_reduced = pd.concat([df_transaction.loc[:, ['user_pseudo_id', 'ecommerce_transaction_id']],
                                    transaction_features,
                                    df_transaction.iloc[:, -1]],
                                   axis=1)
df_transaction_reduced.head()

Unnamed: 0,user_pseudo_id,ecommerce_transaction_id,transaction_ga_session_number,user_ltv_revenue,ecommerce_tax_value_in_usd,sum_item_price_in_usd,sum_item_promotions,sum_item_quantity,sum_event_name_begin_checkout,sum_event_name_select_item,sum_device_category_mobile,sum_device_mobile_brand_name_Google,sum_device_mobile_brand_name_Huawei,sum_device_mobile_brand_name_Microsoft,sum_device_mobile_brand_name_Samsung,sum_device_mobile_brand_name_Xiaomi,sum_device_mobile_model_name_ChromeBook,sum_device_mobile_model_name_Safari,sum_device_mobile_model_name_iPad,sum_device_mobile_model_name_iPhone,sum_device_web_info_browser_AndroidWebview,sum_device_web_info_browser_Firefox,sum_device_web_info_browser_Safari,sum_geo_country_Canada,sum_geo_country_France,sum_geo_country_India,sum_geo_country_Other,sum_geo_country_Spain,sum_geo_country_UnitedKingdom,sum_traffic_source_medium_Other,sum_traffic_source_medium_cpc,sum_traffic_source_medium_organic,sum_traffic_source_medium_referral,sum_event_params_parent_page_CampusCollection,sum_event_params_parent_page_CheckoutConfirmation,sum_event_params_parent_page_EcoFriendly,sum_event_params_parent_page_Home,sum_event_params_parent_page_Lifestyle,sum_event_params_parent_page_New,sum_event_params_parent_page_Other,sum_event_params_parent_page_PaymentMethod,sum_event_params_parent_page_Sale,sum_event_params_parent_page_ShoppingCart,sum_event_params_child_page_Bags,sum_event_params_child_page_Google,sum_event_params_child_page_Hats,sum_event_params_child_page_Kids,sum_event_params_child_page_Notebooks,sum_event_params_child_page_Other,sum_event_params_child_page_Socks,sum_event_params_child_page_Stickers,sum_event_params_child_page_Womens,sum_event_params_child_page_Writing,sum_event_params_child_page_YouTube,sum_item_parent_category_Other,sum_item_parent_category_Stationery,sum_item_child_category_Drinkware,sum_item_child_category_Google,sum_item_child_category_Kids,sum_item_child_category_MensUnisex,sum_item_child_category_SmallGoods,sum_item_child_subcategory_Backpacks,sum_item_child_subcategory_ElectronicsAccessories,sum_item_child_subcategory_Infant,sum_item_child_subcategory_MensTShirts,sum_item_child_subcategory_MugsTumblers,sum_item_child_subcategory_Other,sum_item_child_subcategory_WaterBottles,max_event_params_engagement_time_msec,max_item_price_in_usd,max_item_promotions,avg_event_params_engagement_time_msec,avg_item_promotions,avg_item_quantity,avg_item_revenue_in_usd,pre_stdev_max_event_params_engagement_time_msec,pre_stdev_avg_event_params_engagement_time_msec,pre_max_event_params_ga_session_number,pre_avg_event_params_engagement_time_msec,pre_sum_event_params_engagement_time_msec,pre_sum_event_name_begin_checkout,pre_sum_event_name_scroll,pre_sum_event_name_select_item,pre_sum_device_category_mobile,pre_sum_device_mobile_brand_name_Google,pre_sum_device_mobile_brand_name_Huawei,pre_sum_device_mobile_brand_name_Samsung,pre_sum_device_mobile_brand_name_Xiaomi,pre_sum_device_mobile_model_name_ChromeBook,pre_sum_device_mobile_model_name_Safari,pre_sum_device_mobile_model_name_iPad,pre_sum_device_web_info_browser_AndroidWebview,pre_sum_device_web_info_browser_Edge,pre_sum_device_web_info_browser_Firefox,pre_sum_device_web_info_browser_Safari,pre_sum_geo_country_Canada,pre_sum_geo_country_France,pre_sum_geo_country_India,pre_sum_geo_country_Other,pre_sum_geo_country_Spain,pre_sum_geo_country_UnitedKingdom,pre_sum_traffic_source_medium_Other,pre_sum_traffic_source_medium_cpc,pre_sum_traffic_source_medium_organic,pre_sum_traffic_source_medium_referral,pre_sum_event_params_parent_page_CampusCollection,pre_sum_event_params_parent_page_CheckoutConfirmation,pre_sum_event_params_parent_page_EcoFriendly,pre_sum_event_params_parent_page_Home,pre_sum_event_params_parent_page_Other,pre_sum_event_params_parent_page_PaymentMethod,pre_sum_event_params_parent_page_Sale,pre_sum_event_params_parent_page_ShoppingCart,pre_sum_event_params_child_page_Google,pre_sum_event_params_child_page_Hats,pre_sum_event_params_child_page_Notebooks,pre_sum_event_params_child_page_Other,pre_sum_event_params_child_page_Socks,pre_sum_event_params_child_page_Stickers,pre_sum_event_params_child_page_Womens,pre_sum_event_params_child_page_Writing,pre_sum_event_params_child_page_YouTube,pre_sum_item_parent_category_New,pre_sum_item_parent_category_Other,pre_sum_item_parent_category_ShopbyBrand,pre_sum_item_parent_category_Stationery,pre_sum_item_child_category_Bags,pre_sum_item_child_category_Drinkware,pre_sum_item_child_category_Kids,pre_sum_item_child_category_MensUnisex,pre_sum_item_child_category_SmallGoods,pre_sum_item_child_subcategory_Backpacks,pre_sum_item_child_subcategory_ElectronicsAccessories,pre_sum_item_child_subcategory_Infant,pre_sum_item_child_subcategory_MensTShirts,pre_sum_item_child_subcategory_MugsTumblers,pre_sum_item_child_subcategory_Other,pre_sum_item_child_subcategory_WaterBottles,days_first_session_to_transaction,split
0,10111055.876868386,741471,1.0,94.0,10.0,94.0,0.0,3.0,6.0,12.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,5.0,0.0,9.0,0.0,2.0,3.0,7.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,144.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,275267.0,48.0,0.0,14581.846,0.0,1.0,31.333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,TRAIN
1,1019527.5799124268,2105,1.0,44.0,5.0,32.0,0.0,7.0,8.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,4.0,7.0,27.0,44.0,186.0,0.0,3.0,4.0,62.0,13.0,90.0,0.0,0.0,0.0,0.0,0.0,2.0,36.0,0.0,3.0,0.0,3.0,33.0,12.0,0.0,0.0,0.0,72.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,52542.0,13.0,0.0,7147.296,0.0,1.75,11.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,TRAIN
2,1021887.1151788384,76937,1.0,21.0,2.0,10.0,1.0,2.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,6.0,129.0,3.0,0.0,4.0,2.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,120.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,33396.0,10.0,1.0,5104.367,1.0,2.0,21.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,TRAIN
3,1026932.0858862292,339943,1.0,110.0,4.0,110.0,2.0,2.0,2.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,8.0,0.0,24.0,3.0,2.0,9.0,4.0,27.0,6.0,0.0,62.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,60.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,68076.0,55.0,1.0,13641.206,1.0,1.0,55.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,TRAIN
4,1026932.0858862292,29460,3.0,214.0,4.0,55.0,0.0,1.0,4.0,27.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,7.0,0.0,61.0,0.0,0.0,18.0,8.0,2.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,37441.0,55.0,0.0,5188.885,0.0,1.0,55.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,TRAIN


We've reduced the number of columns from 213 to 137. All scores are now less than 10, so we can move on to the the transactions features.

In [None]:
# create table schema to prevent errors when uploading data
schema = []

for col, dtype in zip(df_transaction_reduced.dtypes.index, df_transaction_reduced.dtypes.values):
    col_dict = {}
    col_dict['name'] = col

    if dtype == object:
        typ = 'STRING'
    elif dtype == bool:
        typ = 'BOOLEAN'
    elif dtype == 'datetime64[ns]':
        typ = 'DATETIME'
    elif dtype in (int, 'Int64', 'uint8'):
        typ = 'INTEGER'
    elif dtype == float:
        typ = 'FLOAT'
    else:
        print(dtype)
    col_dict['type'] = typ

    schema.append(col_dict)

In [None]:
# df_transaction_reduced.to_gbq(f'{project_name}.return_prediction_ga4.step_5_transaction_reduced',
#                                 project_id=project_name,
#                                 if_exists='replace',
#                                 location=region,
#                                 chunksize=100_000,
#                                 table_schema=schema)

# Join transaction and customer data

In [None]:
df_customer_reduced.columns = [f'historical_{col}' for col in df_customer_reduced.columns]

In [None]:
df_merge = pd.merge(df_transaction_reduced.iloc[:, :-1],
                    df_customer_reduced,
                    how='left',
                    left_on=['user_pseudo_id', 'ecommerce_transaction_id'],
                    right_on=['historical_user_pseudo_id', 'historical_ecommerce_transaction_id'])
df_merge.head()

Unnamed: 0,user_pseudo_id,ecommerce_transaction_id,transaction_ga_session_number,user_ltv_revenue,ecommerce_tax_value_in_usd,sum_item_price_in_usd,sum_item_promotions,sum_item_quantity,sum_event_name_begin_checkout,sum_event_name_select_item,sum_device_category_mobile,sum_device_mobile_brand_name_Google,sum_device_mobile_brand_name_Huawei,sum_device_mobile_brand_name_Microsoft,sum_device_mobile_brand_name_Samsung,sum_device_mobile_brand_name_Xiaomi,sum_device_mobile_model_name_ChromeBook,sum_device_mobile_model_name_Safari,sum_device_mobile_model_name_iPad,sum_device_mobile_model_name_iPhone,sum_device_web_info_browser_AndroidWebview,sum_device_web_info_browser_Firefox,sum_device_web_info_browser_Safari,sum_geo_country_Canada,sum_geo_country_France,sum_geo_country_India,sum_geo_country_Other,sum_geo_country_Spain,sum_geo_country_UnitedKingdom,sum_traffic_source_medium_Other,sum_traffic_source_medium_cpc,sum_traffic_source_medium_organic,sum_traffic_source_medium_referral,sum_event_params_parent_page_CampusCollection,sum_event_params_parent_page_CheckoutConfirmation,sum_event_params_parent_page_EcoFriendly,sum_event_params_parent_page_Home,sum_event_params_parent_page_Lifestyle,sum_event_params_parent_page_New,sum_event_params_parent_page_Other,sum_event_params_parent_page_PaymentMethod,sum_event_params_parent_page_Sale,sum_event_params_parent_page_ShoppingCart,sum_event_params_child_page_Bags,sum_event_params_child_page_Google,sum_event_params_child_page_Hats,sum_event_params_child_page_Kids,sum_event_params_child_page_Notebooks,sum_event_params_child_page_Other,sum_event_params_child_page_Socks,sum_event_params_child_page_Stickers,sum_event_params_child_page_Womens,sum_event_params_child_page_Writing,sum_event_params_child_page_YouTube,sum_item_parent_category_Other,sum_item_parent_category_Stationery,sum_item_child_category_Drinkware,sum_item_child_category_Google,sum_item_child_category_Kids,sum_item_child_category_MensUnisex,sum_item_child_category_SmallGoods,sum_item_child_subcategory_Backpacks,sum_item_child_subcategory_ElectronicsAccessories,sum_item_child_subcategory_Infant,sum_item_child_subcategory_MensTShirts,sum_item_child_subcategory_MugsTumblers,sum_item_child_subcategory_Other,sum_item_child_subcategory_WaterBottles,max_event_params_engagement_time_msec,max_item_price_in_usd,max_item_promotions,avg_event_params_engagement_time_msec,avg_item_promotions,avg_item_quantity,avg_item_revenue_in_usd,pre_stdev_max_event_params_engagement_time_msec,pre_stdev_avg_event_params_engagement_time_msec,pre_max_event_params_ga_session_number,pre_avg_event_params_engagement_time_msec,pre_sum_event_params_engagement_time_msec,pre_sum_event_name_begin_checkout,pre_sum_event_name_scroll,pre_sum_event_name_select_item,pre_sum_device_category_mobile,pre_sum_device_mobile_brand_name_Google,pre_sum_device_mobile_brand_name_Huawei,pre_sum_device_mobile_brand_name_Samsung,pre_sum_device_mobile_brand_name_Xiaomi,pre_sum_device_mobile_model_name_ChromeBook,pre_sum_device_mobile_model_name_Safari,pre_sum_device_mobile_model_name_iPad,pre_sum_device_web_info_browser_AndroidWebview,pre_sum_device_web_info_browser_Edge,pre_sum_device_web_info_browser_Firefox,pre_sum_device_web_info_browser_Safari,pre_sum_geo_country_Canada,pre_sum_geo_country_France,pre_sum_geo_country_India,pre_sum_geo_country_Other,pre_sum_geo_country_Spain,pre_sum_geo_country_UnitedKingdom,pre_sum_traffic_source_medium_Other,pre_sum_traffic_source_medium_cpc,pre_sum_traffic_source_medium_organic,pre_sum_traffic_source_medium_referral,pre_sum_event_params_parent_page_CampusCollection,pre_sum_event_params_parent_page_CheckoutConfirmation,pre_sum_event_params_parent_page_EcoFriendly,pre_sum_event_params_parent_page_Home,pre_sum_event_params_parent_page_Other,pre_sum_event_params_parent_page_PaymentMethod,pre_sum_event_params_parent_page_Sale,pre_sum_event_params_parent_page_ShoppingCart,pre_sum_event_params_child_page_Google,pre_sum_event_params_child_page_Hats,pre_sum_event_params_child_page_Notebooks,pre_sum_event_params_child_page_Other,pre_sum_event_params_child_page_Socks,pre_sum_event_params_child_page_Stickers,pre_sum_event_params_child_page_Womens,pre_sum_event_params_child_page_Writing,pre_sum_event_params_child_page_YouTube,pre_sum_item_parent_category_New,pre_sum_item_parent_category_Other,pre_sum_item_parent_category_ShopbyBrand,pre_sum_item_parent_category_Stationery,pre_sum_item_child_category_Bags,pre_sum_item_child_category_Drinkware,pre_sum_item_child_category_Kids,pre_sum_item_child_category_MensUnisex,pre_sum_item_child_category_SmallGoods,pre_sum_item_child_subcategory_Backpacks,pre_sum_item_child_subcategory_ElectronicsAccessories,pre_sum_item_child_subcategory_Infant,pre_sum_item_child_subcategory_MensTShirts,pre_sum_item_child_subcategory_MugsTumblers,pre_sum_item_child_subcategory_Other,pre_sum_item_child_subcategory_WaterBottles,days_first_session_to_transaction,historical_user_pseudo_id,historical_ecommerce_transaction_id,historical_avg_event_params_engagement_time_msec,historical_avg_item_price_in_usd,historical_avg_item_promotions,historical_avg_item_refund_in_usd,historical_sum_total_return_item_quantity,historical_sum_ecommerce_refund_value_in_usd,historical_max_days_first_session_to_transaction,historical_max_event_params_engagement_time_msec,historical_max_item_promotions,historical_max_item_quantity,historical_sum_item_promotions,historical_sum_item_quantity,historical_sum_device_category_mobile,historical_sum_device_mobile_brand_name_Google,historical_sum_device_mobile_brand_name_Huawei,historical_sum_device_mobile_brand_name_Samsung,historical_sum_device_mobile_brand_name_Xiaomi,historical_sum_device_mobile_model_name_ChromeBook,historical_sum_device_web_info_browser_AndroidWebview,historical_sum_device_web_info_browser_Edge,historical_sum_device_web_info_browser_Safari,historical_sum_geo_country_Canada,historical_sum_geo_country_France,historical_sum_geo_country_India,historical_sum_geo_country_Spain,historical_sum_geo_country_UnitedKingdom,historical_sum_geo_country_UnitedStates,historical_sum_traffic_source_medium_Other,historical_sum_traffic_source_medium_cpc,historical_sum_traffic_source_medium_organic,historical_sum_traffic_source_medium_referral,historical_sum_event_params_parent_page_CampusCollection,historical_sum_event_params_parent_page_EcoFriendly,historical_sum_event_params_parent_page_Home,historical_sum_event_params_parent_page_New,historical_sum_event_params_parent_page_Other,historical_sum_event_params_parent_page_Sale,historical_sum_event_params_parent_page_Stationery,historical_sum_event_params_child_page_Google,historical_sum_event_params_child_page_Hats,historical_sum_event_params_child_page_Notebooks,historical_sum_event_params_child_page_Other,historical_sum_event_params_child_page_Socks,historical_sum_event_params_child_page_Stickers,historical_sum_event_params_child_page_Writing,historical_sum_event_params_child_page_YouTube,historical_sum_item_parent_category_New,historical_sum_item_parent_category_Other,historical_sum_item_child_category_Google,historical_sum_item_child_category_Kids,historical_sum_item_child_category_MensUnisex,historical_sum_item_child_category_Womens,historical_sum_item_child_subcategory_Backpacks,historical_sum_item_child_subcategory_ElectronicsAccessories,historical_sum_item_child_subcategory_Infant,historical_sum_item_child_subcategory_MensTShirts,historical_sum_item_child_subcategory_MugsTumblers,historical_sum_item_child_subcategory_Other,historical_sum_item_child_subcategory_WaterBottles,historical_pre_sum_event_params_engagement_time_msec,historical_pre_sum_event_name_add_to_cart,historical_pre_sum_event_name_begin_checkout,historical_pre_sum_event_name_select_item,historical_pre_sum_device_category_mobile,historical_pre_sum_device_mobile_brand_name_Google,historical_pre_sum_device_mobile_brand_name_Huawei,historical_pre_sum_device_mobile_brand_name_Microsoft,historical_pre_sum_device_mobile_brand_name_Samsung,historical_pre_sum_device_mobile_brand_name_Xiaomi,historical_pre_sum_device_mobile_model_name_ChromeBook,historical_pre_sum_device_mobile_model_name_Safari,historical_pre_sum_device_mobile_model_name_iPad,historical_pre_sum_device_mobile_model_name_iPhone,historical_pre_sum_device_web_info_browser_AndroidWebview,historical_pre_sum_device_web_info_browser_Firefox,historical_pre_sum_device_web_info_browser_Safari,historical_pre_sum_geo_country_Canada,historical_pre_sum_geo_country_France,historical_pre_sum_geo_country_India,historical_pre_sum_geo_country_Other,historical_pre_sum_geo_country_Spain,historical_pre_sum_geo_country_UnitedKingdom,historical_pre_sum_traffic_source_medium_Other,historical_pre_sum_traffic_source_medium_cpc,historical_pre_sum_traffic_source_medium_organic,historical_pre_sum_traffic_source_medium_referral,historical_pre_sum_event_params_parent_page_CampusCollection,historical_pre_sum_event_params_parent_page_CheckoutConfirmation,historical_pre_sum_event_params_parent_page_EcoFriendly,historical_pre_sum_event_params_parent_page_Home,historical_pre_sum_event_params_parent_page_New,historical_pre_sum_event_params_parent_page_Other,historical_pre_sum_event_params_parent_page_PaymentMethod,historical_pre_sum_event_params_parent_page_Sale,historical_pre_sum_event_params_parent_page_ShoppingCart,historical_pre_sum_event_params_child_page_Google,historical_pre_sum_event_params_child_page_Hats,historical_pre_sum_event_params_child_page_Kids,historical_pre_sum_event_params_child_page_Notebooks,historical_pre_sum_event_params_child_page_Other,historical_pre_sum_event_params_child_page_Socks,historical_pre_sum_event_params_child_page_Stickers,historical_pre_sum_event_params_child_page_Womens,historical_pre_sum_event_params_child_page_Writing,historical_pre_sum_event_params_child_page_YouTube,historical_pre_sum_item_parent_category_Other,historical_pre_sum_item_parent_category_ShopbyBrand,historical_pre_sum_item_parent_category_Stationery,historical_pre_sum_item_child_category_Bags,historical_pre_sum_item_child_category_Drinkware,historical_pre_sum_item_child_category_MensUnisex,historical_pre_sum_item_child_category_SmallGoods,historical_pre_sum_item_child_subcategory_Backpacks,historical_pre_sum_item_child_subcategory_ElectronicsAccessories,historical_pre_sum_item_child_subcategory_Infant,historical_pre_sum_item_child_subcategory_MensTShirts,historical_pre_sum_item_child_subcategory_MugsTumblers,historical_pre_sum_item_child_subcategory_Other,historical_pre_sum_item_child_subcategory_WaterBottles,historical_pre_max_event_params_ga_session_number,historical_pre_stdev_max_event_params_engagement_time_msec,historical_pre_stdev_avg_event_params_engagement_time_msec,historical_pre_avg_event_params_engagement_time_msec,historical_recency,historical_age,historical_split
0,10111055.876868386,741471,1.0,94.0,10.0,94.0,0.0,3.0,6.0,12.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,5.0,0.0,9.0,0.0,2.0,3.0,7.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,144.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,275267.0,48.0,0.0,14581.846,0.0,1.0,31.333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10111055.876868386,741471,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,21.0,TRAIN
1,1019527.5799124268,2105,1.0,44.0,5.0,32.0,0.0,7.0,8.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,4.0,7.0,27.0,44.0,186.0,0.0,3.0,4.0,62.0,13.0,90.0,0.0,0.0,0.0,0.0,0.0,2.0,36.0,0.0,3.0,0.0,3.0,33.0,12.0,0.0,0.0,0.0,72.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,52542.0,13.0,0.0,7147.296,0.0,1.75,11.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1019527.5799124268,2105,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,26.0,TRAIN
2,1021887.1151788384,76937,1.0,21.0,2.0,10.0,1.0,2.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,6.0,129.0,3.0,0.0,4.0,2.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,120.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,33396.0,10.0,1.0,5104.367,1.0,2.0,21.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1021887.1151788384,76937,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,TRAIN
3,1026932.0858862292,339943,1.0,110.0,4.0,110.0,2.0,2.0,2.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,8.0,0.0,24.0,3.0,2.0,9.0,4.0,27.0,6.0,0.0,62.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,60.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,68076.0,55.0,1.0,13641.206,1.0,1.0,55.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1026932.0858862292,339943,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,30.0,TRAIN
4,1026932.0858862292,29460,3.0,214.0,4.0,55.0,0.0,1.0,4.0,27.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,7.0,0.0,61.0,0.0,0.0,18.0,8.0,2.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,37441.0,55.0,0.0,5188.885,0.0,1.0,55.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1026932.0858862292,29460,13641.206,55.0,1.0,0.0,0.0,0.0,0.0,68076.0,1.0,1.0,2.0,2.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,24.0,2.0,9.0,27.0,0.0,62.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,60.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,30.0,TRAIN


In [None]:
# drop repeated columns
df_merge = df_merge.drop(['historical_user_pseudo_id', 'historical_ecommerce_transaction_id'], axis=1)
df_merge = df_merge.rename({'historical_split': 'split'}, axis=1)
df_merge.shape

(4466, 265)

# Joint VIF analysis

In [None]:
# remove id columns from analysis
merged_features = df_merge.drop(['user_pseudo_id', 'ecommerce_transaction_id', 'split'], axis=1)
merged_features.head()

Unnamed: 0,transaction_ga_session_number,user_ltv_revenue,ecommerce_tax_value_in_usd,sum_item_price_in_usd,sum_item_promotions,sum_item_quantity,sum_event_name_begin_checkout,sum_event_name_select_item,sum_device_category_mobile,sum_device_mobile_brand_name_Google,sum_device_mobile_brand_name_Huawei,sum_device_mobile_brand_name_Microsoft,sum_device_mobile_brand_name_Samsung,sum_device_mobile_brand_name_Xiaomi,sum_device_mobile_model_name_ChromeBook,sum_device_mobile_model_name_Safari,sum_device_mobile_model_name_iPad,sum_device_mobile_model_name_iPhone,sum_device_web_info_browser_AndroidWebview,sum_device_web_info_browser_Firefox,sum_device_web_info_browser_Safari,sum_geo_country_Canada,sum_geo_country_France,sum_geo_country_India,sum_geo_country_Other,sum_geo_country_Spain,sum_geo_country_UnitedKingdom,sum_traffic_source_medium_Other,sum_traffic_source_medium_cpc,sum_traffic_source_medium_organic,sum_traffic_source_medium_referral,sum_event_params_parent_page_CampusCollection,sum_event_params_parent_page_CheckoutConfirmation,sum_event_params_parent_page_EcoFriendly,sum_event_params_parent_page_Home,sum_event_params_parent_page_Lifestyle,sum_event_params_parent_page_New,sum_event_params_parent_page_Other,sum_event_params_parent_page_PaymentMethod,sum_event_params_parent_page_Sale,sum_event_params_parent_page_ShoppingCart,sum_event_params_child_page_Bags,sum_event_params_child_page_Google,sum_event_params_child_page_Hats,sum_event_params_child_page_Kids,sum_event_params_child_page_Notebooks,sum_event_params_child_page_Other,sum_event_params_child_page_Socks,sum_event_params_child_page_Stickers,sum_event_params_child_page_Womens,sum_event_params_child_page_Writing,sum_event_params_child_page_YouTube,sum_item_parent_category_Other,sum_item_parent_category_Stationery,sum_item_child_category_Drinkware,sum_item_child_category_Google,sum_item_child_category_Kids,sum_item_child_category_MensUnisex,sum_item_child_category_SmallGoods,sum_item_child_subcategory_Backpacks,sum_item_child_subcategory_ElectronicsAccessories,sum_item_child_subcategory_Infant,sum_item_child_subcategory_MensTShirts,sum_item_child_subcategory_MugsTumblers,sum_item_child_subcategory_Other,sum_item_child_subcategory_WaterBottles,max_event_params_engagement_time_msec,max_item_price_in_usd,max_item_promotions,avg_event_params_engagement_time_msec,avg_item_promotions,avg_item_quantity,avg_item_revenue_in_usd,pre_stdev_max_event_params_engagement_time_msec,pre_stdev_avg_event_params_engagement_time_msec,pre_max_event_params_ga_session_number,pre_avg_event_params_engagement_time_msec,pre_sum_event_params_engagement_time_msec,pre_sum_event_name_begin_checkout,pre_sum_event_name_scroll,pre_sum_event_name_select_item,pre_sum_device_category_mobile,pre_sum_device_mobile_brand_name_Google,pre_sum_device_mobile_brand_name_Huawei,pre_sum_device_mobile_brand_name_Samsung,pre_sum_device_mobile_brand_name_Xiaomi,pre_sum_device_mobile_model_name_ChromeBook,pre_sum_device_mobile_model_name_Safari,pre_sum_device_mobile_model_name_iPad,pre_sum_device_web_info_browser_AndroidWebview,pre_sum_device_web_info_browser_Edge,pre_sum_device_web_info_browser_Firefox,pre_sum_device_web_info_browser_Safari,pre_sum_geo_country_Canada,pre_sum_geo_country_France,pre_sum_geo_country_India,pre_sum_geo_country_Other,pre_sum_geo_country_Spain,pre_sum_geo_country_UnitedKingdom,pre_sum_traffic_source_medium_Other,pre_sum_traffic_source_medium_cpc,pre_sum_traffic_source_medium_organic,pre_sum_traffic_source_medium_referral,pre_sum_event_params_parent_page_CampusCollection,pre_sum_event_params_parent_page_CheckoutConfirmation,pre_sum_event_params_parent_page_EcoFriendly,pre_sum_event_params_parent_page_Home,pre_sum_event_params_parent_page_Other,pre_sum_event_params_parent_page_PaymentMethod,pre_sum_event_params_parent_page_Sale,pre_sum_event_params_parent_page_ShoppingCart,pre_sum_event_params_child_page_Google,pre_sum_event_params_child_page_Hats,pre_sum_event_params_child_page_Notebooks,pre_sum_event_params_child_page_Other,pre_sum_event_params_child_page_Socks,pre_sum_event_params_child_page_Stickers,pre_sum_event_params_child_page_Womens,pre_sum_event_params_child_page_Writing,pre_sum_event_params_child_page_YouTube,pre_sum_item_parent_category_New,pre_sum_item_parent_category_Other,pre_sum_item_parent_category_ShopbyBrand,pre_sum_item_parent_category_Stationery,pre_sum_item_child_category_Bags,pre_sum_item_child_category_Drinkware,pre_sum_item_child_category_Kids,pre_sum_item_child_category_MensUnisex,pre_sum_item_child_category_SmallGoods,pre_sum_item_child_subcategory_Backpacks,pre_sum_item_child_subcategory_ElectronicsAccessories,pre_sum_item_child_subcategory_Infant,pre_sum_item_child_subcategory_MensTShirts,pre_sum_item_child_subcategory_MugsTumblers,pre_sum_item_child_subcategory_Other,pre_sum_item_child_subcategory_WaterBottles,days_first_session_to_transaction,historical_avg_event_params_engagement_time_msec,historical_avg_item_price_in_usd,historical_avg_item_promotions,historical_avg_item_refund_in_usd,historical_sum_total_return_item_quantity,historical_sum_ecommerce_refund_value_in_usd,historical_max_days_first_session_to_transaction,historical_max_event_params_engagement_time_msec,historical_max_item_promotions,historical_max_item_quantity,historical_sum_item_promotions,historical_sum_item_quantity,historical_sum_device_category_mobile,historical_sum_device_mobile_brand_name_Google,historical_sum_device_mobile_brand_name_Huawei,historical_sum_device_mobile_brand_name_Samsung,historical_sum_device_mobile_brand_name_Xiaomi,historical_sum_device_mobile_model_name_ChromeBook,historical_sum_device_web_info_browser_AndroidWebview,historical_sum_device_web_info_browser_Edge,historical_sum_device_web_info_browser_Safari,historical_sum_geo_country_Canada,historical_sum_geo_country_France,historical_sum_geo_country_India,historical_sum_geo_country_Spain,historical_sum_geo_country_UnitedKingdom,historical_sum_geo_country_UnitedStates,historical_sum_traffic_source_medium_Other,historical_sum_traffic_source_medium_cpc,historical_sum_traffic_source_medium_organic,historical_sum_traffic_source_medium_referral,historical_sum_event_params_parent_page_CampusCollection,historical_sum_event_params_parent_page_EcoFriendly,historical_sum_event_params_parent_page_Home,historical_sum_event_params_parent_page_New,historical_sum_event_params_parent_page_Other,historical_sum_event_params_parent_page_Sale,historical_sum_event_params_parent_page_Stationery,historical_sum_event_params_child_page_Google,historical_sum_event_params_child_page_Hats,historical_sum_event_params_child_page_Notebooks,historical_sum_event_params_child_page_Other,historical_sum_event_params_child_page_Socks,historical_sum_event_params_child_page_Stickers,historical_sum_event_params_child_page_Writing,historical_sum_event_params_child_page_YouTube,historical_sum_item_parent_category_New,historical_sum_item_parent_category_Other,historical_sum_item_child_category_Google,historical_sum_item_child_category_Kids,historical_sum_item_child_category_MensUnisex,historical_sum_item_child_category_Womens,historical_sum_item_child_subcategory_Backpacks,historical_sum_item_child_subcategory_ElectronicsAccessories,historical_sum_item_child_subcategory_Infant,historical_sum_item_child_subcategory_MensTShirts,historical_sum_item_child_subcategory_MugsTumblers,historical_sum_item_child_subcategory_Other,historical_sum_item_child_subcategory_WaterBottles,historical_pre_sum_event_params_engagement_time_msec,historical_pre_sum_event_name_add_to_cart,historical_pre_sum_event_name_begin_checkout,historical_pre_sum_event_name_select_item,historical_pre_sum_device_category_mobile,historical_pre_sum_device_mobile_brand_name_Google,historical_pre_sum_device_mobile_brand_name_Huawei,historical_pre_sum_device_mobile_brand_name_Microsoft,historical_pre_sum_device_mobile_brand_name_Samsung,historical_pre_sum_device_mobile_brand_name_Xiaomi,historical_pre_sum_device_mobile_model_name_ChromeBook,historical_pre_sum_device_mobile_model_name_Safari,historical_pre_sum_device_mobile_model_name_iPad,historical_pre_sum_device_mobile_model_name_iPhone,historical_pre_sum_device_web_info_browser_AndroidWebview,historical_pre_sum_device_web_info_browser_Firefox,historical_pre_sum_device_web_info_browser_Safari,historical_pre_sum_geo_country_Canada,historical_pre_sum_geo_country_France,historical_pre_sum_geo_country_India,historical_pre_sum_geo_country_Other,historical_pre_sum_geo_country_Spain,historical_pre_sum_geo_country_UnitedKingdom,historical_pre_sum_traffic_source_medium_Other,historical_pre_sum_traffic_source_medium_cpc,historical_pre_sum_traffic_source_medium_organic,historical_pre_sum_traffic_source_medium_referral,historical_pre_sum_event_params_parent_page_CampusCollection,historical_pre_sum_event_params_parent_page_CheckoutConfirmation,historical_pre_sum_event_params_parent_page_EcoFriendly,historical_pre_sum_event_params_parent_page_Home,historical_pre_sum_event_params_parent_page_New,historical_pre_sum_event_params_parent_page_Other,historical_pre_sum_event_params_parent_page_PaymentMethod,historical_pre_sum_event_params_parent_page_Sale,historical_pre_sum_event_params_parent_page_ShoppingCart,historical_pre_sum_event_params_child_page_Google,historical_pre_sum_event_params_child_page_Hats,historical_pre_sum_event_params_child_page_Kids,historical_pre_sum_event_params_child_page_Notebooks,historical_pre_sum_event_params_child_page_Other,historical_pre_sum_event_params_child_page_Socks,historical_pre_sum_event_params_child_page_Stickers,historical_pre_sum_event_params_child_page_Womens,historical_pre_sum_event_params_child_page_Writing,historical_pre_sum_event_params_child_page_YouTube,historical_pre_sum_item_parent_category_Other,historical_pre_sum_item_parent_category_ShopbyBrand,historical_pre_sum_item_parent_category_Stationery,historical_pre_sum_item_child_category_Bags,historical_pre_sum_item_child_category_Drinkware,historical_pre_sum_item_child_category_MensUnisex,historical_pre_sum_item_child_category_SmallGoods,historical_pre_sum_item_child_subcategory_Backpacks,historical_pre_sum_item_child_subcategory_ElectronicsAccessories,historical_pre_sum_item_child_subcategory_Infant,historical_pre_sum_item_child_subcategory_MensTShirts,historical_pre_sum_item_child_subcategory_MugsTumblers,historical_pre_sum_item_child_subcategory_Other,historical_pre_sum_item_child_subcategory_WaterBottles,historical_pre_max_event_params_ga_session_number,historical_pre_stdev_max_event_params_engagement_time_msec,historical_pre_stdev_avg_event_params_engagement_time_msec,historical_pre_avg_event_params_engagement_time_msec,historical_recency,historical_age
0,1.0,94.0,10.0,94.0,0.0,3.0,6.0,12.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,5.0,0.0,9.0,0.0,2.0,3.0,7.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,144.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,275267.0,48.0,0.0,14581.846,0.0,1.0,31.333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,21.0
1,1.0,44.0,5.0,32.0,0.0,7.0,8.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,4.0,7.0,27.0,44.0,186.0,0.0,3.0,4.0,62.0,13.0,90.0,0.0,0.0,0.0,0.0,0.0,2.0,36.0,0.0,3.0,0.0,3.0,33.0,12.0,0.0,0.0,0.0,72.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,52542.0,13.0,0.0,7147.296,0.0,1.75,11.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,26.0
2,1.0,21.0,2.0,10.0,1.0,2.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,6.0,129.0,3.0,0.0,4.0,2.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,120.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,33396.0,10.0,1.0,5104.367,1.0,2.0,21.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
3,1.0,110.0,4.0,110.0,2.0,2.0,2.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,8.0,0.0,24.0,3.0,2.0,9.0,4.0,27.0,6.0,0.0,62.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,60.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,68076.0,55.0,1.0,13641.206,1.0,1.0,55.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,30.0
4,3.0,214.0,4.0,55.0,0.0,1.0,4.0,27.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,7.0,0.0,61.0,0.0,0.0,18.0,8.0,2.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,37441.0,55.0,0.0,5188.885,0.0,1.0,55.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,13641.206,55.0,1.0,0.0,0.0,0.0,0.0,68076.0,1.0,1.0,2.0,2.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,24.0,2.0,9.0,27.0,0.0,62.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,60.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,30.0


In [None]:
merged_features.shape

(4466, 262)

In [None]:
# first, let's take a look at VIF scores for all reduced features across customer and transaction
vif_before = pd.DataFrame(
    [variance_inflation_factor(merged_features.values, i) for i in range(merged_features.shape[1])],
    index=merged_features.columns,
    dtype=float,
).reset_index()
vif_before.columns = ('Feature', 'VIF')

In [None]:
print('Before feature selection:')
vif_before

Before feature selection:


Unnamed: 0,Feature,VIF
0,transaction_ga_session_number,7.423
1,user_ltv_revenue,6.918
2,ecommerce_tax_value_in_usd,6.405
3,sum_item_price_in_usd,8.993
4,sum_item_promotions,2.683
5,sum_item_quantity,3.948
6,sum_event_name_begin_checkout,4.5
7,sum_event_name_select_item,2.346
8,sum_device_category_mobile,9.046
9,sum_device_mobile_brand_name_Google,5.025


In [None]:
vif_before[vif_before['VIF'] > 5].shape[0]

176

Again, since so many of our features have such high VIF values, let's start by dropping the features with a VIF value over 100 and recalculate VIF.

In [None]:
# drop 48 features with VIF over 100
merged_features = merged_features.drop(list(vif_before[vif_before['VIF'] > 100].Feature), axis=1)
merged_features.shape

(4466, 214)

In [None]:
# re-examine VIF values after dropping extreme hight VIF features
vif_trimmed = pd.DataFrame(
    [variance_inflation_factor(merged_features.values, i) for i in range(merged_features.shape[1])],
    index=merged_features.columns,
    dtype=float,
).reset_index()
vif_trimmed.columns = ('Feature', 'VIF')

In [None]:
print('After trimming extremely high VIF features:')
vif_trimmed

After trimming extremely high VIF features:


Unnamed: 0,Feature,VIF
0,transaction_ga_session_number,7.36
1,user_ltv_revenue,6.537
2,ecommerce_tax_value_in_usd,6.244
3,sum_item_price_in_usd,8.94
4,sum_item_promotions,2.663
5,sum_item_quantity,3.911
6,sum_event_name_begin_checkout,4.327
7,sum_event_name_select_item,2.32
8,sum_device_category_mobile,7.344
9,sum_device_mobile_brand_name_Google,4.909


In [None]:
vif_trimmed[vif_trimmed['VIF'] > 50].shape[0]

15

After removing extremely high VIF features, 15 features still has a VIF value over 50. <br>
Let's start by automatically removing features to get all features to a max VIF value of 10.

In [None]:
# call the function and update merged_features
merged_features = vif_and_drop(merged_features, thresh=10)

dropping 'historical_pre_sum_event_params_child_page_Socks' at index: 201
dropping 'historical_pre_sum_device_mobile_model_name_ChromeBook' at index: 178
dropping 'historical_pre_sum_event_params_child_page_Hats' at index: 198
dropping 'historical_pre_sum_geo_country_Other' at index: 184
dropping 'historical_pre_sum_item_child_category_Bags' at index: 201
dropping 'historical_pre_sum_geo_country_India' at index: 183
dropping 'historical_pre_sum_event_params_parent_page_Sale' at index: 195
dropping 'historical_pre_max_event_params_ga_session_number' at index: 203
dropping 'historical_pre_sum_event_params_parent_page_EcoFriendly' at index: 191
dropping 'historical_pre_sum_event_params_parent_page_PaymentMethod' at index: 193
dropping 'historical_pre_sum_traffic_source_medium_referral' at index: 188
dropping 'historical_pre_sum_device_mobile_brand_name_Google' at index: 174
dropping 'historical_pre_sum_device_mobile_model_name_iPad' at index: 177
dropping 'historical_pre_sum_geo_country_C

In [None]:
# check all final VIF values
vif_reduced = pd.DataFrame(
    [variance_inflation_factor(merged_features.values, i) for i in range(merged_features.shape[1])],
    index=merged_features.columns,
    dtype=float,
).reset_index()
vif_reduced.columns = ('Feature', 'VIF')

In [None]:
print('All features VIF < 10:')
vif_reduced

All features VIF < 10:


Unnamed: 0,Feature,VIF
0,transaction_ga_session_number,6.359
1,user_ltv_revenue,6.384
2,ecommerce_tax_value_in_usd,6.141
3,sum_item_price_in_usd,5.886
4,sum_item_promotions,2.644
5,sum_item_quantity,3.881
6,sum_event_name_begin_checkout,4.303
7,sum_event_name_select_item,2.258
8,sum_device_category_mobile,7.194
9,sum_device_mobile_brand_name_Google,4.69


In [None]:
merged_features.shape

(4466, 181)

In [None]:
df_merge_reduced = pd.concat([df_merge.loc[:, ['user_pseudo_id', 'ecommerce_transaction_id']],
                              merged_features,
                              df_merge.iloc[:, -1]],
                             axis=1)
df_merge_reduced.head()

Unnamed: 0,user_pseudo_id,ecommerce_transaction_id,transaction_ga_session_number,user_ltv_revenue,ecommerce_tax_value_in_usd,sum_item_price_in_usd,sum_item_promotions,sum_item_quantity,sum_event_name_begin_checkout,sum_event_name_select_item,sum_device_category_mobile,sum_device_mobile_brand_name_Google,sum_device_mobile_brand_name_Huawei,sum_device_mobile_brand_name_Microsoft,sum_device_mobile_brand_name_Samsung,sum_device_mobile_brand_name_Xiaomi,sum_device_mobile_model_name_ChromeBook,sum_device_mobile_model_name_Safari,sum_device_mobile_model_name_iPad,sum_device_mobile_model_name_iPhone,sum_device_web_info_browser_AndroidWebview,sum_device_web_info_browser_Firefox,sum_device_web_info_browser_Safari,sum_geo_country_Canada,sum_geo_country_France,sum_geo_country_India,sum_geo_country_Other,sum_geo_country_Spain,sum_geo_country_UnitedKingdom,sum_traffic_source_medium_Other,sum_traffic_source_medium_cpc,sum_traffic_source_medium_organic,sum_traffic_source_medium_referral,sum_event_params_parent_page_CampusCollection,sum_event_params_parent_page_CheckoutConfirmation,sum_event_params_parent_page_EcoFriendly,sum_event_params_parent_page_Home,sum_event_params_parent_page_Lifestyle,sum_event_params_parent_page_New,sum_event_params_parent_page_Other,sum_event_params_parent_page_PaymentMethod,sum_event_params_parent_page_Sale,sum_event_params_parent_page_ShoppingCart,sum_event_params_child_page_Bags,sum_event_params_child_page_Google,sum_event_params_child_page_Hats,sum_event_params_child_page_Kids,sum_event_params_child_page_Notebooks,sum_event_params_child_page_Other,sum_event_params_child_page_Socks,sum_event_params_child_page_Stickers,sum_event_params_child_page_Womens,sum_event_params_child_page_Writing,sum_event_params_child_page_YouTube,sum_item_parent_category_Other,sum_item_parent_category_Stationery,sum_item_child_category_Drinkware,sum_item_child_category_Google,sum_item_child_category_MensUnisex,sum_item_child_category_SmallGoods,sum_item_child_subcategory_Backpacks,sum_item_child_subcategory_ElectronicsAccessories,sum_item_child_subcategory_Infant,sum_item_child_subcategory_MensTShirts,sum_item_child_subcategory_MugsTumblers,sum_item_child_subcategory_Other,sum_item_child_subcategory_WaterBottles,max_event_params_engagement_time_msec,max_item_promotions,avg_event_params_engagement_time_msec,avg_item_promotions,avg_item_quantity,avg_item_revenue_in_usd,pre_max_event_params_ga_session_number,pre_avg_event_params_engagement_time_msec,pre_sum_event_name_scroll,pre_sum_event_name_select_item,pre_sum_device_mobile_brand_name_Google,pre_sum_device_mobile_brand_name_Samsung,pre_sum_device_mobile_brand_name_Xiaomi,pre_sum_device_mobile_model_name_ChromeBook,pre_sum_device_mobile_model_name_Safari,pre_sum_device_mobile_model_name_iPad,pre_sum_device_web_info_browser_AndroidWebview,pre_sum_device_web_info_browser_Edge,pre_sum_device_web_info_browser_Firefox,pre_sum_geo_country_Canada,pre_sum_geo_country_India,pre_sum_geo_country_Other,pre_sum_geo_country_Spain,pre_sum_geo_country_UnitedKingdom,pre_sum_traffic_source_medium_Other,pre_sum_traffic_source_medium_cpc,pre_sum_traffic_source_medium_organic,pre_sum_traffic_source_medium_referral,pre_sum_event_params_parent_page_CampusCollection,pre_sum_event_params_parent_page_CheckoutConfirmation,pre_sum_event_params_parent_page_EcoFriendly,pre_sum_event_params_parent_page_Home,pre_sum_event_params_parent_page_PaymentMethod,pre_sum_event_params_parent_page_Sale,pre_sum_event_params_child_page_Hats,pre_sum_event_params_child_page_Socks,pre_sum_event_params_child_page_Stickers,pre_sum_event_params_child_page_Womens,pre_sum_event_params_child_page_YouTube,pre_sum_item_parent_category_New,pre_sum_item_child_category_Bags,pre_sum_item_child_category_Drinkware,pre_sum_item_child_category_Kids,pre_sum_item_child_category_MensUnisex,pre_sum_item_child_subcategory_Backpacks,pre_sum_item_child_subcategory_ElectronicsAccessories,pre_sum_item_child_subcategory_WaterBottles,days_first_session_to_transaction,historical_avg_event_params_engagement_time_msec,historical_avg_item_price_in_usd,historical_avg_item_promotions,historical_avg_item_refund_in_usd,historical_sum_total_return_item_quantity,historical_sum_ecommerce_refund_value_in_usd,historical_max_days_first_session_to_transaction,historical_max_event_params_engagement_time_msec,historical_max_item_promotions,historical_max_item_quantity,historical_sum_item_promotions,historical_sum_item_quantity,historical_sum_device_category_mobile,historical_sum_device_mobile_brand_name_Google,historical_sum_device_mobile_brand_name_Samsung,historical_sum_device_mobile_brand_name_Xiaomi,historical_sum_device_mobile_model_name_ChromeBook,historical_sum_device_web_info_browser_AndroidWebview,historical_sum_device_web_info_browser_Edge,historical_sum_device_web_info_browser_Safari,historical_sum_geo_country_Canada,historical_sum_geo_country_France,historical_sum_geo_country_India,historical_sum_geo_country_Spain,historical_sum_geo_country_UnitedKingdom,historical_sum_geo_country_UnitedStates,historical_sum_traffic_source_medium_Other,historical_sum_traffic_source_medium_cpc,historical_sum_traffic_source_medium_organic,historical_sum_traffic_source_medium_referral,historical_sum_event_params_parent_page_CampusCollection,historical_sum_event_params_parent_page_EcoFriendly,historical_sum_event_params_parent_page_Home,historical_sum_event_params_parent_page_New,historical_sum_event_params_parent_page_Other,historical_sum_event_params_parent_page_Sale,historical_sum_event_params_parent_page_Stationery,historical_sum_event_params_child_page_Google,historical_sum_event_params_child_page_Hats,historical_sum_event_params_child_page_Notebooks,historical_sum_event_params_child_page_Other,historical_sum_event_params_child_page_Socks,historical_sum_event_params_child_page_Stickers,historical_sum_event_params_child_page_Writing,historical_sum_event_params_child_page_YouTube,historical_sum_item_parent_category_New,historical_sum_item_parent_category_Other,historical_sum_item_child_category_Google,historical_sum_item_child_category_Kids,historical_sum_item_child_category_MensUnisex,historical_sum_item_child_category_Womens,historical_sum_item_child_subcategory_Backpacks,historical_sum_item_child_subcategory_ElectronicsAccessories,historical_sum_item_child_subcategory_Infant,historical_sum_item_child_subcategory_MensTShirts,historical_sum_item_child_subcategory_MugsTumblers,historical_sum_item_child_subcategory_Other,historical_sum_item_child_subcategory_WaterBottles,historical_pre_sum_event_name_add_to_cart,historical_pre_sum_device_mobile_brand_name_Microsoft,historical_pre_sum_device_mobile_model_name_iPhone,historical_pre_sum_event_params_parent_page_CheckoutConfirmation,historical_pre_sum_event_params_parent_page_New,historical_pre_sum_item_child_subcategory_Backpacks,historical_pre_sum_item_child_subcategory_ElectronicsAccessories,historical_pre_sum_item_child_subcategory_WaterBottles,historical_recency,historical_age,split
0,10111055.876868386,741471,1.0,94.0,10.0,94.0,0.0,3.0,6.0,12.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,5.0,0.0,9.0,0.0,2.0,3.0,7.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,144.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,275267.0,0.0,14581.846,0.0,1.0,31.333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,21.0,TRAIN
1,1019527.5799124268,2105,1.0,44.0,5.0,32.0,0.0,7.0,8.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,4.0,7.0,27.0,44.0,186.0,0.0,3.0,4.0,62.0,13.0,90.0,0.0,0.0,0.0,0.0,0.0,2.0,36.0,0.0,3.0,0.0,3.0,33.0,12.0,0.0,0.0,72.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,52542.0,0.0,7147.296,0.0,1.75,11.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,26.0,TRAIN
2,1021887.1151788384,76937,1.0,21.0,2.0,10.0,1.0,2.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,6.0,129.0,3.0,0.0,4.0,2.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,120.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,33396.0,1.0,5104.367,1.0,2.0,21.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,TRAIN
3,1026932.0858862292,339943,1.0,110.0,4.0,110.0,2.0,2.0,2.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,8.0,0.0,24.0,3.0,2.0,9.0,4.0,27.0,6.0,0.0,62.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,60.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,68076.0,1.0,13641.206,1.0,1.0,55.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,30.0,TRAIN
4,1026932.0858862292,29460,3.0,214.0,4.0,55.0,0.0,1.0,4.0,27.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,7.0,0.0,61.0,0.0,0.0,18.0,8.0,2.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,37441.0,0.0,5188.885,0.0,1.0,55.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,13641.206,55.0,1.0,0.0,0.0,0.0,0.0,68076.0,1.0,1.0,2.0,2.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,24.0,2.0,9.0,27.0,0.0,62.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,60.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,30.0,TRAIN


We've reduced the number of columns from 259 to 181. All scores are now less than 10, so we can move on to the next step. <br>
At this point, we should also re-attach our target column, `ecommerce_refund_value_in_usd`.

In [None]:
df_merge_final = pd.merge(
    df_merge_reduced,
    df_transaction[['user_pseudo_id', 'ecommerce_transaction_id', 'ecommerce_refund_value_in_usd']],
    how='left',
    on=['user_pseudo_id', 'ecommerce_transaction_id']
)
df_merge_final.head()

Unnamed: 0,user_pseudo_id,ecommerce_transaction_id,transaction_ga_session_number,user_ltv_revenue,ecommerce_tax_value_in_usd,sum_item_price_in_usd,sum_item_promotions,sum_item_quantity,sum_event_name_begin_checkout,sum_event_name_select_item,sum_device_category_mobile,sum_device_mobile_brand_name_Google,sum_device_mobile_brand_name_Huawei,sum_device_mobile_brand_name_Microsoft,sum_device_mobile_brand_name_Samsung,sum_device_mobile_brand_name_Xiaomi,sum_device_mobile_model_name_ChromeBook,sum_device_mobile_model_name_Safari,sum_device_mobile_model_name_iPad,sum_device_mobile_model_name_iPhone,sum_device_web_info_browser_AndroidWebview,sum_device_web_info_browser_Firefox,sum_device_web_info_browser_Safari,sum_geo_country_Canada,sum_geo_country_France,sum_geo_country_India,sum_geo_country_Other,sum_geo_country_Spain,sum_geo_country_UnitedKingdom,sum_traffic_source_medium_Other,sum_traffic_source_medium_cpc,sum_traffic_source_medium_organic,sum_traffic_source_medium_referral,sum_event_params_parent_page_CampusCollection,sum_event_params_parent_page_CheckoutConfirmation,sum_event_params_parent_page_EcoFriendly,sum_event_params_parent_page_Home,sum_event_params_parent_page_Lifestyle,sum_event_params_parent_page_New,sum_event_params_parent_page_Other,sum_event_params_parent_page_PaymentMethod,sum_event_params_parent_page_Sale,sum_event_params_parent_page_ShoppingCart,sum_event_params_child_page_Bags,sum_event_params_child_page_Google,sum_event_params_child_page_Hats,sum_event_params_child_page_Kids,sum_event_params_child_page_Notebooks,sum_event_params_child_page_Other,sum_event_params_child_page_Socks,sum_event_params_child_page_Stickers,sum_event_params_child_page_Womens,sum_event_params_child_page_Writing,sum_event_params_child_page_YouTube,sum_item_parent_category_Other,sum_item_parent_category_Stationery,sum_item_child_category_Drinkware,sum_item_child_category_Google,sum_item_child_category_MensUnisex,sum_item_child_category_SmallGoods,sum_item_child_subcategory_Backpacks,sum_item_child_subcategory_ElectronicsAccessories,sum_item_child_subcategory_Infant,sum_item_child_subcategory_MensTShirts,sum_item_child_subcategory_MugsTumblers,sum_item_child_subcategory_Other,sum_item_child_subcategory_WaterBottles,max_event_params_engagement_time_msec,max_item_promotions,avg_event_params_engagement_time_msec,avg_item_promotions,avg_item_quantity,avg_item_revenue_in_usd,pre_max_event_params_ga_session_number,pre_avg_event_params_engagement_time_msec,pre_sum_event_name_scroll,pre_sum_event_name_select_item,pre_sum_device_mobile_brand_name_Google,pre_sum_device_mobile_brand_name_Samsung,pre_sum_device_mobile_brand_name_Xiaomi,pre_sum_device_mobile_model_name_ChromeBook,pre_sum_device_mobile_model_name_Safari,pre_sum_device_mobile_model_name_iPad,pre_sum_device_web_info_browser_AndroidWebview,pre_sum_device_web_info_browser_Edge,pre_sum_device_web_info_browser_Firefox,pre_sum_geo_country_Canada,pre_sum_geo_country_India,pre_sum_geo_country_Other,pre_sum_geo_country_Spain,pre_sum_geo_country_UnitedKingdom,pre_sum_traffic_source_medium_Other,pre_sum_traffic_source_medium_cpc,pre_sum_traffic_source_medium_organic,pre_sum_traffic_source_medium_referral,pre_sum_event_params_parent_page_CampusCollection,pre_sum_event_params_parent_page_CheckoutConfirmation,pre_sum_event_params_parent_page_EcoFriendly,pre_sum_event_params_parent_page_Home,pre_sum_event_params_parent_page_PaymentMethod,pre_sum_event_params_parent_page_Sale,pre_sum_event_params_child_page_Hats,pre_sum_event_params_child_page_Socks,pre_sum_event_params_child_page_Stickers,pre_sum_event_params_child_page_Womens,pre_sum_event_params_child_page_YouTube,pre_sum_item_parent_category_New,pre_sum_item_child_category_Bags,pre_sum_item_child_category_Drinkware,pre_sum_item_child_category_Kids,pre_sum_item_child_category_MensUnisex,pre_sum_item_child_subcategory_Backpacks,pre_sum_item_child_subcategory_ElectronicsAccessories,pre_sum_item_child_subcategory_WaterBottles,days_first_session_to_transaction,historical_avg_event_params_engagement_time_msec,historical_avg_item_price_in_usd,historical_avg_item_promotions,historical_avg_item_refund_in_usd,historical_sum_total_return_item_quantity,historical_sum_ecommerce_refund_value_in_usd,historical_max_days_first_session_to_transaction,historical_max_event_params_engagement_time_msec,historical_max_item_promotions,historical_max_item_quantity,historical_sum_item_promotions,historical_sum_item_quantity,historical_sum_device_category_mobile,historical_sum_device_mobile_brand_name_Google,historical_sum_device_mobile_brand_name_Samsung,historical_sum_device_mobile_brand_name_Xiaomi,historical_sum_device_mobile_model_name_ChromeBook,historical_sum_device_web_info_browser_AndroidWebview,historical_sum_device_web_info_browser_Edge,historical_sum_device_web_info_browser_Safari,historical_sum_geo_country_Canada,historical_sum_geo_country_France,historical_sum_geo_country_India,historical_sum_geo_country_Spain,historical_sum_geo_country_UnitedKingdom,historical_sum_geo_country_UnitedStates,historical_sum_traffic_source_medium_Other,historical_sum_traffic_source_medium_cpc,historical_sum_traffic_source_medium_organic,historical_sum_traffic_source_medium_referral,historical_sum_event_params_parent_page_CampusCollection,historical_sum_event_params_parent_page_EcoFriendly,historical_sum_event_params_parent_page_Home,historical_sum_event_params_parent_page_New,historical_sum_event_params_parent_page_Other,historical_sum_event_params_parent_page_Sale,historical_sum_event_params_parent_page_Stationery,historical_sum_event_params_child_page_Google,historical_sum_event_params_child_page_Hats,historical_sum_event_params_child_page_Notebooks,historical_sum_event_params_child_page_Other,historical_sum_event_params_child_page_Socks,historical_sum_event_params_child_page_Stickers,historical_sum_event_params_child_page_Writing,historical_sum_event_params_child_page_YouTube,historical_sum_item_parent_category_New,historical_sum_item_parent_category_Other,historical_sum_item_child_category_Google,historical_sum_item_child_category_Kids,historical_sum_item_child_category_MensUnisex,historical_sum_item_child_category_Womens,historical_sum_item_child_subcategory_Backpacks,historical_sum_item_child_subcategory_ElectronicsAccessories,historical_sum_item_child_subcategory_Infant,historical_sum_item_child_subcategory_MensTShirts,historical_sum_item_child_subcategory_MugsTumblers,historical_sum_item_child_subcategory_Other,historical_sum_item_child_subcategory_WaterBottles,historical_pre_sum_event_name_add_to_cart,historical_pre_sum_device_mobile_brand_name_Microsoft,historical_pre_sum_device_mobile_model_name_iPhone,historical_pre_sum_event_params_parent_page_CheckoutConfirmation,historical_pre_sum_event_params_parent_page_New,historical_pre_sum_item_child_subcategory_Backpacks,historical_pre_sum_item_child_subcategory_ElectronicsAccessories,historical_pre_sum_item_child_subcategory_WaterBottles,historical_recency,historical_age,split,ecommerce_refund_value_in_usd
0,10111055.876868386,741471,1.0,94.0,10.0,94.0,0.0,3.0,6.0,12.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,5.0,0.0,9.0,0.0,2.0,3.0,7.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,144.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,275267.0,0.0,14581.846,0.0,1.0,31.333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,21.0,TRAIN,0.0
1,1019527.5799124268,2105,1.0,44.0,5.0,32.0,0.0,7.0,8.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,4.0,7.0,27.0,44.0,186.0,0.0,3.0,4.0,62.0,13.0,90.0,0.0,0.0,0.0,0.0,0.0,2.0,36.0,0.0,3.0,0.0,3.0,33.0,12.0,0.0,0.0,72.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,52542.0,0.0,7147.296,0.0,1.75,11.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,26.0,TRAIN,0.0
2,1021887.1151788384,76937,1.0,21.0,2.0,10.0,1.0,2.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,6.0,129.0,3.0,0.0,4.0,2.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,120.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,33396.0,1.0,5104.367,1.0,2.0,21.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,TRAIN,0.0
3,1026932.0858862292,339943,1.0,110.0,4.0,110.0,2.0,2.0,2.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,8.0,0.0,24.0,3.0,2.0,9.0,4.0,27.0,6.0,0.0,62.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,60.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,68076.0,1.0,13641.206,1.0,1.0,55.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,30.0,TRAIN,0.0
4,1026932.0858862292,29460,3.0,214.0,4.0,55.0,0.0,1.0,4.0,27.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,7.0,0.0,61.0,0.0,0.0,18.0,8.0,2.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,37441.0,0.0,5188.885,0.0,1.0,55.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,13641.206,55.0,1.0,0.0,0.0,0.0,0.0,68076.0,1.0,1.0,2.0,2.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,24.0,2.0,9.0,27.0,0.0,62.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,60.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,30.0,TRAIN,0.0


In [None]:
# create table schema to prevent errors when uploading data
schema = []

for col, dtype in zip(df_merge_final.dtypes.index, df_merge_final.dtypes.values):
    col_dict = {}
    col_dict['name'] = col

    if dtype == object:
        typ = 'STRING'
    elif dtype == bool:
        typ = 'BOOLEAN'
    elif dtype == 'datetime64[ns]':
        typ = 'DATETIME'
    elif dtype in (int, 'Int64', 'uint8'):
        typ = 'INTEGER'
    elif dtype == float:
        typ = 'FLOAT'
    else:
        print(dtype)
    col_dict['type'] = typ

    schema.append(col_dict)

In [None]:
# df_merge_final.to_gbq(f'{project_name}.return_prediction_ga4.step_5_merged_final',
#                         project_id=project_name,
#                         if_exists='replace',
#                         location=region,
#                         chunksize=100_000,
#                         table_schema=schema)