<a href="https://colab.research.google.com/github/Adlucent/ga4-return-prediction/blob/main/7_Model_Building_and_Evaluation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 7. Model Building

In [None]:
################################################################################
######################### CHANGE BQ PROJECT NAME BELOW #########################
################################################################################

project_name = 'adl-analytics' #add proj name
region = "US"  # GCP project region

In [None]:
# If your notebook does not have pandas_gbq you can install it here:
# ! pip install pandas_gbq

In [None]:
# Google credentials
from google.colab import auth
auth.authenticate_user()

# BigQuery Magics
'''BigQuery magics are used to run BigQuery SQL queries in a python environment.
These queries can also be run in the BigQuery UI '''

from google.cloud import bigquery
from google.cloud.bigquery import magics, Client, QueryJobConfig

magics.context.project = project_name #update project name
client = bigquery.Client(project=magics.context.project)

# Interface between Jupyter and BigQuery
import pandas_gbq

# data processing libraries + ML tools
import pandas as pd
import numpy as np

# ML model evaluation metrics
from sklearn.metrics import (
    explained_variance_score,
    mean_absolute_error,
    mean_squared_error,
    mean_squared_log_error,
    median_absolute_error,
    r2_score
)

# visualization
import matplotlib.pyplot as plt
import seaborn as sns

sns.set()

# suppress notebook warnings
import warnings
warnings.filterwarnings('ignore')

# dataframe formatting
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)
pd.set_option("display.float_format", lambda x: "%.3f" % x)

# Build Models
Using BigQuery ML

## Linear Regression Model
See hyperparameter details and model options [here](https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-create-glm). <BR>
**EXPLAIN HYPERPARAMETERS BEING TUNED**

In [None]:
%%bigquery lr_model --project $project_name
CREATE OR REPLACE MODEL
  `return_prediction_ga4.lr_model`
OPTIONS
  ( MODEL_TYPE='LINEAR_REG',
    FIT_INTERCEPT = TRUE,
    L1_REG = HPARAM_RANGE(0, 10.0),
    L2_REG = HPARAM_RANGE(0, 10.0),
    MAX_ITERATIONS = 50,
    EARLY_STOP = FALSE,
    DATA_SPLIT_METHOD='CUSTOM',
    DATA_SPLIT_COL='SPLIT_COL',
    NUM_TRIALS = 20,
    HPARAM_TUNING_ALGORITHM = 'VIZIER_DEFAULT',
    HPARAM_TUNING_OBJECTIVES = ['MEAN_ABSOLUTE_ERROR']) AS
SELECT
    * EXCEPT(user_pseudo_id, ecommerce_transaction_id, split, ecommerce_refund_value_in_usd),
    ecommerce_refund_value_in_usd AS label,
    'TRAIN' AS split_col
FROM
    `return_prediction_ga4.step_6_merged_scaled`
WHERE
    split = 'TRAIN'
UNION ALL
SELECT
    * EXCEPT(user_pseudo_id, ecommerce_transaction_id, split, ecommerce_refund_value_in_usd),
    ecommerce_refund_value_in_usd AS label,
    'EVAL' AS split_col
FROM
    `return_prediction_ga4.step_6_merged_scaled`
WHERE
    split = 'VALID'
UNION ALL
SELECT
    * EXCEPT(user_pseudo_id, ecommerce_transaction_id, split, ecommerce_refund_value_in_usd),
    ecommerce_refund_value_in_usd AS label,
    'TEST' AS split_col
FROM
    `return_prediction_ga4.step_6_merged_scaled`
WHERE
    split = 'TEST';

Query is running:   0%|          |

## RandomForest Model
See hyperparameter details and model options [here](https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-create-random-forest). <br>
**EXPLAIN HYPERPARAMETERS BEING TUNED**

In [None]:
%%bigquery rf_model --project $project_name
CREATE OR REPLACE MODEL
    `return_prediction_ga4.rf_model`
OPTIONS
  ( MODEL_TYPE='RANDOM_FOREST_REGRESSOR',
    NUM_PARALLEL_TREE = HPARAM_RANGE(2, 200),
    MAX_TREE_DEPTH = HPARAM_RANGE(1, 20),
    MIN_TREE_CHILD_WEIGHT = HPARAM_RANGE(2, 20),
    COLSAMPLE_BYNODE = HPARAM_RANGE(0.25, 0.75),
    SUBSAMPLE = HPARAM_RANGE(0.5, 0.9),
    EARLY_STOP = TRUE,
    MIN_REL_PROGRESS = 0.0001,
    DATA_SPLIT_METHOD='CUSTOM',
    DATA_SPLIT_COL='SPLIT_COL',
    NUM_TRIALS = 50,
    HPARAM_TUNING_ALGORITHM = 'VIZIER_DEFAULT',
    HPARAM_TUNING_OBJECTIVES = ['MEAN_ABSOLUTE_ERROR'] ) AS
SELECT
    * EXCEPT(user_pseudo_id, ecommerce_transaction_id, split, ecommerce_refund_value_in_usd),
    ecommerce_refund_value_in_usd AS label,
    'TRAIN' AS split_col
FROM
    `return_prediction_ga4.step_6_merged_scaled`
WHERE
    split = 'TRAIN'
UNION ALL
SELECT
    * EXCEPT(user_pseudo_id, ecommerce_transaction_id, split, ecommerce_refund_value_in_usd),
    ecommerce_refund_value_in_usd AS label,
    'EVAL' AS split_col
FROM
    `return_prediction_ga4.step_6_merged_scaled`
WHERE
    split = 'VALID'
UNION ALL
SELECT
    * EXCEPT(user_pseudo_id, ecommerce_transaction_id, split, ecommerce_refund_value_in_usd),
    ecommerce_refund_value_in_usd AS label,
    'TEST' AS split_col
FROM
    `return_prediction_ga4.step_6_merged_scaled`
WHERE
    split = 'TEST';

Query is running:   0%|          |

## XGBoost Model
See hyperparameter details and model options [here](https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-create-boosted-tree). <br>
**EXPLAIN HYPERPARAMETERS BEING TUNED**

In [None]:
%%bigquery xgb_model --project $project_name
CREATE OR REPLACE MODEL
    `return_prediction_ga4.xgb_model`
OPTIONS
  ( MODEL_TYPE='BOOSTED_TREE_REGRESSOR',
    BOOSTER_TYPE = HPARAM_CANDIDATES(['GBTREE', 'DART']),
    DART_NORMALIZE_TYPE = HPARAM_CANDIDATES(['TREE', 'FOREST']),
    NUM_PARALLEL_TREE = HPARAM_RANGE(2, 100),
    MAX_TREE_DEPTH = HPARAM_RANGE(1, 20),
    DROPOUT = HPARAM_RANGE(0, 0.5),
    LEARN_RATE = HPARAM_RANGE(0.1, 0.5),
    MIN_TREE_CHILD_WEIGHT = HPARAM_RANGE(2, 20),
    COLSAMPLE_BYNODE = HPARAM_RANGE(0.25, 0.75),
    SUBSAMPLE = HPARAM_RANGE(0.5, 0.9),
    EARLY_STOP = TRUE,
    MIN_REL_PROGRESS = 0.0001,
    MAX_ITERATIONS = 50,
    DATA_SPLIT_METHOD='CUSTOM',
    DATA_SPLIT_COL='SPLIT_COL',
    NUM_TRIALS = 90,
    HPARAM_TUNING_ALGORITHM = 'VIZIER_DEFAULT',
    HPARAM_TUNING_OBJECTIVES = ['MEAN_ABSOLUTE_ERROR'] ) AS
SELECT
    * EXCEPT(user_pseudo_id, ecommerce_transaction_id, split, ecommerce_refund_value_in_usd),
    ecommerce_refund_value_in_usd AS label,
    'TRAIN' AS split_col
FROM
    `return_prediction_ga4.step_6_merged_scaled`
WHERE
    split = 'TRAIN'
UNION ALL
SELECT
    * EXCEPT(user_pseudo_id, ecommerce_transaction_id, split, ecommerce_refund_value_in_usd),
    ecommerce_refund_value_in_usd AS label,
    'EVAL' AS split_col
FROM
    `return_prediction_ga4.step_6_merged_scaled`
WHERE
    split = 'VALID'
UNION ALL
SELECT
    * EXCEPT(user_pseudo_id, ecommerce_transaction_id, split, ecommerce_refund_value_in_usd),
    ecommerce_refund_value_in_usd AS label,
    'TEST' AS split_col
FROM
    `return_prediction_ga4.step_6_merged_scaled`
WHERE
    split = 'TEST';

Query is running:   0%|          |

# Best Models
Compare the best models from each model type we tested, ie. the best hyperparameter combinations based on our evaluation metric, mean absolute error (MAE). <br>
BigQuery automatically tells us which hyperparameter combination performed the best as 'is_optimal' = True.

### Linear Regression

In [None]:
%%bigquery lr_model_performance --project $project_name
SELECT
  *
FROM
  ML.TRIAL_INFO(MODEL `adl-analytics.return_prediction_ga4.linreg_model`);

Query is running:   0%|          |

Downloading:   0%|          |

In [None]:
# all lr model trial performances
lr_model_performance

Unnamed: 0,trial_id,hyperparameters,hparam_tuning_evaluation_metrics,training_loss,eval_loss,status,error_message,is_optimal
0,1,"{'l1_reg': 1e-14, 'l2_reg': 1e-14}",{'mean_absolute_error': 20.464286656654963},2632.698,888.989,SUCCEEDED,,False
1,2,"{'l1_reg': 1.991669131404525e-11, 'l2_reg': 1....",{'mean_absolute_error': 20.46428665665614},2632.698,888.989,SUCCEEDED,,False
2,3,"{'l1_reg': 2.8692991443926807e-13, 'l2_reg': 1...",{'mean_absolute_error': 20.464286656654963},2632.698,888.989,SUCCEEDED,,False
3,4,"{'l1_reg': 1.5818032333206806e-07, 'l2_reg': 1...",{'mean_absolute_error': 20.464286656585877},2632.698,888.989,SUCCEEDED,,False
4,5,"{'l1_reg': 0.002506451730963958, 'l2_reg': 1e-14}",{'mean_absolute_error': 20.4642855618118},2632.698,888.989,SUCCEEDED,,False
5,6,"{'l1_reg': 9.99999999999999, 'l2_reg': 1e-14}",{'mean_absolute_error': 20.45994517691412},2632.61,888.86,SUCCEEDED,,True
6,7,"{'l1_reg': 9.99999999999999, 'l2_reg': 6.30945...",{'mean_absolute_error': 20.459945176951855},2632.61,888.86,SUCCEEDED,,False
7,8,"{'l1_reg': 9.99999999999999, 'l2_reg': 7.94313...",{'mean_absolute_error': 20.45994992343326},2632.61,888.86,SUCCEEDED,,False
8,9,"{'l1_reg': 9.99999999999999, 'l2_reg': 9.99999...",{'mean_absolute_error': 21.05898792354632},2623.497,912.302,SUCCEEDED,,False
9,10,"{'l1_reg': 0.015728685259771075, 'l2_reg': 1.3...",{'mean_absolute_error': 20.4642798671829},2632.698,888.989,SUCCEEDED,,False


In [None]:
list(lr_model_performance[lr_model_performance['is_optimal'] == True]['hyperparameters'])

[{'l1_reg': 9.99999999999999, 'l2_reg': 1e-14}]

### RandomForest

In [None]:
%%bigquery rf_model_performance --project $project_name
SELECT
  *
FROM
  ML.TRIAL_INFO(MODEL `adl-analytics.return_prediction_ga4.rf_model`);

Query is running:   0%|          |

Downloading:   0%|          |

In [None]:
# all rf model trial performances
rf_model_performance

Unnamed: 0,trial_id,hyperparameters,hparam_tuning_evaluation_metrics,training_loss,eval_loss,status,error_message,is_optimal
0,1,"{'max_tree_depth': 15, 'subsample': 0.8, 'num_...",{'mean_absolute_error': 16.297838421311745},24.327,35.077,SUCCEEDED,,False
1,2,"{'max_tree_depth': 12, 'subsample': 0.71609730...",{'mean_absolute_error': 16.269731785915337},31.549,33.719,SUCCEEDED,,False
2,3,"{'max_tree_depth': 7, 'subsample': 0.628120700...",{'mean_absolute_error': 16.53163167969424},39.306,33.089,SUCCEEDED,,False
3,4,"{'max_tree_depth': 17, 'subsample': 0.78884810...",{'mean_absolute_error': 16.342993546641818},37.124,33.241,SUCCEEDED,,False
4,5,"{'max_tree_depth': 15, 'subsample': 0.62010212...",{'mean_absolute_error': 16.322093485932378},28.978,33.35,SUCCEEDED,,False
5,6,"{'max_tree_depth': 8, 'subsample': 0.728855685...",{'mean_absolute_error': 16.6232696973623},30.709,34.66,SUCCEEDED,,False
6,7,"{'max_tree_depth': 12, 'subsample': 0.73402620...",{'mean_absolute_error': 16.334606689598793},38.266,33.147,SUCCEEDED,,False
7,8,"{'max_tree_depth': 12, 'subsample': 0.82409124...",{'mean_absolute_error': 16.38094464833527},26.146,35.543,SUCCEEDED,,False
8,9,"{'max_tree_depth': 14, 'subsample': 0.67460912...",{'mean_absolute_error': 16.33072949451591},35.196,33.553,SUCCEEDED,,False
9,10,"{'max_tree_depth': 14, 'subsample': 0.81632796...",{'mean_absolute_error': 16.369853013881926},27.782,35.222,SUCCEEDED,,False


In [None]:
list(rf_model_performance[rf_model_performance['is_optimal'] == True]['hyperparameters'])

[{'max_tree_depth': 20,
  'subsample': 0.5,
  'num_parallel_tree': 3,
  'min_tree_child_weight': 2,
  'colsample_bynode': 0.25000000000000216}]

### RandomForest

In [None]:
%%bigquery xgb_model_performance --project $project_name
SELECT
  *
FROM
  ML.TRIAL_INFO(MODEL `adl-analytics.return_prediction_ga4.xgb_model`);

Query is running:   0%|          |

Downloading:   0%|          |

In [None]:
# all xgb model trial performances
xgb_model_performance

Unnamed: 0,trial_id,hyperparameters,hparam_tuning_evaluation_metrics,training_loss,eval_loss,status,error_message,is_optimal
0,1,"{'learn_rate': 0.3, 'dropout': None, 'max_tree...",{'mean_absolute_error': 15.041342495164894},37.667,30.252,SUCCEEDED,,False
1,2,"{'learn_rate': 0.4978354418496759, 'dropout': ...",{'mean_absolute_error': 14.668630244908302},52.632,28.772,SUCCEEDED,,False
2,3,"{'learn_rate': 0.3833610902957896, 'dropout': ...",{'mean_absolute_error': 14.559347383731295},38.943,30.581,SUCCEEDED,,False
3,4,"{'learn_rate': 0.46779875400958404, 'dropout':...",{'mean_absolute_error': 15.014962318546727},40.254,30.185,SUCCEEDED,,False
4,5,"{'learn_rate': 0.43142461413494737, 'dropout':...",{'mean_absolute_error': 15.220470170132586},39.556,30.933,SUCCEEDED,,False
5,6,"{'learn_rate': 0.49788202544016025, 'dropout':...",{'mean_absolute_error': 16.896481102007964},47.51,29.643,SUCCEEDED,,False
6,7,"{'learn_rate': 0.289677670546588, 'dropout': N...",{'mean_absolute_error': 14.078641446222836},44.755,30.214,SUCCEEDED,,False
7,8,"{'learn_rate': 0.4126402492841693, 'dropout': ...",{'mean_absolute_error': 13.807240351287733},51.48,29.009,SUCCEEDED,,False
8,9,"{'learn_rate': 0.19269102223466822, 'dropout':...",{'mean_absolute_error': 13.845215746781715},47.086,29.02,SUCCEEDED,,False
9,10,"{'learn_rate': 0.2932046182675494, 'dropout': ...",{'mean_absolute_error': 13.961714331745252},46.756,28.875,SUCCEEDED,,False


In [None]:
list(xgb_model_performance[xgb_model_performance['is_optimal'] == True]['hyperparameters'])

[{'learn_rate': 0.1,
  'dropout': 0.5,
  'max_tree_depth': 20,
  'subsample': 0.5,
  'booster_type': 'dart',
  'num_parallel_tree': 46,
  'dart_normalize_type': 'tree',
  'min_tree_child_weight': 2,
  'colsample_bynode': 0.42820566106224267}]

## Model Selection
These three models with the best hyperparameters for each model type will be evaluated against each other on the validation set to choose the final model. <br>
The model with the best performance on validation will be chosen as the final model. Lastly, we can use our final model to predict on the test set.

**Best model hyperparameters for Linear Regression:** <br>
- L1 regularization: 10, <br>
- L2 regularization: 0 <br>

**Best model hyperparameters for RandomForest Regressor:** <br>
- Maximum tree depth: 20, <br>
- Subsample: 0.5 <br>
- Number of parallel trees: 3 <br>
- Minimum tree child weight: 2 <br>
- Column sample by node: 0.25 <br>

**Best model hyperparameters for RandomForest Regressor:** <br>
- Learning rate: 0.1, <br>
- Dropout: 0.5 <br>
- Maximum tree depth: 20 <br>
- Subsample: 0.5 <br>
- Booster type: 'dart' <br>
- Number of parallel trees: 46 <br>
- Dart normalization type: 'tree' <br>
- Minimum tree child weight: 2 <br>
- Column sample by node: 0.4282 <br>

In [None]:
%%bigquery lr_model_eval --project $project_name
SELECT
    *
FROM
    ML.EVALUATE(MODEL `adl-analytics.return_prediction_ga4.linreg_model`,
        (
            SELECT
                * EXCEPT(user_pseudo_id, ecommerce_transaction_id, split, ecommerce_refund_value_in_usd),
                ecommerce_refund_value_in_usd AS label,
            FROM
                `return_prediction_ga4.step_6_merged_scaled`
            WHERE
                split = 'VALID'
        ),
            STRUCT(6 AS trial_id));

Query is running:   0%|          |

Downloading:   0%|          |

In [None]:
lr_model_eval

Unnamed: 0,trial_id,mean_absolute_error,mean_squared_error,mean_squared_log_error,median_absolute_error,r2_score,explained_variance
0,6,20.46,888.86,5.256,14.782,-0.136,0.041


In [None]:
%%bigquery rf_model_eval --project $project_name
SELECT
    *
FROM
    ML.EVALUATE(MODEL `adl-analytics.return_prediction_ga4.rf_model`,
        (
            SELECT
                * EXCEPT(user_pseudo_id, ecommerce_transaction_id, split, ecommerce_refund_value_in_usd),
                ecommerce_refund_value_in_usd AS label,
            FROM
                `return_prediction_ga4.step_6_merged_scaled`
            WHERE
                split = 'VALID'
        ),
            STRUCT(33 AS trial_id));

Query is running:   0%|          |

Downloading:   0%|          |

In [None]:
rf_model_eval

Unnamed: 0,trial_id,mean_absolute_error,mean_squared_error,mean_squared_log_error,median_absolute_error,r2_score,explained_variance
0,33,13.897,1025.901,3.363,3.491,-0.312,-0.302


In [None]:
%%bigquery xgb_model_eval --project $project_name
SELECT
    *
FROM
    ML.EVALUATE(MODEL `adl-analytics.return_prediction_ga4.xgb_model`,
        (
            SELECT
                * EXCEPT(user_pseudo_id, ecommerce_transaction_id, split, ecommerce_refund_value_in_usd),
                ecommerce_refund_value_in_usd AS label,
            FROM
                `return_prediction_ga4.step_6_merged_scaled`
            WHERE
                split = 'VALID'
        ),
            STRUCT(55 AS trial_id));

Query is running:   0%|          |

Downloading:   0%|          |

In [None]:
xgb_model_eval

Unnamed: 0,trial_id,mean_absolute_error,mean_squared_error,mean_squared_log_error,median_absolute_error,r2_score,explained_variance
0,55,11.984,859.5,2.277,1.217,-0.099,0.025


In [None]:
best_model_evals = pd.concat([lr_model_eval, rf_model_eval, xgb_model_eval])
best_model_evals['model_type'] = ['linear_regression', 'random_forest', 'xgboost']
best_model_evals.sort_values('mean_absolute_error')

Unnamed: 0,trial_id,mean_absolute_error,mean_squared_error,mean_squared_log_error,median_absolute_error,r2_score,explained_variance,model_type
0,55,11.984,859.5,2.277,1.217,-0.099,0.025,xgboost
0,33,13.897,1025.901,3.363,3.491,-0.312,-0.302,random_forest
0,6,20.46,888.86,5.256,14.782,-0.136,0.041,linear_regression


Based on the performance of the three best models on the validation set, we'd choose XGBoost as our final model. Of the three models, XGBoost has the lowest MAE. <br>
Additionally, it also has the lowest MSE, MSLE and Median Absolute Error.

# Final Best Model
Now, using the best model of the three models we compared, let's get its performance on the test set.

In [None]:
%%bigquery xgb_model_test --project $project_name
SELECT
    *
FROM
    ML.PREDICT(MODEL `adl-analytics.return_prediction_ga4.xgb_model`,
        (
            SELECT
                * EXCEPT(user_pseudo_id, ecommerce_transaction_id, split, ecommerce_refund_value_in_usd),
                ecommerce_refund_value_in_usd AS label,
            FROM
                `return_prediction_ga4.step_6_merged_scaled`
            WHERE
                split = 'TEST'
        ),
            STRUCT(55 AS trial_id));

Query is running:   0%|          |

Downloading:   0%|          |

In [None]:
# model predictions
xgb_model_test.head()

Unnamed: 0,trial_id,predicted_label,transaction_ga_session_number,user_ltv_revenue,ecommerce_tax_value_in_usd,sum_item_price_in_usd,sum_item_promotions,sum_item_quantity,sum_event_name_begin_checkout,sum_event_name_select_item,sum_device_category_mobile,sum_device_mobile_brand_name_Google,sum_device_mobile_brand_name_Huawei,sum_device_mobile_brand_name_Microsoft,sum_device_mobile_brand_name_Samsung,sum_device_mobile_brand_name_Xiaomi,sum_device_mobile_model_name_ChromeBook,sum_device_mobile_model_name_Safari,sum_device_mobile_model_name_iPad,sum_device_mobile_model_name_iPhone,sum_device_web_info_browser_AndroidWebview,sum_device_web_info_browser_Firefox,sum_device_web_info_browser_Safari,sum_geo_country_Canada,sum_geo_country_France,sum_geo_country_India,sum_geo_country_Other,sum_geo_country_Spain,sum_geo_country_UnitedKingdom,sum_traffic_source_medium_Other,sum_traffic_source_medium_cpc,sum_traffic_source_medium_organic,sum_traffic_source_medium_referral,sum_event_params_parent_page_CampusCollection,sum_event_params_parent_page_CheckoutConfirmation,sum_event_params_parent_page_EcoFriendly,sum_event_params_parent_page_Home,sum_event_params_parent_page_Lifestyle,sum_event_params_parent_page_New,sum_event_params_parent_page_Other,sum_event_params_parent_page_PaymentMethod,sum_event_params_parent_page_Sale,sum_event_params_parent_page_ShoppingCart,sum_event_params_child_page_Bags,sum_event_params_child_page_Google,sum_event_params_child_page_Hats,sum_event_params_child_page_Kids,sum_event_params_child_page_Notebooks,sum_event_params_child_page_Other,sum_event_params_child_page_Socks,sum_event_params_child_page_Stickers,sum_event_params_child_page_Womens,sum_event_params_child_page_Writing,sum_event_params_child_page_YouTube,sum_item_parent_category_Other,sum_item_parent_category_Stationery,sum_item_child_category_Drinkware,sum_item_child_category_Google,sum_item_child_category_MensUnisex,sum_item_child_category_SmallGoods,sum_item_child_subcategory_Backpacks,sum_item_child_subcategory_ElectronicsAccessories,sum_item_child_subcategory_Infant,sum_item_child_subcategory_MensTShirts,sum_item_child_subcategory_MugsTumblers,sum_item_child_subcategory_Other,sum_item_child_subcategory_WaterBottles,max_event_params_engagement_time_msec,max_item_promotions,avg_event_params_engagement_time_msec,avg_item_promotions,avg_item_quantity,avg_item_revenue_in_usd,pre_max_event_params_ga_session_number,pre_avg_event_params_engagement_time_msec,pre_sum_event_name_scroll,pre_sum_event_name_select_item,pre_sum_device_mobile_brand_name_Google,pre_sum_device_mobile_brand_name_Samsung,pre_sum_device_mobile_brand_name_Xiaomi,pre_sum_device_mobile_model_name_ChromeBook,pre_sum_device_mobile_model_name_Safari,pre_sum_device_mobile_model_name_iPad,pre_sum_device_web_info_browser_AndroidWebview,pre_sum_device_web_info_browser_Edge,pre_sum_device_web_info_browser_Firefox,pre_sum_geo_country_Canada,pre_sum_geo_country_India,pre_sum_geo_country_Other,pre_sum_geo_country_Spain,pre_sum_geo_country_UnitedKingdom,pre_sum_traffic_source_medium_Other,pre_sum_traffic_source_medium_cpc,pre_sum_traffic_source_medium_organic,pre_sum_traffic_source_medium_referral,pre_sum_event_params_parent_page_CampusCollection,pre_sum_event_params_parent_page_CheckoutConfirmation,pre_sum_event_params_parent_page_EcoFriendly,pre_sum_event_params_parent_page_Home,pre_sum_event_params_parent_page_PaymentMethod,pre_sum_event_params_parent_page_Sale,pre_sum_event_params_child_page_Hats,pre_sum_event_params_child_page_Socks,pre_sum_event_params_child_page_Stickers,pre_sum_event_params_child_page_Womens,pre_sum_event_params_child_page_YouTube,pre_sum_item_parent_category_New,pre_sum_item_child_category_Bags,pre_sum_item_child_category_Drinkware,pre_sum_item_child_category_Kids,pre_sum_item_child_category_MensUnisex,pre_sum_item_child_subcategory_Backpacks,pre_sum_item_child_subcategory_ElectronicsAccessories,pre_sum_item_child_subcategory_WaterBottles,days_first_session_to_transaction,historical_avg_event_params_engagement_time_msec,historical_avg_item_price_in_usd,historical_avg_item_promotions,historical_avg_item_refund_in_usd,historical_sum_total_return_item_quantity,historical_sum_ecommerce_refund_value_in_usd,historical_max_days_first_session_to_transaction,historical_max_event_params_engagement_time_msec,historical_max_item_promotions,historical_max_item_quantity,historical_sum_item_promotions,historical_sum_item_quantity,historical_sum_device_category_mobile,historical_sum_device_mobile_brand_name_Google,historical_sum_device_mobile_brand_name_Samsung,historical_sum_device_mobile_brand_name_Xiaomi,historical_sum_device_mobile_model_name_ChromeBook,historical_sum_device_web_info_browser_AndroidWebview,historical_sum_device_web_info_browser_Edge,historical_sum_device_web_info_browser_Safari,historical_sum_geo_country_Canada,historical_sum_geo_country_France,historical_sum_geo_country_India,historical_sum_geo_country_Spain,historical_sum_geo_country_UnitedKingdom,historical_sum_geo_country_UnitedStates,historical_sum_traffic_source_medium_Other,historical_sum_traffic_source_medium_cpc,historical_sum_traffic_source_medium_organic,historical_sum_traffic_source_medium_referral,historical_sum_event_params_parent_page_CampusCollection,historical_sum_event_params_parent_page_EcoFriendly,historical_sum_event_params_parent_page_Home,historical_sum_event_params_parent_page_New,historical_sum_event_params_parent_page_Other,historical_sum_event_params_parent_page_Sale,historical_sum_event_params_parent_page_Stationery,historical_sum_event_params_child_page_Google,historical_sum_event_params_child_page_Hats,historical_sum_event_params_child_page_Notebooks,historical_sum_event_params_child_page_Other,historical_sum_event_params_child_page_Socks,historical_sum_event_params_child_page_Stickers,historical_sum_event_params_child_page_Writing,historical_sum_event_params_child_page_YouTube,historical_sum_item_parent_category_New,historical_sum_item_parent_category_Other,historical_sum_item_child_category_Google,historical_sum_item_child_category_Kids,historical_sum_item_child_category_MensUnisex,historical_sum_item_child_category_Womens,historical_sum_item_child_subcategory_Backpacks,historical_sum_item_child_subcategory_ElectronicsAccessories,historical_sum_item_child_subcategory_Infant,historical_sum_item_child_subcategory_MensTShirts,historical_sum_item_child_subcategory_MugsTumblers,historical_sum_item_child_subcategory_Other,historical_sum_item_child_subcategory_WaterBottles,historical_pre_sum_event_name_add_to_cart,historical_pre_sum_device_mobile_brand_name_Microsoft,historical_pre_sum_device_mobile_model_name_iPhone,historical_pre_sum_event_params_parent_page_CheckoutConfirmation,historical_pre_sum_event_params_parent_page_New,historical_pre_sum_item_child_subcategory_Backpacks,historical_pre_sum_item_child_subcategory_ElectronicsAccessories,historical_pre_sum_item_child_subcategory_WaterBottles,historical_recency,historical_age,customer_segment,label
0,55,0.974,0.328,-0.171,-0.576,-0.703,0.091,-0.35,-0.508,-0.47,1.209,-0.74,-0.138,-0.124,-0.286,-0.131,-0.24,-0.51,-0.102,1.907,-0.125,-0.118,1.758,-0.292,-0.146,-0.33,1.501,-0.145,-0.174,2.928,-0.17,-0.618,-0.533,-0.207,-0.794,-0.239,-0.426,-0.539,-0.319,-0.653,-0.75,-0.358,-0.931,-0.3,-0.175,-0.315,-0.235,-0.203,-0.097,-0.26,-0.209,-0.322,-0.187,-0.137,-0.254,-0.321,-0.389,-0.188,-0.471,-0.295,-0.017,-0.07,-0.036,-0.041,-0.04,-0.024,-0.024,-0.572,1.025,-0.567,1.685,-0.127,-0.252,-0.601,-0.48,-0.518,-0.246,-0.388,-0.173,-0.085,-0.15,-0.28,-0.071,-0.081,-0.097,-0.079,-0.166,-0.193,-0.361,-0.092,-0.112,-0.334,-0.185,-0.533,-0.472,-0.12,-0.234,-0.179,-0.287,-0.205,-0.241,-0.171,-0.159,-0.117,-0.217,-0.083,-0.21,-0.181,-0.205,-0.147,-0.3,0.0,-0.017,0.0,-0.5,0.716,2.341,4.65,-0.154,-0.125,-0.129,0.173,0.515,4.727,1.428,1.989,0.996,11.992,-0.109,-0.057,-0.04,-0.039,-0.035,-0.024,15.591,-0.054,-0.041,-0.077,-0.048,-0.034,-0.14,-0.079,-0.037,-0.113,10.281,-0.054,-0.072,-0.116,-0.09,-0.089,-0.084,-0.099,-0.066,-0.071,-0.071,-0.029,-0.064,-0.055,-0.042,-0.038,-0.078,-0.069,-0.064,2.56,-0.131,-0.105,0.0,-0.019,0.0,0.0,0.0,0.0,0.0,-0.274,-0.102,1.578,-0.238,-0.26,0.0,-0.024,0.0,-0.069,-4.03,0.056,21.0
1,55,0.782,0.328,-0.572,-0.724,-0.836,-0.516,-0.35,-0.508,-0.47,1.209,-0.74,-0.138,-0.124,-0.286,-0.131,-0.24,-0.51,-0.102,1.907,-0.125,-0.118,1.758,-0.292,-0.146,-0.33,1.501,-0.145,-0.174,-0.342,-0.17,-0.618,-0.533,-0.207,-0.65,-0.239,-0.254,-0.539,-0.319,-0.653,-0.632,-0.358,-0.494,-0.3,-0.175,0.751,-0.235,-0.203,-0.097,-0.26,-0.209,-0.322,-0.187,-0.137,-0.254,-0.321,-0.389,-0.188,-0.35,-0.295,-0.017,-0.07,-0.036,-0.041,-0.04,-0.024,-0.024,-0.401,-0.976,-0.407,-0.776,-0.127,-0.528,1.018,-0.253,-0.518,-0.246,-0.388,-0.173,-0.085,-0.15,-0.28,-0.071,-0.081,-0.097,-0.079,-0.166,-0.193,1.23,-0.092,-0.112,-0.334,-0.185,-0.533,-0.472,-0.12,1.475,-0.179,-0.287,-0.205,-0.241,1.401,-0.159,-0.117,-0.217,-0.083,-0.21,-0.181,-0.205,-0.147,-0.3,0.0,-0.017,0.0,2.775,3.068,1.851,7.065,-0.154,-0.125,-0.129,0.173,4.564,4.727,1.428,0.931,0.211,-0.153,-0.109,-0.057,-0.04,-0.039,-0.035,42.243,-0.114,-0.054,-0.041,-0.077,-0.048,-0.034,-0.14,-0.079,-0.037,5.141,-0.128,-0.054,-0.072,0.071,-0.09,0.126,0.009,-0.099,-0.066,1.47,-0.071,-0.029,-0.064,-0.055,-0.042,-0.038,-0.078,-0.069,-0.064,-0.059,-0.131,-0.105,0.0,-0.019,0.0,0.0,0.0,0.0,0.0,-0.039,6.178,1.578,2.798,-0.26,0.0,-0.024,0.0,-0.069,0.377,-1.232,13.0
2,55,1.078,0.328,-0.133,-0.132,-0.323,-0.516,-0.35,-0.508,-0.47,1.209,-0.74,-0.138,-0.124,-0.286,-0.131,-0.24,-0.51,-0.102,1.907,-0.125,-0.118,1.758,-0.292,-0.146,-0.33,-0.666,-0.145,-0.174,-0.342,-0.17,1.619,-0.533,-0.207,-0.507,-0.239,-0.328,-0.539,-0.156,-0.653,-0.279,-0.358,-0.057,-0.3,-0.175,-0.315,-0.235,-0.203,-0.097,-0.26,-0.209,-0.322,-0.187,-0.137,-0.254,-0.321,-0.389,-0.188,-0.471,-0.295,-0.017,-0.07,-0.036,-0.041,-0.04,-0.024,-0.024,-0.367,-0.976,-0.287,-0.776,-0.127,0.539,0.479,0.156,0.373,-0.246,1.214,-0.173,-0.085,-0.15,-0.28,-0.071,-0.081,-0.097,-0.079,-0.166,-0.193,-0.361,-0.092,-0.112,-0.334,-0.185,-0.533,-0.472,-0.12,-0.234,-0.179,0.068,-0.205,-0.241,-0.171,-0.159,-0.117,1.217,-0.083,-0.21,-0.181,-0.205,-0.147,-0.3,0.0,-0.017,0.0,3.001,1.887,5.155,7.065,8.398,0.62,1.465,-0.157,1.012,4.727,1.428,0.931,0.211,-0.153,1.869,-0.057,-0.04,-0.039,-0.035,-0.024,-0.114,-0.054,-0.041,-0.077,-0.048,-0.034,3.409,-0.079,-0.037,-0.113,-0.128,0.331,-0.072,0.398,-0.09,0.126,-0.084,-0.099,-0.066,-0.071,-0.071,-0.029,-0.064,-0.055,-0.042,-0.038,-0.078,-0.069,-0.064,-0.059,-0.131,-0.105,0.0,-0.019,0.0,0.0,0.0,0.0,0.0,-0.039,-0.102,0.632,-0.238,-0.26,0.0,-0.024,0.0,-0.069,-0.621,-1.489,0.0
3,55,3.125,1.582,1.404,-0.428,-0.323,0.091,-0.35,-0.222,-0.47,1.209,-0.74,-0.138,-0.124,-0.286,-0.131,-0.24,-0.51,-0.102,1.907,-0.125,-0.118,1.758,-0.292,-0.146,-0.33,-0.666,-0.145,-0.174,-0.342,-0.17,-0.618,-0.533,-0.207,-0.507,-0.239,-0.279,-0.539,-0.21,-0.653,0.192,-0.358,-0.421,-0.3,-0.175,-0.315,-0.235,-0.203,-0.097,-0.26,-0.209,-0.28,-0.187,-0.137,-0.254,-0.321,-0.389,-0.188,-0.471,-0.295,-0.017,-0.07,-0.036,-0.041,-0.04,-0.024,-0.024,-0.511,1.025,-0.496,1.685,-0.127,0.539,1.828,0.728,0.017,0.266,-0.388,-0.173,-0.085,-0.15,-0.28,-0.071,-0.081,-0.097,-0.079,-0.166,-0.193,-0.361,-0.092,-0.112,2.486,-0.185,-0.533,-0.472,-0.009,-0.234,-0.179,0.107,-0.205,-0.182,-0.171,-0.159,-0.117,2.162,-0.083,-0.21,-0.181,-0.205,-0.147,-0.3,0.0,-0.017,0.0,1.985,2.179,1.759,4.65,1.514,8.07,4.69,0.832,3.968,4.727,1.428,12.579,5.31,11.992,-0.109,-0.057,-0.04,-0.039,-0.035,-0.024,15.591,-0.054,-0.041,-0.077,-0.048,-0.034,10.508,-0.079,-0.037,5.141,10.281,19.77,-0.072,6.569,1.355,1.632,4.202,-0.099,0.261,-0.071,-0.071,-0.029,-0.064,-0.055,-0.042,-0.038,0.475,-0.069,-0.064,-0.059,-0.131,10.59,0.0,-0.019,0.0,0.0,0.0,0.0,0.0,2.545,-0.102,3.469,-0.238,1.793,0.0,-0.024,0.0,26.719,0.46,0.313,44.0
4,55,1.39,0.829,0.086,0.461,0.47,-0.516,-0.259,-0.365,-0.47,1.209,-0.74,-0.138,-0.124,-0.286,-0.131,-0.24,-0.51,-0.102,1.907,-0.125,-0.118,1.758,-0.292,-0.146,-0.33,-0.666,-0.145,-0.174,-0.342,-0.17,-0.618,-0.533,-0.149,-0.364,0.583,-0.279,0.408,-0.319,-0.653,-0.161,-0.358,-0.494,-0.189,-0.09,-0.315,-0.235,-0.203,-0.097,-0.26,-0.209,-0.322,-0.187,-0.137,-0.254,-0.321,-0.389,-0.188,-0.471,1.873,-0.017,-0.07,-0.036,-0.041,-0.04,-0.024,-0.024,-0.469,-0.976,-0.303,-0.776,-0.127,0.608,1.018,-0.065,-0.25,-0.246,-0.388,-0.173,-0.085,-0.15,-0.28,-0.071,-0.081,-0.097,-0.079,-0.166,-0.193,-0.361,-0.092,-0.112,-0.334,-0.185,-0.533,-0.472,-0.12,-0.234,0.055,-0.149,-0.205,-0.182,-0.171,-0.159,-0.117,-0.217,-0.083,-0.21,-0.181,-0.205,-0.147,-0.3,0.0,-0.017,0.0,-0.5,0.809,1.239,-0.179,-0.154,-0.125,-0.129,0.502,1.147,-0.212,1.428,-0.128,0.604,-0.153,1.869,-0.057,-0.04,-0.039,-0.035,-0.024,-0.114,-0.054,-0.041,-0.077,-0.048,-0.034,3.409,-0.079,-0.037,-0.113,-0.128,-0.054,-0.072,2.549,-0.09,-0.089,-0.084,-0.099,-0.066,-0.071,-0.071,-0.029,-0.064,-0.055,-0.042,-0.038,-0.078,-0.069,-0.064,-0.059,-0.131,-0.105,0.0,-0.019,0.0,0.0,0.0,0.0,0.0,0.196,-0.102,0.632,-0.238,-0.26,0.0,-0.024,0.0,-0.069,-1.037,1.086,0.0


### Evaluate model predictions

In [None]:
# MAE
mae = mean_absolute_error(xgb_model_test['label'], xgb_model_test['predicted_label'])
mae

10.086418785968748

In [None]:
# MSE
mse = mean_squared_error(xgb_model_test['label'], xgb_model_test['predicted_label'])
mse

896.4449986712499

In [None]:
# MSLE
msle = mean_squared_log_error(xgb_model_test['label'], xgb_model_test['predicted_label'])
msle

2.103934618922581

In [None]:
# Median Absolute Error
mmae = median_absolute_error(xgb_model_test['label'], xgb_model_test['predicted_label'])
mmae

1.3851211667060852

In [None]:
# R2
r2 = r2_score(xgb_model_test['label'], xgb_model_test['predicted_label'])
r2

-0.055215712263408934

In [None]:
# Explained Variance
ev = explained_variance_score(xgb_model_test['label'], xgb_model_test['predicted_label'])
ev

0.012488355839422538

In [None]:
xgb_model_test_perf = pd.DataFrame([(55, mae, mse, msle, mmae, r2, ev)], columns=xgb_model_eval.columns)

In [None]:
xgb_model_perf = pd.concat([xgb_model_eval, xgb_model_test_perf])
xgb_model_perf['dataset'] = ['VALIDATION', 'TEST']
xgb_model_perf

Unnamed: 0,trial_id,mean_absolute_error,mean_squared_error,mean_squared_log_error,median_absolute_error,r2_score,explained_variance,dataset
0,55,11.984,859.5,2.277,1.217,-0.099,0.025,VALIDATION
0,55,10.086,896.445,2.104,1.385,-0.055,0.012,TEST


Comparing the performance betwen validation and test, the model seems to have good generalizability since its performance on validation and test is similar.

# Feature Importance
If you are interested in knowing which features your model relied most heavily on to make predictions, you can use the option `ENABLE_GLOBAL_EXPLAIN = TRUE` when training your model with BQML.

# Conclusion