# VII. Model Building

In [1]:
################################################################################
######################### CHANGE BQ PROJECT NAME BELOW #########################
################################################################################

project_name = 'adl-analytics' #add proj name
region = "US"  # GCP project region
table_name = 'adl-analytics.return_prediction_ga4.step_6_merged_scaled'

In [2]:
# If your notebook does not have pandas_gbq you can install it here:
# ! pip install pandas_gbq

In [3]:
# Google credentials 
# from google.colab import auth
# auth.authenticate_user()

# BigQuery Magics 
'''BigQuery magics are used to run BigQuery SQL queries in a python environment.
These queries can also be run in the BigQuery UI '''

from google.cloud import bigquery
from google.cloud.bigquery import magics, Client, QueryJobConfig

magics.context.project = project_name #update project name 
client = bigquery.Client(project=magics.context.project)

# Interface between Jupyter and BigQuery
import pandas_gbq

# data processing libraries + ML tools
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

# visualization
import matplotlib.pyplot as plt
import seaborn as sns

sns.set()

# suppress notebook warnings
import warnings
warnings.filterwarnings('ignore')

# dataframe formatting
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 50)
pd.set_option("display.float_format", lambda x: "%.3f" % x)

# Build Models
Using BigQuery ML

## Linear Regression Model
See hyperparameter details and model options [here](https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-create-glm).

In [4]:
%%bigquery lr_model --project $project_name
CREATE OR REPLACE MODEL
  `return_prediction_ga4.lr_model`
OPTIONS
  ( MODEL_TYPE='LINEAR_REG',
    FIT_INTERCEPT = TRUE,
    L1_REG = HPARAM_RANGE(0, 10.0),
    L2_REG = HPARAM_RANGE(0, 10.0),
    MAX_ITERATIONS = 50,
    EARLY_STOP = FALSE,
    DATA_SPLIT_METHOD='CUSTOM',
    DATA_SPLIT_COL='SPLIT_COL',
    NUM_TRIALS = 20,
    HPARAM_TUNING_ALGORITHM = 'VIZIER_DEFAULT',
    HPARAM_TUNING_OBJECTIVES = ['MEAN_ABSOLUTE_ERROR']) AS
SELECT
    * EXCEPT(user_pseudo_id, ecommerce_transaction_id, split, ecommerce_refund_value_in_usd),
    ecommerce_refund_value_in_usd AS label,
    'TRAIN' AS split_col
FROM
    `return_prediction_ga4.step_6_merged_scaled`
WHERE
    split = 'TRAIN'
UNION ALL
SELECT
    * EXCEPT(user_pseudo_id, ecommerce_transaction_id, split, ecommerce_refund_value_in_usd),
    ecommerce_refund_value_in_usd AS label,
    'EVAL' AS split_col
FROM
    `return_prediction_ga4.step_6_merged_scaled`
WHERE
    split = 'VALID'
UNION ALL
SELECT
    * EXCEPT(user_pseudo_id, ecommerce_transaction_id, split, ecommerce_refund_value_in_usd),
    ecommerce_refund_value_in_usd AS label,
    'TEST' AS split_col
FROM
    `return_prediction_ga4.step_6_merged_scaled`
WHERE
    split = 'TEST';

Query is running:   0%|          |

## RandomForest Model
See hyperparameter details and model options [here](https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-create-random-forest).

In [6]:
%%bigquery rf_model --project $project_name
CREATE OR REPLACE MODEL
    `return_prediction_ga4.rf_model`
OPTIONS
  ( MODEL_TYPE='RANDOM_FOREST_REGRESSOR',
    NUM_PARALLEL_TREE = HPARAM_RANGE(2, 200),
    MAX_TREE_DEPTH = HPARAM_RANGE(1, 20),
    MIN_TREE_CHILD_WEIGHT = HPARAM_RANGE(2, 20),
    COLSAMPLE_BYNODE = HPARAM_RANGE(0.25, 0.75),
    SUBSAMPLE = HPARAM_RANGE(0.5, 0.9),
    EARLY_STOP = TRUE,
    MIN_REL_PROGRESS = 0.0001,
    DATA_SPLIT_METHOD='CUSTOM',
    DATA_SPLIT_COL='SPLIT_COL',
    NUM_TRIALS = 50,
    HPARAM_TUNING_ALGORITHM = 'VIZIER_DEFAULT',
    HPARAM_TUNING_OBJECTIVES = ['MEAN_ABSOLUTE_ERROR'] ) AS
SELECT
    * EXCEPT(user_pseudo_id, ecommerce_transaction_id, split, ecommerce_refund_value_in_usd),
    ecommerce_refund_value_in_usd AS label,
    'TRAIN' AS split_col
FROM
    `return_prediction_ga4.step_6_merged_scaled`
WHERE
    split = 'TRAIN'
UNION ALL
SELECT
    * EXCEPT(user_pseudo_id, ecommerce_transaction_id, split, ecommerce_refund_value_in_usd),
    ecommerce_refund_value_in_usd AS label,
    'EVAL' AS split_col
FROM
    `return_prediction_ga4.step_6_merged_scaled`
WHERE
    split = 'VALID'
UNION ALL
SELECT
    * EXCEPT(user_pseudo_id, ecommerce_transaction_id, split, ecommerce_refund_value_in_usd),
    ecommerce_refund_value_in_usd AS label,
    'TEST' AS split_col
FROM
    `return_prediction_ga4.step_6_merged_scaled`
WHERE
    split = 'TEST';

Query is running:   0%|          |

## XGBoost Model
See hyperparameter details and model options [here](https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-create-boosted-tree).

In [9]:
%%bigquery xgb_model --project $project_name
CREATE OR REPLACE MODEL
    `return_prediction_ga4.xgb_model`
OPTIONS
  ( MODEL_TYPE='BOOSTED_TREE_REGRESSOR',
    BOOSTER_TYPE = HPARAM_CANDIDATES(['GBTREE', 'DART']),
    DART_NORMALIZE_TYPE = HPARAM_CANDIDATES(['TREE', 'FOREST']),
    NUM_PARALLEL_TREE = HPARAM_RANGE(2, 100),
    MAX_TREE_DEPTH = HPARAM_RANGE(1, 20),
    DROPOUT = HPARAM_RANGE(0, 0.5),
    LEARN_RATE = HPARAM_RANGE(0.1, 0.5),
    MIN_TREE_CHILD_WEIGHT = HPARAM_RANGE(2, 20),
    COLSAMPLE_BYNODE = HPARAM_RANGE(0.25, 0.75),
    SUBSAMPLE = HPARAM_RANGE(0.5, 0.9),
    EARLY_STOP = TRUE,
    MIN_REL_PROGRESS = 0.0001,
    MAX_ITERATIONS = 50,
    DATA_SPLIT_METHOD='CUSTOM',
    DATA_SPLIT_COL='SPLIT_COL',
    NUM_TRIALS = 90,
    HPARAM_TUNING_ALGORITHM = 'VIZIER_DEFAULT',
    HPARAM_TUNING_OBJECTIVES = ['MEAN_ABSOLUTE_ERROR'] ) AS
SELECT
    * EXCEPT(user_pseudo_id, ecommerce_transaction_id, split, ecommerce_refund_value_in_usd),
    ecommerce_refund_value_in_usd AS label,
    'TRAIN' AS split_col
FROM
    `return_prediction_ga4.step_6_merged_scaled`
WHERE
    split = 'TRAIN'
UNION ALL
SELECT
    * EXCEPT(user_pseudo_id, ecommerce_transaction_id, split, ecommerce_refund_value_in_usd),
    ecommerce_refund_value_in_usd AS label,
    'EVAL' AS split_col
FROM
    `return_prediction_ga4.step_6_merged_scaled`
WHERE
    split = 'VALID'
UNION ALL
SELECT
    * EXCEPT(user_pseudo_id, ecommerce_transaction_id, split, ecommerce_refund_value_in_usd),
    ecommerce_refund_value_in_usd AS label,
    'TEST' AS split_col
FROM
    `return_prediction_ga4.step_6_merged_scaled`
WHERE
    split = 'TEST';

Query is running:   0%|          |