In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import optuna

from category_encoders.target_encoder import TargetEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from xgboost import DMatrix, XGBRegressor

In [2]:
# Set print options
np.set_printoptions(suppress=True, precision=4, edgeitems = 7)
pd.options.display.float_format = '{:.4f}'.format
pd.set_option('display.max_columns', None)

# Preprocessing

In [3]:
random_state = 1923

In [4]:
# Read data
df = pd.read_csv("./InputData/full_data.csv")

In [5]:
# Reindex data from 0:N
df = df.reset_index(drop = True)

In [6]:
# Enforce variable types
df["market_id"] = df["market_id"].astype(str)
df["store_id"] = df["store_id"].astype(str)
df["order_protocol"] = df["order_protocol"].astype(str)

In [7]:
# Split features and target, drop non-feature columns
X = df.drop(["created_at", "actual_delivery_time", "duration"], axis = 1)
y = df.duration

In [8]:
train_end = int(len(df) * 0.6)

In [9]:
val_end = train_end + int(len(df) * 0.2)

In [10]:
# Train - val - test split, 60 - 20 - 20
X_train, X_val, X_test = X[:train_end], X[train_end:val_end], X[val_end:]
y_train, y_val, y_test = y[:train_end], X[train_end:val_end], X[val_end:]

In [11]:
# Create target encoders

# store_id encoder with hierarchy, top level market_id
hierarchy = pd.DataFrame(X["market_id"]).rename({"market_id": "HIER_store_id_1"}, axis = 1)
encoder_storeid = TargetEncoder(cols = ["store_id"], hierarchy = hierarchy)

# Encoder for remaining categoricals, without hierarchy
encoder = TargetEncoder(cols = ["market_id", "store_primary_category", "order_protocol"])

pipeline = Pipeline([
    ("encoder_storeid", encoder_storeid),
    ("encoder", encoder)
])

In [12]:
pipeline

In [14]:
# Preprocess data
X_train = pipeline.fit_transform(X_train, y_train)
X_val = pipeline.transform(X_val)
X_test = pipeline.transform(X_test)

In [27]:
# Convert all features to float
X_train = X_train.astype(float)
X_val = X_val.astype(float)
X_test = X_test .astype(float)

# Hyperparameter tuning

In [29]:
# Objective function
def objective_xgb(trial):

    # Define hyperparameter space
    learning_rate = trial.suggest_float("learning_rate", 0.05, 0.3)
    max_depth = trial.suggest_int("max_depth", 2, 20)
    min_child_weight = trial.suggest_int("min_child_weight", 1, 20, log = True)
    gamma = trial.suggest_float("gamma", 5e-5, 0.5, log = True)
    reg_alpha = trial.suggest_float("l1_reg", 5e-5, 1, log = True)
    reg_lambda = trial.suggest_float("l2_reg", 0, 2)
    subsample = trial.suggest_float("subsample", 0.5, 1)
    colsample_bytree = trial.suggest_float("colsample_bytree", 0.5, 1)

    # Create model
    callback_pruner = [optuna.integration.XGBoostPruningCallback(
        trial, "validation_0-mean_squared_error")]
    
    model = XGBRegressor(
        device = "cuda",
        objective = "reg:squarederror",
        callbacks = callback_pruner,
        verbosity = 0,
        random_state = random_state,
        n_estimators = 5000,
        early_stopping_rounds = 50,
        eval_metric = mean_squared_error,
        max_depth = max_depth,
        learning_rate = learning_rate,
        min_child_weight = min_child_weight,
        gamma = gamma,
        reg_alpha = reg_alpha,
        reg_lambda = reg_lambda,
        subsample = subsample,
        colsample_bytree = colsample_bytree
    )

    # Train model with early stopping
    
    model.fit(
        X = X_train, 
        y = y_train, 
        eval_set = [(X_val, y_val)], 
        verbose = False)

    # Report best number of rounds
    trial.set_user_attr("n_rounds", (model.best_iteration + 1))
    
    return model.best_score
    

In [30]:
# Create study
study_xgb = optuna.create_study(
  sampler = optuna.samplers.TPESampler(seed = random_state),
  pruner = optuna.pruners.HyperbandPruner(),
  study_name = "tune_xgb",
  direction = "minimize"
)

[I 2023-11-15 15:48:27,901] A new study created in memory with name: tune_xgb


In [31]:
# Perform study
study_xgb.optimize(
  objective_xgb, 
  n_trials = 1000,
  show_progress_bar = True)

  0%|                                                                                         | 0/1000 [00:00<?, ?it/s]

[W 2023-11-15 15:48:31,188] Trial 0 failed with parameters: {'learning_rate': 0.15646013714400925, 'max_depth': 8, 'min_child_weight': 6, 'gamma': 0.002578358095420434, 'l1_reg': 6.845326871860957e-05, 'l2_reg': 0.9949814801609338, 'subsample': 0.8131521629681355, 'colsample_bytree': 0.8518475372099853} because of the following error: ValueError('DataFrame.dtypes for data must be int, float, bool or category. When categorical type is supplied, The experimental DMatrix parameter`enable_categorical` must be set to `True`.  Invalid columns:market_id: object, store_id: object, store_primary_category: object, order_protocol: object').
Traceback (most recent call last):
  File "C:\Users\PC\Documents\WorkLocal\DataScience\GitHub\MixedEffectsRegressionDeliveryTimes\venv\lib\site-packages\optuna\study\_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
  File "C:\Users\PC\AppData\Local\Temp\ipykernel_30352\1400355184.py", line 39, in objective_xgb
    model.fit(
  File "C:\




ValueError: DataFrame.dtypes for data must be int, float, bool or category. When categorical type is supplied, The experimental DMatrix parameter`enable_categorical` must be set to `True`.  Invalid columns:market_id: object, store_id: object, store_primary_category: object, order_protocol: object

In [None]:
# Save tuning log
trials_xgb = study_xgb.trials_dataframe().sort_values("value", ascending = True)
trials_xgb.to_csv("./ModifiedData/trials_xgb.csv", index = False)

# Testing & diagnostics

In [None]:
# Combine train & validation data

In [None]:
# Preprocess data

In [None]:
# Train final model

In [None]:
# Make predictions on test data

In [None]:
# Calculate performance metrics

In [None]:
# Plot predicted vs. actual

In [None]:
# Time plot of predicted & actual

In [None]:
# Residual distribution

In [None]:
# Time plot of residuals