In [34]:
import numpy as np
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import matplotlib.pyplot as plt
import pickle
import json
from pathlib import Path
import os
from sklearn.model_selection import RandomizedSearchCV
import optuna

In [35]:
script_dir = os.getcwd()
data_dir = Path(script_dir).parent / "data"

In [36]:
# Load the preprocessed data
X_train = np.load(f'{data_dir}/X_train_final.npy')
X_val = np.load(f'{data_dir}/X_val_final.npy')
X_test = np.load(f'{data_dir}/X_test_final.npy')

y_train_log = np.load(f'{data_dir}/y_train_log.npy')  # Log-transformed version!
y_val_log = np.load(f'{data_dir}/y_val_log.npy')
y_test_log = np.load(f'{data_dir}/y_test_log.npy')

In [37]:
max(np.expm1(y_test_log))  # To get back to original scale

np.float64(66512.99999999997)

In [38]:
model = xgb.XGBRegressor(
    n_estimators=1000,           # Number of trees
    learning_rate=0.1,          # Step size shrinkage (eta)
    max_depth=7,                 # Maximum tree depth
    min_child_weight=5,          # Minimum sum of instance weight in a child
    subsample=0.8,               # Fraction of samples for each tree
    colsample_bytree=1.0,        # Fraction of features for each tree
    gamma=0.1,                     # Minimum loss reduction for split
    reg_alpha=0.1,               # L1 regularization
    reg_lambda=1.0,              # L2 regularization
    random_state=42,
    n_jobs=-1,  
    device='cuda',                # Use all CPU cores
    #early_stopping_rounds=50,    # Stop if no improvement for 50 rounds
    eval_metric='mphe'          # Evaluation metric
)

In [39]:

model.fit(
    X_train, y_train_log,
    eval_set=[(X_train, y_train_log), (X_val, y_val_log)],
    verbose=50  # Print every 50 iterations
)

[0]	validation_0-mphe:0.37157	validation_1-mphe:0.37175
[50]	validation_0-mphe:0.01759	validation_1-mphe:0.01823
[100]	validation_0-mphe:0.01424	validation_1-mphe:0.01507
[150]	validation_0-mphe:0.01329	validation_1-mphe:0.01449
[200]	validation_0-mphe:0.01267	validation_1-mphe:0.01436
[250]	validation_0-mphe:0.01215	validation_1-mphe:0.01450
[300]	validation_0-mphe:0.01187	validation_1-mphe:0.01460
[350]	validation_0-mphe:0.01163	validation_1-mphe:0.01469
[400]	validation_0-mphe:0.01147	validation_1-mphe:0.01476
[450]	validation_0-mphe:0.01131	validation_1-mphe:0.01482
[500]	validation_0-mphe:0.01118	validation_1-mphe:0.01492
[550]	validation_0-mphe:0.01107	validation_1-mphe:0.01499
[600]	validation_0-mphe:0.01099	validation_1-mphe:0.01503
[650]	validation_0-mphe:0.01089	validation_1-mphe:0.01507
[700]	validation_0-mphe:0.01082	validation_1-mphe:0.01512
[750]	validation_0-mphe:0.01074	validation_1-mphe:0.01515
[800]	validation_0-mphe:0.01067	validation_1-mphe:0.01522
[850]	validation_

0,1,2
,"objective  objective: typing.Union[str, xgboost.sklearn._SklObjWProto, typing.Callable[[typing.Any, typing.Any], typing.Tuple[numpy.ndarray, numpy.ndarray]], NoneType] Specify the learning task and the corresponding learning objective or a custom objective function to be used. For custom objective, see :doc:`/tutorials/custom_metric_obj` and :ref:`custom-obj-metric` for more information, along with the end note for function signatures.",'reg:squarederror'
,"base_score  base_score: typing.Union[float, typing.List[float], NoneType] The initial prediction score of all instances, global bias.",
,booster,
,"callbacks  callbacks: typing.Optional[typing.List[xgboost.callback.TrainingCallback]] List of callback functions that are applied at end of each iteration. It is possible to use predefined callbacks by using :ref:`Callback API `. .. note::  States in callback are not preserved during training, which means callback  objects can not be reused for multiple training sessions without  reinitialization or deepcopy. .. code-block:: python  for params in parameters_grid:  # be sure to (re)initialize the callbacks before each run  callbacks = [xgb.callback.LearningRateScheduler(custom_rates)]  reg = xgboost.XGBRegressor(**params, callbacks=callbacks)  reg.fit(X, y)",
,colsample_bylevel  colsample_bylevel: typing.Optional[float] Subsample ratio of columns for each level.,
,colsample_bynode  colsample_bynode: typing.Optional[float] Subsample ratio of columns for each split.,
,colsample_bytree  colsample_bytree: typing.Optional[float] Subsample ratio of columns when constructing each tree.,1.0
,"device  device: typing.Optional[str] .. versionadded:: 2.0.0 Device ordinal, available options are `cpu`, `cuda`, and `gpu`.",'cuda'
,"early_stopping_rounds  early_stopping_rounds: typing.Optional[int] .. versionadded:: 1.6.0 - Activates early stopping. Validation metric needs to improve at least once in  every **early_stopping_rounds** round(s) to continue training. Requires at  least one item in **eval_set** in :py:meth:`fit`. - If early stopping occurs, the model will have two additional attributes:  :py:attr:`best_score` and :py:attr:`best_iteration`. These are used by the  :py:meth:`predict` and :py:meth:`apply` methods to determine the optimal  number of trees during inference. If users want to access the full model  (including trees built after early stopping), they can specify the  `iteration_range` in these inference methods. In addition, other utilities  like model plotting can also use the entire model. - If you prefer to discard the trees after `best_iteration`, consider using the  callback function :py:class:`xgboost.callback.EarlyStopping`. - If there's more than one item in **eval_set**, the last entry will be used for  early stopping. If there's more than one metric in **eval_metric**, the last  metric will be used for early stopping.",
,enable_categorical  enable_categorical: bool See the same parameter of :py:class:`DMatrix` for details.,False


In [40]:
y_train_pred_log = model.predict(X_train)
y_val_pred_log = model.predict(X_val)
y_test_pred_log = model.predict(X_test)

# Convert back to original price scale
y_train_actual = np.expm1(y_train_log)
y_train_pred = np.expm1(y_train_pred_log)

y_val_actual = np.expm1(y_val_log)
y_val_pred = np.expm1(y_val_pred_log)

In [41]:
def calculate_metrics(y_true, y_pred, dataset_name):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100
    
    print(f"\n{dataset_name} Metrics:")
    print(f"  RMSE: ${rmse:,.2f}")
    print(f"  MAE:  ${mae:,.2f}")
    print(f"  R²:   {r2:.4f}")
    print(f"  MAPE: {mape:.2f}%")
    
    return {'rmse': rmse, 'mae': mae, 'r2': r2, 'mape': mape}

train_metrics = calculate_metrics(y_train_actual, y_train_pred, "Training")
val_metrics = calculate_metrics(y_val_actual, y_val_pred, "Validation")


Training Metrics:
  RMSE: $1,934.45
  MAE:  $587.13
  R²:   0.9464
  MAPE: 9.66%

Validation Metrics:
  RMSE: $2,267.27
  MAE:  $665.59
  R²:   0.9258
  MAPE: 11.23%


In [25]:
param_dist = {
    'max_depth': [3, 5, 7, 10],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'min_child_weight': [1, 5, 10],
    'gamma': [0, 0.1, 0.5, 1],
    'n_estimators': [500, 1000]
}

# Initialize the model
xgb_model = xgb.XGBRegressor(random_state=42, n_jobs=-1)

# Set up the search
random_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_dist,
    n_iter=20,           # Number of combinations to try
    scoring='neg_mean_squared_error',
    cv=3,                # 3-fold cross-validation
    verbose=1,
    random_state=42
)

# Fit the search
random_search.fit(X_train, y_train_log)

print(f"Best Parameters: {random_search.best_params_}")
best_model = random_search.best_estimator_

Fitting 3 folds for each of 20 candidates, totalling 60 fits
Best Parameters: {'subsample': 1.0, 'n_estimators': 1000, 'min_child_weight': 5, 'max_depth': 7, 'learning_rate': 0.1, 'gamma': 0.1, 'colsample_bytree': 1.0}


In [42]:
def objective(trial):
    # 1. Define the Expanded Parameter Space
    param = {
        # GPU Acceleration
        'tree_method': 'hist',
        'device': 'cuda',  # Use 'gpu' if using an older version of XGBoost
        
        # Core Hyperparameters
        'n_estimators': 2000, # Set high, let early stopping handle the rest
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.2, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 20),
        
        # Regularization (To fix your overfitting)
        'gamma': trial.suggest_float('gamma', 1e-8, 1.0, log=True),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),
        
        # Sampling
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.5, 1.0),
        
        'random_state': 42,
        'early_stopping_rounds': 50
    }

    # 2. Initialize and Fit
    model = xgb.XGBRegressor(**param, eval_metric='mphe')
    
    # We use the log-transformed data directly for tuning speed
    model.fit(
        X_train, y_train_log,
        eval_set=[(X_val, y_val_log)],
        verbose=False
    )

    # 3. Evaluate on Validation Set
    preds = model.predict(X_val)
    preds = np.expm1(preds)  # Convert back to original scale
    y_val = np.expm1(y_val_log)
    rmse = np.sqrt(mean_squared_error(y_val, preds))
    
    return rmse

# 4. Create and Run the Study
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100, show_progress_bar=True) # Start with 100 trials

print("\n--- Optimization Finished ---")
print(f"Best RMSE: {study.best_value:.4f}")
print(f"Best Params: {study.best_params}")

[I 2026-01-16 19:51:22,693] A new study created in memory with name: no-name-f093abd9-b1e3-4b7a-90c2-007c167d8cc1


  0%|          | 0/100 [00:00<?, ?it/s]

[I 2026-01-16 19:51:28,028] Trial 0 finished with value: 2233.924251873164 and parameters: {'learning_rate': 0.015029602876362444, 'max_depth': 9, 'min_child_weight': 2, 'gamma': 3.9220358537999276e-07, 'reg_alpha': 8.009004288691503e-08, 'reg_lambda': 0.0011988746382583686, 'subsample': 0.5915533933723962, 'colsample_bytree': 0.502294383897068, 'colsample_bylevel': 0.7996105145036367}. Best is trial 0 with value: 2233.924251873164.
[I 2026-01-16 19:51:30,049] Trial 1 finished with value: 2312.184580740944 and parameters: {'learning_rate': 0.018113995420535624, 'max_depth': 14, 'min_child_weight': 19, 'gamma': 0.9182979142632723, 'reg_alpha': 0.2165283746202694, 'reg_lambda': 5.761458269119316, 'subsample': 0.7741423270647365, 'colsample_bytree': 0.9437264507064214, 'colsample_bylevel': 0.5243007493225795}. Best is trial 0 with value: 2233.924251873164.
[I 2026-01-16 19:51:31,336] Trial 2 finished with value: 2230.636841373205 and parameters: {'learning_rate': 0.06018222257517117, 'max

In [43]:
# Train the final model with best parameters
best_params = study.best_params
# Remember to add the non-tuned parameters back in
best_params.update({'tree_method': 'hist', 'device': 'cuda', 'n_estimators': 2000})

final_model = xgb.XGBRegressor(**best_params)
final_model.fit(X_train, y_train_log, eval_set=[(X_val, y_val_log)], verbose=100)

# Now run your original calculate_metrics function on this final_model

[0]	validation_0-rmse:1.10312
[100]	validation_0-rmse:0.36465
[200]	validation_0-rmse:0.21092
[300]	validation_0-rmse:0.18754
[400]	validation_0-rmse:0.18348
[500]	validation_0-rmse:0.18263
[600]	validation_0-rmse:0.18269
[700]	validation_0-rmse:0.18299
[800]	validation_0-rmse:0.18361
[900]	validation_0-rmse:0.18421
[1000]	validation_0-rmse:0.18485
[1100]	validation_0-rmse:0.18554
[1200]	validation_0-rmse:0.18611
[1300]	validation_0-rmse:0.18670
[1400]	validation_0-rmse:0.18733
[1500]	validation_0-rmse:0.18780
[1600]	validation_0-rmse:0.18836
[1700]	validation_0-rmse:0.18886
[1800]	validation_0-rmse:0.18937
[1900]	validation_0-rmse:0.18986
[1999]	validation_0-rmse:0.19033


0,1,2
,"objective  objective: typing.Union[str, xgboost.sklearn._SklObjWProto, typing.Callable[[typing.Any, typing.Any], typing.Tuple[numpy.ndarray, numpy.ndarray]], NoneType] Specify the learning task and the corresponding learning objective or a custom objective function to be used. For custom objective, see :doc:`/tutorials/custom_metric_obj` and :ref:`custom-obj-metric` for more information, along with the end note for function signatures.",'reg:squarederror'
,"base_score  base_score: typing.Union[float, typing.List[float], NoneType] The initial prediction score of all instances, global bias.",
,booster,
,"callbacks  callbacks: typing.Optional[typing.List[xgboost.callback.TrainingCallback]] List of callback functions that are applied at end of each iteration. It is possible to use predefined callbacks by using :ref:`Callback API `. .. note::  States in callback are not preserved during training, which means callback  objects can not be reused for multiple training sessions without  reinitialization or deepcopy. .. code-block:: python  for params in parameters_grid:  # be sure to (re)initialize the callbacks before each run  callbacks = [xgb.callback.LearningRateScheduler(custom_rates)]  reg = xgboost.XGBRegressor(**params, callbacks=callbacks)  reg.fit(X, y)",
,colsample_bylevel  colsample_bylevel: typing.Optional[float] Subsample ratio of columns for each level.,0.6107605025348347
,colsample_bynode  colsample_bynode: typing.Optional[float] Subsample ratio of columns for each split.,
,colsample_bytree  colsample_bytree: typing.Optional[float] Subsample ratio of columns when constructing each tree.,0.9046189573755721
,"device  device: typing.Optional[str] .. versionadded:: 2.0.0 Device ordinal, available options are `cpu`, `cuda`, and `gpu`.",'cuda'
,"early_stopping_rounds  early_stopping_rounds: typing.Optional[int] .. versionadded:: 1.6.0 - Activates early stopping. Validation metric needs to improve at least once in  every **early_stopping_rounds** round(s) to continue training. Requires at  least one item in **eval_set** in :py:meth:`fit`. - If early stopping occurs, the model will have two additional attributes:  :py:attr:`best_score` and :py:attr:`best_iteration`. These are used by the  :py:meth:`predict` and :py:meth:`apply` methods to determine the optimal  number of trees during inference. If users want to access the full model  (including trees built after early stopping), they can specify the  `iteration_range` in these inference methods. In addition, other utilities  like model plotting can also use the entire model. - If you prefer to discard the trees after `best_iteration`, consider using the  callback function :py:class:`xgboost.callback.EarlyStopping`. - If there's more than one item in **eval_set**, the last entry will be used for  early stopping. If there's more than one metric in **eval_metric**, the last  metric will be used for early stopping.",
,enable_categorical  enable_categorical: bool See the same parameter of :py:class:`DMatrix` for details.,False


In [44]:
y_train_pred_log = final_model.predict(X_train)
y_val_pred_log = final_model.predict(X_val)
y_test_pred_log = final_model.predict(X_test)

# Convert back to original price scale
y_train_actual = np.expm1(y_train_log)
y_train_pred = np.expm1(y_train_pred_log)

y_val_actual = np.expm1(y_val_log)
y_val_pred = np.expm1(y_val_pred_log)

In [45]:
train_metrics = calculate_metrics(y_train_actual, y_train_pred, "Training")
val_metrics = calculate_metrics(y_val_actual, y_val_pred, "Validation")


Training Metrics:
  RMSE: $1,788.74
  MAE:  $518.19
  R²:   0.9542
  MAPE: 8.83%

Validation Metrics:
  RMSE: $2,223.92
  MAE:  $640.14
  R²:   0.9286
  MAPE: 10.83%
