In [2]:
import xgboost as xgb
import sys
print(xgb.__version__)
print(xgb.__file__)
print(sys.executable)

3.1.2
/Users/aaditya.paliwal/Desktop/regression_ml_end2end/.venv/lib/python3.13/site-packages/xgboost/__init__.py
/Users/aaditya.paliwal/Desktop/regression_ml_end2end/.venv/bin/python


In [3]:
# ==============================================
# 1. Imports
# ==============================================
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from xgboost import XGBRegressor
import optuna
import mlflow
import mlflow.xgboost

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# ==============================================
# 2. Loading Processed Datasets
# ==============================================

train_df = pd.read_csv("/Users/aaditya.paliwal/Desktop/regression_ml_end2end/data/processed/feature_engineered_train.csv", index_col=0)
dev_df = pd.read_csv("/Users/aaditya.paliwal/Desktop/regression_ml_end2end/data/processed/feature_engineered_dev.csv", index_col=0)

# Define target + features
target = "price"
X_train = train_df.drop(columns=[target])
y_train = train_df[target]

X_dev = dev_df.drop(columns=[target])
y_dev = dev_df[target]

print(X_train.shape)
print(X_dev.shape)


(578916, 38)
(148697, 38)


In [8]:
# ==============================================
# 3. Define Optuna objective function with MLFlow
# ==============================================

def objective(trial):

    # Define params
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.3, log=True),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "gamma": trial.suggest_float("gamma", 0.0, 5.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 10.0, log=True),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 10.0, log=True),
        "booster": trial.suggest_categorical("booster", ["gbtree", "gblinear", "dart"]),
        "random_state": 42,
        "n_jobs": -1,
        "tree_method": "hist",
    }

    # Start MLFlow Run
    with mlflow.start_run(nested=True):
        model = XGBRegressor(**params)
        model.fit(X_train, y_train)

        y_pred = model.predict(X_dev)
        rmse = float(np.sqrt(mean_squared_error(y_dev, y_pred)))
        mae = float(mean_absolute_error(y_dev, y_pred))
        r2 = float(r2_score(y_dev, y_pred))

        # Logging hyperparameters and metrics
        mlflow.log_params(params)
        mlflow.log_metrics({"rmse": rmse, "mae": mae, "r2": r2})

    return rmse


In [9]:
# ==============================================
# 4. Run Optuna study with MLflow
# ==============================================

# Force MLflow to always use the root project mlruns folder

mlflow.set_tracking_uri("/Users/aaditya.paliwal/Desktop/regression_ml_end2end/mlruns")
mlflow.set_experiment("xgboost_optuna_housing")

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=15)

print("Number of finished trials: ", len(study.trials))
print("Best params:", study.best_trial.params)

[I 2026-01-05 18:28:07,173] A new study created in memory with name: no-name-70928930-b8e7-4a91-9283-4d5c319947fd
[I 2026-01-05 18:28:08,776] Trial 0 finished with value: 68253.64975099656 and parameters: {'n_estimators': 114, 'max_depth': 8, 'learning_rate': 0.0891818286833785, 'subsample': 0.8522179757500077, 'colsample_bytree': 0.5515047064944292, 'min_child_weight': 8, 'gamma': 3.289194818927856, 'reg_alpha': 0.001481796247474645, 'reg_lambda': 0.9511116273334357, 'booster': 'gbtree'}. Best is trial 0 with value: 68253.64975099656.
[I 2026-01-05 18:28:12,264] Trial 1 finished with value: 74226.45637656005 and parameters: {'n_estimators': 337, 'max_depth': 7, 'learning_rate': 0.026997726710675702, 'subsample': 0.7046831400621236, 'colsample_bytree': 0.9627887949005418, 'min_child_weight': 9, 'gamma': 4.294250301741921, 'reg_alpha': 2.8459009486911034e-07, 'reg_lambda': 2.4368425520816053, 'booster': 'gbtree'}. Best is trial 0 with value: 68253.64975099656.
[I 2026-01-05 18:28:17,221

Number of finished trials:  15
Best params: {'n_estimators': 516, 'max_depth': 8, 'learning_rate': 0.04339896173750895, 'subsample': 0.8702954761680367, 'colsample_bytree': 0.6317478812010022, 'min_child_weight': 10, 'gamma': 2.2005211813104624, 'reg_alpha': 1.1004855582527181e-05, 'reg_lambda': 0.03112611143481403, 'booster': 'gbtree'}


In [14]:
# ==============================================
# 5. Train final model with best params and log to MLflow
# ==============================================
best_params = study.best_trial.params
best_model = XGBRegressor(**best_params)
best_model.fit(X_train, y_train)

y_pred = best_model.predict(X_dev)

mae = mean_absolute_error(y_dev, y_pred)
rmse = np.sqrt(mean_squared_error(y_dev, y_pred))
r2 = r2_score(y_dev, y_pred)

print("Final tuned model performance:")
print("MAE:", mae)
print("RMSE:", rmse)
print("R²:", r2)

# Log final model
with mlflow.start_run(run_name="best_xgboost_model"):
    mlflow.log_params(best_params)
    mlflow.log_metrics({"rmse": rmse, "mae": mae, "r2": r2})
    mlflow.sklearn.log_model(
        sk_model=best_model,
        name="model"
    )


Final tuned model performance:
MAE: 30985.11416698783
RMSE: 67908.66968863898
R²: 0.9643543050601973




In [15]:
print(type(best_model))
print(hasattr(best_model, "_estimator_type"))


<class 'xgboost.sklearn.XGBRegressor'>
False
