# Tuning Hyperparameters


## 1. Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## 2. Data Loading and Preprocessing

Load the training data and apply initial filtering to remove invalid entries.

In [None]:
df = pd.read_csv("../data/CW1_train.csv")
# Remove physically impossible dimensions identified in the EDA
df = df[(df["x"] > 0) & (df["y"] > 0) & (df["z"] > 0) & (df["y"] <= 10)]

Separate features and target variable.

In [None]:
X = df.drop(columns=["outcome"])
Y = df["outcome"]

## 3. Feature Engineering

Create derived features including volume and log-transformed price and carat features.

In [None]:
from sklearn.preprocessing import FunctionTransformer

def feature_engineering(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df["volume"] = df["x"] * df["y"] * df["z"]
    df["log_price"] = np.log1p(df["price"])
    df["log_carat"] = np.log1p(df["carat"])
    return df.drop(columns=["x", "y", "z", "carat", "price"])

feature_engineering_transformer = FunctionTransformer(
    feature_engineering,
    validate=False,
    check_inverse=False
)

## 4. Preprocessing Pipeline

Build a preprocessor that standardizes numerical features and encodes categorical features using ordinal encoding with predefined orderings.

In [None]:
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer, make_column_selector

cut_order = ["Fair", "Good", "Very Good", "Premium", "Ideal"]
colour_order = ["J", "I", "H", "G", "F", "E", "D"]
clarity_order = ["I1", "SI2", "SI1", "VS2", "VS1", "VVS2", "VVS1", "IF"]

def build_preprocessor() -> ColumnTransformer:
    categorical_columns = ["cut", "color", "clarity"]

    return ColumnTransformer(
        transformers=[
            ("num", StandardScaler(), make_column_selector(dtype_include=np.number)),
            ("cat", OrdinalEncoder(categories=[cut_order, colour_order, clarity_order]), categorical_columns),
        ],
    )


Create a pipeline that combines feature engineering, preprocessing, and the model.

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator

def build_pipeline(model: BaseEstimator) -> Pipeline:
    return Pipeline(steps=[
        ("feature_engineering", feature_engineering_transformer),
        ("preprocessor", build_preprocessor()),
        ("model", model)
    ])

In [None]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, KFold
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor


In [None]:
cv = KFold(n_splits=5, shuffle=True, random_state=42)

# Broad Search
rf_params = {
    "model__n_estimators": np.arange(100, 600, 50),
    "model__max_depth": np.arange(3, 20),
    "model__min_samples_split": np.arange(2, 10),
    "model__min_samples_leaf": np.arange(1, 6),
}

gb_params = {
    "model__n_estimators": np.arange(100, 600, 50),
    "model__learning_rate": np.linspace(0.01, 0.3, 20),
    "model__max_depth": np.arange(2, 10),
    "model__subsample": np.linspace(0.6, 1.0, 10),
}

xgb_params = {
    "model__n_estimators": np.arange(100, 600, 50),
    "model__learning_rate": np.linspace(0.01, 0.3, 20),
    "model__max_depth": np.arange(3, 12),
    "model__subsample": np.linspace(0.6, 1.0, 10),
    "model__colsample_bytree": np.linspace(0.6, 1.0, 10),
}

In [None]:
def run_random_search(model, param_dist, name):
    pipeline = build_pipeline(model)

    random_search = RandomizedSearchCV(
        pipeline,
        param_distributions=param_dist,
        n_iter=40,
        cv=cv,
        scoring="r2",
        n_jobs=-1,
        verbose=1,
        random_state=42
    )

    random_search.fit(X, Y)

    print(f"\n{name} Random Search Best R²: {random_search.best_score_:.4f}")
    print(f"{name} Best Params: {random_search.best_params_}")

    return random_search


In [None]:
def build_local_grid(best_params):
    grid = {}

    for param, value in best_params.items():
        if isinstance(value, int):
            grid[param] = [max(1, value - 50), value, value + 50]
        elif isinstance(value, float):
            grid[param] = [round(value * 0.8, 3), value, round(value * 1.2, 3)]
        else:
            grid[param] = [value]

    return grid


In [None]:
def run_grid_search(random_search, name):
    local_grid = build_local_grid(random_search.best_params_)

    print(f"\n{name} Local Grid:")
    print(local_grid)

    grid_search = GridSearchCV(
        random_search.estimator,
        param_grid=local_grid,
        cv=cv,
        scoring="r2",
        n_jobs=-1,
        verbose=1
    )

    grid_search.fit(X, Y)

    print(f"\n{name} Grid Search Best R²: {grid_search.best_score_:.4f}")
    print(f"{name} Fine Params: {grid_search.best_params_}")

    return grid_search


In [None]:
rf_random = run_random_search(RandomForestRegressor(random_state=42), rf_params, "Random Forest")
gb_random = run_random_search(GradientBoostingRegressor(random_state=42), gb_params, "Gradient Boosting")
xgb_random = run_random_search(XGBRegressor(random_state=42, verbosity=0), xgb_params, "XGBoost")

rf_grid = run_grid_search(rf_random, "Random Forest")
gb_grid = run_grid_search(gb_random, "Gradient Boosting")
xgb_grid = run_grid_search(xgb_random, "XGBoost")


In [None]:
final_results = pd.DataFrame({
    "Model": ["Random Forest", "Gradient Boosting", "XGBoost"],
    "Fine-tuned R²": [
        rf_grid.best_score_,
        gb_grid.best_score_,
        xgb_grid.best_score_,
    ]
}).sort_values("Fine-tuned R²", ascending=False)

final_results


In [None]:
plt.figure(figsize=(10,6))
plt.barh(final_results["Model"], final_results["Fine-tuned R²"])
plt.xlabel("Fine-tuned Cross-Validated R²")
plt.title("Final Tuned Model Comparison")
plt.gca().invert_yaxis()
plt.tight_layout()
plt.savefig("../figures/final_tuned_models.png", dpi=300)
plt.show()
