In [None]:
# -------------------------------------------
# 1. LOAD LIBRARIES AND CONFIG
# -------------------------------------------

import pandas as pd
import time


# Import config paths so they are accessible to module.
import sys
sys.path.insert(0, "C:/Misc/binary_eval")
import config
import importlib
importlib.reload(config)    # Reload config to ensure latest edits are active

from config import DATASETS_DIR, TUNED_MODELS_DIR, PROCESSED_DATA_DIR, OUTPUTS_DIR

In [8867]:
# -------------------------------------------
# 2. LOAD MODELS
# -------------------------------------------

import json


# Load AVAILABLE_MODELS created in 04_prepare_model_baselines.ipynb
models_path = OUTPUTS_DIR / "pre_tuned_models" / "available_models.json"

with open(models_path, "r") as f:
    AVAILABLE_MODELS = json.load(f)


In [None]:
# -------------------------------------------
# 3. OBTAIN MODEL NAME AND DATASET NAME
# -------------------------------------------

# Obtain selected model and dataset names set in the previous module 05_prepare_model_baselines.ipynb
with open(OUTPUTS_DIR / "last_selection.json", "r") as f:
    selection = json.load(f)
selected_model = selection["selected_model"]
dataset_name = selection["dataset_name"]

print(f"Selected model: {selected_model}")
print(f"Selected dataset: {dataset_name}")

In [None]:
# -------------------------------------------
# 4. LOAD DATASET AND SPLIT FEATURES/TARGET
# -------------------------------------------

# Build the path to the processed dataset
transformed_path = PROCESSED_DATA_DIR / f"{dataset_name}_transformed.parquet"

# Load the dataset
df = pd.read_parquet(transformed_path)

# Specify the target column name. Every dataset for this project used "target".
target_col = "target"  

# Split into features and target
X = df.drop(columns=[target_col])
y = df[target_col]

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")

In [None]:
# -------------------------------------------
# 4a. SANITIZE COLUMN NAMES BECAUSE ONE-HOT ENCODING CAN CREATE PROBLEMS
# -------------------------------------------


# Sanitize column names after one-hot encoding to prevent model crashes.
# LightGBM and similar libraries require feature names to be ASCII, printable, and free of control characters.
# One-hot encoding creates new columns using raw categorical values, which may contain spaces, punctuation,
# Unicode, or invisible/control characters. Sanitizing ensures robust, error-free model training.

# Problematic characters in column names that could cause model crashes include:
#  1. Invisible Unicode characters (e.g., zero-width space, non-breaking space)
#  2. Non-ASCII symbols
#  3. Hidden formatting or control characters


import re

def sanitize_column_names(columns):
    # Remove any character that is not a letter, digit, or underscore
    return [re.sub(r'[^\w]', '_', str(col)) for col in columns]

# Apply to your features
X.columns = sanitize_column_names(X.columns)
print("Sanitized columns:", X.columns.tolist())

for col in X.columns:
    for c in col:
        if ord(c) < 32 or ord(c) > 126:
            print(f"Non-printable character found in column: {col!r}")

In [None]:
# -------------------------------------------
# 5. SET UP HYPERPARAMETER SEARCH (GridSearchCV)
# -------------------------------------------

from sklearn.model_selection import GridSearchCV
import importlib
from sklearn.model_selection import ParameterGrid

# Get model info and param grid
model_info = AVAILABLE_MODELS[selected_model]
param_grid = model_info["param_grid"]
n_param_combinations = len(ParameterGrid(param_grid))   # Permutations of params and value choices.

# Dynamically import the model class
model_module = importlib.import_module(model_info["module"])
ModelClass = getattr(model_module, model_info["class"])

# Instantiate the model with default parameters
model = ModelClass(**model_info["default_params"])

# Set up GridSearchCV
grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    cv=5,  # 5-fold cross-validation
    scoring=model_info.get("scoring", "accuracy"),
    n_jobs=-1,  # Need 1 to limit to one job for some models (e.g., LR). Else -1
    verbose=2
)

print("GridSearchCV is set up and ready to run.")
print(f"Number of parameter combinations: {n_param_combinations:,}")

In [None]:
# -------------------------------------------
# 6. RUN HYPERPARAMETER TUNING
# -------------------------------------------

# Show model being tuned.
print(f"Evaluating this model: {model}")
print()


# The code below uses the entire dataset for tuning
X_sample = X
y_sample = y
sample_size = len(X_sample)   # If entire dataset is used, this is the sample size.


# But a random sample can be selected if RAM is insufficient for the full dataset.
sample_size = min(30000, len(X))
X_sample = X.sample(n=sample_size, random_state=42)
y_sample = y.loc[X_sample.index]

print(f"\nUsing {sample_size:,} rows for tuning.\n")

# Determine best params and measure time to do that.

start_time = time.time()
grid_search.fit(X_sample, y_sample)
end_time = time.time()
grid_runtime = end_time - start_time

print(f"GridSearchCV runtime: {grid_runtime:.2f} seconds")
print()


# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_
best_estimator = grid_search.best_estimator_

print(best_params)
print(f"Best CV Score: {best_score:.4f}")

In [None]:
# -------------------------------------------
# 7. DISPLAY HYPERPARAMETERS AND TUNING STATUS
# -------------------------------------------

# Get all hyperparameters from the best estimator
all_params = best_estimator.get_params()

# Get the param_grid keys (tuned hyperparameters)
tuned_keys = set(param_grid.keys())


print(f"The Hyperparameters and Tuning Values for: {selected_model}")
print()
for param, value in all_params.items():
    if param in tuned_keys:
        status = "ðŸŸ¢ Tuned"
    else:
        status = "âšª Default"
    print(f"{param}: {value}   {status}")

In [None]:
# -------------------------------------------
# 8. EXPORT TUNING RESULTS AND METADATA TO JSON
# -------------------------------------------

# Export the tuning results and best parameters to a JSON file in
# the tuned_models directory. This file includes all metadata
# needed for downstream modules to build and instantiate
# the now-tuned model. 


import json
from datetime import datetime

# Prepare results dictionary which is used to populate the JSON results file.
tuning_results = {
    "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
    "model_name": selected_model,
    "model_module": model_info["module"],
    "dataset": dataset_name,
    "best_params": best_params,
    "best_score": best_score,
    "param_grid": param_grid,
    "all_params": best_estimator.get_params(),
    "n_features": X.shape[1],
    "n_samples": X.shape[0],
    "grid_runtime": grid_runtime,
    "param_combinations": n_param_combinations,
    "hyper_tune_sample_size": sample_size
}

# Save to outputs/tuned_models for use by 07_run_model_evaluation.ipynb
results_path = TUNED_MODELS_DIR / f"{selected_model}_{dataset_name}_tuning_results.json"
results_path.parent.mkdir(parents=True, exist_ok=True)  # Ensure directory exists

with open(results_path, "w") as f:
    json.dump(tuning_results, f, indent=2)

print(f"Tuning results and metadata saved to: {results_path}")

print("Here are tuned parameters")
print(model.get_params())