In [None]:
# -------------------------------------------
# 1. LOAD LIBRARIES AND CONFIG
# -------------------------------------------

import pandas as pd

import json
from pathlib import Path
from datetime import datetime

# For model evaluation
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
import importlib


# Import config paths so they are accessible to module.
import sys
sys.path.insert(0, "C:/Misc/binary_eval")
import config
import importlib
importlib.reload(config)    # Reload config to ensure latest edits are active

from config import RESULTS_DIR, PROCESSED_DATA_DIR, OUTPUTS_DIR, TUNED_MODELS_DIR

In [None]:
# -------------------------------------------
# 2. SELECT AND LOAD TUNING METADATA
# -------------------------------------------

# Obtain selected model and dataset names set in a previous module
with open(OUTPUTS_DIR / "last_selection.json", "r") as f:
    selection = json.load(f)
selected_model = selection["selected_model"]
dataset_name = selection["dataset_name"]

results_path = TUNED_MODELS_DIR / f"{selected_model}_{dataset_name}_tuning_results.json"

# Load the JSON file containing best parameters and metadata
with open(results_path, "r") as f:
    tuning_metadata = json.load(f)

# Print parameters used to instantiate the model - mrerely for confirmation
best_params = tuning_metadata["best_params"]
print("Parameters used to instantiate model:")
print(best_params)

# Merge all_params with best_params so all required parameters are present.
# This ensures that scikit_learn won't use its own defaults for missing parameters,
# including solver, which defaults to lbfgs (even though liblinear is defined
# originally as the default solver).
params = {**tuning_metadata.get("all_params", {}), **best_params}

# Instantiate the model and print its parameters
ModelClass = getattr(importlib.import_module(tuning_metadata["model_module"]), tuning_metadata["model_name"])
model = ModelClass(**params)
print("Model parameters after instantiation:")
print(model.get_params())


grid_runtime = tuning_metadata["grid_runtime"]
param_combinations = tuning_metadata["param_combinations"]

print(f"Loaded tuning metadata from: {results_path}")
print("Model name:", tuning_metadata["model_name"] )
print("Best parameters:", tuning_metadata["best_params"] )
print(f"Grid search runtime: {grid_runtime:,.4f} seconds")
print(f"Number of parameter combinations: {tuning_metadata['param_combinations']:,}")

print("Here are tuned parameters")
print(model.get_params())

In [None]:
# -------------------------------------------
# 3. CONFIRM PATHS
# -------------------------------------------

print("PROCESSED_DATA_DIR:", PROCESSED_DATA_DIR)


dataset_path = PROCESSED_DATA_DIR / f"{dataset_name}_transformed.parquet"
print("Dataset path:", dataset_path)
print("Exists:", dataset_path.exists())

In [None]:
# -------------------------------------------
# 4. SELECT AND LOAD DATASET
# -------------------------------------------


# Build the path to the processed parquet file
dataset_path = PROCESSED_DATA_DIR / f"{dataset_name}_transformed.parquet"

# Load the processed dataset
df = pd.read_parquet(dataset_path)

print(f"Loaded dataset: {dataset_path}")
print(f"Shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")

In [None]:
# -------------------------------------------
# 5. INSTANTIATE MODEL WITH TUNED PARAMETERS - newest ver
# -------------------------------------------

# The model to instantiate in this code cell is determined
# by the values in the tuning_metadata dictionary
# which was loaded from the tuned model JSON file read
# in a previous code cell of this module: 2. SELECT AND 
# LOAD TUNING METADATA.

# Dynamically import the model class
model_module = tuning_metadata["model_module"] if "model_module" in tuning_metadata else "sklearn.ensemble"
model_class_name = tuning_metadata["model_name"]
best_params = tuning_metadata["best_params"]

# Merge all_params with best_params so all required parameters are present
params = {**tuning_metadata.get("all_params", {}), **best_params}


# Import the model class dynamically
ModelClass = getattr(importlib.import_module(model_module), model_class_name)

# Instantiate the model with robust parameters.
# If best_params were used, sckikit will default to lbfgs (not wanted).
model = ModelClass(**params)

print(f"Instantiated {model_class_name} with all these parameters:")
print(params)
print()
print(f"Tuned parameters (best_params):")
print(best_params)

In [None]:
# -------------------------------------------
# 6. SPLIT FEATURES AND TARGET
# -------------------------------------------

# Specify the target column
target_col = "target"   # All datasets for this benchmark use "target".

# Split the dataset into features (X) and target (y)
X = df.drop(columns=[target_col])
y = df[target_col]


print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"Target value counts:\n{y.value_counts()}")

In [None]:
# -------------------------------------------
# 6a. SANITIZE COLUMN NAMES BECAUSE ONE-HOT ENCODING CAN CREATE PROBLEMS
# -------------------------------------------

# Sanitize column names if the model crashes because of column names.
# This code was added only because LGBMClassifier crashed on one dataset.

# Sanitize column names after one-hot encoding to prevent model crashes.
# LightGBM and similar libraries require feature names to be ASCII, printable, and free of control characters.
# One-hot encoding creates new columns using raw categorical values, which may contain spaces, punctuation,
# Unicode, or invisible/control characters. Sanitizing ensures robust, error-free model training.

# Problematic characters in column names that could cause model crashes include:
#  1. Invisible Unicode characters (e.g., zero-width space, non-breaking space)
#  2. Non-ASCII symbols
#  3. Hidden formatting or control characters


import re

def sanitize_column_names(columns):
    # Remove any character that is not a letter, digit, or underscore
    return [re.sub(r'[^\w]', '_', str(col)) for col in columns]

# Apply to features
X.columns = sanitize_column_names(X.columns)
print("Sanitized columns:", X.columns.tolist())

for col in X.columns:
    for c in col:
        if ord(c) < 32 or ord(c) > 126:
            print(f"Non-printable character found in column: {col!r}")

In [None]:
# -------------------------------------------
# 7. EVALUATE MODEL: CROSS-VALIDATION
# -------------------------------------------

from sklearn.model_selection import StratifiedKFold, cross_val_score

# Set up StratifiedKFold
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Populate evaluation metrics
metrics = {
    "accuracy": "accuracy",
    "f1_weighted": "f1_weighted",
    "roc_auc": "roc_auc_ovr"
}

results = {}
for metric_name, scoring in metrics.items():
    scores = cross_val_score(model, X, y, cv=cv, scoring=scoring, n_jobs=-1)
    results[metric_name] = {
        "mean": scores.mean(),
        "std": scores.std(),
        "all_scores": scores.tolist()
    }

# Print results
for metric_name, stats in results.items():
    print(f"{metric_name.capitalize()} - Mean: {stats['mean']:.4f}, Std: {stats['std']:.4f}")

In [None]:
# -------------------------------------------
# 8. AUTOMATE PARAMS METADATA COLUMN
# -------------------------------------------

# Build params_metadata using the tuned parameters and param grid keys if available.
# The stringified params will be one of the fields recorded for this model's results
# in an Excel file that is capturing the results of each model-dataset pair.

if "param_grid" in tuning_metadata:
    param_grid_keys = list(tuning_metadata["param_grid"].keys())
else:
    param_grid_keys = list(best_params.keys())   # fallback to best_params keys

# Use best_params for tuned run
params_metadata = {key: best_params.get(key, None) for key in param_grid_keys}

print("Params metadata for benchmark run:")
print(params_metadata)

In [None]:
# -------------------------------------------
# 9. MEASURE PEAK RAM USAGE AND THROUGHPUT FOR PREDICTION ONLY
# -------------------------------------------

import time
import psutil
from sklearn.model_selection import train_test_split

# Show model being evaluated.
print(f"Evaluating this model: {model}")
print()

n_loops = 5
max_ram_mb = 0
total_predictions = 0
total_pred_runtime = 0.0

process = psutil.Process()

# Split data for prediction benchmarking
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the model once before prediction loops
model.fit(X_train, y_train)

for i in range(n_loops):
    start_ram = process.memory_info().rss / (1024 ** 2)  # MB
    pred_start = time.time()

    y_pred = model.predict(X_test)
    
    pred_end = time.time()
    ram_after_pred = process.memory_info().rss / (1024 ** 2)  # MB
    max_ram_mb = max(max_ram_mb, start_ram, ram_after_pred)
    n_preds = len(X_test)
    total_predictions += n_preds
    loop_pred_runtime = pred_end - pred_start
    total_pred_runtime += loop_pred_runtime

# Calculate throughput for prediction only
throughput = total_predictions / total_pred_runtime if total_pred_runtime > 0 else 0

print(f"Peak RAM usage during prediction over {n_loops} loops: {max_ram_mb:.2f} MB")
print(f"Total predictions made: {total_predictions:,.2f}")
print(f"Total cumulative prediction runtime: {total_pred_runtime:.4f} seconds")
print(f"Prediction throughput (predictions/sec): {throughput:,.2f}")

In [None]:
# -------------------------------------------
# 10. RECORD RESULTS TO EXCEL
# -------------------------------------------


# Prepare results row with all required metadata
results_row = {
    "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
    "model_name": model_class_name,
    "dataset": dataset_name,
    "n_samples": X.shape[0],
    "n_features": X.shape[1],
    "fit_loops": n_loops,
    "params": str(params_metadata),
    "accuracy_mean": results["accuracy"]["mean"],
    "accuracy_std": results["accuracy"]["std"],
    "f1_weighted_mean": results["f1_weighted"]["mean"],
    "f1_weighted_std": results["f1_weighted"]["std"],
    "auc_mean": results["roc_auc"]["mean"],
    "auc_std": results["roc_auc"]["std"],
    "runtime_total": total_pred_runtime,
    "peak_ram_mb": max_ram_mb,
    "throughput": throughput,
    "grid_runtime": grid_runtime,
    "param_combinations": param_combinations
}

# Save to the Excel that has one row for the results of each model-dataset pair
excel_path = RESULTS_DIR / "model_results_benchmark.xlsx"
excel_path.parent.mkdir(parents=True, exist_ok=True)  # Ensure directory exists



df_results = pd.DataFrame([results_row])
if excel_path.exists():
    existing = pd.read_excel(excel_path)
    df_results = pd.concat([existing, df_results], ignore_index=True)

df_results.to_excel(excel_path, index=False)

print(f"Results saved to: {excel_path}")