In [1]:
# -------------------------------------------
# 1. LOAD LIBRARIES AND CONFIG
# -------------------------------------------

import pandas as pd
from pathlib import Path
import json


import os

# Import config paths so they are accessible to module.
import sys
sys.path.insert(0, "C:/Misc/binary_eval")
import config
import importlib
importlib.reload(config)    # Reload config to ensure latest edits are active

from config import  OUTPUTS_DIR, PROCESSED_DATA_DIR, BASELINE_RESULTS_DIR

This cell defines the AVAILABLE_MODELS dictionary, which lists all machine learning classifiers used for baseline evaluation. Each entry includes the modelâ€™s class, module, label requirements, hyperparameter search type, scoring metric, default parameters, and grid search options. 

This structure enables modular, reproducible benchmarking across diverse datasets and supports automated model selection and tuning.

In [None]:
# -------------------------------------------
# 2. INSTANTIATE MODELS
# -------------------------------------------

AVAILABLE_MODELS = {
    "DecisionTreeClassifier": {
        "class": "DecisionTreeClassifier",
        "module": "sklearn.tree",
        "requires_numeric_labels": False,
        "search_type": "grid",
        "scoring": "accuracy",
        "default_params": {"random_state": 42},
        "param_grid": {
            "criterion": ["gini", "entropy"],
            "max_depth": [None, 10, 20],
            "min_samples_split": [2, 5],
            "min_samples_leaf": [1, 3],
            "max_features": [None]
        }
    },
    "RandomForestClassifier": {
        "class": "RandomForestClassifier",
        "module": "sklearn.ensemble",
        "requires_numeric_labels": False,
        "search_type": "grid",
        "scoring": "accuracy",
        "default_params": {
            "random_state": 42,
            "n_jobs": -1
        },
        "param_grid": {
            "n_estimators": [100, 300],
            "max_depth": [None, 30],
            "min_samples_split": [2, 10],
            "min_samples_leaf": [1, 5],
            "max_features": ["sqrt"]
        }
    },
    "ExtraTreesClassifier": {
        "class": "ExtraTreesClassifier",
        "module": "sklearn.ensemble",
        "requires_numeric_labels": False,
        "search_type": "grid",
        "scoring": "accuracy",
        "default_params": {
            "random_state": 42,
            "n_jobs": -1
        },
        "param_grid": {
            "n_estimators": [100, 300],
            "max_depth": [None, 10, 30],
            "min_samples_split": [2, 10],
            "min_samples_leaf": [1, 5],
            "max_features": ["sqrt"]
        }
    },
    "GradientBoostingClassifier": {
        "class": "GradientBoostingClassifier",
        "module": "sklearn.ensemble",
        "requires_numeric_labels": False,
        "search_type": "grid",
        "scoring": "accuracy",
        "default_params": {
            "random_state": 42,
            "subsample": 0.8,
            "max_depth": 5,
            "max_features": "sqrt"
        },
        "param_grid": {
            "n_estimators": [100]
        }
    },
    "LGBMClassifier": {
        "class": "LGBMClassifier",
        "module": "lightgbm",
        "requires_numeric_labels": True,
        "search_type": "grid",
        "scoring": "accuracy",
        "default_params": {
            "random_state": 42,
            "n_jobs": -1,
            "verbose": -1
        },
        "param_grid": {
            "n_estimators": [100],
            "num_leaves": [15, 31]
        }
    },
    "XGBClassifier": {
        "class": "XGBClassifier",
        "module": "xgboost",
        "requires_numeric_labels": True,
        "search_type": "grid",
        "scoring": "accuracy",
        "default_params": {
            "random_state": 42,
            "use_label_encoder": False,
            "eval_metric": "mlogloss",
            "n_jobs": -1,
            "verbosity": 0
        },
        "param_grid": {
            "n_estimators": [100, 300],
            "learning_rate": [0.01, 0.1],
            "max_depth": [3, 7],
            "subsample": [0.8, 1.0]
        }
    },
    "LogisticRegression": {
        "class": "LogisticRegression",
        "module": "sklearn.linear_model",
        "requires_numeric_labels": False,
        "search_type": "grid",
        "scoring": "accuracy",
        "default_params": {
            "random_state": 42,
            "solver": "liblinear"
        },
        "param_grid": {
            "penalty": ["l1", "l2"],
            "C": [0.01, 0.1, 1, 10, 100]
        }
    },
    "SGDClassifier": {
        "class": "SGDClassifier",
        "module": "sklearn.linear_model",
        "requires_numeric_labels": False,
        "search_type": "grid",
        "scoring": "accuracy",
        "default_params": {
            "random_state": 42,
            "n_jobs": -1,
            "max_iter": 1000
        },
        "param_grid": {
            "loss": ["hinge", "log_loss"],
            "penalty": ["l2", "elasticnet"],
            "alpha": [0.0001, 0.001],
            "max_iter": [1000, 2000]
        }
    },
    "LinearSVC": {
        "class": "LinearSVC",
        "module": "sklearn.svm",
        "requires_numeric_labels": False,
        "search_type": "grid",
        "scoring": "accuracy",
        "default_params": {
            "random_state": 42,
            "max_iter": 1000
        },
        "param_grid": {
            "C": [0.1, 1, 10],
            "loss": ["hinge", "squared_hinge"]
        }
    },
    "SVC": {
        "class": "SVC",
        "module": "sklearn.svm",
        "requires_numeric_labels": False,
        "search_type": "grid",
        "scoring": "accuracy",
        "default_params": {
            "random_state": 42
        },
        "param_grid": {
            "C": [0.1, 1, 10],
            "kernel": ["linear", "rbf"],
            "gamma": ["scale", "auto"],
            "max_iter": [-1]
        }
    },
    "KNeighborsClassifier": {
        "class": "KNeighborsClassifier",
        "module": "sklearn.neighbors",
        "requires_numeric_labels": False,
        "search_type": "grid",
        "scoring": "accuracy",
        "default_params": {},
        "param_grid": {
            "n_neighbors": [3, 5, 7],
            "weights": ["uniform", "distance"],
            "metric": ["euclidean", "manhattan"],
            "p": [1, 2]
        }
    },
    "GaussianNB": {
        "class": "GaussianNB",
        "module": "sklearn.naive_bayes",
        "requires_numeric_labels": False,
        "search_type": "grid",
        "scoring": "accuracy",
        "default_params": {},
        "param_grid": {
            "var_smoothing": [1e-9, 1e-8, 1e-7, 1e-6]
        }
    },
    "LinearDiscriminantAnalysis": {
        "class": "LinearDiscriminantAnalysis",
        "module": "sklearn.discriminant_analysis",
        "requires_numeric_labels": False,
        "search_type": "grid",
        "scoring": "accuracy",
        "default_params": {}, # svd w/ shrinkage will crash
        "param_grid": {
            "solver": ["lsqr", "eigen"],
            "shrinkage": ["auto"]  # Only used with 'lsqr' and 'eigen'
        }
    },
    "QuadraticDiscriminantAnalysis": {
        "class": "QuadraticDiscriminantAnalysis",
        "module": "sklearn.discriminant_analysis",
        "requires_numeric_labels": False,
        "search_type": "grid",
        "scoring": "accuracy",
        "default_params": {},
        "param_grid": {
            "reg_param": [0.0, 0.01, 0.1, 0.5],
            "store_covariance": [False]  # Set to True only if you need access to covariances
        }
    },
    "MLPClassifier": {
        "class": "MLPClassifier",
        "module": "sklearn.neural_network",
        "requires_numeric_labels": False,
        "search_type": "grid",
        "scoring": "accuracy",
        "default_params": {
            "random_state": 42,
            "max_iter": 1000
        },
        "param_grid": {
            "hidden_layer_sizes": [(50,), (100,)],
            "activation": ["relu", "tanh"],
            "solver": ["adam", "sgd"],
            "learning_rate_init": [0.001, 0.01]
        }
    }
}




# Save AVAILABLE_MODELS to outputs/pre_tuned_models/available_models.json
models_path = OUTPUTS_DIR / "pre_tuned_models" / "available_models.json"
models_path.parent.mkdir(parents=True, exist_ok=True)  # Ensure directory exists

with open(models_path, "w") as f:
    json.dump(AVAILABLE_MODELS, f, indent=2)

print("Available Models:")
for idx, model_name in enumerate(AVAILABLE_MODELS.keys(), 1):
    print(f"{idx}. {model_name}")

In [None]:
# -------------------------------------------
# 3. SELECT MODEL TO EVALUATE
# -------------------------------------------

# Modules 05, 06, and 07 are run sequentially to evaluate one model on one dataset.
# This approach was chosen because some models took hours to run on some datasets.

# ---------------- CHOOSE MODEL TO EVALUATE ----------------
selected_model = "MLPClassifier"   # <-- Copy/paste from printed list above
#-----------------------------------------------------------
#  CHANGE MODEL FOR  EACH ROUND: One round is executing 05, 06, and 07 modules

In [None]:
# -------------------------------------------
# 4. LOAD TRANSFORMED DATASET
# -------------------------------------------
# -------------------- SELECT THE DATASET --------------------
#  Do not use .csv extension in the name
dataset_name = "dataset1"  # <-- Replace with actual dataset name. Exclude extension ".csv"

# Build the path to the transformed parquet file
transformed_path = PROCESSED_DATA_DIR / f"{dataset_name}_transformed.parquet"

# Load the transformed dataset
df = pd.read_parquet(transformed_path)

print(f"Loaded transformed dataset: {transformed_path}")
print(f"Shape: {df.shape}")

In [None]:
# -------------------------------------------
# 5. SPLIT FEATURES AND TARGET
# -------------------------------------------

# Specify the target column name
target_col = "target"  # <-- Replace with actual target column name if needed

# Split data into features (X) and target (y)
X = df.drop(columns=[target_col])
y = df[target_col]

# Diagnostics
print("Target preview:", y.head())
print("Unique values:", y.unique())
print("Target dtype:", y.dtype)
print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")

In [None]:
# -------------------------------------------
# 5a. SANITIZE COLUMN NAMES BECAUSE ONE-HOT ENCODING CAN CREATE PROBLEMS
# -------------------------------------------


# Sanitize column names after one-hot encoding to prevent model crashes.
# LightGBM and similar libraries require feature names to be ASCII, printable, and free of control characters.
# One-hot encoding creates new columns using raw categorical values, which may contain spaces, punctuation,
# Unicode, or invisible/control characters. Sanitizing ensures robust, error-free model training.

# Problematic characters in column names that could cause model crashes include:
#  1. Invisible Unicode characters (e.g., zero-width space, non-breaking space)
#  2. Non-ASCII symbols
#  3. Hidden formatting or control characters


import re

def sanitize_column_names(columns):
    # Remove any character that is not a letter, digit, or underscore
    return [re.sub(r'[^\w]', '_', str(col)) for col in columns]

# Apply to your features
X.columns = sanitize_column_names(X.columns)
print("Sanitized columns:", X.columns.tolist())

for col in X.columns:
    for c in col:
        if ord(c) < 32 or ord(c) > 126:
            print(f"Non-printable character found in column: {col!r}")

In [None]:
# -------------------------------------------
# 6. CROSS-VALIDATION: ACCURACY, F1, ROC AUC
# -------------------------------------------

from sklearn.model_selection import StratifiedKFold, cross_val_score
import importlib

# ---- Instantiate the selected model ----
model_info = AVAILABLE_MODELS[selected_model]
model_module = importlib.import_module(model_info["module"])
ModelClass = getattr(model_module, model_info["class"])
model = ModelClass(**model_info["default_params"])

# Show model being evaluated.
print(f"Evaluating this model: {model}")
print()

# ---- Set up StratifiedKFold ----
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# ---- Compute metrics ----
metrics = {
    "accuracy": "accuracy",
    "f1_weighted": "f1_weighted",
    "roc_auc": "roc_auc_ovr"
}

results = {}
for metric_name, scoring in metrics.items():
    scores = cross_val_score(model, X, y, cv=cv, scoring=scoring, n_jobs=-1)
    results[metric_name] = {
        "mean": scores.mean(),
        "std": scores.std(),
        "all_scores": scores.tolist()
    }

# ---- Print results ----
for metric_name, stats in results.items():
    print(f"{metric_name.capitalize()} - Mean: {stats['mean']:.4f}, Std: {stats['std']:.4f}")

In [None]:
# -------------------------------------------
# 7 CONSTRUCT PARAMS METADATA COLUMN
# -------------------------------------------

# This cell constructs the params metadata column using only param_grid
# keys and their default values. 

# To capture the actual values used (including library defaults),
# this code uses model.get_params() for the parameters of importance
# for this module (i.e., those in param_grid of each model). This ensures
# that the params data recorded in the Excel file model_results_baseline.xlsx
# are the actual values used by the model.

# This ensures the params column reflects
# the baseline (untuned) configuration for the selected model.

model_info = AVAILABLE_MODELS[selected_model]
param_grid_keys = list(model_info["param_grid"].keys())
default_params = model_info["default_params"]

# Build params dictionary from the model's actual default values
params_metadata = {key: model.get_params().get(key, None) for key in param_grid_keys}

print("Params metadata for baseline run:")
print(params_metadata)

In [None]:
# -------------------------------------------
# 8. MEASURE PEAK RAM USAGE AND THROUGHPUT FOR PREDICTION ONLY
# -------------------------------------------

# Turn off all other apps to reduce interuptions and make
# the benchmarking results more stable and reporducible.

import time
import psutil

# Show model being evaluated.
print(f"Evaluating this model: {model}")
print()


n_loops = 5
max_ram_mb = 0
total_predictions = 0
total_pred_runtime = 0.0

process = psutil.Process()

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the model once before prediction loops
model.fit(X_train, y_train)

for i in range(n_loops):
    start_ram = process.memory_info().rss / (1024 ** 2)  # MB
    
    pred_start = time.time()
    y_pred = model.predict(X_test)
    pred_end = time.time()
    
    ram_after_pred = process.memory_info().rss / (1024 ** 2)  # MB
    max_ram_mb = max(max_ram_mb, start_ram, ram_after_pred)
    n_preds = len(X_test)
    total_predictions += n_preds
    loop_pred_runtime = pred_end - pred_start    # Calculate total runtime
    total_pred_runtime += loop_pred_runtime


# Calculate throughput for prediction only
throughput = total_predictions / total_pred_runtime if total_pred_runtime > 0 else 0

print(f"Peak RAM usage during prediction over {n_loops} loops: {max_ram_mb:.2f} MB")
print(f"Total predictions made: {total_predictions:,.2f}")
print(f"Total cumulative prediction runtime: {total_pred_runtime:.4f} seconds")
print(f"Prediction throughput (predictions/sec): {throughput:,.2f}")

In [None]:
# -------------------------------------------
# 8. DISPLAY ACTUAL HYPERPARAMETERS
# -------------------------------------------


# Confirm the actual hyperparameters used by the fitted model
print(f"Actual hyperparameters for {selected_model}:")
for param, value in model.get_params().items():
    print(f"{param}: {value}")

In [None]:
# -------------------------------------------
# 9. EXPORT BASELINE (PRE-TUNING) RESULTS TO EXCEL
# -------------------------------------------

import pandas as pd
from datetime import datetime

# --- Prepare results row with all required metadata ---
results_row = {
    "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
    "model_name": selected_model,
    "dataset": dataset_name,
    "n_samples": X.shape[0],
    "n_features": X.shape[1],
    "fit_loops": n_loops,
    "params": str(params_metadata),
    "accuracy_mean": results["accuracy"]["mean"],
    "accuracy_std": results["accuracy"]["std"],
    "f1_weighted_mean": results["f1_weighted"]["mean"],
    "f1_weighted_std": results["f1_weighted"]["std"],
    "auc_mean": results["roc_auc"]["mean"],
    "auc_std": results["roc_auc"]["std"],
    "runtime_total": total_pred_runtime,
    "peak_ram_mb": max_ram_mb,
    "throughput": throughput
}

# --- Save to Excel in outputs/baseline_results ---
excel_path = BASELINE_RESULTS_DIR / "model_results_baseline.xlsx"
excel_path.parent.mkdir(parents=True, exist_ok=True)  # Ensure directory exists
df_results = pd.DataFrame([results_row])
if excel_path.exists():
    existing = pd.read_excel(excel_path)
    df_results = pd.concat([existing, df_results], ignore_index=True)
df_results.to_excel(excel_path, index=False)

print(f"Results saved to: {excel_path}")

In [None]:
# -------------------------------------------
# 10. SAVE MODEL AND DATASET SELECTION
# -------------------------------------------

# Save selected_model and dataset_name to a file
# which will be used by the 06_tune_hyperparameters module.
selection = {
    "selected_model": selected_model,
    "dataset_name": dataset_name
}
with open(OUTPUTS_DIR / "last_selection.json", "w") as f:
    json.dump(selection, f)

selection_path = OUTPUTS_DIR / "last_selection.json"
print(f"Model name and file name were saved to: {selection_path}")