# Quick Sample Flow - All Models Demo

This section demonstrates a quick sample run of all implemented model classes with minimal data and iterations to verify everything works correctly.

## Quick Setup - Load Minimal Data

In [None]:
import os
import sys
import numpy as np
from PIL import Image
from copy import deepcopy

# Add parent directory to path to import our models
sys.path.append(os.path.join(os.path.curdir, ".."))

# Dataset directories
DATASET_DIR = os.path.join(os.path.curdir, "..", ".cache", "processed_datasets")
CIFAR10_PATH = os.path.join(DATASET_DIR, "cifar10")
MNIST_PATH = os.path.join(DATASET_DIR, "mnist")

print("Quick sample flow setup completed!")
print(f"Dataset directory: {DATASET_DIR}")
print(f"CIFAR-10 path: {CIFAR10_PATH}")
print(f"MNIST path: {MNIST_PATH}")

In [None]:
# Load datasets (small sample for quick demo)
import warnings

warnings.filterwarnings("ignore")

from datasets import load_from_disk

try:
    # Load datasets from cache
    cifar10_dataset = load_from_disk(CIFAR10_PATH)
    mnist_dataset = load_from_disk(MNIST_PATH)

    # Take small samples for quick demo (100 samples each)
    SAMPLE_SIZE = 100

    # Sample from CIFAR-10
    cifar10_sample = (
        cifar10_dataset["train"].shuffle(seed=42).select(range(SAMPLE_SIZE))
    )
    cifar10_test_sample = cifar10_dataset["test"].shuffle(seed=42).select(range(50))

    # Sample from MNIST
    mnist_sample = mnist_dataset["train"].shuffle(seed=42).select(range(SAMPLE_SIZE))
    mnist_test_sample = mnist_dataset["test"].shuffle(seed=42).select(range(50))

    print("Sample datasets loaded successfully!")
    print(
        f"CIFAR-10 sample: {len(cifar10_sample)} train, {len(cifar10_test_sample)} test"
    )
    print(f"MNIST sample: {len(mnist_sample)} train, {len(mnist_test_sample)} test")

except Exception as e:
    print(f"Error loading datasets: {e}")
    print("Please make sure datasets are downloaded and processed first.")

In [None]:
# Data extraction utility function
def extract_features_and_labels(dataset, flatten_images=True, max_samples=None):
    """Extract features (X) and labels (y) from a HuggingFace Dataset"""
    images = dataset["image"]
    labels = dataset["label"]

    # Limit samples if specified
    if max_samples is not None:
        images = images[:max_samples]
        labels = labels[:max_samples]

    # Convert images to numpy arrays
    X = []
    for img in images:
        if isinstance(img, Image.Image):
            img_array = np.array(img)
        else:
            img_array = img

        if flatten_images:
            img_array = img_array.flatten()

        X.append(img_array)

    X = np.array(X)
    y = np.array(labels)

    return X, y


# Extract sample data
X_cifar, y_cifar = extract_features_and_labels(cifar10_sample, flatten_images=True)
X_cifar_test, y_cifar_test = extract_features_and_labels(
    cifar10_test_sample, flatten_images=True
)

X_mnist, y_mnist = extract_features_and_labels(mnist_sample, flatten_images=True)
X_mnist_test, y_mnist_test = extract_features_and_labels(
    mnist_test_sample, flatten_images=True
)

print(f"CIFAR-10 sample shapes: X={X_cifar.shape}, y={y_cifar.shape}")
print(f"MNIST sample shapes: X={X_mnist.shape}, y={y_mnist.shape}")
print(
    f"Pixel value ranges - CIFAR-10: [{X_cifar.min()}, {X_cifar.max()}], MNIST: [{X_mnist.min()}, {X_mnist.max()}]"
)

## Load All Model Classes

In [None]:
# Silence warnings to avoid printing full paths
import warnings

warnings.filterwarnings("ignore")

# Import all model classes
from models.decision_tree import DecisionTreeModel
from models.knn import KNNModel
from models.logistic_regression import LogisticRegressionModel
from models.mlp import MLPModel
from models.cnn import CNNModel


def create_sample_models():
    """Create instances of all model classes for testing"""
    models = {}

    # Decision Tree
    dt_model = DecisionTreeModel()
    dt_model.create_model()
    models["Decision Tree"] = dt_model

    # K-Nearest Neighbors
    knn_model = KNNModel()
    knn_model.create_model()
    models["K-Nearest Neighbors"] = knn_model

    # Logistic Regression (Logistic Regression for classification)
    lr_model = LogisticRegressionModel()
    lr_model.create_model()
    models["Logistic Regression"] = lr_model

    # Multi-Layer Perceptron
    mlp_model = MLPModel()
    mlp_model.create_model()
    models["Multi-Layer Perceptron"] = mlp_model

    # CNN (Note: May require special handling due to PyTorch)
    try:
        cnn_model = CNNModel()
        cnn_model.create_model()
        models["Convolutional Neural Network"] = cnn_model
    except Exception as e:
        print(f"[ERROR] CNN model creation failed: {e}")
        print("CNN will be skipped in quick demo")

    return models


# Create model instances
sample_models = create_sample_models()

print("Model classes loaded successfully!")
for name, model in sample_models.items():
    print(f"  {name}: {type(model).__name__}")

print(f"\nTotal models available: {len(sample_models)}")

## Quick Hyperparameter Testing

In [None]:
import itertools
import random


def get_class_names_from_dataset(dataset_name):
    """Extract class names from HuggingFace dataset features"""
    try:
        if dataset_name == "MNIST":
            # Try to get from MNIST dataset features
            if (
                hasattr(mnist_dataset["train"].features["label"], "names")
                and mnist_dataset["train"].features["label"].names
            ):
                return mnist_dataset["train"].features["label"].names
            else:
                # MNIST typically uses digits 0-9
                return [str(i) for i in range(10)]
        elif dataset_name == "CIFAR-10":
            # Try to get from CIFAR-10 dataset features
            if (
                hasattr(cifar10_dataset["train"].features["label"], "names")
                and cifar10_dataset["train"].features["label"].names
            ):
                return cifar10_dataset["train"].features["label"].names
            else:
                # CIFAR-10 fallback class names
                return [
                    "airplane",
                    "automobile",
                    "bird",
                    "cat",
                    "deer",
                    "dog",
                    "frog",
                    "horse",
                    "ship",
                    "truck",
                ]
        else:
            # For other datasets, create generic class names based on unique labels
            return [f"Class {i}" for i in range(10)]  # Default to 10 classes
    except Exception as e:
        print(f"Warning: Could not extract class names for {dataset_name}: {e}")
        # Fallback to generic class names
        if dataset_name == "MNIST":
            return [str(i) for i in range(10)]
        elif dataset_name == "CIFAR-10":
            return [
                "airplane",
                "automobile",
                "bird",
                "cat",
                "deer",
                "dog",
                "frog",
                "horse",
                "ship",
                "truck",
            ]
        else:
            return [f"Class {i}" for i in range(10)]


def quick_hyperparameter_test(
    models_dict, X_train, y_train, X_test, y_test, dataset_name="Dataset"
):
    """Perform a quick hyperparameter test with limited iterations"""
    print(f"Starting quick hyperparameter test on {dataset_name}")
    print("=" * 60)

    # Extract class names from HuggingFace dataset - ALWAYS get class names for CNN compatibility
    class_names = get_class_names_from_dataset(dataset_name)
    print(f"Extracted class names from {dataset_name}: {class_names}")

    results = {}

    for model_name, model in models_dict.items():
        print(f"\nTesting {model_name}...")

        # Check if model supports hyperparameter tuning
        if not hasattr(model, "get_param_space"):
            print(
                f"[WARNING] {model_name} does not support hyperparameter tuning. Using default params."
            )
            try:
                model_copy = deepcopy(model)
                # CNN models ALWAYS need class_names (required positional argument)
                if "CNN" in model_name:
                    model_copy.train(X_train, y_train, class_names)
                else:
                    model_copy.train(X_train, y_train)
                metrics = model_copy.evaluate(X_test, y_test)
                results[model_name] = {
                    "best_params": "default",
                    "best_score": metrics.get("accuracy", 0.0),
                    "metrics": metrics,
                }
                print(f"Default accuracy: {metrics.get('accuracy', 0.0):.4f}")
            except Exception as e:
                print(f"[ERROR] {e}")
                results[model_name] = {"error": str(e)}
            continue

        # Get parameter space and sample a few combinations
        try:
            param_space = model.get_param_space()
            param_names = list(param_space.keys())

            # Extract actual values from ParamSpace objects - helper function defined in comprehensive section
            def extract_param_values_for_quick_test(param_space):
                """Quick version of param extraction with fewer samples"""
                param_values = []
                for param_name, param_def in param_space.items():
                    if hasattr(param_def, "param_type"):
                        # This is a ParamSpace object
                        if param_def.param_type.value == "categorical":
                            param_values.append(param_def.choices)
                        elif param_def.param_type.value == "boolean":
                            param_values.append([True, False])
                        elif param_def.param_type.value == "integer":
                            # Sample a few values from the range for quick testing
                            values = []
                            if (
                                param_def.min_value is not None
                                and param_def.max_value is not None
                            ):
                                step = max(
                                    1, (param_def.max_value - param_def.min_value) // 3
                                )
                                values = list(
                                    range(
                                        param_def.min_value,
                                        param_def.max_value + 1,
                                        step,
                                    )
                                )
                                if (
                                    len(values) > 5
                                ):  # Limit to 5 values for quick testing
                                    values = values[:5]
                            if (
                                param_def.default is not None
                                and param_def.default not in values
                            ):
                                values.append(param_def.default)
                            param_values.append(
                                values if values else [param_def.default]
                            )
                        elif param_def.param_type.value == "float":
                            # Sample a few values from the range for quick testing
                            values = []
                            if (
                                param_def.min_value is not None
                                and param_def.max_value is not None
                            ):
                                import numpy as np

                                values = list(
                                    np.linspace(
                                        param_def.min_value, param_def.max_value, 3
                                    )
                                )
                            if (
                                param_def.default is not None
                                and param_def.default not in values
                            ):
                                values.append(param_def.default)
                            param_values.append(
                                values if values else [param_def.default]
                            )
                    else:
                        # This is already a list of values
                        param_values.append(param_def)

                return param_values

            param_values = extract_param_values_for_quick_test(param_space)

            # Generate all combinations and sample a few
            all_combinations = list(itertools.product(*param_values))
            max_test = min(
                3, len(all_combinations)
            )  # Test max 3 combinations for speed
            test_combinations = (
                random.sample(all_combinations, max_test)
                if len(all_combinations) > max_test
                else all_combinations
            )

            print(
                f"Testing {len(test_combinations)}/{len(all_combinations)} parameter combinations..."
            )

            best_score = -1
            best_params = None
            best_metrics = None

            for i, param_combo in enumerate(test_combinations):
                current_params = dict(zip(param_names, param_combo))

                try:
                    # Create fresh model copy
                    model_copy = deepcopy(model)

                    # Set parameters - handle both sklearn and PyTorch models
                    if hasattr(model_copy.model, "set_params"):
                        # sklearn models
                        model_copy.model.set_params(**current_params)
                    elif hasattr(model_copy, "set_params"):
                        # Custom Models like PyTorch
                        model_copy.set_params(**current_params)
                    else:
                        print(
                            f"Combo {i + 1}: ERROR - Model does not support parameter setting"
                        )
                        continue

                    # Train and evaluate - CNN models ALWAYS need class_names
                    if "CNN" in model_name:
                        model_copy.train(X_train, y_train, class_names)
                    else:
                        model_copy.train(X_train, y_train)
                    metrics = model_copy.evaluate(X_test, y_test)
                    accuracy = metrics.get("accuracy", 0.0)

                    if accuracy > best_score:
                        best_score = accuracy
                        best_params = current_params.copy()
                        best_metrics = metrics.copy()

                    print(f"Combo {i + 1}: accuracy={accuracy:.4f}")

                except Exception as e:
                    print(f"Combo {i + 1}: ERROR - {e}")

            results[model_name] = {
                "best_params": best_params,
                "best_score": best_score,
                "metrics": best_metrics,
            }

            print(f"Best accuracy: {best_score:.4f}")

        except Exception as e:
            print(f"[ERROR] Hyperparameter Tuning failed: {e}")
            results[model_name] = {"error": str(e)}

    return results


# Test on MNIST (smaller images, easier for quick demo)
print("Testing all models on MNIST sample...")
mnist_results = quick_hyperparameter_test(
    sample_models, X_mnist, y_mnist, X_mnist_test, y_mnist_test, "MNIST"
)

print("\n" + "=" * 80)
print("QUICK TEST RESULTS SUMMARY - MNIST")
print("=" * 80)
for model_name, result in mnist_results.items():
    if "error" in result:
        print(f"[ERROR] {model_name} failed - {result['error']}")
    else:
        print(f"[RESULT] {model_name}: Accuracy = {result['best_score']:.4f}")

In [None]:
# Test on CIFAR-10 as well
print("\n" + "=" * 60)
print("Testing all models on CIFAR-10 sample...")
cifar_results = quick_hyperparameter_test(
    sample_models, X_cifar, y_cifar, X_cifar_test, y_cifar_test, "CIFAR-10"
)

print("\n" + "=" * 80)
print("QUICK TEST RESULTS SUMMARY - CIFAR-10")
print("=" * 80)
for model_name, result in cifar_results.items():
    if "error" in result:
        print(f"[ERROR] {model_name} failed - {result['error']}")
    else:
        print(f"[RESULT] {model_name}: Accuracy = {result['best_score']:.4f}")

Noticing the exceptionally low scores, it could be the reason of the **quick** test. It only uses 100 training samples to quickly verify whether the pipeline works normally. So now, we go ahead with a more detailed flow.

## Model Interface Verification

In [None]:
# Verify that all models implement the required interface correctly
def verify_model_interface(model_dict):
    """Verify that all models implement the BaseModel interface correctly"""
    print("Verifying model interfaces...")
    print("=" * 50)

    for name, model in model_dict.items():
        print(f"\n{name}:")

        # Check required methods
        required_methods = [
            "create_model",
            "get_param_space",
            "train",
            "predict",
            "evaluate",
        ]
        missing_methods = []

        for method in required_methods:
            if hasattr(model, method):
                print(f"{method}()")
            else:
                print(f"[WARNING] {method}() - MISSING")
                missing_methods.append(method)

        # Check parameter space
        try:
            if hasattr(model, "get_param_space"):
                param_space = model.get_param_space()
                print(f"Parameter space: {len(param_space)} parameters")
                for param_name, param_def in param_space.items():
                    print(f"- {param_name}: {param_def.param_type.value}")
            else:
                print("[WARNING] No parameter space available")
        except Exception as e:
            print(f"[WARNING] Parameter space error: {e}")

        # Check if model is created
        if hasattr(model, "model") and model.model is not None:
            print(f"Model instance: {type(model.model).__name__}")
        else:
            print("No model instance found")

        if missing_methods:
            print(f"[WARNING] INTERFACE INCOMPLETE: Missing {missing_methods}")
        else:
            print("[RESULT] INTERFACE COMPLETE")


# Run interface verification
verify_model_interface(sample_models)

## Quick Demo Summary

This quick sample flow demonstrates:

1. **All Model Classes Loaded**: Successfully imported and instantiated all 5 model classes from the `models/` directory
2. **Interface Compliance**: Verified that all models implement the required `BaseModel` interface
3. **Hyperparameter Testing**: Tested hyperparameter tuning functionality with small sample data
4. **Training & Evaluation**: Confirmed that all models can train and evaluate on both MNIST and CIFAR-10 data

### Models Tested:
- ✅ **Decision Tree Model** (`DecisionTreeModel`)
- ✅ **K-Nearest Neighbors Model** (`KNNModel`) 
- ✅ **Logistic Regression Model** (`LogisticRegressionModel`)
- ✅ **Multi-Layer Perceptron Model** (`MLPModel`)
- ✅ **Convolutional Neural Network Model** (`CNNModel`)

This quick flow uses small sample sizes (100 training, 50 test samples) and limited hyperparameter combinations (max 3 per model) to ensure fast execution while still validating that the complete machine learning pipeline works correctly for all implemented model classes.

---

**Note**: For full experiments, use the comprehensive workflow sections below with complete datasets and extensive hyperparameter search.

## Model Interface Verification

# Establishing an Ordinary Model Training Workflow

## Load the Data

In [None]:
import os

DATASET_DIR = os.path.join(os.path.curdir, "..", ".cache", "processed_datasets")
CIFAR10_PATH = os.path.join(DATASET_DIR, "cifar10")
MNIST_PATH = os.path.join(DATASET_DIR, "mnist")

In [None]:
# Load from HuggingFace datasets
from datasets import load_from_disk

# Load all datasets from cache
cifar10_dataset = load_from_disk(CIFAR10_PATH)
mnist_dataset = load_from_disk(MNIST_PATH)

# Access train and test splits for all datasets
CIFAR10_TRAIN = cifar10_dataset["train"]
CIFAR10_TEST = cifar10_dataset["test"]
MNIST_TRAIN = mnist_dataset["train"]
MNIST_TEST = mnist_dataset["test"]

# Display dataset information
print("Loaded datasets from cache:")
print(f"CIFAR-10 Train: {len(CIFAR10_TRAIN):,} examples")
print(f"CIFAR-10 Test: {len(CIFAR10_TEST):,} examples")
print(f"MNIST Train: {len(MNIST_TRAIN):,} examples")
print(f"MNIST Test: {len(MNIST_TEST):,} examples")
print(f"\nCIFAR-10 classes: {CIFAR10_TRAIN.features['label'].names}")
print(f"MNIST classes: {list(range(10))}")  # MNIST has digits 0-9

## Prepare a Validation Set

In [None]:
# Split using HuggingFace datasets train_test_split method
cifar10_split = CIFAR10_TRAIN.train_test_split(test_size=500, seed=42)
CIFAR10_TRAIN = cifar10_split["train"]
CIFAR10_VAL = cifar10_split["test"]

mnist_split = MNIST_TRAIN.train_test_split(test_size=500, seed=42)
MNIST_TRAIN = mnist_split["train"]
MNIST_VAL = mnist_split["test"]

# Inspect the sizes
print("After splitting into Train and Validation sets:")
print(f"CIFAR-10 Train: {len(CIFAR10_TRAIN):,} examples")
print(f"CIFAR-10 Validation: {len(CIFAR10_VAL):,} examples")
print(f"MNIST Train: {len(MNIST_TRAIN):,} examples")
print(f"MNIST Validation: {len(MNIST_VAL):,} examples")

## Get the Machine Learning Models

In [None]:
import sys

sys.path.append(os.path.join(os.path.curdir, ".."))


# Instantiate model classes (not the sklearn models directly)
dt_model = DecisionTreeModel()
dt_model.create_model()
knn_model = KNNModel()
knn_model.create_model()
lr_model = LogisticRegressionModel()
lr_model.create_model()
mlp_model = MLPModel()
mlp_model.create_model()
cnn_model = CNNModel()
cnn_model.create_model()

# Display those model instances
models = {
    "Decision Tree Model": dt_model,
    "K-Nearest Neighbors Model": knn_model,
    "Logistic Regression Model": lr_model,
    "Multi-Layer Perceptron Model": mlp_model,
    "Convolutional Neural Network Model": cnn_model,
}

# Show the model classes
for name, model in models.items():
    print(f"{name}: {type(model).__name__}")

models

## Tune the Hyperparameters

We tune it with our own multi-objective fitness functions across different metaheuristics. For simlicity, let's just demonstrate how we weigh the importances of every metric.

In [None]:

# Note: extract_features_and_labels is defined in the quick demo section above with max_samples parameter. We are reusing it here.

# Extract validation features and labels for CIFAR-10
X_CIFAR10_VAL, y_CIFAR10_VAL = extract_features_and_labels(
    CIFAR10_VAL, flatten_images=True
)
print("CIFAR-10 Validation:")
print(f"X_VAL shape: {X_CIFAR10_VAL.shape}")
print(f"y_VAL shape: {y_CIFAR10_VAL.shape}")
print(f"Data type: X={X_CIFAR10_VAL.dtype}, y={y_CIFAR10_VAL.dtype}")

# Extract validation features and labels for MNIST
X_MNIST_VAL, y_MNIST_VAL = extract_features_and_labels(MNIST_VAL, flatten_images=True)
print("\nMNIST Validation:")
print(f"X_VAL shape: {X_MNIST_VAL.shape}")
print(f"y_VAL shape: {y_MNIST_VAL.shape}")
print(f"Data type: X={X_MNIST_VAL.dtype}, y={y_MNIST_VAL.dtype}")

# Show label examples
print(f"\nCIFAR-10 label examples: {y_CIFAR10_VAL[:5]}")
print(f"MNIST label examples: {y_MNIST_VAL[:5]}")
print(f"Pixel value range CIFAR-10: [{X_CIFAR10_VAL.min()}, {X_CIFAR10_VAL.max()}]")
print(f"Pixel value range MNIST: [{X_MNIST_VAL.min()}, {X_MNIST_VAL.max()}]")

The cell below shows 3 alternative methods of extracting the images array and the labels array, just as a reference to check whether the above extraction behaves correctly. 

In [None]:
# Alternative methods for extracting X and y

# Method 1: Direct column access (simpler but less flexible)
print("=== Alternative Method 1: Direct Column Access ===")
y_cifar_simple = np.array(CIFAR10_VAL["label"])
print(f"y_CIFAR10_VAL shape: {y_cifar_simple.shape}")

# Method 2: Using dataset.to_pandas()
print("\n=== Alternative Method 2: Using to_pandas() ===")
cifar_df = CIFAR10_VAL.to_pandas()
print(f"DataFrame shape: {cifar_df.shape}")
print(f"DataFrame columns: {list(cifar_df.columns)}")

# Method 3: Batch processing for large datasets (memory efficient)
print("\n=== Alternative Method 3: Batch Processing ===")


def extract_in_batches(dataset, batch_size=1000, flatten_images=True):
    """Extract features and labels in batches to save memory"""
    total_samples = len(dataset)
    X_batches = []
    y_batches = []

    for i in range(0, total_samples, batch_size):
        batch = dataset[i : i + batch_size]

        # Process batch
        batch_images = []
        for img in batch["image"]:
            img_array = np.array(img)
            if flatten_images:
                img_array = img_array.flatten()
            batch_images.append(img_array)

        X_batches.append(np.array(batch_images))
        y_batches.append(np.array(batch["label"]))

    # Concatenate all batches
    X = np.vstack(X_batches)
    y = np.concatenate(y_batches)

    return X, y


# Example with small batch for demonstration
X_batch, y_batch = extract_in_batches(CIFAR10_VAL.select(range(100)), batch_size=50)
print(f"Batch extraction example - X shape: {X_batch.shape}, y shape: {y_batch.shape}")

### Tuning the Hyperparameters by training on only **validation set** iteratively on each dataset and each model.

In [None]:
import os
import time


def calculate_weighted_score(metrics_dict):
    """Calculate weighted sum of evaluation metrics"""
    weighted_sum = 0.0
    for metric_name, score in metrics_dict.items():
        if metric_name in metrics_dict:
            weighted_sum += (
                score * 1 / len(metrics_dict)
            )  # Equal weights for simplicity
    weighted_sum *= 0.8  # avoid going to 1
    return weighted_sum


def get_class_names_from_dataset(dataset_name):
    """Extract class names from HuggingFace dataset features with reliable fallbacks"""
    try:
        if dataset_name == "MNIST":
            # Try to get from MNIST dataset features
            if (
                hasattr(MNIST_TRAIN.features["label"], "names")
                and MNIST_TRAIN.features["label"].names
            ):
                return MNIST_TRAIN.features["label"].names
            else:
                # MNIST typically uses digits 0-9
                return [str(i) for i in range(10)]
        elif dataset_name == "CIFAR-10":
            # Try to get from CIFAR-10 dataset features
            if (
                hasattr(CIFAR10_TRAIN.features["label"], "names")
                and CIFAR10_TRAIN.features["label"].names
            ):
                return CIFAR10_TRAIN.features["label"].names
            else:
                # CIFAR-10 fallback class names
                return [
                    "airplane",
                    "automobile",
                    "bird",
                    "cat",
                    "deer",
                    "dog",
                    "frog",
                    "horse",
                    "ship",
                    "truck",
                ]
        else:
            # For other datasets, create generic class names
            return [f"Class {i}" for i in range(10)]  # Default to 10 classes
    except Exception as e:
        print(f"Warning: Could not extract class names for {dataset_name}: {e}")
        # Reliable fallbacks
        if dataset_name == "MNIST":
            return [str(i) for i in range(10)]
        elif dataset_name == "CIFAR-10":
            return [
                "airplane",
                "automobile",
                "bird",
                "cat",
                "deer",
                "dog",
                "frog",
                "horse",
                "ship",
                "truck",
            ]
        else:
            return [f"Class {i}" for i in range(10)]


# Since ParamSpace is a custom class, we need to extract actual values with a helper function
def extract_param_values_from_param_space(param_space):
    """Extract actual values from ParamSpace objects for hyperparameter tuning"""
    param_values = []
    for param_name, param_def in param_space.items():
        if hasattr(param_def, "param_type"):
            # This is a ParamSpace object
            if param_def.param_type.value == "categorical":
                param_values.append(param_def.choices)
            elif param_def.param_type.value == "boolean":
                param_values.append([True, False])
            elif param_def.param_type.value == "integer":
                # Sample a few values from the range for comprehensive testing
                values = []
                if param_def.min_value is not None and param_def.max_value is not None:
                    step = max(1, (param_def.max_value - param_def.min_value) // 5)
                    values = list(
                        range(param_def.min_value, param_def.max_value + 1, step)
                    )
                    if len(values) > 8:  # Limit to 8 values for comprehensive testing
                        values = values[:8]
                if param_def.default is not None and param_def.default not in values:
                    values.append(param_def.default)
                param_values.append(values if values else [param_def.default])
            elif param_def.param_type.value == "float":
                # Sample a few values from the range for comprehensive testing
                values = []
                if param_def.min_value is not None and param_def.max_value is not None:
                    import numpy as np

                    values = list(
                        np.linspace(param_def.min_value, param_def.max_value, 5)
                    )
                if param_def.default is not None and param_def.default not in values:
                    values.append(param_def.default)
                param_values.append(values if values else [param_def.default])
        else:
            # This is already a list of values
            param_values.append(param_def)

    return param_values


def _sanitize_paths_in_obj(obj, start=None):
    """Recursively convert absolute filesystem paths in strings to relative paths for safe printing."""
    if start is None:
        try:
            start = os.getcwd()
        except Exception:
            start = "."

    # Strings: convert absolute paths to relative where possible
    if isinstance(obj, str):
        try:
            if os.path.isabs(obj):
                return os.path.relpath(obj, start)
        except Exception:
            return obj
        return obj

    # Dicts: recurse
    if isinstance(obj, dict):
        return {k: _sanitize_paths_in_obj(v, start) for k, v in obj.items()}

    # Lists / tuples: recurse and preserve type
    if isinstance(obj, (list, tuple)):
        converted = [_sanitize_paths_in_obj(v, start) for v in obj]
        return type(obj)(converted)

    # Other types: leave as-is
    return obj


# Storage for hyperparameter tuning results
tuning_results = {}
model_dataset_param = {}  # To Store "Best" Models

for dataset in ["CIFAR-10", "MNIST"]:
    for modelName, model in models.items():
        print(f"\n{'=' * 60}")
        print(f"Tuning hyperparameters for {modelName} on {dataset}")
        print(f"{'=' * 60}")

        # Get class names for this dataset - ALWAYS get them for CNN compatibility
        class_names = get_class_names_from_dataset(dataset)
        print(f"Using class names: {class_names}")

        # Get the appropriate validation data
        match dataset:
            case "CIFAR-10":
                X_val, y_val = X_CIFAR10_VAL, y_CIFAR10_VAL
            case "MNIST":
                X_val, y_val = X_MNIST_VAL, y_MNIST_VAL

        # Get parameter space
        if not hasattr(model, "get_param_space"):
            print(
                f"[WARNING] Model {modelName} does not support hyperparameter tuning. Using default params."
            )
            try:
                model_copy = deepcopy(model)
                # CNN models ALWAYS need class_names (required positional argument)
                if "CNN" in modelName:
                    model_copy.train(X_val, y_val, class_names)
                else:
                    model_copy.train(X_val, y_val)
                eval_metrics = model_copy.evaluate(X_val, y_val)
                weighted_score = calculate_weighted_score(eval_metrics)

                # Store results for models that don't support hyperparameter tuning
                tuning_results[(modelName, dataset)] = {
                    "best_params": "default",
                    "best_score": weighted_score,
                    "best_metrics": eval_metrics,
                    "all_results": [
                        {
                            "params": "default",
                            "metrics": eval_metrics,
                            "weighted_score": weighted_score,
                        }
                    ],
                }
                model_dataset_param[(modelName, dataset)] = model_copy
                print(f"Default weighted score: {weighted_score:.4f}")
                continue
            except Exception as e:
                print(f"[ERROR] Error with default params: {e}")
                tuning_results[(modelName, dataset)] = {"error": str(e)}
                model_dataset_param[(modelName, dataset)] = None
                continue

        param_space = model.get_param_space()
        param_names = list(param_space.keys())

        # Extract actual values from ParamSpace objects
        param_values = extract_param_values_from_param_space(param_space)

        # Generate all combinations of parameters
        param_combinations = list(itertools.product(*param_values))
        # Shuffle to randomize order because of iteration limit
        # Make sure every combination have an equal chance to be explored
        random.shuffle(param_combinations)
        total_combinations = len(param_combinations)
        print(f"Testing {total_combinations} parameter combinations...")

        best_score = -1
        best_params = None
        best_model = None
        current_results = []

        # Iterate through all parameter combinations
        time_start = time.perf_counter()
        time_limit = 60 * 10  # 10 minutes
        iteration_count = 0
        iteration_limit = total_combinations * 0.6
        for i, param_combo in enumerate(param_combinations):
            # Check the Limits
            if time.perf_counter() - time_start > time_limit:
                print("Time limit reached, stopping further evaluations.")
                break
            if iteration_count >= iteration_limit:
                print("Iteration limit reached, stopping further evaluations.")
                break
            # Create parameter dictionary
            current_params = dict(zip(param_names, param_combo))

            # Create a fresh copy of the model for this configuration
            model_copy = deepcopy(model)

            # Set parameters - handle both sklearn and PyTorch models
            try:
                if hasattr(model_copy.model, "set_params"):
                    # sklearn models
                    model_copy.model.set_params(**current_params)
                elif hasattr(model_copy, "set_params"):
                    # Custom Models like PyTorch
                    model_copy.set_params(**current_params)
                else:
                    safe_params = _sanitize_paths_in_obj(current_params)
                    print(
                        f"Error with params {safe_params}: Model does not support parameter setting"
                    )
                    continue
            except Exception as e:
                safe_params = _sanitize_paths_in_obj(current_params)
                print(f"Error setting params {safe_params}: {e}")
                continue

            try:
                # Train the model - CNN models ALWAYS need class_names (required positional argument)
                if "CNN" in modelName:
                    model_copy.train(X_val, y_val, class_names)
                else:
                    model_copy.train(X_val, y_val)

                # Evaluate the model
                eval_metrics = model_copy.evaluate(X_val, y_val)

                # Calculate weighted score
                weighted_score = calculate_weighted_score(eval_metrics)

                # Store results
                result_entry = {
                    "params": current_params.copy(),
                    "metrics": eval_metrics.copy(),
                    "weighted_score": weighted_score,
                }
                current_results.append(result_entry)

                # Update best model if this one is better
                if weighted_score > best_score:
                    best_score = weighted_score
                    best_params = current_params.copy()
                    best_model = deepcopy(model_copy)

                # Progress indicator
                if (i + 1) % max(
                    1, total_combinations // 10
                ) == 0 or i == total_combinations - 1:
                    print(
                        f"Progress: {i + 1}/{total_combinations} ({(i + 1) / total_combinations * 100:.1f}%) - "
                        f"Current best weighted score: {best_score:.4f}"
                    )

                # PERFORMANCE: Halt the process when it reaches near-perfect score
                # Avoids Overfitting!
                if best_score >= 0.93 * 0.8:  # applied penalty to scores
                    print(
                        "Reached excellent score (≥0.93), stopping further evaluations."
                    )
                    break

            except Exception as e:
                # Sanitize any path-like values in current_params before printing
                safe_params = _sanitize_paths_in_obj(current_params)
                print(f"Error with params {safe_params}: {str(e)}")
                continue

            iteration_count += 1

        # Store results for this model-dataset combination
        tuning_results[(modelName, dataset)] = {
            "best_params": best_params,
            "best_score": best_score,
            "best_metrics": best_model.evaluate(X_val, y_val) if best_model else None,
            "all_results": current_results,
        }

        # Store the best trained model
        model_dataset_param[(modelName, dataset)] = best_model

        # Print summary for this model-dataset combination
        print(f"\nBest configuration for {modelName} on {dataset}:")
        print(f"Parameters: {_sanitize_paths_in_obj(best_params)}")
        print(f"Weighted Score: {best_score:.4f}")
        if best_model:
            best_metrics = best_model.evaluate(X_val, y_val)
            for metric, value in best_metrics.items():
                print(f"  {metric}: {value:.4f}")

print(f"\n{'=' * 80}")
print("HYPERPARAMETER TUNING COMPLETED!")
print(f"{'=' * 80}")
print("Results summary:")
for (model_name, dataset_name), results in tuning_results.items():
    if "error" in results:
        print(f"{model_name} on {dataset_name}: [ERROR] {results['error']}")
    else:
        safe_best_params = _sanitize_paths_in_obj(results.get("best_params"))
        print(
            f"{model_name} on {dataset_name}: Best weighted score = {results['best_score']:.4f}; params={safe_best_params}"
        )

The tuning might be too naive since it just kept tuning till reaching the best metrics (although it already caps at 0.93). That case must be overfitting (i.e., memorizing) solely the training (seen) data. We need more sophisticated logic. But now at least we know our end-to-end flow works well during the pure tuning processes before introducing metaheuristics. 

## Train the Model

Train it with best set of parameters found.

In [None]:
trained_best_models: dict = {}
for (modelName, datasetName), results in tuning_results.items():
    print(
        f"\nTraining final model for {modelName} on {datasetName} with best hyperparameters..."
    )

    # Check if tuning was successful
    if "error" in results:
        print(
            f"[ERROR] Skipping {modelName} on {datasetName} due to tuning error: {results['error']}"
        )
        continue

    best_model = model_dataset_param.get((modelName, datasetName))
    if best_model is None:
        print(
            f"[ERROR] No best model found for {modelName} on {datasetName}, skipping."
        )
        continue

    # Get class names for CNN models - ALWAYS get them for CNN compatibility
    class_names = get_class_names_from_dataset(datasetName)

    # Retrain on full training data (train only)
    match datasetName:
        case "CIFAR-10":
            X_train, y_train = extract_features_and_labels(
                CIFAR10_TRAIN, flatten_images=True
            )
        case "MNIST":
            X_train, y_train = extract_features_and_labels(
                MNIST_TRAIN, flatten_images=True
            )
    if not hasattr(best_model, "train"):
        print(
            f"[WARNING] Model {modelName} does not support training method. Skipping."
        )
        continue

    # Train - CNN models ALWAYS need class_names (required positional argument)
    if "CNN" in modelName:
        best_model.train(X_train, y_train, class_names)
    else:
        best_model.train(X_train, y_train)
    print("Training completed.")
    # Save Trained Models
    trained_best_models[(modelName, datasetName)] = best_model

## Evaluate the Trained Best Models

In [None]:
metrics: dict = {}
for (modelName, datasetName), best_model in trained_best_models.items():
    print(f"\nEvaluating final trained model for {modelName} on {datasetName}...")
    # Get test data
    match datasetName:
        case "CIFAR-10":
            X_test, y_test = extract_features_and_labels(
                CIFAR10_TEST, flatten_images=True
            )
        case "MNIST":
            X_test, y_test = extract_features_and_labels(
                MNIST_TEST, flatten_images=True
            )
    # Evaluate
    test_metrics = best_model.evaluate(X_test, y_test)
    print(f"Test Metrics for {modelName} on {datasetName}:")
    for metric, value in test_metrics.items():
        print(f"  {metric}: {value:.4f}")
        # Save metrics
    metrics[(modelName, datasetName)] = test_metrics

In [None]:
# Find the Best Model from Weighted Sum
best_overall_score = -1
for (modelName, datasetName), test_metrics in metrics.items():
    weighted_score = calculate_weighted_score(test_metrics)
    print(f"Weighted Score for {modelName} on {datasetName}: {weighted_score:.4f}")
    if weighted_score > best_overall_score:
        best_overall_score = weighted_score
        best_model_info = (modelName, datasetName)
print(
    f"\nBest Overall Model: {best_model_info[0]} on {best_model_info[1]} with Weighted Score: {best_overall_score:.4f}"
)