In [1]:
import numpy as np
import matplotlib.pyplot as plt
from keras.datasets import cifar10
from skopt import BayesSearchCV
from skopt.space import Integer, Categorical, Real
from cuml.ensemble import RandomForestClassifier as CuMLRandomForestClassifier
from cuml.svm import SVC as CuMLSVC
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score, roc_curve, auc, confusion_matrix
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelBinarizer
from tqdm import tqdm
import logging
from datetime import datetime
import cupy as cp

ModuleNotFoundError: No module named 'cuml'

In [None]:
# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [None]:
# Load the CIFAR-10 dataset
(X_train, y_train), (X_test, y_test) = cifar10.load_data()

In [None]:
# Use a smaller subset for initial testing
SUBSET_SIZE = 5000  # Adjust this value as needed
logger.info(f"Using subset of {SUBSET_SIZE} samples for testing")
X_train = X_train[:SUBSET_SIZE]
y_train = y_train[:SUBSET_SIZE]

In [None]:
# Reshape and scale the data using CuPy for GPU acceleration
logger.info("Preprocessing data...")
X_train = cp.asarray(X_train.reshape(X_train.shape[0], -1))
X_test = cp.asarray(X_test.reshape(X_test.shape[0], -1))
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.get())
X_test_scaled = scaler.transform(X_test.get())

# Convert to CuPy arrays for GPU processing
X_train_scaled = cp.asarray(X_train_scaled)
X_test_scaled = cp.asarray(X_test_scaled)

y_train = y_train.flatten()
y_test = y_test.flatten()

In [None]:
# # Full Hyperparameter Search Spaces
# rf_param_grid = {
#     "n_estimators": Integer(50, 500),
#     "criterion": Categorical(["gini", "entropy"]),
#     "max_depth": Categorical([None, 3, 5, 10, 20, 30]),
#     "min_samples_leaf": Integer(1, 16),
#     "max_features": Categorical(["sqrt", "log2", 0.25, 0.5]),
#     "bootstrap": Categorical([True, False]),
#     "min_samples_split": Integer(2, 10),
#     "class_weight": Categorical([None, "balanced", "balanced_subsample"]),
# }

# svm_param_grid = {
#     "C": Real(1e-4, 1e3, "log-uniform"),
#     "kernel": Categorical(["linear", "rbf", "poly", "sigmoid"]),
#     "gamma": Categorical(["scale", "auto"] + [0.001, 0.01, 0.1, 1, 10]),
#     "degree": Integer(2, 4),
#     "coef0": Real(0.0, 1.0),
#     "class_weight": Categorical([None, "balanced"]),
# }

# ensemble_param_grid = {
#     "voting": Categorical(["soft", "hard"]),
#     "weights": Categorical([None, (0.5, 0.5), (0.3, 0.7), (0.7, 0.3)]),
#     "rf__n_estimators": Integer(100, 300),
#     "rf__max_depth": Categorical([None, 10, 20]),
#     "svm__C": Real(0.1, 10, "log-uniform"),
#     "svm__gamma": Categorical(["scale", "auto", 0.01]),
# }

# Reduced Testing Version
rf_param_grid = {
    "n_estimators": Integer(50, 100),
    "max_depth": Categorical([10, 20]),
    "min_samples_split": Integer(2, 5),
    "max_features": Categorical(["sqrt"]),
}

svm_param_grid = {
    "C": Real(0.1, 1.0, "log-uniform"),  
    "kernel": Categorical(["rbf"]),       
    "gamma": Categorical(["scale"]),      
}

ensemble_param_grid = {
    "voting": Categorical(["soft"]),              
    "weights": Categorical([None]),                
    "rf__n_estimators": Integer(50, 100),        
    "rf__max_depth": Categorical([10]),          
    "svm__C": Real(0.1, 0.5, "log-uniform"),     
    "svm__gamma": Categorical(["scale"]),         
}

In [None]:
class TqdmBayesSearchCV(BayesSearchCV):
    def _run_search(self, evaluate_candidates):
        with tqdm(total=self.n_iter, desc="Bayesian optimization") as pbar:
            def wrapped_evaluate(candidate_params):
                start_time = datetime.now()
                logger.info(f"Testing parameters: {candidate_params}")
                result = evaluate_candidates(candidate_params)
                end_time = datetime.now()
                duration = (end_time - start_time).total_seconds()
                logger.info(f"Iteration completed in {duration:.2f} seconds")
                pbar.update(1)
                return result
            return super()._run_search(wrapped_evaluate)

In [None]:
# # Tune Random Forest (cuML)
# rf = cuRF(random_state=42)
# bayes_search_rf = BayesSearchCV(estimator=rf, search_spaces=rf_param_grid, cv=3, random_state=42) 
# bayes_search_rf.fit(X_train_scaled, y_train)

# Tune Random Forest with CuML RandomForest
rf = CuMLRandomForestClassifier(random_state=42)
bayes_search_rf = TqdmBayesSearchCV(estimator=rf, search_spaces=rf_param_grid, cv=3, n_jobs=-1, random_state=42, verbose=3)
bayes_search_rf.fit(X_train_scaled.get(), y_train)
print("Best Random Forest (BayesSearch) Hyperparameters:", bayes_search_rf.best_params_)

In [None]:
# # Tune SVM (cuML)
# svm = cuSVC(probability=True, random_state=42)
# bayes_search_svm = BayesSearchCV(estimator=svm, search_spaces=svm_param_grid, cv=3, random_state=42) 
# bayes_search_svm.fit(X_train_scaled, y_train)

# Tune SVM with CuML SVC 
svm = CuMLSVC(probability=True, random_state=42)
bayes_search_svm = TqdmBayesSearchCV(estimator=svm, search_spaces=svm_param_grid, cv=3, n_jobs=-1, random_state=42, verbose=3)
bayes_search_svm.fit(X_train_scaled.get(), y_train)
print("Best SVM (BayesSearch) Hyperparameters:", bayes_search_svm.best_params_)

In [None]:
# Create base models with best individual parameters
best_rf = CuMLRandomForestClassifier(**bayes_search_rf.best_params_, random_state=42)
best_svm = CuMLSVC(**bayes_search_svm.best_params_, probability=True, random_state=42)

In [None]:
# Create base ensemble model with best individual parameter models 
ensemble = VotingClassifier(
    estimators=[('rf', best_rf), ('svm', best_svm)],
    voting='soft'
)

In [None]:
# # Tuning Ensemble 
# bayes_search_ensemble = BayesSearchCV(estimator=ensemble, search_spaces=ensemble_param_grid, cv=3, n_jobs=-1, random_state=42)
# bayes_search_ensemble.fit(X_train_scaled, y_train)
# print("Best Ensemble (BayesSearch) Hyperparameters:", bayes_search_ensemble.best_params_) 

# Tuning Ensemble
bayes_search_ensemble = TqdmBayesSearchCV(estimator=ensemble, search_spaces=ensemble_param_grid, cv=3, n_jobs=-1, random_state=42, verbose=3)
bayes_search_ensemble.fit(X_train_scaled.get(), y_train)
print("Best Ensemble (BayesSearch) Hyperparameters:", bayes_search_ensemble.best_params_) 
 

In [None]:
# --- Evaluation ---
def evaluate_model(model, X_test, y_test):
    y_test_bin = LabelBinarizer().fit_transform(
        cp.asnumpy(y_test)
    )  # Use LabelBinarizer, convert CuPy to NumPy for plotting
    pred = cp.asnumpy(model.predict(X_test))  # Convert Cupy to Numpy
    accuracy = accuracy_score(cp.asnumpy(y_test), pred)
    print(f"Accuracy: {accuracy:.4f}")

    y_score = cp.asnumpy(model.predict_proba(X_test))  # Convert Cupy to Numpy
    plot_roc_curve(y_test_bin, y_score)
    plot_confusion_matrix(cp.asnumpy(y_test), pred)  # Convert CuPy to NumPy here


def plot_roc_curve(y_test_bin, y_score):
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    for i in range(10):
        fpr[i], tpr[i], _ = roc_curve(y_test_bin[:, i], y_score[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])

    plt.figure(figsize=(10, 8))
    for i in range(10):
        plt.plot(fpr[i], tpr[i], label="Class {} (AUC = {:.2f})".format(i, roc_auc[i]))
    plt.plot([0, 1], [0, 1], "k--")
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title("Receiver Operating Characteristic (ROC) Curves")
    plt.legend(loc="lower right")
    plt.show()


def plot_confusion_matrix(y_test, pred):
    cm = confusion_matrix(y_test, pred)
    plt.figure(figsize=(10, 8))
    sns.heatmap(
        cm,
        annot=True,
        fmt="d",
        cmap="Blues",
        xticklabels=range(10),
        yticklabels=range(10),
    )
    plt.xlabel("Predicted Label")
    plt.ylabel("True Label")
    plt.title("Confusion Matrix")
    plt.show()

In [None]:

print("\n--- SVM (Best from BayesSearch) ---")
evaluate_model(bayes_search_svm.best_estimator_, X_test_scaled, y_test)

In [None]:
print("\n--- Random Forest (Best from BayesSearch) ---")
evaluate_model(bayes_search_rf.best_estimator_, X_test_scaled, y_test)

In [None]:
print("\n--- Ensemble Model (Best from BayesSearch) ---")
evaluate_model(bayes_search_ensemble.best_estimator_, X_test_scaled, y_test)