# 11_model_zoo.ipynb

In [1]:
import pandas as pd
from pathlib import Path

# Read in data from CSV files
# Assuming the CSV files are located in a 'data/raw' directory relative to this script's location
# Adjust the path as necessary based on your project structure
path = Path.cwd().parent.parent
img = pd.read_csv(path / "data" / "raw" / "imagingFeatures.csv")
clin = pd.read_csv(path / "data" / "raw" / "clinicalData_clean.csv")

In [None]:
# Clean the data- drop rows with NaN values
TARGET = 'ER'
data = img.merge(clin[['Patient ID', TARGET]], on='Patient ID', how='inner')
data = data.drop('Unnamed: 0', axis=1, errors='ignore')
data = data.dropna() 

861

In [3]:
from sklearn.preprocessing import LabelEncoder
# Select the target variable and merge with imaging data
y = data[TARGET]

# Encode target if needed
if y.dtype == 'object' or y.dtype.name == 'category':
    le = LabelEncoder()
    y = le.fit_transform(y)

# Drop the target and patient ID columns to get the feature set
X = data.drop([TARGET, 'Patient ID'], axis=1, errors='ignore')

## Define search grids for RF, XGB, MLP
A *search grid* is a set of hyperparameter values that are tested in order to find the optimal configuration.

In [8]:
from skopt.space import Categorical, Real, Integer # Required for MLPClassifier grid search

# RandomForestClassifier search grid
rf_grid = {
    'n_estimators': [100, 200, 500],
    'max_depth': [None, 5, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# XGBoost Classifier search grid
xgb_grid = {
    'n_estimators': [100, 200, 500],
    'max_depth': [3, 5, 10],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.7, 0.8, 1.0],
    'colsample_bytree': [0.7, 0.8, 1.0]
}

# MLPClassifier search grid
mlp_grid = {
    'hidden_layer_sizes': Integer(50, 100),  # Only single-layer, sizes 50 to 100
    'activation': Categorical(['relu', 'tanh']),
    'solver': Categorical(['adam', 'sgd']),
    'alpha': Real(0.0001, 0.01, prior='log-uniform'),
    'learning_rate_init': Real(0.001, 0.01, prior='log-uniform')
}

## Bayesian search for efficient tuning
* Each `BayesSearchCV` runs a Bayesian optimization over the respective hyperparameter grid
* Uses cross-validation to evaluate each set of parameters
* Attempts to find the best combination of hyperparameters that maximize the cross-validation score (in this case, accuracy)

May need to run `pip install scikit-optimize`.

In [5]:
# Import necessary libraries for Bayesian optimization
from skopt import BayesSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier

In [6]:
# Example: Bayesian search for RandomForest
rf_bayes = BayesSearchCV(
    estimator=RandomForestClassifier(),
    search_spaces=rf_grid,
    n_iter=32,
    cv=3,
    n_jobs=-1,
    random_state=42,
    verbose=0
)

# Example: Bayesian search for XGBoost
xgb_bayes = BayesSearchCV(
    estimator=XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    search_spaces=xgb_grid,
    n_iter=32,
    cv=3,
    n_jobs=-1,
    random_state=42,
    verbose=0
)

# Example: Bayesian search for MLPClassifier
mlp_bayes = BayesSearchCV(
    estimator=MLPClassifier(max_iter=2000),
    search_spaces=mlp_grid,
    n_iter=32,
    cv=3,
    n_jobs=-1,
    random_state=42,
    verbose=0
)

## Fitting the models

In [7]:
#rf_bayes.fit(X, y)
#xgb_bayes.fit(X, y)
mlp_bayes.fit(X, y)

ValueError: can only convert an array of size 1 to a Python scalar

### Understanding the models
* RG = Random Forst
* XBG = eXtreme Gradient Boosting: open source library that implements optimized distributed gradient boosting machine learning algorithms [(source)](https://www.nvidia.com/en-us/glossary/xgboost/)
    * A scalable, distributed gradient-boosted decision tree (GBDT) machien learning library
    * Provides parallel tree boosting
    * GBDT: decision tree ensemble learning algorithm (consists of multiple decision trees, like random forest)
* MLP = Multilayer Perceptron

## Log results
* `best_params_`: the best hyperparameters found
* `best_score_`: the best cross-validation score (accuracy) achieved
* `cv_results_`: more results for each of the hyperparameters tried

In [None]:
import json

def log_search_results(search, model_name, filename="cv_results.json"):
    results = {
        "model": model_name,
        "best_params": search.best_params_,
        "best_score": search.best_score_,
        "cv_results": {
            "mean_test_score": search.cv_results_["mean_test_score"].tolist(),
            "std_test_score": search.cv_results_["std_test_score"].tolist(),
            "params": search.cv_results_["params"]
        }
    }
    with open(filename, "a") as f:
        json.dump(results, f)
        f.write("\n")  # Write each result on a new line

# Example usage after fitting:
log_search_results(rf_bayes, "RandomForest")
log_search_results(xgb_bayes, "XGBoost")
#log_search_results(mlp_bayes, "MLP")