# Frankenstein Model

## Summary

- 

## Table of Contents

- [Notebook Setup](#Notebook-Setup)
- [MLP GridSearch](#MLP-GridSearch)
- [MLP Parameters](#MLP-Parameters)
- [Results](#Results)

## Notebook Setup

Significant functions from [assignment_3_tools.py](./assignment_3_tools.py)

In [81]:
import os # List 
import time # Runtime
import pickle # Model Saving
import logging # Log Checkpoints
import numpy as np # Flatten y vectors
import pandas as pd # DataFrame
import polars as pl # LazyFrame
from sklearn.preprocessing import StandardScaler # X Standardization
from sklearn.neural_network import MLPClassifier as mlp # model
from sklearn.metrics import recall_score, roc_auc_score, accuracy_score, auc, roc_curve  # Scoring
from sklearn.model_selection import GridSearchCV, StratifiedKFold, ParameterGrid
from great_tables import GT, md, html, from_column, style, loc, vals
from assignment_3_tools import parquet_to_dict, unq_df_names, corr_testset
import xgboost as xgb
# add scikit optimize for bayesian optimization
from skopt import BayesSearchCV

## MLP GridSearch

In [82]:
def mlp_gridsearch(lazy_dict, unq_names, param_grid, save_pth, test_name, threads=None):
    """
    MLP GridSearch using 5-fold Cross Validation. Saves best model and results.
    ---
    Args:
        lazy_dict: Dictionary with names and LazyFrames of train and test sets.
        unq_names: List of unique names of parent datasets.
        param_grid: Dictionary of parameters for MLPClassifier. CHANGE FOR BAESIAN
        save_pth: String of the path to save the best model.
        test_name: String of the test performed. PARAMETER BEING TESTED
        threads: Integer of CPU threads for cross-validation (optional).
    Return:
        None
    """
    ## Initializing
    # Define number of threads to be used in GridSearch
    if threads is None:
        threads = os.cpu_count() - 4
        print(f"Using {threads} CPU threads!")

    # Log for debugging
    logging.basicConfig(
        filename=f"./log/MLP_{test_name}.log",
        filemode='w', 
        level=logging.INFO, 
        format='%(asctime)s - %(levelname)s - %(message)s')
    
    ## GridSearch and Results
    for name in unq_names:
        # Results from prediction on test_set. FOR TEST TABLE
        best_results = {
            "Dataset_Name": [],
            "Grid_Variable": [],
            "Parameters": [],
            "Recall": [], 
            "ROC_AUC": [], 
            "Accuracy": [],
            "Fit_Time": []}
        
        # Results from prediction on Cross Validation Set. FOR CV TABLE
        param_results = {
            "Dataset_Name": [],
            "Grid_Variable": [],
            "Parameters": [],
            "Recall": [], 
            "Fit_Time": []}
        
        ## Reading and Preparing Data
        # Dataset names in path
        X_train_name = f"{name}_X_train"
        y_train_name = f"{name}_y_train"
        X_test_name = f"{name}_X_test"
        y_test_name = f"{name}_y_test"

        # Train and test sets.
        X_train = lazy_dict[X_train_name].collect().to_pandas()
        y_train = lazy_dict[y_train_name].collect().to_pandas()
        X_test = lazy_dict[X_test_name].collect().to_pandas()
        y_test = lazy_dict[y_test_name].collect().to_pandas()

        # Drop index column
        X_train.drop(columns=['__index_level_0__'], errors='ignore', inplace=True)
        y_train.drop(columns=['__index_level_0__'], errors='ignore', inplace=True)
        X_test.drop(columns=['__index_level_0__'], errors='ignore', inplace=True)
        y_test.drop(columns=['__index_level_0__'], errors='ignore', inplace=True)

        # Flatten response sets
        y_train = y_train.to_numpy().ravel()
        y_test = y_test.to_numpy().ravel()

        # Standardize predictor sets
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)

        ## Defining Modeling and GridSearch. CHANGE FOR BEASIAN
        # Define cross-validation folds
        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=212)

        # Define mlp model
        mlp_model = mlp()

        # Define GridSearch. CHANGE TO BEASIAN!!!
        grid_search = GridSearchCV(
            mlp_model, #mlp model
            param_grid=param_grid, #parameter dictionary
            cv=cv, # cv
            scoring='recall', # best is by recall
            n_jobs=threads,
            verbose=3, 
            return_train_score=True) # For making CV table


        ## Performing GridSearch
        # Debugging Checkpoint
        logging.info(f"Processing dataset: {name}")
        print(f"Training on {name}...", flush=True)

        # GridSearch Training and Results
        grid_search.fit(X_train_scaled, y_train)

        # Debugging Checkpoint
        print(f"GridSearch completed", flush=True)
        logging.info(f"GridSearch for {test_name} completed.")

        ## Results from GridSearch
        # Storing Results for each parameter combination
        for i in range(len(grid_search.cv_results_['params'])):
            param_combination = grid_search.cv_results_['params'][i]
            recall = grid_search.cv_results_['mean_test_score'][i]
            fit_time = grid_search.cv_results_['mean_fit_time'][i]
            param_results["Dataset_Name"].append(name)
            param_results["Grid_Variable"].append(test_name)
            param_results["Parameters"].append(param_combination)
            param_results["Recall"].append(recall)
            param_results["Fit_Time"].append(fit_time)
        
        # Convert to DataFrame
        param_results_df = pd.DataFrame(param_results)
        param_results_df = param_results_df.sort_values(by="Recall", ascending=False)

        # Best model by Recall on Cross Validation data
        best_fit_time = param_results_df.iloc[0]["Fit_Time"]
        best_model = grid_search.best_estimator_

        # Metrics on test set
        y_pred_test = best_model.predict(X_test_scaled)
        test_recall = recall_score(y_test, y_pred_test)
        test_roc_auc = roc_auc_score(y_test, best_model.predict_proba(X_test_scaled)[:, 1])
        test_accuracy = accuracy_score(y_test, y_pred_test)

        # Save best model as pickle
        # example: best_model_neurons-hidden_layer_sizes.pkl
        with open(f"{save_pth}best_model{test_name}-{name}.pkl", 'wb') as file:
            pickle.dump(best_model, file)

        # Debugging Checkpoint
        logging.info(f"Model saved to {save_pth}")

        # Results from predicting test data using the best model.
        best_results["Dataset_Name"].append(name)
        best_results["Grid_Variable"].append(test_name)
        best_results["Parameters"].append(grid_search.best_params_)
        best_results["Recall"].append(test_recall)
        best_results["ROC_AUC"].append(test_roc_auc)
        best_results["Accuracy"].append(test_accuracy)
        best_results["Fit_Time"].append(best_fit_time)

        # Convert to DataFrame
        best_results_df = pd.DataFrame(best_results)
        
        # Save results as Parquet
        # example: test_results_layers-hidden_layer_sizes.parquet
        best_results_df.to_parquet(f"{save_pth}test_results{test_name}-{name}.parquet", index=False)
        # example: grid_results_neurons-hidden_layer_sizes.parquet
        param_results_df.to_parquet(f"{save_pth}grid_results{test_name}-{name}.parquet", index=False)

        # Debugging Checkpoint
        print(f"{test_name} GridSearch completed!", flush=True)
        logging.info(f"{test_name} GridSearch completed!")

## MLP Parameters

Change the `param_grid` and `test_name` to match the test being performed.

In [83]:
# Train and test sets are in MLP_Dataset.
# Save results and best model to MLP_Results.
data_pth = "../../../Data/GoogleDrive/MLP_Dataset/"
save_pth = "../../../Data/GoogleDrive/MLP_Results/"

# Read in Parquet files in path and add to a LazyFrame dictionary.
pq_jar = parquet_to_dict(data_pth) # all lazy

# Record the unique dataset names (drop X_train, etc.).
unq_names = unq_df_names(pq_jar)

# A dictionary of parmeter dictionaries
# Schema {testname:{parameter:values}}

all_test_parameters = {
    '_best_params':{
        'hidden_layer_sizes': [(47, 46, 46, 46)],
        'learning_rate_init': [0.01]}
}

frank_param = {
    '_best_params_vem_2':{
        'hidden_layer_sizes': [(47, 46, 46, 46)],  # Number of neurons and layers, represented as a tuple
        'activation': ['relu'],         # Activation function
        'alpha': [0.0001],              # Regularization parameter
        'batch_size': ['auto'],         # Size of minibatches
        'learning_rate': ['constant'],  # Learning rate schedule
        'learning_rate_init': [0.01],   # Initial learning rate
        'max_iter': [200],              # Maximum number of iterations
        'momentum': [0.9],              # Momentum for gradient descent
        'n_iter_no_change': [10],       # Number of iterations with no improvement to stop training
        'solver': ['adam']              # Solver for weight optimization
}
}

test = '_best_params_vem_2'

In [10]:
%%time
# Run the model
for test, param_dict in all_test_parameters.items():
    mlp_gridsearch(pq_jar, unq_names, param_dict, save_pth, test)

Using 8 CPU threads!
Training on Under_Sample_1:1_threshold_20...
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV 5/5] END hidden_layer_sizes=(47, 46, 46, 46), learning_rate_init=0.01;, score=(train=0.880, test=0.871) total time= 1.8min
[CV 1/5] END hidden_layer_sizes=(47, 46, 46, 46), learning_rate_init=0.01;, score=(train=0.933, test=0.927) total time= 2.7min
[CV 3/5] END hidden_layer_sizes=(47, 46, 46, 46), learning_rate_init=0.01;, score=(train=0.915, test=0.908) total time= 2.8min
[CV 2/5] END hidden_layer_sizes=(47, 46, 46, 46), learning_rate_init=0.01;, score=(train=0.947, test=0.939) total time= 2.8min
[CV 4/5] END hidden_layer_sizes=(47, 46, 46, 46), learning_rate_init=0.01;, score=(train=0.928, test=0.920) total time= 3.4min
GridSearch completed
_best_params GridSearch completed!
CPU times: user 1h 42min 13s, sys: 3min 23s, total: 1h 45min 36s
Wall time: 12min 37s


In [84]:
%%time
# Run the model
for test, param_dict in frank_param.items():
    mlp_gridsearch(pq_jar, unq_names, param_dict, save_pth, test)

Using 8 CPU threads!
Training on Under_Sample_1:1_threshold_20...
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV 2/5] END activation=relu, alpha=0.0001, batch_size=auto, hidden_layer_sizes=(47, 46, 46, 46), learning_rate=constant, learning_rate_init=0.01, max_iter=200, momentum=0.9, n_iter_no_change=10, solver=adam;, score=(train=0.956, test=0.948) total time= 2.5min
[CV 1/5] END activation=relu, alpha=0.0001, batch_size=auto, hidden_layer_sizes=(47, 46, 46, 46), learning_rate=constant, learning_rate_init=0.01, max_iter=200, momentum=0.9, n_iter_no_change=10, solver=adam;, score=(train=0.804, test=0.794) total time= 2.5min
[CV 4/5] END activation=relu, alpha=0.0001, batch_size=auto, hidden_layer_sizes=(47, 46, 46, 46), learning_rate=constant, learning_rate_init=0.01, max_iter=200, momentum=0.9, n_iter_no_change=10, solver=adam;, score=(train=0.937, test=0.929) total time= 2.5min
[CV 5/5] END activation=relu, alpha=0.0001, batch_size=auto, hidden_layer_sizes=(47, 46, 46,

## Results

In [12]:
## Reading in Results

name = 'best_params-Under_Sample_1:1_threshold_20'

# Best model by Recall
with open(f"{save_pth}best_model_{name}.pkl", 'rb') as file:
    model = pickle.load(file)

# Test set prediction results
results_df = pd.read_parquet(f"{save_pth}test_results_{name}.parquet")

# Cross Validation results
grid_df = pd.read_parquet(f"{save_pth}grid_results_{name}.parquet")

In [37]:
# data type of the model
frank_results_df = pd.concat([results_df, grid_df], axis=0)
frank_results_df = frank_results_df[['Recall', 
                                     'ROC_AUC', 
                                       'Accuracy']]
frank_results_df['Result_Type'] = ['Test'] * len(results_df) + ['CV'] * len(grid_df)
frank_results_df = frank_results_df[['Result_Type', 'Recall', 'ROC_AUC', 'Accuracy']]
frank_results_df

Unnamed: 0,Result_Type,Recall,ROC_AUC,Accuracy
0,Test,0.811796,0.814387,0.691627
0,CV,0.912884,,


In [85]:
## Reading in Results

name = 'best_params_vem_2-Under_Sample_1:1_threshold_20'

# Best model by Recall
with open(f"{save_pth}best_model_{name}.pkl", 'rb') as file:
    model = pickle.load(file)

# Test set prediction results
results_df_2 = pd.read_parquet(f"{save_pth}test_results_{name}.parquet")

# Cross Validation results
grid_df_2 = pd.read_parquet(f"{save_pth}grid_results_{name}.parquet")

In [87]:
# data type of the model
frank_results_df_2 = pd.concat([results_df_2, grid_df_2], axis=0)
frank_results_df_2 = frank_results_df_2[['Recall', 
                                     'ROC_AUC', 
                                       'Accuracy']]
frank_results_df_2['Result_Type'] = ['Test'] * len(results_df) + ['CV'] * len(grid_df)
frank_results_df_2 = frank_results_df_2[['Result_Type', 'Recall', 'ROC_AUC', 'Accuracy']]
frank_results_df_2

Unnamed: 0,Result_Type,Recall,ROC_AUC,Accuracy
0,Test,0.815167,0.815958,0.694148
0,CV,0.906449,,


# XGBoost Model

In [45]:
# Prepare data for XGBoost model

name = 'Under_Sample_1:1_threshold_20'
X_train_name = f"{data_pth}{name}_X_train.parquet"
y_train_name = f"{data_pth}{name}_y_train.parquet"
X_test_name = f"{data_pth}{name}_X_test.parquet"
y_test_name = f"{data_pth}{name}_y_test.parquet"

# Train and test sets.
X_train = pd.read_parquet(X_train_name)
y_train = pd.read_parquet(y_train_name)
X_test = pd.read_parquet(X_test_name)
y_test = pd.read_parquet(y_test_name)

# Drop index column
X_train.drop(columns=['__index_level_0__'], errors='ignore', inplace=True)
y_train.drop(columns=['__index_level_0__'], errors='ignore', inplace=True)
X_test.drop(columns=['__index_level_0__'], errors='ignore', inplace=True)
y_test.drop(columns=['__index_level_0__'], errors='ignore', inplace=True)

# Flatten response sets
y_train = y_train.to_numpy().ravel()
y_test = y_test.to_numpy().ravel()

# Standardize predictor sets
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [46]:
# Perform GridSearch

# Define cross-validation folds
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=212)

# Define XGBoost model
xgb_model = xgb.XGBClassifier()

param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.1, 0.01, 0.001],
    'subsample': [0.5, 0.7, 1],
}

# Define GridSearch
grid_search = GridSearchCV(
    xgb_model,
    param_grid=param_grid,
    cv=cv,
    scoring='recall',
    n_jobs=8,
    return_train_score=True)

# Grid search training and results
grid_search.fit(X_train_scaled, y_train)

# Print the best set of hyperparameters and the corresponding score
print("Best set of hyperparameters: ", grid_search.best_params_)
print("Best score: ", grid_search.best_score_)



Best set of hyperparameters:  {'learning_rate': 0.001, 'max_depth': 3, 'subsample': 1}
Best score:  0.8345360549838865


In [47]:
# Final XGBoost model

# Get the best model
best_xgb_model = grid_search.best_estimator_

# Use the best model to make predictions on the testing data
predictions = best_xgb_model.predict(X_test)

# Get predicted probabilities for the positive class
y_probs = best_xgb_model.predict_proba(X_test)[:, 1]

# Calculate false positive rate, true positive rate, and thresholds
fpr, tpr, thresholds = roc_curve(y_test, y_probs)

# Calculate AUC-ROC
roc_auc = auc(fpr, tpr)
print("ROC AUC Score:", roc_auc)

# Save XG Model
pickle.dump(best_xgb_model, open(save_pth + 'xgb_model.pkl', 'wb'))

ROC AUC Score: 0.7073852000260137


In [54]:
# Save XGBoost results
xgb_results = {
    "Recall": recall_score(y_test, predictions),
    "ROC_AUC": roc_auc,
    "Accuracy": accuracy_score(y_test, predictions)
}
xgb_results_df = pd.DataFrame(xgb_results, index=[0])
xgb_results_df.to_parquet(f"{save_pth}xgb_results.parquet", index=False)

# Read in XGBoost results
xgb_results_df = pd.read_parquet(f"{save_pth}xgb_results.parquet")

# Save grid search results
grid_results = pd.DataFrame(grid_search.cv_results_)
grid_results.to_parquet(f"{save_pth}grid_results_xgb.parquet", index=False)

# Read in grid search results
grid_results_df = pd.read_parquet(f"{save_pth}grid_results_xgb.parquet")

In [62]:
# Filter for best rank_test_score from grid_results_df
best_cv_xgb = grid_results_df[grid_results_df['rank_test_score'] == 1]
best_cv_xgb = best_cv_xgb['mean_test_score']
# Convert to DataFrame
best_cv_xgb = pd.DataFrame(best_cv_xgb)
best_cv_xgb['Result_Type'] = 'CV'
best_cv_xgb = best_cv_xgb[['Result_Type', 'mean_test_score']]
# Change column name
best_cv_xgb.columns = ['Result_Type', 'Recall']
best_cv_xgb

Unnamed: 0,Result_Type,Recall
20,CV,0.834536


In [63]:
# Prepare results for table
xgb_results_df['Result_Type'] = 'Test'
xgb_results_df = xgb_results_df[['Result_Type', 'Recall', 'ROC_AUC', 'Accuracy']]

# Combine results
final_xgb_results_df = pd.concat([xgb_results_df, best_cv_xgb], axis=0)
final_xgb_results_df

Unnamed: 0,Result_Type,Recall,ROC_AUC,Accuracy
0,Test,0.705783,0.707385,0.662288
20,CV,0.834536,,


# Bayesian Optimization

## Bayesian Search

In [70]:
def mlp_bayesian_search(lazy_dict, unq_names, param_grid, save_pth, test_name, threads=None):
    """
    MLP Bayesian Search using 5-fold Cross Validation. Saves best model and results.
    ---
    Args:
        lazy_dict: Dictionary with names and LazyFrames of train and test sets.
        unq_names: List of unique names of parent datasets.
        param_grid: Dictionary of parameters for BayesSearchCV.
        save_pth: String of the path to save the best model.
        test_name: String of the test performed.
        threads: Integer of CPU threads for cross-validation (optional).
    Return:
        None
    """
    ## Initializing
    # Define number of threads to be used in Bayesian Search
    if threads is None:
        threads = os.cpu_count() - 4
        print(f"Using {threads} CPU threads!")

    # Log for debugging
    logging.basicConfig(
        filename=f"./log/MLP_{test_name}.log",
        filemode='w', 
        level=logging.INFO, 
        format='%(asctime)s - %(levelname)s - %(message)s')
    
    ## Bayesian Search and Results
    for name in unq_names:
        # Results from prediction on test_set
        best_results = {
            "Dataset_Name": [],
            "Grid_Variable": [],
            "Parameters": [],
            "Recall": [], 
            "ROC_AUC": [], 
            "Accuracy": [],
            "Fit_Time": []}
        
        # Results from prediction on Cross Validation Set
        param_results = {
            "Dataset_Name": [],
            "Grid_Variable": [],
            "Parameters": [],
            "Recall": [], 
            "Fit_Time": []}
        
        ## Reading and Preparing Data
        # Dataset names in path
        X_train_name = f"{name}_X_train"
        y_train_name = f"{name}_y_train"
        X_test_name = f"{name}_X_test"
        y_test_name = f"{name}_y_test"

        # Train and test sets
        X_train = lazy_dict[X_train_name].collect().to_pandas()
        y_train = lazy_dict[y_train_name].collect().to_pandas()
        X_test = lazy_dict[X_test_name].collect().to_pandas()
        y_test = lazy_dict[y_test_name].collect().to_pandas()

        # Drop index column
        X_train.drop(columns=['__index_level_0__'], errors='ignore', inplace=True)
        y_train.drop(columns=['__index_level_0__'], errors='ignore', inplace=True)
        X_test.drop(columns=['__index_level_0__'], errors='ignore', inplace=True)
        y_test.drop(columns=['__index_level_0__'], errors='ignore', inplace=True)

        # Flatten response sets
        y_train = y_train.to_numpy().ravel()
        y_test = y_test.to_numpy().ravel()

        # Standardize predictor sets
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)

        ## Defining Modeling and Bayesian Search
        # Define cross-validation folds
        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=212)

        # Define MLP model
        mlp_model = mlp()

        # Define Bayesian Search
        bayes_search = BayesSearchCV(
            mlp_model,  # MLP model
            search_spaces=param_grid,  # Parameter dictionary
            cv=cv,  # Cross-validation
            scoring='recall',  # Best is by recall
            n_jobs=threads,
            verbose=3,
            return_train_score=True,  # For making CV table
            n_iter=50  # Number of iterations for Bayesian optimization
        )

        ## Performing Bayesian Search
        # Debugging Checkpoint
        logging.info(f"Processing dataset: {name}")
        print(f"Training on {name}...", flush=True)

        # Bayesian Search Training and Results
        bayes_search.fit(X_train_scaled, y_train)

        # Debugging Checkpoint
        print(f"Bayesian Search completed", flush=True)
        logging.info(f"Bayesian Search for {test_name} completed.")

        ## Results from Bayesian Search
        # Storing Results for each parameter combination
        for i in range(len(bayes_search.cv_results_['params'])):
            param_combination = bayes_search.cv_results_['params'][i]
            recall = bayes_search.cv_results_['mean_test_score'][i]
            fit_time = bayes_search.cv_results_['mean_fit_time'][i]
            param_results["Dataset_Name"].append(name)
            param_results["Grid_Variable"].append(test_name)
            param_results["Parameters"].append(param_combination)
            param_results["Recall"].append(recall)
            param_results["Fit_Time"].append(fit_time)
        
        # Convert to DataFrame
        param_results_df = pd.DataFrame(param_results)
        param_results_df = param_results_df.sort_values(by="Recall", ascending=False)

        # Best model by Recall on Cross Validation data
        best_fit_time = param_results_df.iloc[0]["Fit_Time"]
        best_model = bayes_search.best_estimator_

        # Metrics on test set
        y_pred_test = best_model.predict(X_test_scaled)
        test_recall = recall_score(y_test, y_pred_test)
        test_roc_auc = roc_auc_score(y_test, best_model.predict_proba(X_test_scaled)[:, 1])
        test_accuracy = accuracy_score(y_test, y_pred_test)

        # Save best model as pickle
        with open(f"{save_pth}best_model_{test_name}_{name}.pkl", 'wb') as file:
            pickle.dump(best_model, file)

        # Debugging Checkpoint
        logging.info(f"Model saved to {save_pth}")

        # Results from predicting test data using the best model
        best_results["Dataset_Name"].append(name)
        best_results["Grid_Variable"].append(test_name)
        best_results["Parameters"].append(bayes_search.best_params_)
        best_results["Recall"].append(test_recall)
        best_results["ROC_AUC"].append(test_roc_auc)
        best_results["Accuracy"].append(test_accuracy)
        best_results["Fit_Time"].append(best_fit_time)

        # Convert to DataFrame
        best_results_df = pd.DataFrame(best_results)
        
        # Save results as Parquet
        best_results_df.to_parquet(f"{save_pth}test_results_{test_name}_{name}.parquet", index=False)
        param_results_df.to_parquet(f"{save_pth}grid_results_{test_name}_{name}.parquet", index=False)

        # Debugging Checkpoint
        print(f"{test_name} Bayesian Search completed!", flush=True)
        logging.info(f"{test_name} Bayesian Search completed!")


## MLP Parameters

In [75]:
# Train and test sets are in MLP_Dataset.
# Save results and best model to MLP_Results.
data_pth = "../../../Data/GoogleDrive/MLP_Dataset/"
save_pth = "../../../Data/GoogleDrive/MLP_Results/"

# Read in Parquet files in path and add to a LazyFrame dictionary.
pq_jar = parquet_to_dict(data_pth) # all lazy

# Record the unique dataset names (drop X_train, etc.).
unq_names = unq_df_names(pq_jar)

# A dictionary of parmeter dictionaries
# Schema {testname:{parameter:values}}

# bayes_param = {
#     '_bayes_params':{
#         'hidden_layer_sizes': [(47, 46, 46, 46)],  # Different configurations of neurons and layers
#         'activation': ['relu', 'tanh', 'logistic'],  # Including another activation function
#         'alpha': [0.0, 0.0001, 0.001],  # Different regularization parameters
#         'batch_size': ['auto', 100, 1000],  # Different sizes of minibatches
#         'learning_rate': ['constant', 'adaptive'],  # Including adaptive learning rate
#         'learning_rate_init': [0.001, 0.01],  # Different initial learning rates
#         'max_iter': [100, 200],  # Different maximum number of iterations
#         'momentum': [0.0, 0.5, 0.9],  # Different values for momentum
#         'n_iter_no_change': [10, 100],  # Different values for stopping criteria
#         'solver': ['adam', 'sgd']  # Including stochastic gradient descent as a solver
#     }
# }

bayes_param = {
    '_bayes_params_vem': {
        'hidden_layer_sizes': [(47, 46, 46, 46)],  # Different configurations of neurons and layers
        'activation': ['relu', 'tanh', 'logistic'],  # Including another activation function
        'alpha': [0.0, 0.0001, 0.001],  # Different regularization parameters
        'batch_size': ['auto', 100, 1000],  # Different sizes of minibatches
        'learning_rate': ['constant', 'adaptive'],  # Including adaptive learning rate
        'learning_rate_init': [0.001, 0.01],  # Different initial learning rates
        'max_iter': [100, 200],  # Different maximum number of iterations
        'momentum': [0.0, 0.5, 0.9],  # Different values for momentum
        'n_iter_no_change': [10, 100],  # Different values for stopping criteria
        'solver': ['adam', 'sgd']  # Including stochastic gradient descent as a solver
    }
}

test = '_bayes_params_vem'

In [76]:
%%time
# Run the model
for test, param_dict in bayes_param.items():
    mlp_bayesian_search(pq_jar, unq_names, param_dict, save_pth, test)

Using 8 CPU threads!
Training on Under_Sample_1:1_threshold_20...




ValueError: can only convert an array of size 1 to a Python scalar

In [79]:
import os
import logging
import pickle
import pandas as pd
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import recall_score, roc_auc_score, accuracy_score
from skopt import BayesSearchCV

def mlp_bayesian_search(lazy_dict, unq_names, param_grid, save_pth, test_name, threads=None):
    if threads is None:
        threads = os.cpu_count() - 4
        print(f"Using {threads} CPU threads!")

    logging.basicConfig(
        filename=f"./log/MLP_{test_name}.log",
        filemode='w', 
        level=logging.INFO, 
        format='%(asctime)s - %(levelname)s - %(message)s')
    
    for name in unq_names:
        best_results = {
            "Dataset_Name": [],
            "Grid_Variable": [],
            "Parameters": [],
            "Recall": [], 
            "ROC_AUC": [], 
            "Accuracy": [],
            "Fit_Time": []}
        
        param_results = {
            "Dataset_Name": [],
            "Grid_Variable": [],
            "Parameters": [],
            "Recall": [], 
            "Fit_Time": []}
        
        X_train_name = f"{name}_X_train"
        y_train_name = f"{name}_y_train"
        X_test_name = f"{name}_X_test"
        y_test_name = f"{name}_y_test"

        X_train = lazy_dict[X_train_name].collect().to_pandas()
        y_train = lazy_dict[y_train_name].collect().to_pandas()
        X_test = lazy_dict[X_test_name].collect().to_pandas()
        y_test = lazy_dict[y_test_name].collect().to_pandas()

        X_train.drop(columns=['__index_level_0__'], errors='ignore', inplace=True)
        y_train.drop(columns=['__index_level_0__'], errors='ignore', inplace=True)
        X_test.drop(columns=['__index_level_0__'], errors='ignore', inplace=True)
        y_test.drop(columns=['__index_level_0__'], errors='ignore', inplace=True)

        y_train = y_train.to_numpy().ravel()
        y_test = y_test.to_numpy().ravel()

        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)

        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=212)
        mlp_model = MLPClassifier()

        # Debugging: Print the search space before fitting
        print("Search space:", param_grid)
        logging.info(f"Search space: {param_grid}")

        bayes_search = BayesSearchCV(
            estimator=mlp_model,
            search_spaces=param_grid,
            cv=cv,
            scoring='recall',
            n_jobs=threads,
            verbose=3,
            return_train_score=True)

        logging.info(f"Processing dataset: {name}")
        print(f"Training on {name}...", flush=True)

        bayes_search.fit(X_train_scaled, y_train)

        print(f"Bayesian Search completed", flush=True)
        logging.info(f"Bayesian Search for {test_name} completed.")

        for i in range(len(bayes_search.cv_results_['params'])):
            param_combination = bayes_search.cv_results_['params'][i]
            recall = bayes_search.cv_results_['mean_test_score'][i]
            fit_time = bayes_search.cv_results_['mean_fit_time'][i]
            param_results["Dataset_Name"].append(name)
            param_results["Grid_Variable"].append(test_name)
            param_results["Parameters"].append(param_combination)
            param_results["Recall"].append(recall)
            param_results["Fit_Time"].append(fit_time)
        
        param_results_df = pd.DataFrame(param_results)
        param_results_df = param_results_df.sort_values(by="Recall", ascending=False)

        best_fit_time = param_results_df.iloc[0]["Fit_Time"]
        best_model = bayes_search.best_estimator_

        y_pred_test = best_model.predict(X_test_scaled)
        test_recall = recall_score(y_test, y_pred_test)
        test_roc_auc = roc_auc_score(y_test, best_model.predict_proba(X_test_scaled)[:, 1])
        test_accuracy = accuracy_score(y_test, y_pred_test)

        with open(f"{save_pth}best_model{test_name}-{name}.pkl", 'wb') as file:
            pickle.dump(best_model, file)

        logging.info(f"Model saved to {save_pth}")

        best_results["Dataset_Name"].append(name)
        best_results["Grid_Variable"].append(test_name)
        best_results["Parameters"].append(bayes_search.best_params_)
        best_results["Recall"].append(test_recall)
        best_results["ROC_AUC"].append(test_roc_auc)
        best_results["Accuracy"].append(test_accuracy)
        best_results["Fit_Time"].append(best_fit_time)

        best_results_df = pd.DataFrame(best_results)
        best_results_df.to_parquet(f"{save_pth}test_results{test_name}-{name}.parquet", index=False)
        param_results_df.to_parquet(f"{save_pth}grid_results{test_name}-{name}.parquet", index=False)

        print(f"{test_name} Bayesian Search completed!", flush=True)
        logging.info(f"{test_name} Bayesian Search completed!")

# Run the model
for test, param_dict in bayes_param.items():
    mlp_bayesian_search(pq_jar, unq_names, param_dict, save_pth, test)


Using 8 CPU threads!
Search space: {'hidden_layer_sizes': [(47, 46, 46, 46)], 'activation': ['relu', 'tanh', 'logistic'], 'alpha': [0.0, 0.0001, 0.001], 'batch_size': ['auto', 100, 1000], 'learning_rate': ['constant', 'adaptive'], 'learning_rate_init': [0.001, 0.01], 'max_iter': [100, 200], 'momentum': [0.0, 0.5, 0.9], 'n_iter_no_change': [10, 100], 'solver': ['adam', 'sgd']}
Training on Under_Sample_1:1_threshold_20...




ValueError: can only convert an array of size 1 to a Python scalar

In [80]:
import os
import logging
import pickle
import pandas as pd
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import recall_score, roc_auc_score, accuracy_score
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

# Initialize paths
data_pth = "../../../Data/GoogleDrive/MLP_Dataset/"
save_pth = "../../../Data/GoogleDrive/MLP_Results/"

# Load data
pq_jar = parquet_to_dict(data_pth)  # all lazy
unq_names = unq_df_names(pq_jar)

# Parameter grid
bayes_param = {
    '_bayes_params': {
        'hidden_layer_sizes': [(47, 46, 46, 46)],  # Different configurations of neurons and layers
        'activation': ['relu', 'tanh', 'logistic'],  # Including another activation function
        'alpha': [0.0, 0.0001, 0.001],  # Different regularization parameters
        'batch_size': ['auto', 100, 1000],  # Different sizes of minibatches
        'learning_rate': ['constant', 'adaptive'],  # Including adaptive learning rate
        'learning_rate_init': [0.001, 0.01],  # Different initial learning rates
        'max_iter': [100, 200],  # Different maximum number of iterations
        'momentum': [0.0, 0.5, 0.9],  # Different values for momentum
        'n_iter_no_change': [10, 100],  # Different values for stopping criteria
        'solver': ['adam', 'sgd']  # Including stochastic gradient descent as a solver
    }
}

# Set test name
test = '_bayes_params_vem'

# Define number of threads
threads = os.cpu_count() - 4
print(f"Using {threads} CPU threads!")

# Setup logging
logging.basicConfig(
    filename=f"./log/MLP_{test}.log",
    filemode='w', 
    level=logging.INFO, 
    format='%(asctime)s - %(levelname)s - %(message)s'
)

# Iterate through datasets
for name in unq_names:
    # Initialize results containers
    best_results = {
        "Dataset_Name": [],
        "Grid_Variable": [],
        "Parameters": [],
        "Recall": [], 
        "ROC_AUC": [], 
        "Accuracy": [],
        "Fit_Time": []
    }
    
    param_results = {
        "Dataset_Name": [],
        "Grid_Variable": [],
        "Parameters": [],
        "Recall": [], 
        "Fit_Time": []
    }
    
    # Load datasets
    X_train_name = f"{name}_X_train"
    y_train_name = f"{name}_y_train"
    X_test_name = f"{name}_X_test"
    y_test_name = f"{name}_y_test"

    X_train = pq_jar[X_train_name].collect().to_pandas()
    y_train = pq_jar[y_train_name].collect().to_pandas()
    X_test = pq_jar[X_test_name].collect().to_pandas()
    y_test = pq_jar[y_test_name].collect().to_pandas()

    # Drop index column
    X_train.drop(columns=['__index_level_0__'], errors='ignore', inplace=True)
    y_train.drop(columns=['__index_level_0__'], errors='ignore', inplace=True)
    X_test.drop(columns=['__index_level_0__'], errors='ignore', inplace=True)
    y_test.drop(columns=['__index_level_0__'], errors='ignore', inplace=True)

    # Flatten response sets
    y_train = y_train.to_numpy().ravel()
    y_test = y_test.to_numpy().ravel()

    # Standardize predictor sets
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Define cross-validation folds
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=212)

    # Define MLP model
    mlp_model = MLPClassifier()

    # Define search space
    # search_space = {
    #     'hidden_layer_sizes': Categorical([(47, 46, 46, 46)]),
    #     'activation': Categorical(['relu', 'tanh', 'logistic']),
    #     'alpha': Real(1e-6, 1e-1, prior='log-uniform'),
    #     'batch_size': Categorical(['auto', 100, 1000]),
    #     'learning_rate': Categorical(['constant', 'adaptive']),
    #     'learning_rate_init': Real(1e-4, 1e-2, prior='log-uniform'),
    #     'max_iter': Integer(100, 200),
    #     'momentum': Real(0.0, 0.9),
    #     'n_iter_no_change': Integer(10, 100),
    #     'solver': Categorical(['adam', 'sgd'])
    # }

    search_space = {
    'hidden_layer_sizes': [(47, 46, 46, 46)],  # Different configurations of neurons and layers
    'activation': ['relu', 'tanh', 'logistic'],  # Including another activation function
    'alpha': (1e-6, 1e-1, 'log-uniform'),  # Different regularization parameters
    'batch_size': ['auto', 100, 1000],  # Different sizes of minibatches
    'learning_rate': ['constant', 'adaptive'],  # Including adaptive learning rate
    'learning_rate_init': (1e-4, 1e-2, 'log-uniform'),  # Different initial learning rates
    'max_iter': (100, 200),  # Different maximum number of iterations
    'momentum': (0.0, 0.9),  # Different values for momentum
    'n_iter_no_change': (10, 100),  # Different values for stopping criteria
    'solver': ['adam', 'sgd']  # Including stochastic gradient descent as a solver
    }


    # Debugging: Print the search space before fitting
    print("Search space:", search_space)
    logging.info(f"Search space: {search_space}")

    # Initialize Bayesian Search
    bayes_search = BayesSearchCV(
        estimator=mlp_model,
        search_spaces=search_space,
        n_iter=50,  # Adjust the number of iterations as needed
        cv=cv,
        scoring='recall',
        n_jobs=threads,
        verbose=3,
        return_train_score=True
    )

    # Logging
    logging.info(f"Processing dataset: {name}")
    print(f"Training on {name}...", flush=True)

    # Fit Bayesian Search
    try:
        bayes_search.fit(X_train_scaled, y_train)
    except Exception as e:
        print(f"Error during fit: {e}")
        logging.error(f"Error during fit: {e}")
        continue

    # Logging
    print(f"Bayesian Search completed", flush=True)
    logging.info(f"Bayesian Search for {test} completed.")

    # Record results
    for i in range(len(bayes_search.cv_results_['params'])):
        param_combination = bayes_search.cv_results_['params'][i]
        recall = bayes_search.cv_results_['mean_test_score'][i]
        fit_time = bayes_search.cv_results_['mean_fit_time'][i]
        param_results["Dataset_Name"].append(name)
        param_results["Grid_Variable"].append(test)
        param_results["Parameters"].append(param_combination)
        param_results["Recall"].append(recall)
        param_results["Fit_Time"].append(fit_time)
    
    param_results_df = pd.DataFrame(param_results)
    param_results_df = param_results_df.sort_values(by="Recall", ascending=False)

    best_fit_time = param_results_df.iloc[0]["Fit_Time"]
    best_model = bayes_search.best_estimator_

    # Predict on test set
    y_pred_test = best_model.predict(X_test_scaled)
    test_recall = recall_score(y_test, y_pred_test)
    test_roc_auc = roc_auc_score(y_test, best_model.predict_proba(X_test_scaled)[:, 1])
    test_accuracy = accuracy_score(y_test, y_pred_test)

    # Save best model
    with open(f"{save_pth}best_model{test}-{name}.pkl", 'wb') as file:
        pickle.dump(best_model, file)

    logging.info(f"Model saved to {save_pth}")

    best_results["Dataset_Name"].append(name)
    best_results["Grid_Variable"].append(test)
    best_results["Parameters"].append(bayes_search.best_params_)
    best_results["Recall"].append(test_recall)
    best_results["ROC_AUC"].append(test_roc_auc)
    best_results["Accuracy"].append(test_accuracy)
    best_results["Fit_Time"].append(best_fit_time)

    best_results_df = pd.DataFrame(best_results)
    best_results_df.to_parquet(f"{save_pth}test_results{test}-{name}.parquet", index=False)
    param_results_df.to_parquet(f"{save_pth}grid_results{test}-{name}.parquet", index=False)

    # Logging
    print(f"{test} Bayesian Search completed!", flush=True)
    logging.info(f"{test} Bayesian Search completed!")

Using 8 CPU threads!
Search space: {'hidden_layer_sizes': [(47, 46, 46, 46)], 'activation': ['relu', 'tanh', 'logistic'], 'alpha': (1e-06, 0.1, 'log-uniform'), 'batch_size': ['auto', 100, 1000], 'learning_rate': ['constant', 'adaptive'], 'learning_rate_init': (0.0001, 0.01, 'log-uniform'), 'max_iter': (100, 200), 'momentum': (0.0, 0.9), 'n_iter_no_change': (10, 100), 'solver': ['adam', 'sgd']}
Training on Under_Sample_1:1_threshold_20...
Error during fit: can only convert an array of size 1 to a Python scalar


## Results