# Multi-Perceptron Grid_Search

## Summary

## Table of Contents

- [Notebook Setup](#Notebook-Setup)
- [Read in Parquet](#Read-in-Parquet)
- [MLP Baseline Model Loop](#MLP-Baseline-Model-Loop)
- [MLP Baseline Parallelization](#MLP-Baseline-Parallelization)

## Notebook Setup

Significant functions from [assignment_3_tools.py](./assignment_3_tools.py)

In [1]:
import time # Runtime
import pickle # Model Saving
import logging # Log Checkpoints
import numpy as np # Flatten y vectors
import pandas as pd # DataFrame
import polars as pl # LazyFrame
from sklearn.preprocessing import StandardScaler # X Standardization
from sklearn.neural_network import MLPClassifier as mlp # model
from sklearn.metrics import recall_score, roc_auc_score, accuracy_score # Scoring
from sklearn.model_selection import GridSearchCV, StratifiedKFold # Grid and CV
from assignment_3_tools import parquet_to_dict, unq_df_names, corr_testset

## Unique Datasets and Corresponding Testsets

In [preprocess notebook](./taylor_preprocess.ipynb), all of the null-threshold datasets were split into X_train, y_train, X_test, and y_test. The X_train, and y_train sets of each null-threshold datasets were balanced using random over/under sampling. Therefore when `parquet_to_dict()` is called, the dictionary will contain the X_train, y_train, X_test, y_test which correspond to one dataset. To resolve this, `unq_df_names()` and `corr_testset` record the dataset names and corresponding testsets.

## MLP Baseline Model Loop

This MLP model loops through all of the unique dataset names from `unq_df_names` and trains an MLP model on each unique dataset.

In [4]:
import os
import time
import pickle
import logging
import pandas as pd
from sklearn.model_selection import GridSearchCV, StratifiedKFold, ParameterGrid
from sklearn.metrics import recall_score, roc_auc_score, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier as mlp

def mlp_gridsearch(lazy_dict, unq_names, param_grid, save_pth, test_name, threads=None):
    """
    MLP GridSearch using 5-fold Cross Validation. Saves best model and results.
    ---
    Args:
        lazy_dict: Dictionary with names and LazyFrames of train and test sets.
        unq_names: List of unique names of parent datasets.
        param_grid: Dictionary of parameters for MLPClassifier.
        save_pth: String of the path to save the best model.
        test_name: String of the test performed.
        threads: Integer of CPU threads for cross-validation (optional).
    Return:
        None
    """
    ## Initializing
    # Define number of threads to be used in GridSearch
    if threads is None:
        threads = os.cpu_count() - 2
        print(f"Using {threads} CPU threads!")

    # Log for debugging
    logging.basicConfig(
        filename=f"./log/MLP_{test_name}.log",
        filemode='w', 
        level=logging.INFO, 
        format='%(asctime)s - %(levelname)s - %(message)s')

    # Results from prediction on test_set
    best_results = {
        "Dataset_Name": [],
        "Parameters": [],
        "Recall": [], 
        "ROC_AUC": [], 
        "Accuracy": [],
        "Fit_Time": []}
    
    # Results from prediction on Cross Validation Set
    param_results = {
        "Dataset_Name": [],
        "Parameters": [],
        "Recall": [], 
        "Fit_Time": []}
    
    ## GridSearch and Results
    for name in unq_names:
        ## Reading and Preparing Data
        # Dataset names in path
        X_train_name = f"{name}_X_train"
        y_train_name = f"{name}_y_train"
        X_test_name = f"{name}_X_test"
        y_test_name = f"{name}_y_test"

        # Train and test sets.
        X_train = lazy_dict[X_train_name].collect().to_pandas()
        y_train = lazy_dict[y_train_name].collect().to_pandas()
        X_test = lazy_dict[X_test_name].collect().to_pandas()
        y_test = lazy_dict[y_test_name].collect().to_pandas()

        # Drop index column
        X_train.drop(columns=['__index_level_0__'], errors='ignore', inplace=True)
        y_train.drop(columns=['__index_level_0__'], errors='ignore', inplace=True)
        X_test.drop(columns=['__index_level_0__'], errors='ignore', inplace=True)
        y_test.drop(columns=['__index_level_0__'], errors='ignore', inplace=True)

        # Flatten response sets
        y_train = y_train.to_numpy().ravel()
        y_test = y_test.to_numpy().ravel()

        # Standardize predictor sets
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)

        ## Defining Modeling and GridSearch
        # Define cross-validation folds
        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=212)

        # Define GridSearch
        grid_search = GridSearchCV(
            mlp(),
            param_grid=param_grid, 
            cv=cv,
            scoring='recall',
            n_jobs=threads,
            verbose=1, 
            return_train_score=True)

        ## Performing GridSearch
        # Debugging Checkpoint
        logging.info(f"Processing dataset: {name}")
        print(f"Training on {name}...", flush=True)

        # GridSearch Training and Results
        grid_search.fit(X_train_scaled, y_train)

        # Debugging Checkpoint
        print(f"GridSearch completed", flush=True)
        logging.info(f"GridSearch for {test_name} completed.")

        ## Results from GridSearch
        # Storing Results for each parameter combination
        for i in range(len(grid_search.cv_results_['params'])):
            param_combination = grid_search.cv_results_['params'][i]
            recall = grid_search.cv_results_['mean_test_score'][i]
            fit_time = grid_search.cv_results_['mean_fit_time'][i]
            param_results["Dataset_Name"].append(name)
            param_results["Parameters"].append(param_combination)
            param_results["Recall"].append(recall)
            param_results["Fit_Time"].append(fit_time)
        
        # Convert to DataFrame
        param_results_df = pd.DataFrame(param_results)
        param_results_df = param_results_df.sort_values(by="Recall", ascending=False)

        # Best model by Recall on Cross Validation data
        best_fit_time = param_results_df.iloc[0]["Fit_Time"]
        best_model = grid_search.best_estimator_

        # Metrics on test set
        y_pred_test = best_model.predict(X_test_scaled)
        test_recall = recall_score(y_test, y_pred_test)
        test_roc_auc = roc_auc_score(y_test, best_model.predict_proba(X_test_scaled)[:, 1])
        test_accuracy = accuracy_score(y_test, y_pred_test)

        # Save best model as pickle
        with open(f"{save_pth}best_model{test_name}.pkl", 'wb') as file:
            pickle.dump(best_model, file)

        # Debugging Checkpoint
        logging.info(f"Model saved to {save_pth}")

        # Results from predicting test data using the best model.
        best_results["Dataset_Name"].append(name)
        best_results["Parameters"].append(grid_search.best_params_)
        best_results["Recall"].append(test_recall)
        best_results["ROC_AUC"].append(test_roc_auc)
        best_results["Accuracy"].append(test_accuracy)
        best_results["Fit_Time"].append(best_fit_time)

        # Convert to DataFrame
        best_results_df = pd.DataFrame(best_results)

        # Save results as Parquet
        best_results_df.to_parquet(f"{save_pth}test_results_{test_name}.parquet", index=False)
        param_results_df.to_parquet(f"{save_pth}grid_results_{test_name}.parquet", index=False)

        # Debugging Checkpoint
        print(f"{test_name} GridSearch completed!", flush=True)
        logging.info(f"{test_name} GridSearch completed!")

In [5]:
%%time

# Train and test sets are in MLP_Dataset.
# Save results and best model to MLP_Results.
data_pth = "../../Data/GoogleDrive/MLP_Dataset/"
save_pth = "../../Data/GoogleDrive/MLP_Results/"

# Read in Parquet files in path and add to a LazyFrame dictionary.
pq_jar = parquet_to_dict(data_pth)

# Record the unique dataset names (drop X_train, etc.).
unq_names = unq_df_names(pq_jar)

# Baseline sklearn mlp_classification parameters.
# These are all of the parameters we are going to test/are different from default.
param_grid = {
    'hidden_layer_sizes': [(100,)],  # Single layer with 100 neurons
    'activation': ['relu'],  # Using 'relu' activation function
    'solver': ['adam'],  # Solver set to 'adam'
    'alpha': [0.0001],  # L2 penalty (regularization term)
    'batch_size': ['auto'],  # 'auto' sets batch size to min(200, n_samples)
    'learning_rate': ['constant'],  # Learning rate schedule
    'learning_rate_init': [0.001],  # Initial learning rate
    'max_iter': [200],  # Maximum number of iterations
    'shuffle': [True],  # Whether to shuffle samples in each iteration
    'random_state': [212],  # Random state for reproducibility, can set to a specific number
    'momentum': [0.9],  # Momentum for gradient descent update
    'nesterovs_momentum': [True],  # Whether to use Nesterov's momentum
    'early_stopping': [True],  # Whether to use early stopping to terminate training
    'n_iter_no_change': [10],  # Maximum number of epochs to not meet improvement threshold
}

test_name = "baseline"

results = mlp_gridsearch(pq_jar, unq_names, param_grid, save_pth, test_name)

Using 6 CPU threads!
Training on Under_Sample_1:1_threshold_20...
Fitting 5 folds for each of 1 candidates, totalling 5 fits
GridSearch completed
baseline GridSearch completed!
CPU times: user 14min 43s, sys: 2min 2s, total: 16min 46s
Wall time: 6min 28s


In [11]:
## Reading in Results

# Best model by Recall
with open(f"{save_pth}best_model_{test_name}.pkl", 'rb') as file:
    model = pickle.load(file)

# Test set prediction results
results_df = pd.read_parquet(f"{save_pth}test_results_{test_name}.parquet")

# Cross Validation results
grid_df = pd.read_parquet(f"{save_pth}grid_results_{test_name}.parquet")