# Baseline Multi-Perceptron Model

## Summary

## Table of Contents

- [Notebook Setup](#Notebook-Setup)
- [Read in Parquet](#Read-in-Parquet)
- [MLP Baseline Model Loop](#MLP-Baseline-Model-Loop)
- [MLP Baseline Parallelization](#MLP-Baseline-Parallelization)

## Notebook Setup

Significant functions from [assignment_3_tools.py](./assignment_3_tools.py)

In [1]:
import os
import time
import pickle #for saveing and loading trained models
import numpy as np # for vector / matrix operations
import pandas as pd # for data manipulation
import logging
from joblib import Parallel, delayed
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import recall_score, roc_auc_score, accuracy_score
from sklearn.model_selection import cross_validate, GridSearchCV, StratifiedKFold
from sklearn.neural_network import MLPClassifier as mlp
from assignment_3_tools import parquet_to_dict

## Unique Datasets and Corresponding Testsets

In [preprocess notebook](./taylor_preprocess.ipynb), all of the null-threshold datasets were split into X_train, y_train, X_test, and y_test. The X_train, and y_train sets of each null-threshold datasets were balanced using random over/under sampling. Therefore when `parquet_to_dict()` is called, the dictionary will contain the X_train, y_train, X_test, y_test which correspond to one dataset. To resolve this, `unq_df_names()` and `corr_testset` record the dataset names and corresponding testsets.

In [3]:
# Unique Datasets.
def unq_df_names(lazy_dict):
    """
    Creates a set of unique datasets from a LazyFrame dictionary.
    ---
    Args: 
        lazy_dict (dict): Contains LazyFrame names and corresponding LazyFrames.
    Returns:
        unq_names (set): Contains unique dataset names.
    """
    all_names = list()
    for key in lazy_dict:
        if key[-6:] == "_train":
            all_names.append(key[:-8]) # Remove _X_train and _y_train
        elif key[-5:] == "_test":
            all_names.append(key[:-7]) # Remove _X_test and _y_test
        else:
            pass
    unq_names = set(all_names)
    return unq_names

# Return Corresponding Test Set.
def corr_testset(unq_name):
    """
    Return the names of testsets corresponding to a preprocessed trainset
    ---
    Args:
        unq_name(set): Contains unique dataset names.
    Returns:
        X_test_name(str): Name of corresponding predictor testset.
        y_test_name(str): Name of corresponding response testset.
    """
    threshold = unq_name[-2:] # 2 possibilities: "##" or "mp"
    if threshold.isnumeric():
        # Use null-threshold datasets with no balancing operations.
        X_test_name = f"df_heart_drop_{threshold}_imp_X_test"
        y_test_name = f"df_heart_drop_{threshold}_imp_y_test"
    else:
        # Use null-threshold datasets with no balancing operations. 
        X_test_name = f"{unq_name}_X_test"
        y_test_name = f"{unq_name}_y_test"
    return X_test_name, y_test_name

## MLP Baseline Model Loop

This MLP model loops through all of the unique dataset names from `unq_df_names` and trains an MLP model on each unique dataset.

In [6]:
def mlp_baseline(lazy_dict, unq_names, param_grid, save_pth, threads=None):
    """
    Baseline MLP model using 5-fold Cross Validation.
    ---
    Args:
        lazy_dict: dict with names and LazyFrames of train and test sets.
        unq_names: list of unique names of parent datasets.
        param_grid: dict of parameters for MLPClassifier.
        save_pth: path to save the best model.
        cv_threads: number of CPU threads for cross-validation (optional).
    Return:
        None
    """
    if threads is None:
        threads = os.cpu_count() - 2
        print(f"Using {threads} CPU threads!")
        
    results = {
        "Dataset Name": [],
        "Best Recall": [], 
        "Best ROCAUC": [], 
        "Best Accuracy": [],
        "Fit Time": []}
    
    for name in unq_names:
        X_train_name = f"{name}_X_train"
        y_train_name = f"{name}_y_train"
        (X_test_name, y_test_name) = corr_testset(name)
        
        X_train = lazy_dict[X_train_name].collect().to_pandas()
        y_train = lazy_dict[y_train_name].collect().to_pandas()
        X_test = lazy_dict[X_test_name].collect().to_pandas()
        y_test = lazy_dict[y_test_name].collect().to_pandas()
        
        X_train.drop(columns=['__index_level_0__'], errors='ignore', inplace=True)
        y_train.drop(columns=['__index_level_0__'], errors='ignore', inplace=True)
        X_test.drop(columns=['__index_level_0__'], errors='ignore', inplace=True)
        y_test.drop(columns=['__index_level_0__'], errors='ignore', inplace=True)
        
        y_train = y_train.to_numpy().ravel()
        y_test = y_test.to_numpy().ravel()


        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        
        print(X_train.shape)
        print(y_train.shape)
        print(X_test.shape)
        print(y_test.shape)

        mlp_model = mlp(early_stopping = True, verbose= False, random_state=212)
        
        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state= 212)
        best_recall = -1
        best_model = None
        
        grid_search = GridSearchCV(mlp_model, param_grid=param_grid, cv=cv, scoring='recall', n_jobs=threads, verbose=1)
        
        logging.info(f"Processing dataset: {name}")
        print(f"Training on {name}...", flush=True)
        
        start_time = time.time()
        grid_search.fit(X_train_scaled, y_train)
        fit_time = time.time() - start_time

        print(f"GridSearch completed", flush=True)
        logging.info("GridSearch completed.")
        
        best_model = grid_search.best_estimator_
        
        y_pred_test = best_model.predict(X_test_scaled)
        test_recall = recall_score(y_test, y_pred_test)
        test_roc_auc = roc_auc_score(y_test, best_model.predict_proba(X_test_scaled)[:, 1])
        test_accuracy = accuracy_score(y_test, y_pred_test)
        
        with open(f"{save_pth}{name}_MLPbaseline", 'wb') as file:
            pickle.dump(best_model, file)
        logging.info(f"Model saved to {save_pth}")

        results["Dataset Name"].append(name)
        results["Best Recall"].append(test_recall)
        results["Best ROCAUC"].append(test_roc_auc)
        results["Best Accuracy"].append(test_accuracy)
        results["Fit Time"].append(fit_time)
        
    pd.DataFrame(results,"../../Data/GoogleDrive/baseline_results")

In [7]:
%%time

## Log Initialization
logging.basicConfig(filename='./log/MLP_baseline.log', filemode='w', level=logging.INFO, 
                    format='%(asctime)s - %(levelname)s - %(message)s')
## Paths
data_pth = "../../Data/GoogleDrive/Encoded_Data/"
save_pth = "../../Data/GoogleDrive/Baseline/"

## Read in Parquet to LazyFrame Dictionary.
pq_jar = parquet_to_dict(data_pth)

## Record the unique dataset names.
unq_names = unq_df_names(pq_jar)

## List the default sklearn mlp_classification parameters.
param_grid = {
    'hidden_layer_sizes': [(100,)],  # Single layer with 100 neurons
    'activation': ['relu'],  # Using 'relu' activation function
    'solver': ['adam'],  # Solver set to 'adam'
    'alpha': [0.0001],  # L2 penalty (regularization term)
    'batch_size': ['auto'],  # 'auto' sets batch size to min(200, n_samples)
    'learning_rate': ['constant'],  # Learning rate schedule
    'learning_rate_init': [0.001],  # Initial learning rate
    'power_t': [0.5],  # The exponent for inverse scaling learning rate
    'max_iter': [200],  # Maximum number of iterations
    'shuffle': [True],  # Whether to shuffle samples in each iteration
    'random_state': [212],  # Random state for reproducibility, can set to a specific number
    'tol': [0.0001],  # Tolerance for the optimization
    'verbose': [False],  # Whether to print progress messages to stdout
    'warm_start': [False],  # Reuse solution of the previous call to fit as initialization
    'momentum': [0.9],  # Momentum for gradient descent update
    'nesterovs_momentum': [True],  # Whether to use Nesterov's momentum
    'early_stopping': [True],  # Whether to use early stopping to terminate training
    'validation_fraction': [0.1],  # Proportion of training data to set aside as validation set
    'beta_1': [0.9],  # Exponential decay rate for estimates of first moment vector in adam
    'beta_2': [0.999],  # Exponential decay rate for estimates of second moment vector in adam
    'epsilon': [1e-08],  # Value for numerical stability in adam
    'n_iter_no_change': [10],  # Maximum number of epochs to not meet improvement threshold
    'max_fun': [15000]  # Maximum number of loss function calls
}

results = mlp_baseline(pq_jar, unq_names, param_grid, save_pth)

Using 14 CPU threads!
(781716, 121)
(781716,)
(98675, 121)
(98675,)
Training on Over_Sample_1:2_threshold_05...
Fitting 5 folds for each of 1 candidates, totalling 5 fits




GridSearch completed




(453648, 121)
(453648,)
(85741, 121)
(85741,)
Training on Under_Sample_1:1_threshold_01...
Fitting 5 folds for each of 1 candidates, totalling 5 fits




GridSearch completed




(2324712, 121)
(2324712,)
(109919, 121)
(109919,)
Training on Over_Sample_1:7_threshold_20...
Fitting 5 folds for each of 1 candidates, totalling 5 fits
GridSearch completed




KeyboardInterrupt: 