# Lego Model

In [2]:
import os # List 
import time # Runtime
import pickle # Model Saving
import logging # Log Checkpoints
import numpy as np # Flatten y vectors
import pandas as pd # DataFrame
import polars as pl # LazyFrame
from sklearn.preprocessing import StandardScaler # X Standardization
from sklearn.neural_network import MLPClassifier as mlp # model
from sklearn.metrics import recall_score, roc_auc_score, accuracy_score, auc, roc_curve  # Scoring
from sklearn.model_selection import GridSearchCV, StratifiedKFold, ParameterGrid
from great_tables import GT, md, html, from_column, style, loc, vals
from assignment_3_tools import parquet_to_dict, unq_df_names, corr_testset
import xgboost as xgb
# add scikit optimize for bayesian optimization
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

In [3]:
def mlp_gridsearch(lazy_dict, unq_names, param_grid, save_pth, test_name, threads=None):
    """
    MLP GridSearch using 5-fold Cross Validation. Saves best model and results.
    ---
    Args:
        lazy_dict: Dictionary with names and LazyFrames of train and test sets.
        unq_names: List of unique names of parent datasets.
        param_grid: Dictionary of parameters for MLPClassifier. CHANGE FOR BAESIAN
        save_pth: String of the path to save the best model.
        test_name: String of the test performed. PARAMETER BEING TESTED
        threads: Integer of CPU threads for cross-validation (optional).
    Return:
        None
    """
    ## Initializing
    # Define number of threads to be used in GridSearch
    if threads is None:
        threads = os.cpu_count() - 4
        print(f"Using {threads} CPU threads!")

    # Log for debugging
    logging.basicConfig(
        filename=f"./log/MLP_{test_name}.log",
        filemode='w', 
        level=logging.INFO, 
        format='%(asctime)s - %(levelname)s - %(message)s')
    
    ## GridSearch and Results
    for name in unq_names:
        # Results from prediction on test_set. FOR TEST TABLE
        best_results = {
            "Dataset_Name": [],
            "Grid_Variable": [],
            "Parameters": [],
            "Recall": [], 
            "ROC_AUC": [], 
            "Accuracy": [],
            "Fit_Time": []}
        
        # Results from prediction on Cross Validation Set. FOR CV TABLE
        param_results = {
            "Dataset_Name": [],
            "Grid_Variable": [],
            "Parameters": [],
            "Recall": [], 
            "Fit_Time": []}
        
        ## Reading and Preparing Data
        # Dataset names in path
        X_train_name = f"{name}_X_train"
        y_train_name = f"{name}_y_train"
        X_test_name = f"{name}_X_test"
        y_test_name = f"{name}_y_test"

        # Train and test sets.
        X_train = lazy_dict[X_train_name].collect().to_pandas()
        y_train = lazy_dict[y_train_name].collect().to_pandas()
        X_test = lazy_dict[X_test_name].collect().to_pandas()
        y_test = lazy_dict[y_test_name].collect().to_pandas()

        # Drop index column
        X_train.drop(columns=['__index_level_0__'], errors='ignore', inplace=True)
        y_train.drop(columns=['__index_level_0__'], errors='ignore', inplace=True)
        X_test.drop(columns=['__index_level_0__'], errors='ignore', inplace=True)
        y_test.drop(columns=['__index_level_0__'], errors='ignore', inplace=True)

        # Flatten response sets
        y_train = y_train.to_numpy().ravel()
        y_test = y_test.to_numpy().ravel()

        # Standardize predictor sets
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)

        ## Defining Modeling and GridSearch. CHANGE FOR BEASIAN
        # Define cross-validation folds
        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=212)

        # Define mlp model
        mlp_model = mlp()

        # Define GridSearch. CHANGE TO BEASIAN!!!
        grid_search = GridSearchCV(
            mlp_model, #mlp model
            param_grid=param_grid, #parameter dictionary
            cv=cv, # cv
            scoring='recall', # best is by recall
            n_jobs=threads,
            verbose=3, 
            return_train_score=True) # For making CV table


        ## Performing GridSearch
        # Debugging Checkpoint
        logging.info(f"Processing dataset: {name}")
        print(f"Training on {name}...", flush=True)

        # GridSearch Training and Results
        grid_search.fit(X_train_scaled, y_train)

        # Debugging Checkpoint
        print(f"GridSearch completed", flush=True)
        logging.info(f"GridSearch for {test_name} completed.")

        ## Results from GridSearch
        # Storing Results for each parameter combination
        for i in range(len(grid_search.cv_results_['params'])):
            param_combination = grid_search.cv_results_['params'][i]
            recall = grid_search.cv_results_['mean_test_score'][i]
            fit_time = grid_search.cv_results_['mean_fit_time'][i]
            param_results["Dataset_Name"].append(name)
            param_results["Grid_Variable"].append(test_name)
            param_results["Parameters"].append(param_combination)
            param_results["Recall"].append(recall)
            param_results["Fit_Time"].append(fit_time)
        
        # Convert to DataFrame
        param_results_df = pd.DataFrame(param_results)
        param_results_df = param_results_df.sort_values(by="Recall", ascending=False)

        # Best model by Recall on Cross Validation data
        best_fit_time = param_results_df.iloc[0]["Fit_Time"]
        best_model = grid_search.best_estimator_

        # Metrics on test set
        y_pred_test = best_model.predict(X_test_scaled)
        test_recall = recall_score(y_test, y_pred_test)
        test_roc_auc = roc_auc_score(y_test, best_model.predict_proba(X_test_scaled)[:, 1])
        test_accuracy = accuracy_score(y_test, y_pred_test)

        # Save best model as pickle
        # example: best_model_neurons-hidden_layer_sizes.pkl
        with open(f"{save_pth}best_model{test_name}-{name}.pkl", 'wb') as file:
            pickle.dump(best_model, file)

        # Debugging Checkpoint
        logging.info(f"Model saved to {save_pth}")

        # Results from predicting test data using the best model.
        best_results["Dataset_Name"].append(name)
        best_results["Grid_Variable"].append(test_name)
        best_results["Parameters"].append(grid_search.best_params_)
        best_results["Recall"].append(test_recall)
        best_results["ROC_AUC"].append(test_roc_auc)
        best_results["Accuracy"].append(test_accuracy)
        best_results["Fit_Time"].append(best_fit_time)

        # Convert to DataFrame
        best_results_df = pd.DataFrame(best_results)
        
        # Save results as Parquet
        # example: test_results_layers-hidden_layer_sizes.parquet
        best_results_df.to_parquet(f"{save_pth}test_results{test_name}-{name}.parquet", index=False)
        # example: grid_results_neurons-hidden_layer_sizes.parquet
        param_results_df.to_parquet(f"{save_pth}grid_results{test_name}-{name}.parquet", index=False)

        # Debugging Checkpoint
        print(f"{test_name} GridSearch completed!", flush=True)
        logging.info(f"{test_name} GridSearch completed!")

In [4]:
# Train and test sets are in MLP_Dataset.
# Save results and best model to MLP_Results.
data_pth = "../../../Data/GoogleDrive/MLP_Dataset/"
save_pth = "../../../Data/GoogleDrive/MLP_Results/"

# Read in Parquet files in path and add to a LazyFrame dictionary.
pq_jar = parquet_to_dict(data_pth) # all lazy

# Record the unique dataset names (drop X_train, etc.).
unq_names = unq_df_names(pq_jar)

In [None]:
# A dictionary of parmeter dictionaries
# Schema {testname:{parameter:values}}

two_param = {
    '_lego_2':{
        'solver': ['adam'],              # Solver for weight optimization
        'hidden_layer_sizes': [(47, 46, 46, 46)]  # Number of neurons and layers, represented as a tupl
    }
}

test_2 = '_best_lego_2'
test = test_2

In [None]:
%%time
# Run the model
for test, param_dict in two_param.items():
    mlp_gridsearch(pq_jar, unq_names, param_dict, save_pth, test)

Using 8 CPU threads!
Training on Under_Sample_1:1_threshold_20...
Fitting 5 folds for each of 1 candidates, totalling 5 fits




[CV 5/5] END hidden_layer_sizes=(47, 46, 46, 46), solver=adam;, score=(train=0.932, test=0.912) total time= 5.8min




[CV 1/5] END hidden_layer_sizes=(47, 46, 46, 46), solver=adam;, score=(train=0.943, test=0.922) total time= 5.8min




[CV 2/5] END hidden_layer_sizes=(47, 46, 46, 46), solver=adam;, score=(train=0.932, test=0.917) total time= 5.8min




[CV 4/5] END hidden_layer_sizes=(47, 46, 46, 46), solver=adam;, score=(train=0.938, test=0.919) total time= 5.8min




[CV 3/5] END hidden_layer_sizes=(47, 46, 46, 46), solver=adam;, score=(train=0.940, test=0.922) total time= 5.8min
GridSearch completed




_lego_2 GridSearch completed!
CPU times: user 4h 48min 41s, sys: 9min 34s, total: 4h 58min 15s
Wall time: 32min 1s


In [5]:
three_param = {
    '_best_lego_3_2': {
        'solver': ['adam'],               # Solver for weight optimization
        'hidden_layer_sizes': [(47, 46, 46, 46)],  # Number of neurons and layers, represented as a tuple
        'activation': ['logistic'],           # Activation function
    }
}
test_3 = '_best_lego_3_2'
test = test_3

In [6]:
%%time
# Run the model
for test, param_dict in three_param.items():
    mlp_gridsearch(pq_jar, unq_names, param_dict, save_pth, test)

Using 8 CPU threads!
Training on Under_Sample_1:1_threshold_20...
Fitting 5 folds for each of 1 candidates, totalling 5 fits




[CV 3/5] END activation=logistic, hidden_layer_sizes=(47, 46, 46, 46), solver=adam;, score=(train=0.955, test=0.947) total time= 6.4min




[CV 4/5] END activation=logistic, hidden_layer_sizes=(47, 46, 46, 46), solver=adam;, score=(train=0.945, test=0.935) total time= 6.5min
[CV 2/5] END activation=logistic, hidden_layer_sizes=(47, 46, 46, 46), solver=adam;, score=(train=0.950, test=0.940) total time= 6.5min
[CV 1/5] END activation=logistic, hidden_layer_sizes=(47, 46, 46, 46), solver=adam;, score=(train=0.940, test=0.929) total time= 6.5min




[CV 5/5] END activation=logistic, hidden_layer_sizes=(47, 46, 46, 46), solver=adam;, score=(train=0.940, test=0.929) total time= 6.5min
GridSearch completed




_best_lego_3_2 GridSearch completed!
CPU times: user 4h 3min 51s, sys: 7min 15s, total: 4h 11min 6s
Wall time: 28min 22s


In [7]:
four_param = {
    '_best_lego_4_2': {
        'solver': ['adam'],               # Solver for weight optimization
        'hidden_layer_sizes': [(47, 46, 46, 46)],  # Number of neurons and layers, represented as a tuple
        'activation': ['logistic'],           # Activation function
        'learning_rate_init': [0.01],     # Initial learning rate
    }
}
test_4 = '_best_lego_4_2'
test = test_4

In [8]:
%%time
# Run the model
for test, param_dict in four_param.items():
    mlp_gridsearch(pq_jar, unq_names, param_dict, save_pth, test)

Using 8 CPU threads!
Training on Under_Sample_1:1_threshold_20...
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV 4/5] END activation=logistic, hidden_layer_sizes=(47, 46, 46, 46), learning_rate_init=0.01, solver=adam;, score=(train=0.939, test=0.926) total time= 2.3min
[CV 1/5] END activation=logistic, hidden_layer_sizes=(47, 46, 46, 46), learning_rate_init=0.01, solver=adam;, score=(train=0.943, test=0.930) total time= 2.5min
[CV 2/5] END activation=logistic, hidden_layer_sizes=(47, 46, 46, 46), learning_rate_init=0.01, solver=adam;, score=(train=0.933, test=0.920) total time= 2.5min
[CV 5/5] END activation=logistic, hidden_layer_sizes=(47, 46, 46, 46), learning_rate_init=0.01, solver=adam;, score=(train=0.890, test=0.876) total time= 2.6min
[CV 3/5] END activation=logistic, hidden_layer_sizes=(47, 46, 46, 46), learning_rate_init=0.01, solver=adam;, score=(train=0.947, test=0.936) total time= 2.7min
GridSearch completed
_best_lego_4_2 GridSearch completed!
CPU times: u

In [9]:
five_param = {
    '_best_lego_5_2': {
        'solver': ['adam'],               # Solver for weight optimization
        'hidden_layer_sizes': [(47, 46, 46, 46)],  # Number of neurons and layers, represented as a tuple
        'activation': ['logistic'],           # Activation function
        'learning_rate_init': [0.01],     # Initial learning rate
        'max_iter': [100],                # Maximum number of iterations
    }
}
test_5 = '_best_lego_5_2'
test = test_5

In [10]:
%%time
# Run the model
for test, param_dict in five_param.items():
    mlp_gridsearch(pq_jar, unq_names, param_dict, save_pth, test)

Using 8 CPU threads!
Training on Under_Sample_1:1_threshold_20...
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV 5/5] END activation=logistic, hidden_layer_sizes=(47, 46, 46, 46), learning_rate_init=0.01, max_iter=100, solver=adam;, score=(train=0.922, test=0.908) total time= 2.0min
[CV 3/5] END activation=logistic, hidden_layer_sizes=(47, 46, 46, 46), learning_rate_init=0.01, max_iter=100, solver=adam;, score=(train=0.878, test=0.866) total time= 2.2min
[CV 2/5] END activation=logistic, hidden_layer_sizes=(47, 46, 46, 46), learning_rate_init=0.01, max_iter=100, solver=adam;, score=(train=0.907, test=0.892) total time= 2.4min
[CV 4/5] END activation=logistic, hidden_layer_sizes=(47, 46, 46, 46), learning_rate_init=0.01, max_iter=100, solver=adam;, score=(train=0.937, test=0.923) total time= 2.4min
[CV 1/5] END activation=logistic, hidden_layer_sizes=(47, 46, 46, 46), learning_rate_init=0.01, max_iter=100, solver=adam;, score=(train=0.901, test=0.884) total time= 2.4min


In [11]:
six_param = {
    '_best_lego_6_2': {
        'solver': ['adam'],               # Solver for weight optimization
        'hidden_layer_sizes': [(47, 46, 46, 46)],  # Number of neurons and layers, represented as a tuple
        'activation': ['logistic'],           # Activation function
        'learning_rate_init': [0.01],     # Initial learning rate
        'max_iter': [100],                # Maximum number of iterations
        'n_iter_no_change': [100],         # Number of iterations with no improvement to stop training
    }
}
test_6 = '_best_lego_6_2'
test = test_6

In [12]:
%%time
# Run the model
for test, param_dict in six_param.items():
    mlp_gridsearch(pq_jar, unq_names, param_dict, save_pth, test)

Using 8 CPU threads!
Training on Under_Sample_1:1_threshold_20...
Fitting 5 folds for each of 1 candidates, totalling 5 fits




[CV 3/5] END activation=logistic, hidden_layer_sizes=(47, 46, 46, 46), learning_rate_init=0.01, max_iter=100, n_iter_no_change=100, solver=adam;, score=(train=0.917, test=0.902) total time= 3.1min
[CV 5/5] END activation=logistic, hidden_layer_sizes=(47, 46, 46, 46), learning_rate_init=0.01, max_iter=100, n_iter_no_change=100, solver=adam;, score=(train=0.893, test=0.877) total time= 3.1min
[CV 1/5] END activation=logistic, hidden_layer_sizes=(47, 46, 46, 46), learning_rate_init=0.01, max_iter=100, n_iter_no_change=100, solver=adam;, score=(train=0.949, test=0.935) total time= 3.1min




[CV 2/5] END activation=logistic, hidden_layer_sizes=(47, 46, 46, 46), learning_rate_init=0.01, max_iter=100, n_iter_no_change=100, solver=adam;, score=(train=0.901, test=0.887) total time= 3.1min
[CV 4/5] END activation=logistic, hidden_layer_sizes=(47, 46, 46, 46), learning_rate_init=0.01, max_iter=100, n_iter_no_change=100, solver=adam;, score=(train=0.913, test=0.900) total time= 3.1min
GridSearch completed




_best_lego_6_2 GridSearch completed!
CPU times: user 1h 59min 56s, sys: 4min 7s, total: 2h 4min 4s
Wall time: 13min 51s


In [19]:
seven_param = {
    '_best_lego_7_3': {
        'solver': ['adam'],               # Solver for weight optimization
        'hidden_layer_sizes': [(47, 46, 46, 46)],  # Number of neurons and layers, represented as a tuple
        'activation': ['logistic'],           # Activation function
        'learning_rate_init': [0.01],     # Initial learning rate
        'max_iter': [100],                # Maximum number of iterations
        'n_iter_no_change': [100],         # Number of iterations with no improvement to stop training
        'batch_size': [100],           # Size of minibatches
    }
}
test_7 = '_best_lego_7_3'
test = test_7

In [20]:
%%time
# Run the model
for test, param_dict in seven_param.items():
    mlp_gridsearch(pq_jar, unq_names, param_dict, save_pth, test)

Using 8 CPU threads!
Training on Under_Sample_1:1_threshold_20...
Fitting 5 folds for each of 1 candidates, totalling 5 fits




[CV 2/5] END activation=logistic, batch_size=100, hidden_layer_sizes=(47, 46, 46, 46), learning_rate_init=0.01, max_iter=100, n_iter_no_change=100, solver=adam;, score=(train=0.905, test=0.897) total time= 3.7min




[CV 5/5] END activation=logistic, batch_size=100, hidden_layer_sizes=(47, 46, 46, 46), learning_rate_init=0.01, max_iter=100, n_iter_no_change=100, solver=adam;, score=(train=0.899, test=0.887) total time= 3.7min




[CV 1/5] END activation=logistic, batch_size=100, hidden_layer_sizes=(47, 46, 46, 46), learning_rate_init=0.01, max_iter=100, n_iter_no_change=100, solver=adam;, score=(train=0.918, test=0.906) total time= 3.7min
[CV 3/5] END activation=logistic, batch_size=100, hidden_layer_sizes=(47, 46, 46, 46), learning_rate_init=0.01, max_iter=100, n_iter_no_change=100, solver=adam;, score=(train=0.882, test=0.868) total time= 3.7min




[CV 4/5] END activation=logistic, batch_size=100, hidden_layer_sizes=(47, 46, 46, 46), learning_rate_init=0.01, max_iter=100, n_iter_no_change=100, solver=adam;, score=(train=0.912, test=0.899) total time= 3.8min
GridSearch completed




_best_lego_7_3 GridSearch completed!
CPU times: user 1h 37min 2s, sys: 1min 57s, total: 1h 38min 59s
Wall time: 12min 23s


In [21]:
eight_param = {
    '_best_lego_8_3': {
        'solver': ['adam'],               # Solver for weight optimization
        'hidden_layer_sizes': [(47, 46, 46, 46)],  # Number of neurons and layers, represented as a tuple
        'activation': ['logistic'],           # Activation function
        'learning_rate_init': [0.01],     # Initial learning rate
        'max_iter': [100],                # Maximum number of iterations
        'n_iter_no_change': [100],         # Number of iterations with no improvement to stop training
        'batch_size': [100],           # Size of minibatches
        'alpha': [0.0]                 # Regularization parameter
    }
}
test_8 = '_best_lego_8_3'
test = test_8

In [22]:
%%time
# Run the model
for test, param_dict in eight_param.items():
    mlp_gridsearch(pq_jar, unq_names, param_dict, save_pth, test)

Using 8 CPU threads!
Training on Under_Sample_1:1_threshold_20...
Fitting 5 folds for each of 1 candidates, totalling 5 fits




[CV 3/5] END activation=logistic, alpha=0.0, batch_size=100, hidden_layer_sizes=(47, 46, 46, 46), learning_rate_init=0.01, max_iter=100, n_iter_no_change=100, solver=adam;, score=(train=0.903, test=0.886) total time= 3.7min




[CV 1/5] END activation=logistic, alpha=0.0, batch_size=100, hidden_layer_sizes=(47, 46, 46, 46), learning_rate_init=0.01, max_iter=100, n_iter_no_change=100, solver=adam;, score=(train=0.915, test=0.903) total time= 3.8min




[CV 4/5] END activation=logistic, alpha=0.0, batch_size=100, hidden_layer_sizes=(47, 46, 46, 46), learning_rate_init=0.01, max_iter=100, n_iter_no_change=100, solver=adam;, score=(train=0.853, test=0.840) total time= 3.8min




[CV 2/5] END activation=logistic, alpha=0.0, batch_size=100, hidden_layer_sizes=(47, 46, 46, 46), learning_rate_init=0.01, max_iter=100, n_iter_no_change=100, solver=adam;, score=(train=0.887, test=0.877) total time= 3.8min




[CV 5/5] END activation=logistic, alpha=0.0, batch_size=100, hidden_layer_sizes=(47, 46, 46, 46), learning_rate_init=0.01, max_iter=100, n_iter_no_change=100, solver=adam;, score=(train=0.895, test=0.885) total time= 3.9min
GridSearch completed




_best_lego_8_3 GridSearch completed!
CPU times: user 1h 34min 19s, sys: 1min 55s, total: 1h 36min 14s
Wall time: 12min 12s
