# Create Datasets that Drop the Yes or No Columns

## Notebook Setup

Significant functions from [assignment_3_tools.py](./assignment_3_tools.py)

In [9]:
import os
import time # Runtime
import pickle # Model Saving
import logging # Log Checkpoints
import numpy as np # Flatten y vectors
import pandas as pd # DataFrame
import polars as pl # LazyFrame
from sklearn.preprocessing import StandardScaler # X Standardization
from sklearn.neural_network import MLPClassifier as mlp # model
from sklearn.metrics import recall_score, roc_auc_score, accuracy_score # Scoring
from sklearn.model_selection import GridSearchCV, StratifiedKFold, ParameterGrid
from great_tables import GT, md, html, from_column, style, loc, vals
from assignment_3_tools import parquet_to_dict, unq_df_names, corr_testset
import glob

## Unique Datasets and Corresponding Testsets

In [preprocess notebook](./taylor_preprocess.ipynb), all of the null-threshold datasets were split into X_train, y_train, X_test, and y_test. The X_train, and y_train sets of each null-threshold datasets were balanced using random over/under sampling. Therefore when `parquet_to_dict()` is called, the dictionary will contain the X_train, y_train, X_test, y_test which correspond to one dataset. To resolve this, `unq_df_names()` and `corr_testset` record the dataset names and corresponding testsets.

In [None]:
def lazy_read_parquet(path):
    """
    Lazy read all parquet files in a folder.
    ---
    Args: 
        Path: Relative Path to folder
    Return: 
        lazy_frames_dict: Dictionary of lazy dataframes
    """
    lazy_frames_dict = {}
    for filename in os.listdir(path): # Iterate over each file
        if filename.endswith(".parquet"):
            file_path = os.path.join(path, filename) # File Path
            lazy_frame = pl.scan_parquet(file_path) # Lazy read
            key = os.path.splitext(filename)[0] # key = filename
            lazy_frames_dict[key] = lazy_frame # Add lazyframe to dictionary
    return lazy_frames_dict

In [None]:
root_path = "../../Data/GoogleDrive/MLP_Dataset/"
df_dict_all = lazy_read_parquet(root_path)
X_test = df_dict_all['Under_Sample_1:1_threshold_20_X_test'].collect().to_pandas()
X_train = df_dict_all['Under_Sample_1:1_threshold_20_X_train'].collect().to_pandas()

In [None]:
print(X_train.shape)
print(X_test.shape)

(581178, 121)
(109919, 122)


In [None]:
X_test.columns

Index(['onehot__State_Alabama', 'onehot__State_Alaska',
       'onehot__State_Arizona', 'onehot__State_Arkansas',
       'onehot__State_California', 'onehot__State_Colorado',
       'onehot__State_Connecticut', 'onehot__State_Delaware',
       'onehot__State_District of Columbia', 'onehot__State_Florida',
       ...
       'LastCheckupTime_label__LastCheckupTime',
       'RemovedTeeth_label__RemovedTeeth', 'SmokerStatus_label__SmokerStatus',
       'ECigaretteUsage_label__ECigaretteUsage',
       'remainder__PhysicalHealthDays', 'remainder__MentalHealthDays',
       'remainder__SleepHours', 'remainder__HeightInMeters',
       'remainder__WeightInKilograms', '__index_level_0__'],
      dtype='object', length=122)

### Keep all the No columns

In [None]:
# Parsing the column names to group by the categorical variable
groups = {}
for column in X_train.columns:
    # format 'onehot__<Category>_<Value>'
    category = column.split('__')[1].split('_')[0]
    if category in groups:
        groups[category].append(column)
    else:
        groups[category] = [column]

# Drop the first column from each group
for category, columns in groups.items():
    # Drop the last column (Yes columns) to reduce sparsity in feature space
    if len(columns) != 1:
        print(columns[-1])
        X_train.drop(columns=columns[-1], inplace=True)
        X_test.drop(columns=columns[-1], inplace=True)

onehot__State_Wyoming
onehot__Sex_Male
onehot__PhysicalActivities_Yes
onehot__HadAsthma_Yes
onehot__HadSkinCancer_Yes
onehot__HadCOPD_Yes
onehot__HadDepressiveDisorder_Yes
onehot__HadKidneyDisease_Yes
onehot__HadArthritis_Yes
onehot__HadDiabetes_Yes, but only during pregnancy (female)
onehot__DeafOrHardOfHearing_Yes
onehot__BlindOrVisionDifficulty_Yes
onehot__DifficultyConcentrating_Yes
onehot__DifficultyWalking_Yes
onehot__DifficultyDressingBathing_Yes
onehot__DifficultyErrands_Yes
onehot__ChestScan_Yes
onehot__RaceEthnicityCategory_White only, Non-Hispanic
onehot__AlcoholDrinkers_Yes
onehot__HIVTesting_Yes
onehot__FluVaxLast12_Yes
onehot__PneumoVaxEver_Yes
onehot__TetanusLast10Tdap_Yes, received tetanus shot, but not Tdap
onehot__HighRiskLastYear_Yes
onehot__CovidPos_Yes


In [None]:
print(X_train_No.shape)
print(X_test_No.shape)

(581178, 120)
(109919, 121)


In [None]:
X_train.to_parquet(root_path + "Under_Sample_1:1_threshold_20_X_train_No.parquet")
X_test.to_parquet(root_path + "Under_Sample_1:1_threshold_20_X_test_No.parquet")

### Keep all the Yes columns

In [None]:
# Parsing the column names to group by the categorical variable
groups = {}
for column in X_train.columns:
    # format 'onehot__<Category>_<Value>'
    category = column.split('__')[1].split('_')[0]
    if category in groups:
        groups[category].append(column)
    else:
        groups[category] = [column]

# Drop the first column from each group
for category, columns in groups.items():
    # Drop the first column (No columns)
    if len(columns) != 1:
        X_train.drop(columns=columns[0], inplace=True)
        X_test.drop(columns=columns[0], inplace=True)

In [None]:
X_train.columns

Index(['onehot__State_Alaska', 'onehot__State_Arizona',
       'onehot__State_Arkansas', 'onehot__State_California',
       'onehot__State_Colorado', 'onehot__State_Connecticut',
       'onehot__State_Delaware', 'onehot__State_District of Columbia',
       'onehot__State_Florida', 'onehot__State_Georgia', 'onehot__State_Guam',
       'onehot__State_Hawaii', 'onehot__State_Idaho', 'onehot__State_Illinois',
       'onehot__State_Indiana', 'onehot__State_Iowa', 'onehot__State_Kansas',
       'onehot__State_Kentucky', 'onehot__State_Louisiana',
       'onehot__State_Maine', 'onehot__State_Maryland',
       'onehot__State_Massachusetts', 'onehot__State_Michigan',
       'onehot__State_Minnesota', 'onehot__State_Mississippi',
       'onehot__State_Missouri', 'onehot__State_Montana',
       'onehot__State_Nebraska', 'onehot__State_Nevada',
       'onehot__State_New Hampshire', 'onehot__State_New Jersey',
       'onehot__State_New Mexico', 'onehot__State_New York',
       'onehot__State_North 

In [None]:
X_train.to_parquet(root_path + "Under_Sample_1:1_threshold_20_X_train_Yes.parquet")
X_test.to_parquet(root_path + "Under_Sample_1:1_threshold_20_X_test_Yes.parquet")

### Read in model results pickle files:

XGB test set performance:

In [18]:
root_path = "../../Data/GoogleDrive/MLP_Results/"
xgb_results = pd.read_parquet(root_path + "xgb_results.parquet")
xgb_results

Unnamed: 0,Recall,ROC_AUC,Accuracy
0,0.705783,0.707385,0.662288


MLP models test set performance:

In [21]:
# Use glob to find all parquet files starting with 'test_results' in the directory
parquet_files = glob.glob(root_path + 'test_results*.parquet')

# Initialize an empty list to hold the DataFrames
model_results = []

# Loop through the list of parquet files and read each one into a DataFrame
for file in parquet_files:
    df = pd.read_parquet(file)
    model_results.append(df)

# add in xgb result to the list
model_results.append(xgb_results)

# Optionally, concatenate all DataFrames into a single DataFrame
combined_df = pd.concat(model_results, ignore_index=True)

# Print the combined DataFrame
combined_df

Unnamed: 0,Dataset_Name,Grid_Variable,Parameters,Recall,ROC_AUC,Accuracy,Fit_Time
0,Under_Sample_1:1_threshold_20,learning_rate_init,{'learning_rate_init': 0.01},0.767905,0.816309,0.73001,72.824502
1,Under_Sample_1:1_threshold_20,max_iter,{'max_iter': 100},0.752738,0.814254,0.732085,105.687169
2,Under_Sample_1:1_threshold_20,batch_size,{'batch_size': 100},0.736729,0.815755,0.74262,150.978772
3,Under_Sample_1:1_threshold_20,iter_no_change,{'n_iter_no_change': 100},0.712601,0.808466,0.745294,399.201779
4,Under_Sample_1:1_threshold_20,_best_params,"{'hidden_layer_sizes': [47, 46, 46, 46], 'lear...",0.811796,0.814387,0.691627,162.356431
5,Under_Sample_1:1_threshold_20,_bayes_params,"{'activation': 'relu', 'alpha': 0.000523565603...",0.770816,0.813239,0.722086,49.996617
6,Under_Sample_1:1_threshold_20,momentum,{'momentum': 0.0},0.752738,0.814254,0.732085,108.784148
7,Under_Sample_1:1_threshold_20,neurons,{'hidden_layer_sizes': 500},0.448487,0.729403,0.79575,379.136686
8,Under_Sample_1:1_threshold_20,alpha,{'alpha': 0.0},0.719418,0.811282,0.745786,101.764314
9,Under_Sample_1:1_threshold_20,layers,"{'hidden_layer_sizes': [100, 100, 100, 100, 100]}",0.518116,0.760817,0.809887,832.888808


In [27]:
combined_df.iloc[15, 0] = 'Under_Sample_1:1_threshold_20'
combined_df.iloc[15, 1] = 'xgb'
combined_df.iloc[14, 1] = 'frank_results'
combined_df.iloc[5, 1] = 'bayes'
combined_df

Unnamed: 0,Dataset_Name,Grid_Variable,Parameters,Recall,ROC_AUC,Accuracy,Fit_Time
0,Under_Sample_1:1_threshold_20,learning_rate_init,{'learning_rate_init': 0.01},0.767905,0.816309,0.73001,72.824502
1,Under_Sample_1:1_threshold_20,max_iter,{'max_iter': 100},0.752738,0.814254,0.732085,105.687169
2,Under_Sample_1:1_threshold_20,batch_size,{'batch_size': 100},0.736729,0.815755,0.74262,150.978772
3,Under_Sample_1:1_threshold_20,iter_no_change,{'n_iter_no_change': 100},0.712601,0.808466,0.745294,399.201779
4,Under_Sample_1:1_threshold_20,_best_params,"{'hidden_layer_sizes': [47, 46, 46, 46], 'lear...",0.811796,0.814387,0.691627,162.356431
5,Under_Sample_1:1_threshold_20,bayes,"{'activation': 'relu', 'alpha': 0.000523565603...",0.770816,0.813239,0.722086,49.996617
6,Under_Sample_1:1_threshold_20,momentum,{'momentum': 0.0},0.752738,0.814254,0.732085,108.784148
7,Under_Sample_1:1_threshold_20,neurons,{'hidden_layer_sizes': 500},0.448487,0.729403,0.79575,379.136686
8,Under_Sample_1:1_threshold_20,alpha,{'alpha': 0.0},0.719418,0.811282,0.745786,101.764314
9,Under_Sample_1:1_threshold_20,layers,"{'hidden_layer_sizes': [100, 100, 100, 100, 100]}",0.518116,0.760817,0.809887,832.888808


In [36]:
combined_df['Parameters'].fillna({}, inplace=True)
combined_df['Fit_Time'].fillna(0, inplace=True)
combined_df

Unnamed: 0,Dataset_Name,Grid_Variable,Parameters,Recall,ROC_AUC,Accuracy,Fit_Time
0,Under_Sample_1:1_threshold_20,learning_rate_init,{'learning_rate_init': 0.01},0.767905,0.816309,0.73001,72.824502
1,Under_Sample_1:1_threshold_20,max_iter,{'max_iter': 100},0.752738,0.814254,0.732085,105.687169
2,Under_Sample_1:1_threshold_20,batch_size,{'batch_size': 100},0.736729,0.815755,0.74262,150.978772
3,Under_Sample_1:1_threshold_20,iter_no_change,{'n_iter_no_change': 100},0.712601,0.808466,0.745294,399.201779
4,Under_Sample_1:1_threshold_20,_best_params,"{'hidden_layer_sizes': [47, 46, 46, 46], 'lear...",0.811796,0.814387,0.691627,162.356431
5,Under_Sample_1:1_threshold_20,bayes,"{'activation': 'relu', 'alpha': 0.000523565603...",0.770816,0.813239,0.722086,49.996617
6,Under_Sample_1:1_threshold_20,momentum,{'momentum': 0.0},0.752738,0.814254,0.732085,108.784148
7,Under_Sample_1:1_threshold_20,neurons,{'hidden_layer_sizes': 500},0.448487,0.729403,0.79575,379.136686
8,Under_Sample_1:1_threshold_20,alpha,{'alpha': 0.0},0.719418,0.811282,0.745786,101.764314
9,Under_Sample_1:1_threshold_20,layers,"{'hidden_layer_sizes': [100, 100, 100, 100, 100]}",0.518116,0.760817,0.809887,832.888808


In [46]:
combined_df.drop(columns=['Parameters'], inplace=True)

In [47]:
combined_df.to_parquet(root_path + "test_results-combined.parquet", index=False)