# Baseline Multi-Perceptron Model

## Summary

## Table of Contents

- [Notebook Setup](#Notebook-Setup)
- [Read in Parquet](#Read-in-Parquet)
- [MLP Baseline Model](#Encode-Features)
- [Results](#Results)
- [Save as Pickle](#Save-as_Pickle)

## Notebook Setup

Significant functions from [assignment_3_tools.py](./assignment_3_tools.py)

In [12]:
import pickle #for saveing and loading trained models
from icecream import ic # Debugging
import numpy as np # for vector / matrix operations
import pandas as pd # for data manipulation
import seaborn as sns # For plots
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_recall_curve, make_scorer, recall_score
from sklearn.model_selection import cross_validate
from sklearn.neural_network import MLPClassifier as mlp
from assignment_3_tools import parquet_to_dict

data_pth = "../../Data/GoogleDrive/Encoded_Data"
save_pth = "../../Data/GoogleDrive/Baseline"

## Read in Parquet

In [42]:
# Lazy read encoded data.
pq_jar = parquet_to_dict(data_pth)

# Unique Datasets.
def encode_dataset(lazy_dict):
    all_names = list()
    for i, key in enumerate(lazy_dict):
        if key[-6:] == "_train":
            all_names.append(key[:-8])
        elif key[-5:] == "_test":
            all_names.append(key[:-7])
        else:
            pass
    unq_names = set(all_names)
    return unq_names

unq_names = encode_dataset(pq_jar)

# Return Corresponding Test Set.
def corr_testset(unq_name):
    threshold = unq_name[-2:]
    if threshold.isnumeric():
        X_test_name = f"df_heart_drop_{threshold}_imp_X_test"
        y_test_name = f"df_heart_drop_{threshold}_imp_y_test"
    else:
        X_test_name = f"{unq_name}_X_test"
        y_test_name = f"{unq_name}_y_test"
    return X_test_name, y_test_name

## MLP Baseline Model

In [46]:
def mlp_baseline(lazy_dict):
    baseline_results = dict()
    # Initialize MLP parameters
    params = {'activation':'tanh', 'solver':'adam', 'valid_frac':0.2,
      'alpha':0.001, 'learn_rate_init':0.0001,
      'max_iter':1000, 'n_iter_no_change':200,
      'rand_state': 3}
    
    # MLP for each Dataframe
    for name in unq_names:
        # pl.LazyFrame Names
        X_train_name = f"{name}_X_train"
        y_train_name = f"{name}_y_train"
        (X_test_name, y_test_name) = corr_testset(name)
        
        # Collect pl.LazyFrame and convert to pd.DataFrame
        X_train = lazy_dict[X_train_name].collect().to_pandas()
        y_train = lazy_dict[y_train_name].collect().to_pandas()
        X_test = lazy_dict[X_test_name].collect().to_pandas()
        y_test = lazy_dict[y_test_name].collect().to_pandas()
        
        # Remove index column
        if "__index_level_0__" in X_train.columns:
            X_train = X_train.drop(columns=['__index_level_0__'])
        if "__index_level_0__" in y_train.columns:
            y_train = y_train.drop(columns=['__index_level_0__'])
        if "__index_level_0__" in X_test.columns:
            X_test = X_test.drop(columns=['__index_level_0__'])
        if "__index_level_0__" in y_test.columns:
            y_test = y_test.drop(columns=['__index_level_0__'])
            
        # Standardize X_train and X_test by the standardization scalar of X_train
        scaler = StandardScaler()
        scaler.fit(X_train)
        X_train_scaled = scaler.transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        print("checkpoint - Standardized")
        
        # Initialize mlp model
        mlp_model = mlp(
            hidden_layer_sizes=([8,4,8]),
            validation_fraction = params['valid_frac'],
            activation = params['activation'],
            solver = params['solver'],
            alpha = params['alpha'],
            learning_rate = "adaptive",
            learning_rate_init = params['learn_rate_init'],
            batch_size = "auto",
            max_iter = params['max_iter'],
            early_stopping = True,
            n_iter_no_change = params['n_iter_no_change'],
            verbose= True,
            random_state = params['rand_state'])
        
        # Train the model
        mlp_model.fit(X_train_scaled, y_train)
        
        # Save the Trained model
        with open(f"{save_pth}{name}.pkl", 'wb') as file:
            pickle.dump(mlp_model, file)
            
        # Cross-Validation and Evaluation
        scoring = {'recall': make_scorer(recall_score)}
        scores = cross_validate(mlp_model, X_train_scaled, y_train, cv=5, scoring=scoring, return_train_score=True)

        # Store results in the dictionary
        baseline_results['dataset_name'].append(name)
        baseline_results['train_recall'].append(scores['train_recall'].mean())
        baseline_results['test_recall'].append(scores['test_recall'].mean())
        baseline_results['fit_time'].append(scores['fit_time'].mean())
    df_baseline_results = pd.DataFrame(baseline_results)
    df_baseline_results.to_parquet(f"{save_pth}baseline_results.pkl")
    return df_baseline_results

mlp_baseline(pq_jar)

checkpoint - Standardized


  y = column_or_1d(y, warn=True)


Iteration 1, loss = 0.37068515
Validation score: 0.843859
Iteration 2, loss = 0.34300745
Validation score: 0.845979
Iteration 3, loss = 0.33964466
Validation score: 0.847059
Iteration 4, loss = 0.33804217
Validation score: 0.847673
Iteration 5, loss = 0.33699357
Validation score: 0.848144
Iteration 6, loss = 0.33620212
Validation score: 0.848388
Iteration 7, loss = 0.33558609
Validation score: 0.848378
Iteration 8, loss = 0.33509143
Validation score: 0.848763
Iteration 9, loss = 0.33466121
Validation score: 0.848935
Iteration 10, loss = 0.33431845
Validation score: 0.848920
Iteration 11, loss = 0.33399581
Validation score: 0.849341
Iteration 12, loss = 0.33374200
Validation score: 0.849676
Iteration 13, loss = 0.33350747
Validation score: 0.849488
Iteration 14, loss = 0.33330164
Validation score: 0.849554
Iteration 15, loss = 0.33311183
Validation score: 0.849448
Iteration 16, loss = 0.33293470
Validation score: 0.849579
Iteration 17, loss = 0.33279296
Validation score: 0.849711
Iterat

NameError: name 'make_scorer' is not defined

## Save as Pickle