In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import cross_val_score, cross_validate, train_test_split

from utils import data_preprocessing_util as dpu
from utils import classification_util as cu

from MLP import MLP
from nn_framework import NNFramework

import seaborn as sns
import matplotlib.pyplot as plt

In [9]:
data_path = r'data/breast-cancer-diagnostic.shuf.lrn.csv'
df_original = pd.read_csv(data_path)

random_seed = 32
log_transform = True
outlier_removal = True
scaling = True

scaler = preprocessing.StandardScaler() if scaling else None

df = dpu.preprocess_breast_cancer_data(df_original, log_transform=log_transform, outlier_removal=outlier_removal)

In [11]:
y = df['class']
X = df[df.columns.difference(['ID', 'class'])]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = random_seed, shuffle=True, stratify=y)


In [12]:
activation_functions = ['relu', 'sigmoid']
learning_rates = [0.0001, 0.001, 0.01, 0.1, 0.5]
hidden_layer_sizes = [(5,), (32,), (16, 16), (10, 5, 5), (16, 8, 8), (64, 32, 32),]

In [14]:
methods = []

for af in activation_functions:
    for lr in learning_rates:
        for hls in hidden_layer_sizes:
            methods.append((f'MLP-{af}-{lr}-{hls}', MLP(n_iter=5000, activation_function=af, learning_rate=lr, hidden_layer_sizes=hls)))
    
pipelines = cu.define_pipelines(methods, scaler=scaler)

In [15]:
import warnings
warnings.filterwarnings("ignore")

In [16]:
cv_num = 5
model_params = {}
models = {}
models_lists = {}

for model_name, pipeline in pipelines.items():
    cv_results = cross_validate(pipeline, X, y, cv=cv_num, scoring='f1_macro', return_estimator=True, n_jobs=10)

    models[model_name] = cv_results['estimator']
    model_params[model_name] = {}
    models_lists[model_name] = {}

    num_cols = ['test_score', 'fit_time', 'score_time']

    for num_col in num_cols:
        models_lists[model_name][num_col] = cv_results[num_col]
        model_params[model_name][f'{num_col}_mean'] = cv_results[num_col].mean()
        model_params[model_name][f'{num_col}_std'] = cv_results[num_col].std()
    
    model_params[model_name]['parameter_num'] = cv_results['estimator'][0][model_name].number_of_params_
    model_params[model_name]['hidden_layer_sizes'] = cv_results['estimator'][0][model_name].hidden_layer_sizes
    model_params[model_name]['activation_function'] = cv_results['estimator'][0][model_name].activation_function
    model_params[model_name]['learning_rate'] = cv_results['estimator'][0][model_name].learning_rate
    models_lists[model_name]['converged'] = [e[model_name].converged_ for e in cv_results['estimator']]
    models_lists[model_name]['validation_losses'] = [e[model_name].validation_losses_ for e in cv_results['estimator']]
    models_lists[model_name]['training_losses'] = [e[model_name].training_losses_ for e in cv_results['estimator']]
    model_params[model_name]['num_iter'] = np.array(list([len(e[model_name].training_losses_) for e in cv_results['estimator']])).mean()

    print(model_name)
    print(
        f"f1 scores: {models_lists[model_name]['test_score']}\n" +
        f"f1 mean: {model_params[model_name]['test_score_mean']:.3f}\n" +
        f"f1 std: {model_params[model_name]['test_score_std']:.3f}\n"
    )
    print('----------------------------------------------------------------------------------------------------')
    

MLP-relu-0.0001-(5,)
f1 scores: [0.93215739 0.97715618 0.97777778 0.93146853 0.95359848]
f1 mean: 0.954
f1 std: 0.020

----------------------------------------------------------------------------------------------------
MLP-relu-0.0001-(32,)
f1 scores: [0.95404412 0.97715618 0.95609319 0.95496324 0.93146853]
f1 mean: 0.955
f1 std: 0.014

----------------------------------------------------------------------------------------------------
MLP-relu-0.0001-(16, 16)
f1 scores: [0.39759036 0.39506173 0.39506173 0.39506173 0.39506173]
f1 mean: 0.396
f1 std: 0.001

----------------------------------------------------------------------------------------------------
MLP-relu-0.0001-(10, 5, 5)
f1 scores: [0.89010989 0.25757576 0.25757576 0.79166667 0.25757576]
f1 mean: 0.491
f1 std: 0.287

----------------------------------------------------------------------------------------------------
MLP-relu-0.0001-(16, 8, 8)
f1 scores: [0.15254237 0.10909091 0.13383838 0.09866221 0.161869  ]
f1 mean: 0.131

In [18]:
df_param = pd.DataFrame(model_params).transpose()
df_param = df_param.reset_index(drop=False)
df_param = df_param.rename(columns={'index': 'model'})

# df_param

In [19]:
df_param.to_csv(r'results/breast_cancer_params_sigmoid.csv', index=False)