# Imports

In [None]:
import os
from tqdm.auto import tqdm

import numpy as np
import pandas as pd

from sklearn import svm,neural_network,ensemble
from sklearn.model_selection import GridSearchCV, cross_validate, KFold
from sklearn.pipeline import Pipeline
from sklearn.metrics import precision_score, recall_score, accuracy_score, make_scorer
from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt
import seaborn as sns

import pickle as pkl

# Input Pre-processing

In [None]:
dataset = pd.read_csv("data/R22_breast_cancer.csv")

In [None]:
ds_dict['df'] = dataset
ds_dict['name'] = 'Breast'
ds_dict['thresholds'] = [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7]

# Model Fitting/Eval

## Param Grid

In [None]:
gammas = [2**gamma for gamma in range(-15, 4, 1)]
Cs = [2**c for c in range(-5, 12, 1)]
tols = [10**(-3), 10**(-4)]
param_grid_svm = [{'clf__C': Cs,
                   'clf__tol': tols,
                   'clf__gamma': gammas,
                   'clf__kernel': ['rbf', 'sigmoid']},
                  {'clf__C': Cs,
                   'clf__tol': tols,
                   'clf__kernel': ['linear']}]
param_grid_mlp = {'clf__solver': ['adam', 'lbfgs'],
                  'clf__hidden_layer_sizes': [(50,),(100,),(50,50)],
                  'clf__learning_rate': ['constant', 'adaptive'],
                  'clf__learning_rate_init': [0.1,0.01,0.001],
                  'clf__max_iter': [10000],
                  'clf__batch_size': [16,32,64]}
param_grid_ensemble = {'clf__max_depth': [10, 20, 40, None],
                       'clf__max_features': ['auto', 'sqrt'],
                       'clf__min_samples_leaf': [1, 2, 3],
                       'clf__min_samples_split': [2, 3, 5],
                       'clf__n_estimators': [100, 200, 500, 1000]}

methods = [('SVM', svm.SVC(), param_grid_svm), 
           ('MLP', neural_network.MLPClassifier(), param_grid_mlp),
           ('RandForest',ensemble.RandomForestClassifier(), param_grid_ensemble)]

## Inner CV

In [None]:
def param_scan(X, y, method, param_grid, cv=10):
    
    pipe = Pipeline([('scaler', StandardScaler()), ('clf', method)])
    
    clf = GridSearchCV(estimator = pipe, 
                       param_grid = param_grid,
                       refit = True,
                       cv=cv,
                       n_jobs=20)
    
    clf.fit(X, y)
    
    return clf

## Outer CV

In [None]:
def nested_cv(X, y, method, param_grid, NUM_TRIALS=3):
    scoring = {'accuracy': 'accuracy', 
               'balanced_accuracy': 'balanced_accuracy',
               'average_precision': make_scorer(precision_score, average='micro'), 
               'average_recall': make_scorer(recall_score, average='micro'),
               'recall_cancer': make_scorer(recall_score, pos_label='cancer'),
               'precision_cancer': make_scorer(precision_score, pos_label='cancer'),
               'recall_normal': make_scorer(recall_score, pos_label='normal'),
               'precision_normal': make_scorer(precision_score, pos_label='normal')}
    nested_scores = {key: np.array([]) for key in scoring}

    # Loop for each trial
    for i in tqdm(range(NUM_TRIALS), leave=False, position=2):
        inner_cv = KFold(n_splits=5, shuffle=True, random_state=i)
        outer_cv = KFold(n_splits=5, shuffle=True, random_state=i)

        clf = param_scan(X, y, method, param_grid)
        
        nested_score = cross_validate(clf, X=X, y=y, cv=outer_cv, scoring=scoring)
        
        for key in scoring:
            nested_scores[key] = np.append(nested_scores[key], nested_score['test_'+key].mean())
    
    result = {key+'_'+val: nested_scores[key].mean() if val=='mean' else nested_scores[key].std() for key in scoring for val in ['mean','std']}
    result_all = {key+'_all': nested_scores[key] for key in scoring}
    result.update(result_all)
    return result

## Features/Classifier Loop

In [71]:
def grid_forall_th_m(ds, methods):
    
    thresholds = ds['thresholds']
    df = ds['df']
        
    pcs = [col for col in df.columns if col.startswith('pca')]
    sing_th = [("single_th", "Th_"+str(i), ["thre=" + str(th)])
               for i, th in enumerate(thresholds)]
    features_list = sing_th + [('pca', 'pca', pcs)]
    
    results_df = pd.DataFrame()
    
    for i, method in enumerate(tqdm(methods)):
        pbar = tqdm(features_list, leave=False, position=1)
        for features_kind, features_name, features in pbar:
            
            columns = [col for col in df.columns if col.startswith(tuple(features))]
            
            X, y = df[columns], df['name_class']
            
            class_name, classifier, param_grid = method
            result_dict = nested_cv(X, y, classifier, param_grid)
            
            new_setting = dict(method = class_name,
                               features_kind = features_kind,
                               features_name = features_name,
                               features = features)
            new_setting.update(result_dict)
            
            results_df = results_df.append(new_setting, ignore_index=True)
        
        results_df.to_csv(f"results_partial_{i}.csv",index=False)
    return results_df

# Main

In [None]:
results = grid_forall_th_m(ds_dict, methods)

## Save

In [None]:
results.to_csv("output/result.csv",index=False)

## Load

In [69]:
results = pd.read_csv("output/results.csv")

In [70]:
selection = results[(results['features_name']=="Th_3") & (results['method']=="SVM")]
selection[['accuracy_mean','precision_cancer_mean','recall_cancer_mean']].T

Unnamed: 0,3
accuracy_mean,0.866667
precision_cancer_mean,0.861005
recall_cancer_mean,0.879992
