In [14]:
import pandas as pd
import numpy as np 
import seaborn as sns 
sns.set(style='ticks', palette='Set2')
sns.set_context("talk", font_scale=1.2)
import matplotlib.pyplot as plt
import random
import warnings
warnings.filterwarnings('ignore')

from sklearn import svm 
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from mlxtend.plotting import plot_decision_regions
from sklearn import model_selection
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Perceptron
from pathlib import Path
from imblearn.over_sampling import SMOTE
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier
from tqdm import tqdm

import tp1
import tensorflow as tf

In [15]:
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier

class customMLPClassifer(MLPClassifier):
    def resample_with_replacement(self, X_train, y_train, sample_weight):

        # normalize sample_weights if not already
        sample_weight = sample_weight / sample_weight.sum(dtype=np.float64)

        X_train_resampled = np.zeros((len(X_train), len(X_train[0])), dtype=np.float32)
        y_train_resampled = np.zeros((len(y_train)), dtype=np.int)
        for i in range(len(X_train)):
            # draw a number from 0 to len(X_train)-1
            draw = np.random.choice(np.arange(len(X_train)), p=sample_weight)

            # place the X and y at the drawn number into the resampled X and y
            X_train_resampled[i] = X_train[draw]
            y_train_resampled[i] = y_train[draw]

        return X_train_resampled, y_train_resampled


    def fit(self, X, y, sample_weight=None):
        if sample_weight is not None:
            X, y = self.resample_with_replacement(X, y, sample_weight)

        return self._fit(X, y, incremental=(self.warm_start and
                                            hasattr(self, "classes_")))

In [44]:
def build_models_dict():
    mlp_hidden_dimensions =(8,)
    adabooster = AdaBoostClassifier(base_estimator=customMLPClassifer(), n_estimators = 20)

    adabooster = adabooster.set_params(base_estimator__hidden_layer_sizes = mlp_hidden_dimensions, 
                                    base_estimator__alpha = 0.0001, 
                                    base_estimator__activation = 'tanh', 
                                    base_estimator__early_stopping = False, 
                                    base_estimator__learning_rate_init = 0.001)

    # KERNEL_PARAMETERS = {'kernel_type': 'linear', 'coef0': 1}
    # KERNEL_PARAMETERS = {'kernel_type': 'polynomial', 'degree': 4, 'gamma': 1, 'coef0': 1}
    KERNEL_PARAMETERS = {'kernel_type': 'gaussian_rbf', 'gamma': 0.03125}

    def build_perceptron_hinge():
        perceptron = Perceptron()
        perceptron.loss = 'hinge'
        return perceptron

    models = {
        "Perceptron" : Perceptron(eta0=0.1, random_state=42),

        'Perceptron + L2':  Perceptron(penalty = 'l2', alpha = 0.01, eta0=0.1, random_state=42),

        'Perceptron + Hinge': build_perceptron_hinge(),

        "MLP" : customMLPClassifer(hidden_layer_sizes = mlp_hidden_dimensions, 
                                    alpha = 0.0, 
                                    activation = 'tanh', 
                                    early_stopping = False, 
                                    learning_rate_init = 0.001),


        "MLP + l2" : customMLPClassifer(hidden_layer_sizes = mlp_hidden_dimensions, 
                                    alpha = 0.0, 
                                    activation = 'tanh', 
                                    early_stopping = False, 
                                    learning_rate_init = 0.001),

        "MLP + Adaboost" : adabooster,

        "Perceptron_LargeMargin" : tp1.VotedPerceptron(KERNEL_PARAMETERS, error_threshold = 0, max_epochs = 10),

        "SVM_RBF" : svm.SVC(kernel='rbf', C = 1.0),

        "SVM_Linear" : svm.SVC(kernel='linear', C = 1.0)
    }

    return models


In [17]:
num_examples = 200
classe1 = np.column_stack((np.random.normal(1.0, 1.5, num_examples), np.random.normal(1.0, 1.5, num_examples)))
classe2 = np.random.normal(8, 1.5, num_examples * 2).reshape(num_examples, 2)
X = np.row_stack((classe1, classe2))
y = np.array([1] * num_examples + [0] * num_examples)

In [18]:
from sklearn.datasets import load_iris
iris = pd.read_csv(Path("./Datasets/iris.csv"))

control = tp1.parse_dataset_control(Path("./Datasets/data.csv"))
lp4 = tp1.parse_lp_data(Path("./Datasets/lp5.txt"))

ionosphere = pd.read_csv(Path("./Datasets/ionosphere.data"), header = None)
ionosphere[34] = np.where(ionosphere[34] == 'g', 1, 0)

parkinson = pd.read_csv(Path("./Datasets/parkinsons.data"))
parkinson.pop("name")

glass = pd.read_csv(Path("./Datasets/glass.data"), header = None)
glass.pop(0)
glass[10] = np.where(glass[10] == 2, 1, 0)

climate = pd.read_csv(Path("./Datasets/climate_model.csv"))
climate.pop("Study")
climate.pop("Run")

australian = pd.read_csv(Path("./Datasets/australian.dat"), sep =" ", header = None)
australian_categorical_features = [0, 3, 4, 5, 7, 8, 11, 12]

bank = pd.read_csv(Path("./Datasets/data_banknote_authentication.txt"), header = None)

eeg = pd.read_csv(Path("./Datasets/EEG Eye State.arff"), header = None)

cancer = pd.read_csv(Path("./Datasets/breast_cancer.csv"))
cancer['diagnosis'] = np.where(cancer['diagnosis'] == "B", 1, 0)
cancer.pop("id")
_ = cancer.pop("Unnamed: 32")

In [138]:
parkinson.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 195 entries, 0 to 194
Data columns (total 22 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   MDVP:Fo(Hz)       195 non-null    float64
 1   MDVP:Fhi(Hz)      195 non-null    float64
 2   MDVP:Flo(Hz)      195 non-null    float64
 3   MDVP:Jitter(%)    195 non-null    float64
 4   MDVP:Jitter(Abs)  195 non-null    float64
 5   MDVP:RAP          195 non-null    float64
 6   MDVP:PPQ          195 non-null    float64
 7   Jitter:DDP        195 non-null    float64
 8   MDVP:Shimmer      195 non-null    float64
 9   MDVP:Shimmer(dB)  195 non-null    float64
 10  Shimmer:APQ3      195 non-null    float64
 11  Shimmer:APQ5      195 non-null    float64
 12  MDVP:APQ          195 non-null    float64
 13  Shimmer:DDA       195 non-null    float64
 14  NHR               195 non-null    float64
 15  HNR               195 non-null    float64
 16  RPDE              195 non-null    float64
 1

In [19]:
imbalanced_datasets = ['iris', 'lp4', 'control', 'ionosphere', 'climate', 'glass', 'parksinson', 'cancer']

In [20]:
iris['Species'] = np.where(iris['Species'] == 'Iris-setosa', 1, 0)
lp4['label'] = np.where(lp4['label'] == 'normal', 1, 0)
control['label'] = np.where(control['label'] == 'normal', 1, 0)


In [21]:
iris_labels = iris.pop("Species")
lp4_labels = lp4.pop("label")
control_labels = control.pop("label")
bank_labels = bank.pop(4)
australian_labels = australian.pop(14)
climate_labels = climate.pop("outcome")
glass_labels = glass.pop(10)
parksinson_labels = parkinson.pop("status")
ionosphere_labels = ionosphere.pop(34)
eeg_labels = eeg.pop(14)
cancer_labels = cancer.pop("diagnosis")

In [22]:
datasets = {
    "2Gaussians" : (X, y),
    "Iris" : (iris.values, iris_labels.values),
    "lp4" : (lp4.values, lp4_labels.values),
    "controle" : (control.values, control_labels.values),
    "bank" : (bank.values, bank_labels.values),
    "australian" : (australian.values, australian_labels.values),
    "climate" : (climate.values, climate_labels.values),
    "glass" : (glass.values, glass_labels.values),
    "parkinsons" : (parkinson.values, parksinson_labels.values),
    "ionosphere" : (ionosphere.values, ionosphere_labels.values),
    # "eeg" : (eeg.values, eeg_labels.values),
    "cancer" : (cancer.values, cancer_labels.values)
}

In [23]:
# data = []
# for ds_name, ds in datasets.items():
#     models = build_models_dict()

#     for model_name, model in models.items():
#         print(ds_name, model_name)
#         scoring = ['accuracy', 'f1_weighted', 'roc_auc']

#         kfold = model_selection.KFold(n_splits=3, shuffle=True, random_state=90210)
#         scaler = StandardScaler()
        
#         training_data = scaler.fit_transform(ds[0])
        
#         sm = SMOTE(random_state=90210)
#         X_res, y_res = sm.fit_sample(training_data, ds[1])


#         cv_results = model_selection.cross_validate(model, X_res, y_res, cv=kfold, scoring = scoring, error_score="raise")
#         df = pd.DataFrame(cv_results)
#         df['model'] = [model_name] * len(df)
#         df['dataset'] = [ds_name] * len(df)
#         data.append(df)
# results = pd.concat(data, ignore_index=True)

In [45]:
import time

it = 40
data = []
for ds_name, ds in datasets.items():
    print(ds_name)
    models = build_models_dict()

    for model_name, model in models.items():
        if model_name != "SVM_Linear":
            continue
        
        scaler = StandardScaler()
        training_data = scaler.fit_transform(ds[0])
        
        for i in range(it):
            X_train, X_test, y_train, y_test = train_test_split(training_data, ds[1], test_size=0.2, random_state=i)

            if ds_name in imbalanced_datasets:
                sm = SMOTE(random_state=i)
                X_res, y_res = sm.fit_sample(X_train, y_train)
            else:
                X_res = X_train
                y_res = y_train

            start_time = time.time()
            model.fit(X_res, y_res)
            t = time.time() - start_time
            y_pred = model.predict(X_test)
            acc = accuracy_score(y_test, y_pred)
            model_data = {"model" : model_name, "dataset" : ds_name, "accuracy" : acc, "iteration" : i}
            df = pd.DataFrame(model_data, index = [0])
            df['model'] = [model_name] * len(df)
            df['dataset'] = [ds_name] * len(df)
            df['timespent'] = [t] * len(df)
            data.append(df)
results = pd.concat(data, ignore_index=True)

2Gaussians
Iris
lp4
controle
bank
australian
climate
glass
parkinsons
ionosphere
cancer


In [30]:
results.to_csv("results.csv", index = False)

In [None]:
# results = pd.read_csv("resultados.csv")

In [51]:
results.groupby(['dataset', 'model']).agg({'accuracy' : np.mean}).round(3).reset_index().pivot(index = 'dataset', columns = 'model', values = 'accuracy')

model,MLP,MLP + Adaboost,MLP + l2,Perceptron,Perceptron + Hinge,Perceptron + L2,Perceptron_LargeMargin,SVM_Linear,SVM_RBF
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2Gaussians,0.999,0.998,0.999,1.0,1.0,1.0,0.999,1.0,1.0
Iris,0.991,0.998,0.972,0.997,0.997,0.992,0.994,1.0,0.998
australian,0.864,0.843,0.861,0.804,0.821,0.802,0.855,0.85,0.854
bank,0.981,0.995,0.982,0.977,0.985,0.954,0.999,0.984,1.0
cancer,0.975,0.973,0.975,0.959,0.963,0.957,0.974,0.97,0.976
climate,0.899,0.95,0.903,0.908,0.913,0.885,0.938,0.928,0.945
controle,0.941,0.978,0.954,0.7,0.71,0.71,0.993,0.834,0.992
glass,0.653,0.726,0.638,0.595,0.585,0.565,0.715,0.599,0.728
ionosphere,0.88,0.895,0.875,0.839,0.843,0.821,0.919,0.873,0.952
lp4,0.631,0.76,0.639,0.69,0.717,0.685,0.811,0.724,0.652


In [52]:
results.groupby(['dataset', 'model']).agg({'accuracy' : np.std}).round(3).reset_index().pivot(index = 'dataset', columns = 'model', values = 'accuracy')

model,MLP,MLP + Adaboost,MLP + l2,Perceptron,Perceptron + Hinge,Perceptron + L2,Perceptron_LargeMargin,SVM_Linear,SVM_RBF
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2Gaussians,0.003,0.004,0.004,0.002,0.0,0.002,0.003,0.0,0.0
Iris,0.02,0.009,0.042,0.013,0.01,0.029,0.015,0.0,0.007
australian,0.033,0.033,0.035,0.061,0.031,0.056,0.03,0.029,0.027
bank,0.009,0.007,0.009,0.012,0.008,0.028,0.003,0.007,0.0
cancer,0.012,0.014,0.011,0.018,0.018,0.022,0.013,0.014,0.009
climate,0.035,0.018,0.027,0.031,0.031,0.041,0.02,0.028,0.021
controle,0.039,0.013,0.027,0.082,0.082,0.087,0.007,0.033,0.007
glass,0.092,0.07,0.085,0.094,0.083,0.092,0.081,0.068,0.079
ionosphere,0.039,0.03,0.034,0.052,0.054,0.081,0.025,0.035,0.019
lp4,0.1,0.092,0.085,0.085,0.078,0.083,0.075,0.083,0.088


In [54]:
results.groupby(['dataset', 'model']).agg({'timespent' : np.sum}).round(3).reset_index().pivot(index = 'dataset', columns = 'model', values = 'timespent')

model,MLP,MLP + Adaboost,MLP + l2,Perceptron,Perceptron + Hinge,Perceptron + L2,Perceptron_LargeMargin,SVM_Linear,SVM_RBF
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2Gaussians,8.691,10.145,8.49,0.033,0.033,0.029,0.713,0.03,0.043
Iris,4.429,8.365,4.26,0.028,0.027,0.036,0.316,0.024,0.026
australian,10.573,249.057,10.946,0.031,0.033,0.032,602.964,0.96,0.407
bank,19.892,57.005,20.387,0.057,0.069,0.056,164.036,0.388,0.308
cancer,12.715,184.713,11.861,0.056,0.072,0.066,124.375,0.191,0.214
climate,14.452,374.523,15.717,0.049,0.049,0.053,230.676,0.554,0.668
controle,12.982,135.709,12.107,0.091,0.077,0.095,9.314,0.93,0.204
glass,7.181,160.173,6.869,0.067,0.038,0.04,154.06,0.128,0.129
ionosphere,7.884,178.885,8.519,0.043,0.063,0.067,101.543,0.289,0.188
lp4,4.667,100.113,4.527,0.062,0.064,0.042,74.659,0.152,0.121


In [117]:
dados = results.copy(deep = True)
dados['model'] = dados['model'].str.replace(' ', '').str.replace('_', '-')
dados['accuracy'] = -1*dados['accuracy']
dados.rename(columns = {'dataset' : 'datasets'}, inplace = True)
pivoted = dados.groupby(['datasets', 'model']).agg({'accuracy' : np.median}).round(3).reset_index().pivot(index = 'datasets', columns = 'model', values = 'accuracy').reset_index()

In [118]:
pivoted.to_csv('pivoted.csv', index=False)

In [125]:
data = []

dados = results.copy(deep = True)
dados['accuracy'] = -1*dados['accuracy']
dados['index'] = dados['dataset'] + " - " + dados['iteration'].astype(str)
dados.set_index('index', inplace = True)

for g, dfg in dados.groupby('model'):
    dfg.rename(columns = {'accuracy' : g}, inplace = True)
    dfg.drop(['model', 'timespent', 'iteration', 'dataset'], axis = 1, inplace = True)
    data.append(dfg)

pivoted = pd.concat(data, axis = 1)

In [126]:
pivoted = pivoted.reset_index()
pivoted['dataset'] = pivoted['index'].str.split(' - ', expand = True)[0]

In [127]:
pivoted.drop('index', axis = 1, inplace = True)

In [128]:
pivoted.columns


Index(['MLP', 'MLP + Adaboost', 'MLP + l2', 'Perceptron', 'Perceptron + Hinge',
       'Perceptron + L2', 'Perceptron_LargeMargin', 'SVM_Linear', 'SVM_RBF',
       'dataset'],
      dtype='object')

In [129]:
pivoted =  pivoted[['dataset', 'MLP', 'MLP + Adaboost', 'MLP + l2', 'Perceptron', 'Perceptron + Hinge',
       'Perceptron + L2', 'Perceptron_LargeMargin', 'SVM_Linear', 'SVM_RBF',
       ]]
pivoted.to_csv('pivoted.csv', index = False)

In [132]:
glass.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 214 entries, 0 to 213
Data columns (total 9 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   1       214 non-null    float64
 1   2       214 non-null    float64
 2   3       214 non-null    float64
 3   4       214 non-null    float64
 4   5       214 non-null    float64
 5   6       214 non-null    float64
 6   7       214 non-null    float64
 7   8       214 non-null    float64
 8   9       214 non-null    float64
dtypes: float64(9)
memory usage: 15.2 KB
