In [1]:
import sys
sys.path.append('../')

from dataset_data.constants.var_types import VAR_TYPES

import itertools

import pandas as pd

import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import accuracy_score

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [2]:
tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [3]:
# Set seed
tf.random.set_seed(42)

In [4]:
# Path of all datasets
data_path = "../dataset_data/data"
num_datasets = [f"{data_path}/NORM_BCW.csv", f"{data_path}/NORM_Ecoli.csv", f"{data_path}/NORM_Iris.csv", f"{data_path}/NORM_ISOLET.csv", f"{data_path}/NORM_SDD.csv", f"{data_path}/NORM_PBC.csv", f"{data_path}/NORM_CMSC.csv", f"{data_path}/NORM_MagicGT.csv", f"{data_path}/NORM_Wine.csv"]
cat_datasets = [f"{data_path}/OH_BalanceScale.csv", f"{data_path}/OH_CarEvaluation.csv", f"{data_path}/OH_HayesRoth.csv", f"{data_path}/OH_Chess.csv", f"{data_path}/OH_Lymphography.csv", f"{data_path}/OH_Nursery.csv", f"{data_path}/OH_SoybeanSmall.csv", f"{data_path}/OH_TicTacToe.csv"]
mix_datasets = [f"{data_path}/OH_NORM_DefaultOfCCC.csv", f"{data_path}OH_NORM_StudentPerf.csv", f"{data_path}/OH_NORM_Adult.csv", f"{data_path}/OH_NORM_InternetAdv.csv", f"{data_path}/OH_NORM_StatlogGC.csv"]

In [5]:
def data_splitter(dsname, df, classes):
    """ Splitter for binary class DataFrames into train, validation and test in
    the proportion 60 20 20, respectively. It explicitly enforces the same proportion
    for each class.

    Args:
        df (DataFrame): DataFrame to be splitted
        classes (list): The DataFrame classes
    Returns:
        (DataFrame, DataFrame, DataFrame, DataFrame, DataFrame,
            DataFrame): DataFrames with the Train_X, Validation_X, Test_X, Train_y, Validation_y, Test_y

    """

    # Assert the output classes are binary
    assert len(list(set(classes) - {0, 1})) == 0

    # Get categorical features for the dataset
    cat_feats = VAR_TYPES[dsname]['categorical']

    # Get the original dataset
    original_ds = pd.read_csv(f'{data_path}/{dsname}.csv')

    # Set search parameters
    train_has_all_cats = False
    random_seed_number = 42
    tries = 0

    # Search for a train set that has all categorical values if it's not binary
    while not train_has_all_cats:

        # Start considering all categories are inside the train set
        train_has_all_cats = True

        # Sets
        X_trains = []
        X_validations = []
        X_tests = []
        y_trains = []
        y_validations = []
        y_tests = []

        # Iteration for each class
        for c in classes:

            df_s = df[df['output'] == c]
            X_train, X_test_validation, y_train, y_test_validation = train_test_split(
                df_s.drop(columns=['output']), df_s['output'], test_size=0.4, random_state=random_seed_number)

            X_test, X_validation, y_test, y_validation = train_test_split(
                X_test_validation, y_test_validation, test_size=0.5, random_state=random_seed_number)

            X_trains.append(X_train)
            X_validations.append(X_validation)
            X_tests.append(X_test)
            y_trains.append(y_train)
            y_validations.append(y_validation)
            y_tests.append(y_test)

        # Verify if all categories, non-binary, are represented in the train set
        for cat in cat_feats:
            if len(original_ds[cat].unique()) > 2:
                if len(original_ds.loc[pd.concat(X_trains).index][cat].unique()) != len(original_ds[cat].unique()):
                    train_has_all_cats = False
                    tries += 1
                    
        # If the number of tries surpasses 100, rises an error
        if tries == 100:
            print(dsname)
            raise ValueError('Could not find a way to get all categorical features inside the train')

        random_seed_number += 1

    return pd.concat(X_trains), pd.concat(X_validations), pd.concat(X_tests), pd.concat(y_trains), pd.concat(
        y_validations), pd.concat(y_tests)

In [6]:
import itertools

import pandas as pd

import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import accuracy_score

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers


def generate_models(paths_datasets):
    """ Model generator for a dataset in a specific path. It creates and saves the model, the
    train, validation and test indexes, each model parameter for the gridsearch and the parameters
    for the best model chosen (based on validation AUC)
    
    Args:
        path_datasets (list): A list containing the paths for each dataset to be analyzed
    Returns:
        (None)
    
    """
    for path_dataset in paths_datasets:
        # Generate df
        df = pd.read_csv(path_dataset)
        
        # Define the majority class as 0 and the other classes as 1, binarization
        most_common_class = df['output'].value_counts().index[0]
        df['output'] = df['output'].apply(lambda x: 0 if x==most_common_class else 1)
        
        # Get the name of the dataset
        ds_name = path_dataset.split('.')[2].split('_')[-1]
        
        # Get the possible classes of DS
        classes = list(df['output'].unique())
        
        # Split DataFrame to train, validation and test
        X_train, X_validation, X_test, y_train, y_validation, y_test = data_splitter(ds_name, df, classes)
        
        # Make y multiclass (as we need a 2 class output)
        y_train = pd.concat([y_train, y_train.map({0:1, 1:0})], axis=1)
        y_validation = pd.concat([y_validation, y_validation.map({0:1, 1:0})], axis=1)
        y_test = pd.concat([y_test, y_test.map({0:1, 1:0})], axis=1)
        
        print('##########################################')
        print(f'Size train: {len(X_train)}, {len(y_train)} \n')
        print(f'Size validation: {len(X_validation)}, {len(y_validation)} \n')
        print(f'Size test: {len(X_test)}, {len(y_test)} \n')
        print('##########################################')

        # Save model train Data indexes
        pd.DataFrame(y_train.index).rename(columns={0: 'index'}).to_csv(f'../dataset_data/idxstrain/{ds_name}.csv', index=False)
        
        # Save model validation Data indexes
        pd.DataFrame(y_validation.index).rename(columns={0: 'index'}).to_csv(f'../dataset_data/idxsvalidation/{ds_name}.csv', index=False)

        # Save test Data indexes
        pd.DataFrame(y_test.index).rename(columns={0: 'index'}).to_csv(f'../dataset_data/idxstest/{ds_name}.csv', index=False)


        # GridSearch Parameters
        learning_rates = [0.01, 0.001, 0.0001]
        epoch_numbers = [50, 100, 500]
        nn_sizes = []
        for i in range(1, 6):
            nnsize = int((X_train.shape[1]*2+1)*i/5)
            if nnsize not in nn_sizes:
                nn_sizes.append(nnsize)
        parameters = [learning_rates, epoch_numbers, nn_sizes]
        comb_param = list(itertools.product(*parameters))

        # best scores placeholders
        best_model = []
        best_params = []
        best_auc = 0.0

        for params in comb_param:
            
            # Get parameters
            lr = params[0]
            epoch = params[1]
            nn_size = params[2]


            # Create model
            model = keras.Sequential(
                [layers.Dense(nn_size, activation="relu", name="layer1"),
                 layers.Dense(2, activation="softmax", name="outputLayer"),
                ])

            # Configure optimizer
            opt = tf.keras.optimizers.RMSprop(learning_rate=lr, name='RMSprop')

            # Compile
            model.compile(optimizer=opt, loss="categorical_crossentropy", metrics=["accuracy"])

            # Train
            model.fit(X_train, y_train, epochs=epoch, verbose=0)

            # Get Prediction for train and validation set
            y_train_pred = model.predict(X_train)
            y_validation_pred = model.predict(X_validation)
            y_test_pred = model.predict(X_test)


            #Get Accuracies
            train_acc = accuracy_score(np.apply_along_axis(np.argmax, 1, y_train), np.apply_along_axis(np.argmax, 1, y_train_pred))
            validation_acc = accuracy_score(np.apply_along_axis(np.argmax, 1, y_validation), np.apply_along_axis(np.argmax, 1, y_validation_pred))
            test_acc = accuracy_score(np.apply_along_axis(np.argmax, 1, y_test), np.apply_along_axis(np.argmax, 1, y_test_pred))

            print(f"\n\nModel for {ds_name}:\nTrain Accuracy:{train_acc}\nValidation Accuracy:{validation_acc}\nTest Accuracy: {test_acc}\nClass Balance={y_train.sum()/y_train.shape[0]}\n\n")


            # Calculate auc score for Train and Validation
            fpr_train, tpr_train, thresholds_train = roc_curve(y_train.iloc[:,1:], pd.DataFrame(y_train_pred)[1])
            fpr_validation, tpr_validation, threshold_validations = roc_curve(y_validation.iloc[:,1:], pd.DataFrame(y_validation_pred)[1])
            
            auc_m_train = auc(fpr_train, tpr_train)
            auc_m_validation = auc(fpr_validation, tpr_validation)
            
            # Report auc for the params
            with open('./all_params.txt', 'a') as f:
                f.write(f'{ds_name} {auc_m_validation} {params} \n')

            if auc_m_validation > best_auc:
                best_auc = auc_m_validation
                best_model = model
                best_params = params
                
            # Release GPU memory
            tf.keras.backend.clear_session()
            del model

        # Report the best params and auc score for the dataset and class
        with open('./best_params.txt', 'a') as f:
            f.write(f'{ds_name} {best_auc} {best_params} \n')

        # Save
        best_model.save("./models/"+ds_name+".h5")

In [7]:
# Create models for numerical datasets
generate_models(num_datasets)
# Create models for categorical datasets
generate_models(cat_datasets)
# Create models for mixed datasets
generate_models(mix_datasets)

##########################################
Size train: 118, 118 

Size validation: 41, 41 

Size test: 39, 39 

##########################################


Model for BCW:
Train Accuracy:1.0
Validation Accuracy:0.8536585365853658
Test Accuracy: 0.7948717948717948
Class Balance=output    0.237288
output    0.762712
dtype: float64




Model for BCW:
Train Accuracy:1.0
Validation Accuracy:0.8292682926829268
Test Accuracy: 0.7948717948717948
Class Balance=output    0.237288
output    0.762712
dtype: float64




Model for BCW:
Train Accuracy:1.0
Validation Accuracy:0.7804878048780488
Test Accuracy: 0.7435897435897436
Class Balance=output    0.237288
output    0.762712
dtype: float64




Model for BCW:
Train Accuracy:0.940677966101695
Validation Accuracy:0.7073170731707317
Test Accuracy: 0.6153846153846154
Class Balance=output    0.237288
output    0.762712
dtype: float64




Model for BCW:
Train Accuracy:1.0
Validation Accuracy:0.7804878048780488
Test Accuracy: 0.717948717948718
Class Balan