In [None]:
import itertools

import pandas as pd

import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [None]:
tf.config.list_physical_devices('GPU')

In [None]:
# Set seed
tf.random.set_seed(42)

In [None]:
# Path of all datasets
num_datasets = ["./data/NORM_BCW.csv", "./data/NORM_Ecoli.csv", "./data/NORM_Iris.csv", "./data/NORM_ISOLET.csv", "./data/NORM_SDD.csv", "./data/NORM_PBC.csv", "./data/NORM_CMSC.csv", "./data/NORM_MagicGT.csv", "./data/NORM_Wine.csv"]
cat_datasets = ["./data/OH_BalanceScale.csv", "./data/OH_CarEvaluation.csv", "./data/OH_HayesRoth.csv", "./data/OH_Chess.csv", "./data/OH_Lenses.csv", "./data/OH_Lymphography.csv", "./data/OH_Nursery.csv", "./data/OH_SoybeanSmall.csv", "./data/OH_TicTacToe.csv"]
mix_datasets = ["./data/OH_NORM_DefaultOfCCC.csv", "./data/OH_NORM_StudentPerf.csv", "./data/OH_NORM_Adult.csv", "./data/OH_NORM_InternetAdv.csv", "./data/OH_NORM_StatlogGC.csv"]

In [None]:
def data_splitter(df, classes):
    
    # Sets
    X_trains = []
    X_tests = []
    y_trains = []
    y_tests = []
    
    
    # Iteration for each class
    for c in classes:
        df_s = df[df['output']==c]
        X_train, X_test, y_train, y_test = train_test_split(df_s.drop(columns=['output']), 
                                                            df_s['output'], test_size=0.2, random_state=42)
        X_trains.append(X_train)
        X_tests.append(X_test)
        y_trains.append(y_train)
        y_tests.append(y_test)
    
    return pd.concat(X_trains), pd.concat(X_tests), pd.concat(y_trains), pd.concat(y_tests)

In [None]:
def generate_models(paths_datasets):
    for path_dataset in paths_datasets:
        # Generate df
        df = pd.read_csv(path_dataset)
        
        # Define the majority class as 0 and the other classes as 1
        most_common_class = df['output'].value_counts().index[0]
        df['output'] = df['output'].apply(lambda x: 0 if x==most_common_class else 1)
        
        # Get the name of the dataset
        ds_name = path_dataset.split('.')[1].split('_')[-1]
        
        # Get the possible classes of DS
        classes = list(df['output'].unique())
        
        # Split DataFrame to train and test
        X_train, X_test, y_train, y_test = data_splitter(df, classes)
            
                
        # Make y multiclass
        y_train = pd.concat([y_train, y_train.map({0:1, 1:0})], axis=1)
        y_test = pd.concat([y_test, y_test.map({0:1, 1:0})], axis=1)

        # Save model train Data indexes
        pd.DataFrame(y_train.index).rename(columns={0: 'index'}).to_csv(f'./idxstrain/{ds_name}.csv', index=False)

        # Save test Data indexes
        pd.DataFrame(y_test.index).rename(columns={0: 'index'}).to_csv(f'./idxstest/{ds_name}.csv', index=False)


        # GridSearch Parameters
        learning_rates = [0.01, 0.001, 0.0001]
        epoch_numbers = [50, 100, 500]
        nn_sizes = []
        for i in range(1, 6):
            nnsize = int((X_train.shape[1]*2+1)*i/5)
            if nnsize not in nn_sizes:
                nn_sizes.append(nnsize)
        parameters = [learning_rates, epoch_numbers, nn_sizes]
        comb_param = list(itertools.product(*parameters))

        # best scores placeholders
        best_model = []
        best_params = []
        best_f1 = 0.0

        for params in comb_param:

            lr = params[0]
            epoch = params[1]
            nn_size = params[2]


            # Create model
            model = keras.Sequential(
                [layers.Dense(nn_size, activation="relu", name="layer1"),
                 layers.Dense(2, activation="softmax", name="outputLayer"),
                ])

            # Configure optimizer
            opt = tf.keras.optimizers.RMSprop(learning_rate=lr, name='RMSprop')

            # Compile
            model.compile(optimizer=opt, loss="categorical_crossentropy", metrics=["accuracy"])

            # Train
            model.fit(X_train, y_train, epochs=epoch, verbose=0)

            # Get Prediction for train and test set
            y_train_pred = model.predict(X_train)
            y_test_pred = model.predict(X_test)


            #Get Accuracies
            train_acc = accuracy_score(np.apply_along_axis(np.argmax, 1, y_train), np.apply_along_axis(np.argmax, 1, y_train_pred))
            test_acc = accuracy_score(np.apply_along_axis(np.argmax, 1, y_test), np.apply_along_axis(np.argmax, 1, y_test_pred))

            print(f"\n\nModel for {ds_name}:\nTrain Accuracy:{train_acc}\nTest Accuracy:{test_acc}\nClass Balance={y_train.sum()/y_train.shape[0]}\n\n")


            # Calculate F1 score for Train Test
            f1train = f1_score(np.apply_along_axis(np.argmax, 1, y_train), np.apply_along_axis(np.argmax, 1, y_train_pred))
            f1s = f1_score(np.apply_along_axis(np.argmax, 1, y_test), np.apply_along_axis(np.argmax, 1, y_test_pred))

            # Report f1 for the params
            with open('./all_params.txt', 'a') as f:
                f.write(f'{ds_name} {f1s} {params} \n')

            if f1s > best_f1 and f1train > 0:
                best_f1 = f1s
                best_model = model
                best_params = params
                
            # Release GPU memory
            tf.keras.backend.clear_session()
            del model

        # Report the best params and f1 score for the dataset and class
        with open('./best_params.txt', 'a') as f:
            f.write(f'{ds_name} {best_f1} {best_params} \n')

        # Save
        best_model.save("./models/"+ds_name+".h5")

In [None]:
# Create models for numerical datasets
generate_models(num_datasets)
# Create models for categorical datasets
generate_models(cat_datasets)
# Create models for mixed datasets
generate_models(mix_datasets)