In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [2]:
tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [3]:
# Set seed
tf.random.set_seed(42)

In [4]:
# Path of all datasets
num_datasets = ["./data/NORM_BCW.csv", "./data/NORM_Ecoli.csv", "./data/NORM_Iris.csv", "./data/NORM_ISOLET.csv", "./data/NORM_SDD.csv", "./data/NORM_PBC.csv", "./data/NORM_CMSC.csv", "./data/NORM_MagicGT.csv", "./data/NORM_Wine.csv"]
cat_datasets = ["./data/OH_BalanceScale.csv", "./data/OH_CarEvaluation.csv", "./data/OH_HayesRoth.csv", "./data/OH_Chess.csv", "./data/OH_Lenses.csv", "./data/OH_Lymphography.csv", "./data/OH_Nursery.csv", "./data/OH_SoybeanSmall.csv", "./data/OH_TicTacToe.csv"]
mix_datasets = ["./data/OH_NORM_DefaultOfCCC.csv", "./data/OH_NORM_StudentPerf.csv", "./data/OH_NORM_Adult.csv", "./data/OH_NORM_InternetAdv.csv", "./data/OH_NORM_StatlogGC.csv"]

In [5]:
def data_splitter(df, classes):
    
    # Sets
    X_trains = []
    X_tests = []
    y_trains = []
    y_tests = []
    
    # Iteration for each class
    for c in classes:
        df_s = df[df['output']==c]
        X_train, X_test, y_train, y_test = train_test_split(df_s.drop(columns=['output']), 
                                                            df_s['output'], test_size=0.1, random_state=42)
        X_trains.append(X_train)
        X_tests.append(X_test)
        y_trains.append(y_train)
        y_tests.append(y_test)
    
    return pd.concat(X_trains), pd.concat(X_tests), pd.concat(y_trains), pd.concat(y_tests)

In [6]:
def generate_models(paths_datasets):
    for path_dataset in paths_datasets:
        # Generate df
        df = pd.read_csv(path_dataset)
        
        # Get the name of the dataset
        ds_name = path_dataset.split('.')[1].split('_')[-1]
        
        # Get the possible classes of DS
        classes = list(df['output'].unique())
        
        # Split DataFrame to train and test
        X_train, X_test, y_train, y_test = data_splitter(df, classes)
        
        for c in classes:
            # The selected class is 0 and all others are 1
            y_train = y_train.copy().apply(lambda x: 0 if x==c else 1)
            y_test = y_test.copy().apply(lambda x: 0 if x==c else 1)
            
            # Save model train Data
            pd.concat([X_train, y_train], axis=1).to_csv(f'./modeldata/{str(int(c))}_{ds_name}.csv')
            
            # Save test Data
            pd.concat([X_test, y_test], axis=1).to_csv(f'./testdata/{str(int(c))}_{ds_name}.csv')
                        
            # Create model
            model = keras.Sequential(
                [layers.Dense(X_train.shape[1]*20, activation="relu", name="layer1"), # For ISOLET, InternetAdv 4,
                 layers.Dense(X_train.shape[1]*10  activation="relu", name="layer2"), # For ISOLET, InternetAdv 2
                 layers.Dense(X_train.shape[1]*4, activation="relu", name="layer3"),  # For ISOLET, internetAdv 2
                 layers.Dense(X_train.shape[1], activation="relu", name="layer4"),    # For ISOLET, internetAdv 1
                 layers.Dense(1, activation="sigmoid", name="outputLayer"),
                ])
            
            # Configure callbacks
            my_callbacks = [
                tf.keras.callbacks.ModelCheckpoint(filepath=f'./tempModels/{str(int(c))}_{ds_name}_model.temp.h5'),
                tf.keras.callbacks.TensorBoard(log_dir='./logs'),
            ]
            
            # Configure optimizer
            opt = tf.keras.optimizers.RMSprop(
                learning_rate=0.001, rho=0.9, momentum=0.0, epsilon=1e-07, centered=False, name='RMSprop')
            
            # Compile
            model.compile(optimizer=opt, loss="binary_crossentropy", metrics=["accuracy"])
            
            # Train
            model.fit(X_train, y_train, epochs=200, callbacks=my_callbacks)
            
            #Get Accuracies
            train_acc = (pd.DataFrame(model.predict(X_train))[0].map(round)==y_train.reset_index(drop=True)).sum()/X_train.shape[0]
            test_acc = (pd.DataFrame(model.predict(X_test))[0].map(round)==y_test.reset_index(drop=True)).sum()/X_test.shape[0]
            
            print(f"\n\nModel for {ds_name}:\nTrain Accuracy:{train_acc}\nTest Accuracy:{test_acc}\nClass Balance={y_train.sum()/y_train.shape[0]}\n\n")
            
            # Save
            model.save("./models/"+str(int(c))+"_"+ds_name+".h5")
            
            # Release GPU memory
            tf.keras.backend.clear_session()
            del model

            

In [7]:
# Create models for numerical datasets
generate_models(num_datasets)
# Create models for categorical datasets
generate_models(cat_datasets)
# Create models for mixed datasets
generate_models(mix_datasets)