In [25]:
import pandas as pd
import numpy as np
from numpy import loadtxt

In [26]:
from sklearn.model_selection import StratifiedKFold
from api.keras.binary_nn import BinaryNN
from api.data_handler import DataHandler

# Creation of a DataHandler Object
data_handler = DataHandler(['id', 'input1', 'input2', 'input3', 'input4', 'input5', 'input6', 'input7', 'input8', 'input9', 'input10', 'input11', 'target_x', 'target_y', 'target_z'])
                                #ID, INPUTS, TARGET_x, TARGET_y, TARGET_z
# Number of different Datasets
datasets_number = 1

# Lists of DataFrames
df_train : list[pd.DataFrame] = []
df_test  : list[pd.DataFrame] = []

# Load the Training/Test sets into pandas DataFrames
for i in range(datasets_number):
    df_train.append(data_handler.load_data(f'data/cup/ML-CUP23-TR.csv'))
    df_test.append(data_handler.load_data(f'data/cup/ML-CUP23-TS.csv'))

    # Print the head of the loaded data
    print(df_train[i].head())
    print(df_test[i].head())

#nn_i = BinaryNN(params=params, monk_i=dataset_i+1, trial=trial+1)
    




  id   input1               input2 input3                      input4  \
0  #     Data                  Set  (v2):                     3D-2023   
1  #      Nov                 2023      -                     ML-2023   
2  #    INFO:  micheli@di.unipi.it      -  lorenzo.simone@di.unipi.it   
3  #      (C)                 CIML  group                           -   
4  #  Format:                  NaN    NaN                         NaN   

    input5 input6 input7 input8  input9 input10  input11  target_x  target_y  \
0      NaN    NaN    NaN    NaN     NaN     NaN      NaN       NaN       NaN   
1      CUP    NaN    NaN    NaN     NaN     NaN      NaN       NaN       NaN   
2      NaN    NaN    NaN    NaN     NaN     NaN      NaN       NaN       NaN   
3  Micheli   2023    NaN    NaN     NaN     NaN      NaN       NaN       NaN   
4      NaN    NaN    NaN    NaN     NaN     NaN      NaN       NaN       NaN   

   target_z  
0       NaN  
1       NaN  
2       NaN  
3       NaN  
4       Na

In [27]:
dataset = loadtxt("data/cup/ML-CUP23-TR.csv", delimiter=',', usecols=range(1, 14), dtype=np.float64)
print(dataset[0]) # check the correctness
      

# Build training set
x = dataset[:, :-3]
y = dataset[:, -3:]

[ -0.9172796   -0.7127266   -0.9899035    0.9928187    0.9936488
   0.995543     0.7110739    0.40764457  -0.68854785   0.6168897
   7.897453   -35.936382    21.077147  ]


In [28]:
# Set the seed for reproducibility
np.random.seed(42)

# Define the proportions for train, validation, and test sets
train_percent = 0.8
val_percent = 0.1
test_percent = 0.1

# Calculate the number of samples for each set
num_samples = len(dataset)
num_train = int(train_percent * num_samples)
num_val = int(val_percent * num_samples)
num_test = int(test_percent * num_samples)

# Shuffle the indices
indices = np.arange(num_samples)
np.random.shuffle(indices)

# Split the indices into train, validation, and test sets
train_indices = indices[:num_train]
val_indices = indices[num_train:num_train + num_val]
test_indices = indices[num_train + num_val:]

# Use the indices to get the corresponding data for each set
x_train, y_train = x[train_indices], y[train_indices]
x_val, y_val = x[val_indices], y[val_indices]
x_test, y_test = x[test_indices], y[test_indices]

# Print the sizes of the resulting sets
print("Train set size:", len(x_train))
print("Validation set size:", len(x_val))
print("Internal Test set size:", len(x_test))

Train set size: 800
Validation set size: 100
Internal Test set size: 100


In [29]:
# Parameters' space for Grid Search (1 for each Dataset)
param_space = {
    0: {
        'input_units': [17],
        'hidden_units': [2, 3, 4, 5],
        'patience': [10,100,200],
        'factor_lr_dec': [0.5, 1],
        'step_decay': [500, 1000, 1500],
        'learning_rate': [float(i/10) for i in range(1,10)] + [float(i/100) for i in range(1,10)] + [0.99, 0.999],
        'batch_size': [7, 8, 9, 15, 16, 17, 31, 32, 33, 62, 63, 64, 65],
        'epochs': [int(350+epochs) for epochs in range(0,50,10)],
        'weight_decay': [float(i/10) for i in range(1,10)] + [0.01, 0.001, 0.0001],
        'weight_init': ['glorot_normal', 'lecun_normal', 'he_normal', 'he_uniform'],
        'momentum': [float(i/100) for i in range(1,9)] + [float(i/10) for i in range(1,9)],
        'activation': ['tanh', 'relu'],
        'output_activation': ['linear'],
        'metrics': ['mean_squared_error'],
    }
}

In [22]:
# Creation of a BinaryNN objct for each dataset
nn: list[BinaryNN] = []

# Different values per dataset
trials_list = 30
k_values = 5
n_hidden_layers_list = 1

# Search of the best Hyperparameters to each Training set
k = k_values
# K-fold Cross-validation
kfold = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)

    # For each iteration we choose the hyperparameters (randomly) and we use them with K-fold CV
for i in range(trials_list):
        #if i != 2:
        #    continue
        
        # Random parameters
        params = data_handler.random_dictionary(param_space[0])

        # Creation of the Neural Network object
        nn_i = BinaryNN(params=params, monk_i=1, trial=i+1)

        # For each K-fold returns the indexes of the data splitted in: <X_train,y_train> and <X_val,y_val>
        for train_index, val_index in kfold.split(x_train, y_train):
            x_kfold_train, x_kfold_val = x_train[train_index], y_train[val_index]
            y_kfold_train, y_kfold_val = y_train[train_index], y_train[val_index]

            # Building the model
            nn_i.create_model(n_hidden_layers=1)

            # Training the model
            nn_i.fit(
                x_train=x_kfold_train,
                y_train=y_kfold_train,
                x_val=x_kfold_val,
                y_val=y_kfold_val
            )

            # Evaluating the model
            nn_i.evaluate(
                x_train=x_kfold_train,
                y_train=y_kfold_train,
                x_val=x_kfold_val,
                y_val=y_kfold_val
            )

        # Case of first append
        #if len(nn) == dataset_i:
        #    nn.append(nn_i)
        
        # Print the results of this trial
        print("------------------ Current Hyperparameters ------------------")
        nn_i.print_training_info()
        print("-------------------- Best Hyperparameters -------------------")
        nn[0].print_training_info()
        print("\n\n")

        # Update best hyperparameters if: no high overfitting AND (higher mean VL accuracy OR (equal mean AND
        if nn_i.mean_tr_accuracy-0.1 <= nn_i.mean_vl_accuracy \
            and (
                    nn[0].mean_vl_accuracy < nn_i.mean_vl_accuracy \
                or (
                    nn[0].mean_vl_accuracy == nn_i.mean_vl_accuracy and nn[0].mean_tr_accuracy < nn_i.mean_tr_accuracy
                    )
            ):
            nn[0] = nn_i
        
        # Case of TR/VL accuracy = 1.0 AND TR/VL loss minor
        if nn_i.mean_tr_accuracy == 1 and nn_i.mean_vl_accuracy == 1 \
            and nn_i.mean_tr_accuracy == nn[0].mean_tr_accuracy \
            and nn_i.mean_vl_accuracy == nn[0].mean_vl_accuracy \
            and abs(nn_i.mean_tr_accuracy - nn_i.mean_vl_accuracy) < 0.02 \
            and nn_i.mean_vl_loss < nn[0].mean_vl_loss \
            and nn_i.mean_tr_loss < nn[0].mean_tr_loss:
            nn[0] = nn_i
        
        # Exit case
        if nn_i.mean_tr_accuracy == 1 and nn_i.mean_vl_accuracy == 1 \
            and nn_i.mean_vl_loss < 0.1 and nn_i.mean_tr_loss < 0.1 \
            and abs(nn_i.mean_vl_loss - nn_i.mean_tr_loss) < 0.01:
            nn[0] = nn_i
            break

# Print output
print(f"### Best Hyperparameters of Monk {i+1} ###")
nn[0].print_training_info()
print("\n\n")


ValueError: Supported target types are: ('binary', 'multiclass'). Got 'continuous-multioutput' instead.