In [None]:
import pandas as pd

# Data Loading

In [None]:
from api.data_handler import DataHandler

# Creation of a DataHandler Object
data_handler = DataHandler(['id', 'col1', 'col2', 'col3', 'col4', 'col5', 'col6', 'col7', 'col8', 'col9', 'col10', 'target_x', 'target_y', 'target_z'])

# Load the Training/Test sets into pandas DataFrames
df_train : pd.DataFrame = data_handler.load_data(f'data/cup/ML-CUP23-TR.csv', delimiter=',')
df_test  : pd.DataFrame = data_handler.load_data(f'data/cup/ML-CUP23-TS.csv', delimiter=',')

# Print the head of the loaded data
print(df_train.head())
print(df_test.head())

# Data Split

In [None]:
# Saving the ID columns
df_id_train: pd.DataFrame = df_train['id']
df_id_test: pd.DataFrame = df_test['id']

# Drop the ID columns
df_train = df_train.drop(columns=['id'],axis=1).copy(deep=True)
df_test = df_test.drop(columns=['id'],axis=1).copy(deep=True)

# Split of columns and rows (0.8/0.2) into: TR set and Internal TS set
x_train, y_train, x_internal_test, y_internal_test = data_handler.split_data(
    data=df_train,
    cols_name_split=['target_x','target_y','target_z'],
    rows_split_perc=0.8
)

# Split on columns
x_test, y_test = data_handler.split_data(data=df_test, cols_name_split=['target_x','target_y','target_z'])

# Print of the shapes
print(f"[IDs TR SET]: " + str(df_id_train.shape))
print(f"[IDs TS SET]: " + str(df_id_test.shape))
print(f"[TR SET - x]: " + str(x_train.shape))
print(f"[TR SET - y]: " + str(y_train.shape))
print(f"[Internal TS SET - x]: " + str(x_internal_test.shape))
print(f"[Internal TS SET - y]: " + str(y_internal_test.shape))
print(f"[TS SET - x]: " + str(x_test.shape))
print(f"[TS SET - y]: " + str(y_test.shape))

# Grid Search parameters

In [None]:
# Parameters' space for Grid Search
param_space = { # CUP 0.49
    'input_size': [10],
    'hidden_size': [100, 80],
    'output_size': [3],
    'hidden_layers': [4],
    'hidden_activation': ['Tanh'],
    'output_activation': [''],
    'tolerance': [0.02],
    'learning_rate': [0.00345, 0.0032],
    'batch_size': [64],
    'momentum': [0.8],
    'weight_init': ["glorot_normal"],
    'seed_init': [19],
    'epochs': [700],
    'weight_decay': [0.0001],
    'optimizer': ['SGD'],
    'nesterov': [True],
    'metrics': ['MEE']
}

# Model Selection

## Best Hyperparameters Research

In [None]:
from sklearn.model_selection import KFold
from api.pytorch.nn import NN

# Creation of a BinaryNN objct for each dataset
nn: NN = None

# Different values per dataset
trials = 1
k = 5

# Search of the best Hyperparameters
X = x_train.values.astype(dtype=float)
y = y_train.values.astype(dtype=float)

# K-fold Cross-validation
kfold = KFold(n_splits=k, shuffle=True, random_state=42)

# Sets all the combinations of the entire set of parameters
data_handler.set_params_combinations(params=param_space)

# Gets the list with the combinations of all the parameters
params_combinations = data_handler.get_params_combinations()

# For each iteration we choose the hyperparameters (randomly) and we use them with K-fold CV
for trial, params in enumerate(params_combinations):
#for trial in range(trials):

    # Choose random hyperparameters
    #params = data_handler.random_dictionary(params=param_space)

    # Creation of the Neural Network object
    #nn_i = NN(params=params, current_trial=trial+1, trials=trials)
    nn_i = NN(params=params, current_trial=trial+1, trials=len(params_combinations))
                    
    # For each K-fold returns the indexes of the data splitted in: <X_train,y_train> and <X_val,y_val>
    for train_index, val_index in kfold.split(X, y):
        x_kfold_train, x_kfold_val = X[train_index], X[val_index]
        y_kfold_train, y_kfold_val = y[train_index], y[val_index]

        nn_i.fit(
            x_train=x_kfold_train,
            y_train=y_kfold_train,
            x_val=x_kfold_val,
            y_val=y_kfold_val
        )

    # Case of first assignment
    if nn is None:
        nn = nn_i
    
    # Print the results of this trial
    print("\n------------------ Current Hyperparameters ------------------")
    nn_i.print_training_info()
    print("------------------ CUP Best Hyperparameters -----------------")
    nn.print_training_info()
    print("\n\n")

    # Case of better model found
    if nn_i.is_better_model_than(model=nn):
        nn = nn_i

# Print output
print("### Best Hyperparameters for CUP ###")
nn.print_training_info()
print("\n\n")


## Retraining Phase

In [None]:
import matplotlib.pyplot as plt

def print_acc_plot(history):
    '''
        Prints the plot based on the accuracy of the trained model.
    '''
    plt.figure()
    plt.plot(history['tr_metric'], label='Training MEE')
    plt.plot(history['vl_metric'], label='Validation MEE', linestyle='--')
    plt.title('Learning Curve')
    plt.xlabel('Epoch')
    plt.ylabel('MEE')
    plt.legend()

# Training the model
nn.fit(
    x_train=x_train.values,
    y_train=y_train.values
)

# Print values
print(f"\n### Retraining for CUP ###")
nn.print_training_info()
nn.print_loss_plot()
nn.print_acc_plot()

# Internal Testing Phase

In [None]:

X = x_internal_test.values
y = y_internal_test.values

# Evaluate the Model on TS set
nn.test(
    x_test=X,
    y_test=y
)

# Prints the results obtained
print(nn)

# Predictions' Phase

In [None]:
# Predictions
predictions = nn.predict(x_test=x_test.values)

# Write predictions into file .CSV
data_handler.write_data(
    filename='EmmElle_ML-CUP23-TS.csv',
    id_list=df_id_test.values.tolist(),
    data=predictions,
    cols_name=['id', 'output_x', 'output_y', 'output_z']
)

predictions