# MONK


1. Title: The Monk's Problems (CLASSIFICATION PROBLEMS)

4. Relevant Information:

   The MONK's problem were the basis of a first international comparison
   of learning algorithms. The result of this comparison is summarized in
   "The MONK's Problems - A Performance Comparison of Different Learning
   algorithms" by S.B. Thrun, J. Bala, E. Bloedorn, I.  Bratko, B.
   Cestnik, J. Cheng, K. De Jong, S.  Dzeroski, S.E. Fahlman, D. Fisher,
   R. Hamann, K. Kaufman, S. Keller, I. Kononenko, J.  Kreuziger, R.S.
   Michalski, T. Mitchell, P.  Pachowicz, Y. Reich H.  Vafaie, W. Van de
   Welde, W. Wenzel, J. Wnek, and J. Zhang has been published as
   Technical Report CS-CMU-91-197, Carnegie Mellon University in Dec.
   1991.

   One significant characteristic of this comparison is that it was
   performed by a collection of researchers, each of whom was an advocate
   of the technique they tested (often they were the creators of the
   various methods). In this sense, the results are less biased than in
   comparisons performed by a single person advocating a specific
   learning method, and more accurately reflect the generalization
   behavior of the learning techniques as applied by knowledgeable users.

   There are three MONK's problems.  The domains for all MONK's problems
   are the same (described below).  One of the MONK's problems has noise
   added. For each problem, the domain has been partitioned into a train
   and test set.

5. Number of Instances: 432

6. Number of Attributes: 8 (including class attribute)

7. Attribute information:
    1. class: 0, 1 
    2. a1:    1, 2, 3
    3. a2:    1, 2, 3
    4. a3:    1, 2
    5. a4:    1, 2, 3
    6. a5:    1, 2, 3, 4
    7. a6:    1, 2
    8. Id:    (A unique symbol for each instance)

8. Missing Attribute Values: None

9. Target Concepts associated to the MONK's problem:

   MONK-1: (a1 = a2) or (a5 = 1)

   MONK-2: EXACTLY TWO of {a1 = 1, a2 = 1, a3 = 1, a4 = 1, a5 = 1, a6 = 1}

   MONK-3: (a5 = 3 and a4 = 1) or (a5 /= 4 and a2 /= 3)
           (5% class noise added to the training set)



In [None]:
# librerie di base dal file utils.py
from utils import *

# librerie utili per la classificazione
from sklearn.model_selection import ParameterGrid
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import accuracy_score, log_loss
from sklearn.metrics import mean_squared_error

# per rimuovere i warnings
import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning)

## Monk 1
MONK-1: (a1 = a2) or (a5 = 1) --> Class label 1 else 0

In [None]:
# lettura dataset monk 1
monk_train_1 = load_monk_data_one_hot_enc("../data/monk+s+problems/monks-1.train")
monk_test_1 = load_monk_data_one_hot_enc("../data/monk+s+problems/monks-1.test")

monk_train_1.head()

In [None]:
print(monk_train_1.shape)
print(monk_test_1.shape)

In [None]:
# separazione feature e target
X_dev, y_dev = monk_train_1.iloc[:, 1:], monk_train_1.iloc[:, 0]
X_test, y_test = monk_test_1.iloc[:, 1:], monk_test_1.iloc[:, 0]

In [None]:
print(X_dev.shape)
print(y_dev.shape)
print(X_test.shape)
print(y_test.shape)

In [None]:
# Splitting del dev set in training e validation
X_train, X_val, y_train, y_val = train_test_split(X_dev, y_dev, test_size=0.2, random_state=42)

In [None]:
print(X_train.shape)
print(y_train.shape)
print(X_val.shape)
print(y_val.shape)

In [None]:
batch_size = X_train.shape[0]
print(batch_size)

In [None]:
# Iperparametri da testare 
iperparameters = [ 
    {
    'hidden_layer_sizes': [(2,), (3,), (4,)],  
    'activation': ['tanh', 'relu', 'logistic'],  
    'solver': ['adam'], 
    'learning_rate_init': [0.01, 0.02, 0.05, 0.001, 0.002],
    },
    {
    'hidden_layer_sizes': [(2,), (3,), (4,)],  
    'activation': ['tanh', 'relu', 'logistic'],  
    'solver': ['sgd'],  
    'learning_rate_init': [0.01, 0.02, 0.05, 0.001, 0.002],
    'learning_rate': ['constant', 'adaptive'],  # solo per SGD
    'momentum': [0.9, 0.95]  # solo per SGD
    }
]

best_model = None
best_train_acc = float('-inf')
best_val_acc = float('-inf')
best_train_mse = float('inf')
best_val_mse = float('inf')
best_params = None

# Itera su tutte le combinazioni di iperparametri
for params in ParameterGrid(iperparameters):
    # Definizione del modello con gli iperparametri scelti
    nn = MLPRegressor(
        hidden_layer_sizes=params['hidden_layer_sizes'],
        activation=params['activation'],
        solver=params['solver'],
        learning_rate_init=params['learning_rate_init'],
        batch_size=batch_size,  # fullbatch (99)
        **({'learning_rate': params['learning_rate']} if params['solver'] == 'sgd' else {}),  # learning_rate solo se il solver è 'sgd'
        **({'momentum': params['momentum']} if params['solver'] == 'sgd' else {}),  # momentum solo se il solver è 'sgd'
        max_iter=500,
        shuffle=True,
        random_state=7,
        verbose=False
    )
    
    # Addestramento del modello 
    nn.fit(X_train, y_train)

    # Predizioni sui dati
    y_pred_train = nn.predict(X_train)
    y_pred_val = nn.predict(X_val)

    # Calcolo MSE
    train_mse = mean_squared_error(y_train, y_pred_train)
    val_mse = mean_squared_error(y_val, y_pred_val)

    # Convertsione delle predizioni in classi discrete
    train_acc = accuracy_score(y_train.round(), y_pred_train.round())
    val_acc = accuracy_score(y_val.round(), y_pred_val.round())


    # Verifica se abbiamo trovato un modello migliore
    if val_acc > best_val_acc:  
        best_train_mse = train_mse
        best_val_mse = val_mse
        best_train_acc = train_acc
        best_val_acc = val_acc
        best_model = nn
        best_params = params

In [None]:
# Evaluation
print("\nMiglior modello trovato per MONK-1 (random state 7 - 500 epochs):")
print(f"Parametri: {best_params}")

print(F'Training Accuracy: {accuracy_score(y_train, best_model.predict(X_train).round()):.4f}')
print(f'Validation Accuracy: {accuracy_score(y_val, best_model.predict(X_val).round()):.4f}')
print(f"Test Accuracy: {accuracy_score(y_test, best_model.predict(X_test).round()):.4f}")

In [None]:
# Inizializzazione modello con i miglior iperparametri trovati
nn = MLPRegressor(
        hidden_layer_sizes=best_params['hidden_layer_sizes'],
        activation=best_params['activation'],
        solver=best_params['solver'],
        learning_rate_init=best_params['learning_rate_init'],
        batch_size=batch_size,  # fullbatch (99)
        **({'learning_rate': best_params['learning_rate']} if best_params['solver'] == 'sgd' else {}), # learning_rate solo se il solver è 'sgd'
        **({'momentum': best_params['momentum']} if best_params['solver'] == 'sgd' else {}), # momentum solo se il solver è 'sgd'
        max_iter=1, # nota bene
        warm_start=True, # nota bene
        shuffle=True,
        random_state=7,
        verbose=False
      )

In [None]:
# TRAINING MODELLO

# Numero di epoche
epochs = 500  

# Liste per tenere traccia delle metriche epoca per epoca
train_loss_curve = []
test_loss_curve = []
train_accuracy_curve = []
test_accuracy_curve = []

# Training passo passo
for epoch in range(epochs):
    nn.partial_fit(X_dev, y_dev)

    # Predizioni su training e test set
    y_dev_pred = nn.predict(X_dev)
    y_test_pred = nn.predict(X_test)

    # Calcoliamo le metriche
    train_loss = mean_squared_error(y_dev, y_dev_pred)  # MSE su training
    test_loss = mean_squared_error(y_test, y_test_pred)  # MSE su test

    # Accuracy: arrotondamento delle predizioni per la classificazione
    train_acc = accuracy_score(y_dev, y_dev_pred.round())
    test_acc = accuracy_score(y_test, y_test_pred.round())

    # Memorizziamo i valori di loss e accuracy
    train_loss_curve.append(train_loss)
    test_loss_curve.append(test_loss)
    train_accuracy_curve.append(train_acc)
    test_accuracy_curve.append(test_acc)

    # Stampa ogni 50 epoche
    if (epoch + 1) % 50 == 0:
        print(f"Epoca {epoch + 1}/{epochs}, Train MSE: {train_loss:.5f}, Train Acc: {train_acc:.4f}")

In [None]:
# Plot delle metriche
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 5))

# Primo grafico: Loss Curve
ax1.set_xlabel("Epoche")
ax1.set_ylabel("MSE")
ax1.plot(train_loss_curve, label="Train Loss", color="blue", linestyle="solid")
ax1.plot(test_loss_curve, label="Test Loss", color="red", linestyle="solid")
ax1.legend(loc="upper right")
ax1.set_title("MONK-1 Loss")
ax1.grid(True)

# Secondo grafico: Accuracy Curve
ax2.set_xlabel("Epoche")
ax2.set_ylabel("Accuracy")
ax2.plot(train_accuracy_curve, label="Train Accuracy", color="blue", linestyle="solid")
ax2.plot(test_accuracy_curve, label="Test Accuracy", color="red", linestyle="solid")
ax2.legend(loc="lower right")
ax2.set_title("MONK-1 Accuracy")
ax2.grid(True)

# Migliora la disposizione dei grafici
plt.tight_layout()
plt.show()

In [None]:
print(F'Training Accuracy: {accuracy_score(y_dev, nn.predict(X_dev).round()):.4f}')
print(f"Test Accuracy: {accuracy_score(y_test, nn.predict(X_test).round()):.4f}")

In [None]:
# 5 run diverse corrispondenti a 5 random_state differenti (MODEL SELECTION)

epochs = 500

# Per salvare i risultati delle 5 inizializzazioni differenti
all_train_mse = []
all_val_mse = []
all_train_acc = []
all_val_acc = []

# Seed differenti
random_states = [7, 13, 26, 39, 47]

for rs in random_states:
    print(f"Training con random_state={rs}...")

    # Inizializzazione modello con i miglior iperparametri trovati
    nn = MLPRegressor(
        hidden_layer_sizes=best_params['hidden_layer_sizes'],
        activation=best_params['activation'],
        solver=best_params['solver'],
        learning_rate_init=best_params['learning_rate_init'],
        batch_size=batch_size,  # fullbatch (99)
        **({'learning_rate': best_params['learning_rate']} if best_params['solver'] == 'sgd' else {}), # learning_rate solo se il solver è 'sgd'
        **({'momentum': best_params['momentum']} if best_params['solver'] == 'sgd' else {}), # momentum solo se il solver è 'sgd'
        max_iter=1, # nota bene
        warm_start=True, # nota bene
        shuffle=True,
        random_state=rs,
        verbose=False
      )

    train_mse_list = []
    val_mse_list = []
    train_acc_list = []
    val_acc_list = []

    for epoch in range(epochs):  
        nn.partial_fit(X_train, y_train)
        
        # Predizioni su training e test set
        y_train_pred = nn.predict(X_train)
        y_val_pred = nn.predict(X_val)  

        # Calcolo metriche     
        train_mse = mean_squared_error(y_train, y_train_pred)
        val_mse = mean_squared_error(y_val, y_val_pred)

        # Accuracy: arrotondamento delle predizioni per la classificazione
        train_acc = accuracy_score(y_train, y_train_pred.round())
        val_acc = accuracy_score(y_val, y_val_pred.round()) 

        # Memorizziamo i valori di loss e accuracy
        train_mse_list.append(train_mse)    
        val_mse_list.append(val_mse)
        train_acc_list.append(train_acc)
        val_acc_list.append(val_acc)


    # Salviamo i risultati di questa inizializzazione (per la media)
    all_train_mse.append(train_mse_list)
    all_val_mse.append(val_mse_list)
    all_train_acc.append(train_acc_list)
    all_val_acc.append(val_acc_list)

    # Creazione della figura con due subplot
    fig, ax1 = plt.subplots(1, 2, figsize=(12, 5))

    # Plot MSE
    ax1[0].plot(train_mse_list, label="Train MSE", color="blue")
    ax1[0].plot(val_mse_list, label="Validation MSE", color="green")
    ax1[0].set_xlabel("Epochs")
    ax1[0].set_ylabel("MSE")
    ax1[0].set_title(f"MSE MONK-1 (random_state={rs})")
    ax1[0].legend()
    ax1[0].grid(True)

    # Plot Accuracy
    ax1[1].plot(train_acc_list, label="Train Accuracy", color="blue")
    ax1[1].plot(val_acc_list, label="Validation Accuracy", color="green")
    ax1[1].set_xlabel("Epochs")
    ax1[1].set_ylabel("Accuracy")
    ax1[1].set_title(f"Accuracy MONK-1 (random_state={rs})")
    ax1[1].legend()
    ax1[1].grid(True)

    # Mostra la figura completa con i due subplot
    plt.tight_layout()
    plt.show()

    print(F'Training Accuracy: {accuracy_score(y_train, nn.predict(X_train).round()):.4f}')
    print(f'Validation Accuracy: {accuracy_score(y_val, nn.predict(X_val).round()):.4f}')
    print(f'Best params: {best_params}')

# Calcolo medie
mean_train_mse = np.mean(all_train_mse, axis=0)
mean_val_mse = np.mean(all_val_mse, axis=0)
mean_train_acc = np.mean(all_train_acc, axis=0)
mean_val_acc = np.mean(all_val_acc, axis=0)

# Plot medie
plt.figure(figsize=(10, 5))
plt.plot(mean_train_mse, label="Mean Train MSE", color="blue")
plt.plot(mean_val_mse, label="Mean Validation MSE", color="green")
plt.xlabel("Epochs")
plt.ylabel("MSE")
plt.title("Media MSE su 5 random_state MONK-1")
plt.legend()
plt.grid(True)
plt.show()

plt.figure(figsize=(10, 5))
plt.plot(mean_train_acc, label="Mean Train Accuracy", color="blue")
plt.plot(mean_val_acc, label="Mean Validation Accuracy", color="green")
plt.xlabel("Epochs")
plt.ylabel("Accuracy")
plt.title("Media Accuracy su 5 random_state MONK-1")
plt.legend()
plt.grid(True)
plt.show()

### Best model MONK-1

In [None]:
best_params = {'activation': 'tanh', 
               'hidden_layer_sizes': (3,), 
               'learning_rate_init': 0.02, 
               'solver': 'adam'}

# Inizializzazione modello con i miglior iperparametri trovati
nn = MLPRegressor(
        hidden_layer_sizes=best_params['hidden_layer_sizes'],
        activation=best_params['activation'],
        solver=best_params['solver'],
        learning_rate_init=best_params['learning_rate_init'],
        batch_size=batch_size,  # fullbatch (99)
        max_iter=1, # nota bene
        warm_start=True, # nota bene
        shuffle=True,
        random_state=7, # MIGLIOR MODELLO TRAMITE MODEL SELECTION
        verbose=False
      )

In [None]:
# TRAINING MODELLO

# Numero di epoche
epochs = 250  

# Liste per tenere traccia delle metriche epoca per epoca
train_loss_curve = []
test_loss_curve = []
train_accuracy_curve = []
test_accuracy_curve = []

# Training passo passo
for epoch in range(epochs):
    nn.partial_fit(X_dev, y_dev)

    # Predizioni su training e test set
    y_dev_pred = nn.predict(X_dev)
    y_test_pred = nn.predict(X_test)

    # Calcoliamo le metriche
    train_loss = mean_squared_error(y_dev, y_dev_pred)  # MSE su training
    test_loss = mean_squared_error(y_test, y_test_pred)  # MSE su test

    # Accuracy: arrotondamento delle predizioni per la classificazione
    train_acc = accuracy_score(y_dev, y_dev_pred.round())
    test_acc = accuracy_score(y_test, y_test_pred.round())

    # Memorizziamo i valori di loss e accuracy
    train_loss_curve.append(train_loss)
    test_loss_curve.append(test_loss)
    train_accuracy_curve.append(train_acc)
    test_accuracy_curve.append(test_acc)

    # Stampa ogni 50 epoche
    if (epoch + 1) % 50 == 0:
        print(f"Epoca {epoch + 1}/{epochs}, Train MSE: {train_loss:.5f}, Train Acc: {train_acc:.4f}, Test Acc: {test_acc:.4f}, Test MSE: {test_loss:.5f}")

In [None]:
# Plot delle metriche
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 5))

# Primo grafico: Loss Curve
ax1.set_xlabel("Epoche")
ax1.set_ylabel("MSE")
ax1.plot(train_loss_curve, label="Train Loss", color="blue", linestyle="solid")
ax1.plot(test_loss_curve, label="Test Loss", color="red", linestyle="solid")
ax1.legend(loc="upper right")
ax1.set_title("MONK-1 Loss")
ax1.grid(True)

# Secondo grafico: Accuracy Curve
ax2.set_xlabel("Epoche")
ax2.set_ylabel("Accuracy")
ax2.plot(train_accuracy_curve, label="Train Accuracy", color="blue", linestyle="solid")
ax2.plot(test_accuracy_curve, label="Test Accuracy", color="red", linestyle="solid")
ax2.legend(loc="lower right")
ax2.set_title("MONK-1 Accuracy")
ax2.grid(True)

# Migliora la disposizione dei grafici
plt.tight_layout()
plt.show()

In [None]:
print(f"Numero di epoche salvate: {len(train_loss_curve)}")
print(f"Numero di epoche previste: {epochs}")

In [None]:
# Evaluation
print("\nMiglior modello trovato per MONK-1 (random state 7 - 250 epochs):")
print(f"Parametri: {best_params}")

print(F'Training Accuracy: {accuracy_score(y_dev, nn.predict(X_dev).round()):.4f}')
print(f"Test Accuracy: {accuracy_score(y_test, nn.predict(X_test).round()):.4f}")

## Monk 2
MONK-2: EXACTLY TWO of {a1 = 1, a2 = 1, a3 = 1, a4 = 1, a5 = 1, a6 = 1} --> Class label 1 else 0

In [None]:
# lettura dataset monk 2
monk_train_2 = load_monk_data_one_hot_enc("../data/monk+s+problems/monks-2.train")
monk_test_2 = load_monk_data_one_hot_enc("../data/monk+s+problems/monks-2.test")

monk_train_2.head()

In [None]:
print(monk_train_2.shape)
print(monk_test_2.shape)

In [None]:
# separazione feature e target
X_dev, y_dev = monk_train_2.iloc[:, 1:], monk_train_2.iloc[:, 0]
X_test, y_test = monk_test_2.iloc[:, 1:], monk_test_2.iloc[:, 0]

In [None]:
print(X_dev.shape)
print(y_dev.shape)
print(X_test.shape)
print(y_test.shape)

In [None]:
# Splitting del dev set in training e validation
X_train, X_val, y_train, y_val = train_test_split(X_dev, y_dev, test_size=0.2, random_state=42)

In [None]:
print(X_train.shape)
print(y_train.shape)
print(X_val.shape)
print(y_val.shape)

In [None]:
batch_size = X_train.shape[0]
print(batch_size)

In [None]:
# Iperparametri da testare 
iperparameters = [ 
    {
    'hidden_layer_sizes': [(2,), (3,), (4,)],  
    'activation': ['tanh', 'relu', 'logistic'],  
    'solver': ['adam'], 
    'learning_rate_init': [0.01, 0.02, 0.05, 0.001, 0.002],
    },
    {
    'hidden_layer_sizes': [(2,), (3,), (4,)],  
    'activation': ['tanh', 'relu', 'logistic'],  
    'solver': ['sgd'],  
    'learning_rate_init': [0.01, 0.02, 0.05, 0.001, 0.002],
    'learning_rate': ['constant', 'adaptive'],  # solo per SGD
    'momentum': [0.9, 0.95]  # solo per SGD
    }
]

best_model = None
best_train_acc = float('-inf')
best_val_acc = float('-inf')
best_train_mse = float('inf')
best_val_mse = float('inf')
best_params = None

# Itera su tutte le combinazioni di iperparametri
for params in ParameterGrid(iperparameters):
    # Definizione del modello con gli iperparametri scelti
    nn = MLPRegressor(
        hidden_layer_sizes=params['hidden_layer_sizes'],
        activation=params['activation'],
        solver=params['solver'],
        learning_rate_init=params['learning_rate_init'],
        batch_size=batch_size,  # fullbatch (135)
        **({'learning_rate': params['learning_rate']} if params['solver'] == 'sgd' else {}),  # learning_rate solo se il solver è 'sgd'
        **({'momentum': params['momentum']} if params['solver'] == 'sgd' else {}),  # momentum solo se il solver è 'sgd'
        max_iter=500,
        shuffle=True,
        random_state=7,
        verbose=False
    )
    
    # Addestramento del modello 
    nn.fit(X_train, y_train)

    # Predizioni sui dati
    y_pred_train = nn.predict(X_train)
    y_pred_val = nn.predict(X_val)

    # Calcolo MSE
    train_mse = mean_squared_error(y_train, y_pred_train)
    val_mse = mean_squared_error(y_val, y_pred_val)

    # Convertsione delle predizioni in classi discrete
    train_acc = accuracy_score(y_train.round(), y_pred_train.round())
    val_acc = accuracy_score(y_val.round(), y_pred_val.round())


    # Verifica se abbiamo trovato un modello migliore
    if val_acc > best_val_acc:  
        best_train_mse = train_mse
        best_val_mse = val_mse
        best_train_acc = train_acc
        best_val_acc = val_acc
        best_model = nn
        best_params = params

In [None]:
# Evaluation
print("\nMiglior modello trovato per MONK-2 (random state 7 - 500 epochs):")
print(f"Parametri: {best_params}")

print(F'Training Accuracy: {accuracy_score(y_train, best_model.predict(X_train).round()):.4f}')
print(f'Validation Accuracy: {accuracy_score(y_val, best_model.predict(X_val).round()):.4f}')
print(f"Test Accuracy: {accuracy_score(y_test, best_model.predict(X_test).round()):.4f}")

In [None]:
# Inizializzazione modello con i miglior iperparametri trovati
nn = MLPRegressor(
        hidden_layer_sizes=best_params['hidden_layer_sizes'],
        activation=best_params['activation'],
        solver=best_params['solver'],
        learning_rate_init=best_params['learning_rate_init'],
        batch_size=batch_size,  # fullbatch (135)
        **({'learning_rate': best_params['learning_rate']} if best_params['solver'] == 'sgd' else {}), # learning_rate solo se il solver è 'sgd'
        **({'momentum': best_params['momentum']} if best_params['solver'] == 'sgd' else {}), # momentum solo se il solver è 'sgd'
        max_iter=1, # nota bene
        warm_start=True, # nota bene
        shuffle=True,
        random_state=7,
        verbose=False
      )

In [None]:
# TRAINING MODELLO

# Numero di epoche
epochs = 500  

# Liste per tenere traccia delle metriche epoca per epoca
train_loss_curve = []
test_loss_curve = []
train_accuracy_curve = []
test_accuracy_curve = []

# Training passo passo
for epoch in range(epochs):
    nn.partial_fit(X_dev, y_dev)

    # Predizioni su training e test set
    y_dev_pred = nn.predict(X_dev)
    y_test_pred = nn.predict(X_test)

    # Calcoliamo le metriche
    train_loss = mean_squared_error(y_dev, y_dev_pred)  # MSE su training
    test_loss = mean_squared_error(y_test, y_test_pred)  # MSE su test

    # Accuracy: arrotondamento delle predizioni per la classificazione
    train_acc = accuracy_score(y_dev, y_dev_pred.round())
    test_acc = accuracy_score(y_test, y_test_pred.round())

    # Memorizziamo i valori di loss e accuracy
    train_loss_curve.append(train_loss)
    test_loss_curve.append(test_loss)
    train_accuracy_curve.append(train_acc)
    test_accuracy_curve.append(test_acc)

    # Stampa ogni 50 epoche
    if (epoch + 1) % 50 == 0:
        print(f"Epoca {epoch + 1}/{epochs}, Train MSE: {train_loss:.5f}, Train Acc: {train_acc:.4f}")

In [None]:
# Plot delle metriche
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 5))

# Primo grafico: Loss Curve
ax1.set_xlabel("Epoche")
ax1.set_ylabel("MSE")
ax1.plot(train_loss_curve, label="Train Loss", color="blue", linestyle="solid")
ax1.plot(test_loss_curve, label="Test Loss", color="red", linestyle="solid")
ax1.legend(loc="upper right")
ax1.set_title("MONK-2 Loss")
ax1.grid(True)

# Secondo grafico: Accuracy Curve
ax2.set_xlabel("Epoche")
ax2.set_ylabel("Accuracy")
ax2.plot(train_accuracy_curve, label="Train Accuracy", color="blue", linestyle="solid")
ax2.plot(test_accuracy_curve, label="Test Accuracy", color="red", linestyle="solid")
ax2.legend(loc="lower right")
ax2.set_title("MONK-2 Accuracy")
ax2.grid(True)

# Migliora la disposizione dei grafici
plt.tight_layout()
plt.show()

In [None]:
print(F'Training Accuracy: {accuracy_score(y_dev, nn.predict(X_dev).round()):.4f}')
print(f"Test Accuracy: {accuracy_score(y_test, nn.predict(X_test).round()):.4f}")

In [None]:
# 5 run diverse corrispondenti a 5 random_state differenti (MODEL SELECTION)

epochs = 500

# Per salvare i risultati delle 5 inizializzazioni differenti
all_train_mse = []
all_val_mse = []
all_train_acc = []
all_val_acc = []

# Seed differenti
random_states = [7, 18, 28, 31, 42]

for rs in random_states:
    print(f"Training con random_state={rs}...")

    # Inizializzazione modello con i miglior iperparametri trovati
    nn = MLPRegressor(
        hidden_layer_sizes=best_params['hidden_layer_sizes'],
        activation=best_params['activation'],
        solver=best_params['solver'],
        learning_rate_init=best_params['learning_rate_init'],
        batch_size=batch_size,  # fullbatch (135)
        **({'learning_rate': best_params['learning_rate']} if best_params['solver'] == 'sgd' else {}), # learning_rate solo se il solver è 'sgd'
        **({'momentum': best_params['momentum']} if best_params['solver'] == 'sgd' else {}), # momentum solo se il solver è 'sgd'
        max_iter=1, # nota bene
        warm_start=True, # nota bene
        shuffle=True,
        random_state=rs,
        verbose=False
      )

    train_mse_list = []
    val_mse_list = []
    train_acc_list = []
    val_acc_list = []

    for epoch in range(epochs):  
        nn.partial_fit(X_train, y_train)
        
        # Predizioni su training e test set
        y_train_pred = nn.predict(X_train)
        y_val_pred = nn.predict(X_val)  

        # Calcolo metriche     
        train_mse = mean_squared_error(y_train, y_train_pred)
        val_mse = mean_squared_error(y_val, y_val_pred)

        # Accuracy: arrotondamento delle predizioni per la classificazione
        train_acc = accuracy_score(y_train, y_train_pred.round())
        val_acc = accuracy_score(y_val, y_val_pred.round()) 

        # Memorizziamo i valori di loss e accuracy
        train_mse_list.append(train_mse)    
        val_mse_list.append(val_mse)
        train_acc_list.append(train_acc)
        val_acc_list.append(val_acc)


    # Salviamo i risultati di questa inizializzazione (per la media)
    all_train_mse.append(train_mse_list)
    all_val_mse.append(val_mse_list)
    all_train_acc.append(train_acc_list)
    all_val_acc.append(val_acc_list)

    # Creazione della figura con due subplot
    fig, ax1 = plt.subplots(1, 2, figsize=(12, 5))

    # Plot MSE
    ax1[0].plot(train_mse_list, label="Train MSE", color="blue")
    ax1[0].plot(val_mse_list, label="Validation MSE", color="green")
    ax1[0].set_xlabel("Epochs")
    ax1[0].set_ylabel("MSE")
    ax1[0].set_title(f"MSE MONK-2 (random_state={rs})")
    ax1[0].legend()
    ax1[0].grid(True)

    # Plot Accuracy
    ax1[1].plot(train_acc_list, label="Train Accuracy", color="blue")
    ax1[1].plot(val_acc_list, label="Validation Accuracy", color="green")
    ax1[1].set_xlabel("Epochs")
    ax1[1].set_ylabel("Accuracy")
    ax1[1].set_title(f"Accuracy MONK-2 (random_state={rs})")
    ax1[1].legend()
    ax1[1].grid(True)

    # Mostra la figura completa con i due subplot
    plt.tight_layout()
    plt.show()

    print(F'Training Accuracy: {accuracy_score(y_train, nn.predict(X_train).round()):.4f}')
    print(f'Validation Accuracy: {accuracy_score(y_val, nn.predict(X_val).round()):.4f}')
    print(f'Best params: {best_params}')

# Calcolo medie
mean_train_mse = np.mean(all_train_mse, axis=0)
mean_val_mse = np.mean(all_val_mse, axis=0)
mean_train_acc = np.mean(all_train_acc, axis=0)
mean_val_acc = np.mean(all_val_acc, axis=0)

# Plot medie
plt.figure(figsize=(10, 5))
plt.plot(mean_train_mse, label="Mean Train MSE", color="blue")
plt.plot(mean_val_mse, label="Mean Validation MSE", color="green")
plt.xlabel("Epochs")
plt.ylabel("MSE")
plt.title("Media MSE su 5 random_state MONK-2")
plt.legend()
plt.grid(True)
plt.show()

plt.figure(figsize=(10, 5))
plt.plot(mean_train_acc, label="Mean Train Accuracy", color="blue")
plt.plot(mean_val_acc, label="Mean Validation Accuracy", color="green")
plt.xlabel("Epochs")
plt.ylabel("Accuracy")
plt.title("Media Accuracy su 5 random_state MONK-2")
plt.legend()
plt.grid(True)
plt.show()

### Best model MONK-2

In [None]:
best_params = {'activation': 'tanh', 
               'hidden_layer_sizes': (2,), 
               'learning_rate_init': 0.01, 
               'solver': 'adam'}

# Inizializzazione modello con i miglior iperparametri trovati
nn = MLPRegressor(
        hidden_layer_sizes=best_params['hidden_layer_sizes'],
        activation=best_params['activation'],
        solver=best_params['solver'],
        learning_rate_init=best_params['learning_rate_init'],
        batch_size=batch_size,  # fullbatch (135)
        max_iter=1, # nota bene
        warm_start=True, # nota bene
        shuffle=True,
        random_state=28, # MIGLIOR MODELLO TRAMITE MODEL SELECTION
        verbose=False
      )

In [None]:
# TRAINING MODELLO

# Numero di epoche
epochs = 300  

# Liste per tenere traccia delle metriche epoca per epoca
train_loss_curve = []
test_loss_curve = []
train_accuracy_curve = []
test_accuracy_curve = []

# Training passo passo
for epoch in range(epochs):
    nn.partial_fit(X_dev, y_dev)

    # Predizioni su training e test set
    y_dev_pred = nn.predict(X_dev)
    y_test_pred = nn.predict(X_test)

    # Calcoliamo le metriche
    train_loss = mean_squared_error(y_dev, y_dev_pred)  # MSE su training
    test_loss = mean_squared_error(y_test, y_test_pred)  # MSE su test

    # Accuracy: arrotondamento delle predizioni per la classificazione
    train_acc = accuracy_score(y_dev, y_dev_pred.round())
    test_acc = accuracy_score(y_test, y_test_pred.round())

    # Memorizziamo i valori di loss e accuracy
    train_loss_curve.append(train_loss)
    test_loss_curve.append(test_loss)
    train_accuracy_curve.append(train_acc)
    test_accuracy_curve.append(test_acc)

    # Stampa ogni 50 epoche
    if (epoch + 1) % 50 == 0:
        print(f"Epoca {epoch + 1}/{epochs}, Train MSE: {train_loss:.5f}, Train Acc: {train_acc:.4f}, Test Acc: {test_acc:.4f}, Test MSE: {test_loss:.5f}")

In [None]:
# Plot delle metriche
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 5))

# Primo grafico: Loss Curve
ax1.set_xlabel("Epoche")
ax1.set_ylabel("MSE")
ax1.plot(train_loss_curve, label="Train Loss", color="blue", linestyle="solid")
ax1.plot(test_loss_curve, label="Test Loss", color="red", linestyle="solid")
ax1.legend(loc="upper right")
ax1.set_title("MONK-2 Loss")
ax1.grid(True)

# Secondo grafico: Accuracy Curve
ax2.set_xlabel("Epoche")
ax2.set_ylabel("Accuracy")
ax2.plot(train_accuracy_curve, label="Train Accuracy", color="blue", linestyle="solid")
ax2.plot(test_accuracy_curve, label="Test Accuracy", color="red", linestyle="solid")
ax2.legend(loc="lower right")
ax2.set_title("MONK-2 Accuracy")
ax2.grid(True)

# Migliora la disposizione dei grafici
plt.tight_layout()
plt.show()

In [None]:
# Evaluation
print("\nMiglior modello trovato per MONK-2 (random state 28 - 300 epochs):")
print(f"Parametri: {best_params}")

print(F'Training Accuracy: {accuracy_score(y_dev, nn.predict(X_dev).round()):.4f}')
print(f"Test Accuracy: {accuracy_score(y_test, nn.predict(X_test).round()):.4f}")

## Monk 3
MONK-3: (a5 = 3 and a4 = 1) or (a5 /= 4 and a2 /= 3)
           (5% class noise added to the training set) --> Class label 1 else 0

## Senza regolarizzazione

In [None]:
# lettura dataset monk 3
monk_train_3 = load_monk_data_one_hot_enc("../data/monk+s+problems/monks-3.train")
monk_test_3 = load_monk_data_one_hot_enc("../data/monk+s+problems/monks-3.test")

# Stampiamo i primi 5 record
monk_train_3.head()

In [None]:
print(monk_train_3.shape)
print(monk_test_3.shape)

In [None]:
# separazione feature e target
X_dev, y_dev = monk_train_3.iloc[:, 1:], monk_train_3.iloc[:, 0]
X_test, y_test = monk_test_3.iloc[:, 1:], monk_test_3.iloc[:, 0]

In [None]:
print(X_dev.shape)
print(y_dev.shape)
print(X_test.shape)
print(y_test.shape)

In [None]:
# Splitting del dev set in training e validation
X_train, X_val, y_train, y_val = train_test_split(X_dev, y_dev, test_size=0.2, random_state=42)

In [None]:
print(X_train.shape)
print(y_train.shape)
print(X_val.shape)
print(y_val.shape)

In [None]:
batch_size = X_train.shape[0]
print(batch_size)

In [None]:
# Iperparametri da testare 
iperparameters = [ 
    {
    'hidden_layer_sizes': [(2,), (3,), (4,)],  
    'activation': ['tanh', 'relu', 'logistic'],  
    'solver': ['adam'], 
    'learning_rate_init': [0.01, 0.02, 0.05, 0.001, 0.002],
    'alpha': [0] # senza regolarizzazione
    },
    {
    'hidden_layer_sizes': [(2,), (3,), (4,)],  
    'activation': ['tanh', 'relu', 'logistic'],  
    'solver': ['sgd'],  
    'learning_rate_init': [0.01, 0.02, 0.05, 0.001, 0.002],
    'alpha': [0], # senza regolarizzazione
    'learning_rate': ['constant', 'adaptive'],  # solo per SGD
    'momentum': [0.9, 0.95]  # solo per SGD
    }
]

best_model = None
best_train_acc = float('-inf')
best_val_acc = float('-inf')
best_train_mse = float('inf')
best_val_mse = float('inf')
best_params = None

# Itera su tutte le combinazioni di iperparametri
for params in ParameterGrid(iperparameters):
    # Definizione del modello con gli iperparametri scelti
    nn = MLPRegressor(
        hidden_layer_sizes=params['hidden_layer_sizes'],
        activation=params['activation'],
        solver=params['solver'],
        learning_rate_init=params['learning_rate_init'],
        alpha=params['alpha'],
        batch_size=batch_size,  # fullbatch (97)
        **({'learning_rate': params['learning_rate']} if params['solver'] == 'sgd' else {}),  # learning_rate solo se il solver è 'sgd'
        **({'momentum': params['momentum']} if params['solver'] == 'sgd' else {}),  # momentum solo se il solver è 'sgd'
        max_iter=500,
        shuffle=True,
        random_state=7,
        verbose=False
    )
    
    # Addestramento del modello 
    nn.fit(X_train, y_train)

    # Predizioni sui dati
    y_pred_train = nn.predict(X_train)
    y_pred_val = nn.predict(X_val)

    # Calcolo MSE
    train_mse = mean_squared_error(y_train, y_pred_train)
    val_mse = mean_squared_error(y_val, y_pred_val)

    # Convertsione delle predizioni in classi discrete
    train_acc = accuracy_score(y_train.round(), y_pred_train.round())
    val_acc = accuracy_score(y_val.round(), y_pred_val.round())


    # Verifica se abbiamo trovato un modello migliore
    if val_acc > best_val_acc:  
        best_train_mse = train_mse
        best_val_mse = val_mse
        best_train_acc = train_acc
        best_val_acc = val_acc
        best_model = nn
        best_params = params

In [None]:
# Evaluation
print("\nMiglior modello trovato per MONK-3 (No Regularization) (random state 7 - 500 epochs):")
print(f"Parametri: {best_params}")

print(F'Training Accuracy: {accuracy_score(y_train, best_model.predict(X_train).round()):.4f}')
print(f'Validation Accuracy: {accuracy_score(y_val, best_model.predict(X_val).round()):.4f}')
print(f"Test Accuracy: {accuracy_score(y_test, best_model.predict(X_test).round()):.4f}")

In [None]:
# Inizializzazione modello con i miglior iperparametri trovati
nn = MLPRegressor(
        hidden_layer_sizes=best_params['hidden_layer_sizes'],
        activation=best_params['activation'],
        solver=best_params['solver'],
        learning_rate_init=best_params['learning_rate_init'],
        alpha=best_params['alpha'],
        batch_size=batch_size,  # fullbatch (97)
        **({'learning_rate': best_params['learning_rate']} if best_params['solver'] == 'sgd' else {}), # learning_rate solo se il solver è 'sgd'
        **({'momentum': best_params['momentum']} if best_params['solver'] == 'sgd' else {}), # momentum solo se il solver è 'sgd'
        max_iter=1, # nota bene
        warm_start=True, # nota bene
        shuffle=True,
        random_state=7,
        verbose=False
      )

In [None]:
# TRAINING MODELLO

# Numero di epoche
epochs = 500  

# Liste per tenere traccia delle metriche epoca per epoca
train_loss_curve = []
test_loss_curve = []
train_accuracy_curve = []
test_accuracy_curve = []

# Training passo passo
for epoch in range(epochs):
    nn.partial_fit(X_dev, y_dev)

    # Predizioni su training e test set
    y_dev_pred = nn.predict(X_dev)
    y_test_pred = nn.predict(X_test)

    # Calcoliamo le metriche
    train_loss = mean_squared_error(y_dev, y_dev_pred)  # MSE su training
    test_loss = mean_squared_error(y_test, y_test_pred)  # MSE su test

    # Accuracy: arrotondamento delle predizioni per la classificazione
    train_acc = accuracy_score(y_dev, y_dev_pred.round())
    test_acc = accuracy_score(y_test, y_test_pred.round())

    # Memorizziamo i valori di loss e accuracy
    train_loss_curve.append(train_loss)
    test_loss_curve.append(test_loss)
    train_accuracy_curve.append(train_acc)
    test_accuracy_curve.append(test_acc)

    # Stampa ogni 50 epoche
    if (epoch + 1) % 50 == 0:
        print(f"Epoca {epoch + 1}/{epochs}, Train MSE: {train_loss:.5f}, Train Acc: {train_acc:.4f}")

In [None]:
# Plot delle metriche
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 5))

# Primo grafico: Loss Curve
ax1.set_xlabel("Epoche")
ax1.set_ylabel("MSE")
ax1.plot(train_loss_curve, label="Train Loss", color="blue", linestyle="solid")
ax1.plot(test_loss_curve, label="Test Loss", color="red", linestyle="solid")
ax1.legend(loc="upper right")
ax1.set_title("MONK-3 Loss (No Regularization)")
ax1.grid(True)

# Secondo grafico: Accuracy Curve
ax2.set_xlabel("Epoche")
ax2.set_ylabel("Accuracy")
ax2.plot(train_accuracy_curve, label="Train Accuracy", color="blue", linestyle="solid")
ax2.plot(test_accuracy_curve, label="Test Accuracy", color="red", linestyle="solid")
ax2.legend(loc="lower right")
ax2.set_title("MONK-3 Accuracy (No Regularization)")
ax2.grid(True)

# Migliora la disposizione dei grafici
plt.tight_layout()
plt.show()

In [None]:
print(F'Training Accuracy: {accuracy_score(y_dev, nn.predict(X_dev).round()):.4f}')
print(f"Test Accuracy: {accuracy_score(y_test, nn.predict(X_test).round()):.4f}")

In [None]:
# 5 run diverse corrispondenti a 5 random_state differenti (MODEL SELECTION)

epochs = 500

# Per salvare i risultati delle 5 inizializzazioni differenti
all_train_mse = []
all_val_mse = []
all_train_acc = []
all_val_acc = []

# Seed differenti
random_states = [7, 18, 28, 31, 42]

for rs in random_states:
    print(f"Training con random_state={rs}...")

    # Inizializzazione modello con i miglior iperparametri trovati
    nn = MLPRegressor(
        hidden_layer_sizes=best_params['hidden_layer_sizes'],
        activation=best_params['activation'],
        solver=best_params['solver'],
        learning_rate_init=best_params['learning_rate_init'],
        alpha=best_params['alpha'],
        batch_size=batch_size,  # fullbatch (97)
        **({'learning_rate': best_params['learning_rate']} if best_params['solver'] == 'sgd' else {}), # learning_rate solo se il solver è 'sgd'
        **({'momentum': best_params['momentum']} if best_params['solver'] == 'sgd' else {}), # momentum solo se il solver è 'sgd'
        max_iter=1, # nota bene
        warm_start=True, # nota bene
        shuffle=True,
        random_state=rs,
        verbose=False
      )

    train_mse_list = []
    val_mse_list = []
    train_acc_list = []
    val_acc_list = []

    for epoch in range(epochs):  
        nn.partial_fit(X_train, y_train)
        
        # Predizioni su training e test set
        y_train_pred = nn.predict(X_train)
        y_val_pred = nn.predict(X_val)  

        # Calcolo metriche     
        train_mse = mean_squared_error(y_train, y_train_pred)
        val_mse = mean_squared_error(y_val, y_val_pred)

        # Accuracy: arrotondamento delle predizioni per la classificazione
        train_acc = accuracy_score(y_train, y_train_pred.round())
        val_acc = accuracy_score(y_val, y_val_pred.round()) 

        # Memorizziamo i valori di loss e accuracy
        train_mse_list.append(train_mse)    
        val_mse_list.append(val_mse)
        train_acc_list.append(train_acc)
        val_acc_list.append(val_acc)


    # Salviamo i risultati di questa inizializzazione (per la media)
    all_train_mse.append(train_mse_list)
    all_val_mse.append(val_mse_list)
    all_train_acc.append(train_acc_list)
    all_val_acc.append(val_acc_list)

    # Creazione della figura con due subplot
    fig, ax1 = plt.subplots(1, 2, figsize=(12, 5))

    # Plot MSE
    ax1[0].plot(train_mse_list, label="Train MSE", color="blue")
    ax1[0].plot(val_mse_list, label="Validation MSE", color="green")
    ax1[0].set_xlabel("Epochs")
    ax1[0].set_ylabel("MSE")
    ax1[0].set_title(f"MSE MONK-3 (random_state={rs})")
    ax1[0].legend()
    ax1[0].grid(True)

    # Plot Accuracy
    ax1[1].plot(train_acc_list, label="Train Accuracy", color="blue")
    ax1[1].plot(val_acc_list, label="Validation Accuracy", color="green")
    ax1[1].set_xlabel("Epochs")
    ax1[1].set_ylabel("Accuracy")
    ax1[1].set_title(f"Accuracy MONK-3 (random_state={rs})")
    ax1[1].legend()
    ax1[1].grid(True)

    # Mostra la figura completa con i due subplot
    plt.tight_layout()
    plt.show()

    print(F'Training Accuracy: {accuracy_score(y_train, nn.predict(X_train).round()):.4f}')
    print(f'Validation Accuracy: {accuracy_score(y_val, nn.predict(X_val).round()):.4f}')
    print(f'Best params: {best_params}')

# Calcolo medie
mean_train_mse = np.mean(all_train_mse, axis=0)
mean_val_mse = np.mean(all_val_mse, axis=0)
mean_train_acc = np.mean(all_train_acc, axis=0)
mean_val_acc = np.mean(all_val_acc, axis=0)

# Plot medie
plt.figure(figsize=(10, 5))
plt.plot(mean_train_mse, label="Mean Train MSE", color="blue")
plt.plot(mean_val_mse, label="Mean Validation MSE", color="green")
plt.xlabel("Epochs")
plt.ylabel("MSE")
plt.title("Media MSE su 5 random_state MONK-3")
plt.legend()
plt.grid(True)
plt.show()

plt.figure(figsize=(10, 5))
plt.plot(mean_train_acc, label="Mean Train Accuracy", color="blue")
plt.plot(mean_val_acc, label="Mean Validation Accuracy", color="green")
plt.xlabel("Epochs")
plt.ylabel("Accuracy")
plt.title("Media Accuracy su 5 random_state MONK-3")
plt.legend()
plt.grid(True)
plt.show()

### Best model MONK-3 (No Regularization)

In [None]:
best_params = {'activation': 'relu',
 'alpha': 0,
 'hidden_layer_sizes': (4,),
 'learning_rate_init': 0.001,
 'solver': 'adam'}

# Inizializzazione modello con i miglior iperparametri trovati
nn = MLPRegressor(
        hidden_layer_sizes=best_params['hidden_layer_sizes'],
        activation=best_params['activation'],
        solver=best_params['solver'],
        learning_rate_init=best_params['learning_rate_init'],
        alpha=best_params['alpha'],
        batch_size=batch_size,  # fullbatch (97)
        max_iter=1, # nota bene
        warm_start=True, # nota bene
        shuffle=True,
        random_state=7, # MIGLIOR MODELLO TRAMITE MODEL SELECTION
        verbose=False
      )

In [None]:
# TRAINING MODELLO

# Numero di epoche
epochs = 500  

# Liste per tenere traccia delle metriche epoca per epoca
train_loss_curve = []
test_loss_curve = []
train_accuracy_curve = []
test_accuracy_curve = []

# Training passo passo
for epoch in range(epochs):
    nn.partial_fit(X_dev, y_dev)

    # Predizioni su training e test set
    y_dev_pred = nn.predict(X_dev)
    y_test_pred = nn.predict(X_test)

    # Calcoliamo le metriche
    train_loss = mean_squared_error(y_dev, y_dev_pred)  # MSE su training
    test_loss = mean_squared_error(y_test, y_test_pred)  # MSE su test

    # Accuracy: arrotondamento delle predizioni per la classificazione
    train_acc = accuracy_score(y_dev, y_dev_pred.round())
    test_acc = accuracy_score(y_test, y_test_pred.round())

    # Memorizziamo i valori di loss e accuracy
    train_loss_curve.append(train_loss)
    test_loss_curve.append(test_loss)
    train_accuracy_curve.append(train_acc)
    test_accuracy_curve.append(test_acc)

     # Stampa ogni 50 epoche
    if (epoch + 1) % 50 == 0:
        print(f"Epoca {epoch + 1}/{epochs}, Train MSE: {train_loss:.5f}, Train Acc: {train_acc:.4f}, Test Acc: {test_acc:.4f}, Test MSE: {test_loss:.5f}")

In [None]:
# Plot delle metriche
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 5))

# Primo grafico: Loss Curve
ax1.set_xlabel("Epoche")
ax1.set_ylabel("MSE")
ax1.plot(train_loss_curve, label="Train Loss", color="blue", linestyle="solid")
ax1.plot(test_loss_curve, label="Test Loss", color="red", linestyle="solid")
ax1.legend(loc="upper right")
ax1.set_title("MONK-3 Loss (No Regularization)")
ax1.grid(True)

# Secondo grafico: Accuracy Curve
ax2.set_xlabel("Epoche")
ax2.set_ylabel("Accuracy")
ax2.plot(train_accuracy_curve, label="Train Accuracy", color="blue", linestyle="solid")
ax2.plot(test_accuracy_curve, label="Test Accuracy", color="red", linestyle="solid")
ax2.legend(loc="lower right")
ax2.set_title("MONK-3 Accuracy (No Regularization)")
ax2.grid(True)

# Migliora la disposizione dei grafici
plt.tight_layout()
plt.show()

In [None]:
# Evaluation
print("\nMiglior modello trovato per MONK-3 (random state 7 - 500 epochs):")
print(f"Parametri: {best_params}")

print(F'Training Accuracy: {accuracy_score(y_dev, nn.predict(X_dev).round()):.4f}')
print(f"Test Accuracy: {accuracy_score(y_test, nn.predict(X_test).round()):.4f}")

## Con regolarizzazione

In [None]:
# lettura dataset monk 3
monk_train_3 = load_monk_data_one_hot_enc("../data/monk+s+problems/monks-3.train")
monk_test_3 = load_monk_data_one_hot_enc("../data/monk+s+problems/monks-3.test")

# Stampiamo i primi 5 record
monk_train_3.head()

In [None]:
print(monk_train_3.shape)
print(monk_test_3.shape)

In [None]:
# separazione feature e target
X_dev, y_dev = monk_train_3.iloc[:, 1:], monk_train_3.iloc[:, 0]
X_test, y_test = monk_test_3.iloc[:, 1:], monk_test_3.iloc[:, 0]

In [None]:
print(X_dev.shape)
print(y_dev.shape)
print(X_test.shape)
print(y_test.shape)

In [None]:
# Splitting del dev set in training e validation
X_train, X_val, y_train, y_val = train_test_split(X_dev, y_dev, test_size=0.2, random_state=42)

In [None]:
print(X_train.shape)
print(y_train.shape)
print(X_val.shape)
print(y_val.shape)

In [None]:
batch_size = X_train.shape[0]
print(batch_size)

In [None]:
# Iperparametri da testare 
iperparameters = [ 
    {
    'hidden_layer_sizes': [(2,), (3,), (4,)],  
    'activation': ['tanh', 'relu', 'logistic'],  
    'solver': ['adam'], 
    'learning_rate_init': [0.01, 0.02, 0.05, 0.001, 0.002],
    'alpha':[0.0001, 0.0005, 0.001, 0.005, 0.01] # parametro di regolarizzazione
    },
    {
    'hidden_layer_sizes': [(2,), (3,), (4,)],  
    'activation': ['tanh', 'relu', 'logistic'],  
    'solver': ['sgd'],  
    'learning_rate_init': [0.01, 0.02, 0.05, 0.001, 0.002],
    'alpha':[0.0001, 0.0005, 0.001, 0.005, 0.01], # parametro di regolarizzazione
    'learning_rate': ['constant', 'adaptive'],  # solo per SGD
    'momentum': [0.9, 0.95]  # solo per SGD
    }
]

best_model = None
best_train_acc = float('-inf')
best_val_acc = float('-inf')
best_train_mse = float('inf')
best_val_mse = float('inf')
best_params = None

# Itera su tutte le combinazioni di iperparametri
for params in ParameterGrid(iperparameters):
    # Definizione del modello con gli iperparametri scelti
    nn = MLPRegressor(
        hidden_layer_sizes=params['hidden_layer_sizes'],
        activation=params['activation'],
        solver=params['solver'],
        learning_rate_init=params['learning_rate_init'],
        alpha=params['alpha'],
        batch_size=batch_size,  # fullbatch (97)
        **({'learning_rate': params['learning_rate']} if params['solver'] == 'sgd' else {}),  # learning_rate solo se il solver è 'sgd'
        **({'momentum': params['momentum']} if params['solver'] == 'sgd' else {}),  # momentum solo se il solver è 'sgd'
        max_iter=500,
        shuffle=True,
        random_state=13,
        verbose=False,
        # early_stopping=True,  # EARLY STOPPING
        # validation_fraction=0.2,  # 20% del training set usato per validazione
        # n_iter_no_change=10,  # Stop dopo 20 epoche senza miglioramenti
    )
    
    # Addestramento del modello 
    nn.fit(X_train, y_train)

    # Predizioni sui dati
    y_pred_train = nn.predict(X_train)
    y_pred_val = nn.predict(X_val)

    # Calcolo MSE
    train_mse = mean_squared_error(y_train, y_pred_train)
    val_mse = mean_squared_error(y_val, y_pred_val)

    # Convertsione delle predizioni in classi discrete
    train_acc = accuracy_score(y_train.round(), y_pred_train.round())
    val_acc = accuracy_score(y_val.round(), y_pred_val.round())

    # Verifica se abbiamo trovato un modello migliore
    if val_acc > best_val_acc:  
        best_train_mse = train_mse
        best_val_mse = val_mse
        best_train_acc = train_acc
        best_val_acc = val_acc
        best_model = nn
        best_params = params

In [None]:
# Evaluation
print("\nMiglior modello trovato per MONK-3 (Regularized) (random state 13 - 500 epochs):")
print(f"Parametri: {best_params}")

print(F'Training Accuracy: {accuracy_score(y_train, best_model.predict(X_train).round()):.4f}')
print(f'Validation Accuracy: {accuracy_score(y_val, best_model.predict(X_val).round()):.4f}')
print(f"Test Accuracy: {accuracy_score(y_test, best_model.predict(X_test).round()):.4f}")

In [None]:
# Inizializzazione modello con i miglior iperparametri trovati
nn = MLPRegressor(
        hidden_layer_sizes=best_params['hidden_layer_sizes'],
        activation=best_params['activation'],
        solver=best_params['solver'],
        learning_rate_init=best_params['learning_rate_init'],
        alpha=best_params['alpha'],
        batch_size=batch_size,  # fullbatch (97)
        **({'learning_rate': best_params['learning_rate']} if best_params['solver'] == 'sgd' else {}), # learning_rate solo se il solver è 'sgd'
        **({'momentum': best_params['momentum']} if best_params['solver'] == 'sgd' else {}), # momentum solo se il solver è 'sgd'
        max_iter=1, # nota bene
        warm_start=True, # nota bene
        shuffle=True,
        random_state=13,
        verbose=False,
        # early_stopping=True,  # EARLY STOPPING
        # validation_fraction=0.2,  # 20% del training set usato per validazione
        # n_iter_no_change=20,  # Stop dopo 20 epoche senza miglioramenti
      )

In [None]:
# TRAINING MODELLO

# Numero di epoche
epochs = 500 

# Liste per tenere traccia delle metriche epoca per epoca
train_loss_curve = []
test_loss_curve = []
train_accuracy_curve = []
test_accuracy_curve = []

# Training passo passo
for epoch in range(epochs):
    nn.partial_fit(X_dev, y_dev)

    # Predizioni su training e test set
    y_dev_pred = nn.predict(X_dev)
    y_test_pred = nn.predict(X_test)

    # Calcoliamo le metriche
    train_loss = mean_squared_error(y_dev, y_dev_pred)  # MSE su training
    test_loss = mean_squared_error(y_test, y_test_pred)  # MSE su test

    # Accuracy: arrotondamento delle predizioni per la classificazione
    train_acc = accuracy_score(y_dev, y_dev_pred.round())
    test_acc = accuracy_score(y_test, y_test_pred.round())

    # Memorizziamo i valori di loss e accuracy
    train_loss_curve.append(train_loss)
    test_loss_curve.append(test_loss)
    train_accuracy_curve.append(train_acc)
    test_accuracy_curve.append(test_acc)

    # Stampa ogni 50 epoche
    if (epoch + 1) % 50 == 0:
        print(f"Epoca {epoch + 1}/{epochs}, Train MSE: {train_loss:.5f}, Train Acc: {train_acc:.4f}")

In [None]:
# Plot delle metriche
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 5))

# Primo grafico: Loss Curve
ax1.set_xlabel("Epoche")
ax1.set_ylabel("MSE")
ax1.plot(train_loss_curve, label="Train Loss", color="blue", linestyle="solid")
ax1.plot(test_loss_curve, label="Test Loss", color="red", linestyle="solid")
ax1.legend(loc="upper right")
ax1.set_title("MONK-3 Loss (Regularization)")
ax1.grid(True)

# Secondo grafico: Accuracy Curve
ax2.set_xlabel("Epoche")
ax2.set_ylabel("Accuracy")
ax2.plot(train_accuracy_curve, label="Train Accuracy", color="blue", linestyle="solid")
ax2.plot(test_accuracy_curve, label="Test Accuracy", color="red", linestyle="solid")
ax2.legend(loc="lower right")
ax2.set_title("MONK-3 Accuracy (Regularization)")
ax2.grid(True)

# Migliora la disposizione dei grafici
plt.tight_layout()
plt.show()

In [None]:
print(F'Training Accuracy: {accuracy_score(y_dev, nn.predict(X_dev).round()):.4f}')
print(f"Test Accuracy: {accuracy_score(y_test, nn.predict(X_test).round()):.4f}")

In [None]:
# 5 run diverse corrispondenti a 5 random_state differenti (MODEL SELECTION)

epochs = 500

# Per salvare i risultati delle 5 inizializzazioni differenti
all_train_mse = []
all_val_mse = []
all_train_acc = []
all_val_acc = []

# Seed differenti
random_states = [7, 13, 27, 31, 42]

for rs in random_states:
    print(f"Training con random_state={rs}...")

    # Inizializzazione modello con i miglior iperparametri trovati
    nn = MLPRegressor(
        hidden_layer_sizes=best_params['hidden_layer_sizes'],
        activation=best_params['activation'],
        solver=best_params['solver'],
        learning_rate_init=best_params['learning_rate_init'],
        alpha=best_params['alpha'],
        batch_size=batch_size,  # fullbatch (97)
        **({'learning_rate': best_params['learning_rate']} if best_params['solver'] == 'sgd' else {}), # learning_rate solo se il solver è 'sgd'
        **({'momentum': best_params['momentum']} if best_params['solver'] == 'sgd' else {}), # momentum solo se il solver è 'sgd'
        max_iter=1, # nota bene
        warm_start=True, # nota bene
        shuffle=True,
        random_state=rs,
        verbose=False
      )

    train_mse_list = []
    val_mse_list = []
    train_acc_list = []
    val_acc_list = []

    for epoch in range(epochs):  
        nn.partial_fit(X_train, y_train)
        
        # Predizioni su training e test set
        y_train_pred = nn.predict(X_train)
        y_val_pred = nn.predict(X_val)  

        # Calcolo metriche     
        train_mse = mean_squared_error(y_train, y_train_pred)
        val_mse = mean_squared_error(y_val, y_val_pred)

        # Accuracy: arrotondamento delle predizioni per la classificazione
        train_acc = accuracy_score(y_train, y_train_pred.round())
        val_acc = accuracy_score(y_val, y_val_pred.round()) 

        # Memorizziamo i valori di loss e accuracy
        train_mse_list.append(train_mse)    
        val_mse_list.append(val_mse)
        train_acc_list.append(train_acc)
        val_acc_list.append(val_acc)


    # Salviamo i risultati di questa inizializzazione (per la media)
    all_train_mse.append(train_mse_list)
    all_val_mse.append(val_mse_list)
    all_train_acc.append(train_acc_list)
    all_val_acc.append(val_acc_list)

    # Creazione della figura con due subplot
    fig, ax1 = plt.subplots(1, 2, figsize=(12, 5))

    # Plot MSE
    ax1[0].plot(train_mse_list, label="Train MSE", color="blue")
    ax1[0].plot(val_mse_list, label="Validation MSE", color="green")
    ax1[0].set_xlabel("Epochs")
    ax1[0].set_ylabel("MSE")
    ax1[0].set_title(f"MSE MONK-3 (random_state={rs})")
    ax1[0].legend()
    ax1[0].grid(True)

    # Plot Accuracy
    ax1[1].plot(train_acc_list, label="Train Accuracy", color="blue")
    ax1[1].plot(val_acc_list, label="Validation Accuracy", color="green")
    ax1[1].set_xlabel("Epochs")
    ax1[1].set_ylabel("Accuracy")
    ax1[1].set_title(f"Accuracy MONK-3 (random_state={rs})")
    ax1[1].legend()
    ax1[1].grid(True)

    # Mostra la figura completa con i due subplot
    plt.tight_layout()
    plt.show()

    print(F'Training Accuracy: {accuracy_score(y_train, nn.predict(X_train).round()):.4f}')
    print(f'Validation Accuracy: {accuracy_score(y_val, nn.predict(X_val).round()):.4f}')
    print(f'Best params: {best_params}')

# Calcolo medie
mean_train_mse = np.mean(all_train_mse, axis=0)
mean_val_mse = np.mean(all_val_mse, axis=0)
mean_train_acc = np.mean(all_train_acc, axis=0)
mean_val_acc = np.mean(all_val_acc, axis=0)

# Plot medie
plt.figure(figsize=(10, 5))
plt.plot(mean_train_mse, label="Mean Train MSE", color="blue")
plt.plot(mean_val_mse, label="Mean Validation MSE", color="green")
plt.xlabel("Epochs")
plt.ylabel("MSE")
plt.title("Media MSE su 5 random_state MONK-3")
plt.legend()
plt.grid(True)
plt.show()

plt.figure(figsize=(10, 5))
plt.plot(mean_train_acc, label="Mean Train Accuracy", color="blue")
plt.plot(mean_val_acc, label="Mean Validation Accuracy", color="green")
plt.xlabel("Epochs")
plt.ylabel("Accuracy")
plt.title("Media Accuracy su 5 random_state MONK-3")
plt.legend()
plt.grid(True)
plt.show()

### Best model MONK-3 (Regularization)

In [None]:
best_params = {'activation': 'tanh',
 'alpha': 0.0001,
 'hidden_layer_sizes': (2,),
 'learning_rate_init': 0.01,
 'solver': 'adam'}

# Inizializzazione modello con i miglior iperparametri trovati
nn = MLPRegressor(
        hidden_layer_sizes=best_params['hidden_layer_sizes'],
        activation=best_params['activation'],
        solver=best_params['solver'],
        learning_rate_init=best_params['learning_rate_init'],
        alpha=best_params['alpha'],
        batch_size=batch_size,  # fullbatch (97)
        max_iter=1, # nota bene
        warm_start=True, # nota bene
        shuffle=True,
        random_state=13, # MIGLIOR MODELLO TRAMITE MODEL SELECTION
        verbose=False
      )

In [None]:
# TRAINING MODELLO

# Numero di epoche
epochs = 300 

# Liste per tenere traccia delle metriche epoca per epoca
train_loss_curve = []
test_loss_curve = []
train_accuracy_curve = []
test_accuracy_curve = []

# Training passo passo
for epoch in range(epochs):
    nn.partial_fit(X_dev, y_dev)

    # Predizioni su training e test set
    y_dev_pred = nn.predict(X_dev)
    y_test_pred = nn.predict(X_test)

    # Calcoliamo le metriche
    train_loss = mean_squared_error(y_dev, y_dev_pred)  # MSE su training
    test_loss = mean_squared_error(y_test, y_test_pred)  # MSE su test

    # Accuracy: arrotondamento delle predizioni per la classificazione
    train_acc = accuracy_score(y_dev, y_dev_pred.round())
    test_acc = accuracy_score(y_test, y_test_pred.round())

    # Memorizziamo i valori di loss e accuracy
    train_loss_curve.append(train_loss)
    test_loss_curve.append(test_loss)
    train_accuracy_curve.append(train_acc)
    test_accuracy_curve.append(test_acc)

    # Stampa ogni 50 epoche
    if (epoch + 1) % 50 == 0:
        print(f"Epoca {epoch + 1}/{epochs}, Train MSE: {train_loss:.5f}, Train Acc: {train_acc:.4f}, Test Acc: {test_acc:.4f}, Test MSE: {test_loss:.5f}")

In [None]:
# Plot delle metriche
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 5))

# Primo grafico: Loss Curve
ax1.set_xlabel("Epoche")
ax1.set_ylabel("MSE")
ax1.plot(train_loss_curve, label="Train Loss", color="blue", linestyle="solid")
ax1.plot(test_loss_curve, label="Test Loss", color="red", linestyle="solid")
ax1.legend(loc="upper right")
ax1.set_title("MONK-3 Loss (Regularization)")
ax1.grid(True)

# Secondo grafico: Accuracy Curve
ax2.set_xlabel("Epoche")
ax2.set_ylabel("Accuracy")
ax2.plot(train_accuracy_curve, label="Train Accuracy", color="blue", linestyle="solid")
ax2.plot(test_accuracy_curve, label="Test Accuracy", color="red", linestyle="solid")
ax2.legend(loc="lower right")
ax2.set_title("MONK-3 Accuracy (Regularization)")
ax2.grid(True)

# Migliora la disposizione dei grafici
plt.tight_layout()
plt.show()

In [None]:
# Evaluation
print("\nMiglior modello trovato per MONK-3 (random state 13 - 300 epochs):")
print(f"Parametri: {best_params}")

print(F'Training Accuracy: {accuracy_score(y_dev, nn.predict(X_dev).round()):.4f}')
print(f"Test Accuracy: {accuracy_score(y_test, nn.predict(X_test).round()):.4f}")