In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

In [None]:
def load_dataset(csv_path):
    """
    Load the dataset made of labels and TMD features stored ar csv_path
        - csv_path: String giving the path where the dataset is stored
    Returns features and labels as Pandas DataFrames
    """
    df = pd.read_csv(csv_path)
    labels = df.iloc[:, 0].values
    features = df.iloc[:, 1:].values
    return features, labels

def standardize_features(features):
  """
  Make every feature centered and scaled to unit variance
      - features: Pandas DataFrame
  """
  scaler = StandardScaler()
  return scaler.fit_transform(features)

In [None]:
def create_mlp(input_size, hidden_sizes, output_size=2):
  """
  Defines a function to create a MLP model with variable architecture
      -input_size, output_size : Integers giving the size of input and input
      -hidden_size : List of integers corresponding to the hidden layers widths
  """
  layers = []
  sizes = [input_size] + hidden_sizes + [output_size]
  for i in range(len(sizes) - 1):
      layers.append(nn.Linear(sizes[i], sizes[i+1]))
      if i < len(sizes) - 2:
          layers.append(nn.ReLU())
  return nn.Sequential(*layers)

In [None]:
def cross_validation(model, features, labels, num_epochs=10, batch_size=32, learning_rate=0.001, num_splits=5):
  """
  For a given model and dataset (features, labels) this funciton performs K_fold cross validation
      -model: Pytorch MLP model with appropriate input and output sizes
      -features, labels = Dataset in the form of two Pandas dataframes
      -num_epoch, batch_size, learning_rate : usual NN parameters
      -num_split : number of splits for the K_fold CV
  Returns a list with the num_splits values of validation loss
  """
  skf = StratifiedKFold(n_splits=num_splits, shuffle=True, random_state=42)
  criterion = nn.CrossEntropyLoss()

  all_val_losses = []

  for fold, (train_index, test_index) in enumerate(skf.split(features, labels)):
    train_features, test_features = features[train_index], features[test_index]
    train_labels, test_labels = labels[train_index], labels[test_index]

    train_dataset = TensorDataset(torch.tensor(train_features, dtype=torch.float32),
                                  torch.tensor(train_labels, dtype=torch.int32))
    test_dataset = TensorDataset(torch.tensor(test_features, dtype=torch.float32),
                                  torch.tensor(test_labels, dtype=torch.int32))

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    val_losses = []

    for epoch in range(num_epochs):
        model.train()
        for batch_features, batch_labels in train_loader:
          optimizer.zero_grad()
          outputs = model(batch_features)
          loss = criterion(outputs, batch_labels)
          loss.backward()
          optimizer.step()

        model.eval()
        val_loss = 0.0
        correct = 0
        total = 0
        with torch.no_grad():
          for batch_features, batch_labels in test_loader:
            outputs = model(batch_features)
            loss = criterion(outputs, batch_labels)
            val_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total += batch_labels.size(0)
            correct += (predicted == batch_labels).sum().item()

        accuracy = correct / total
        val_losses.append(val_loss / len(test_loader))
        print(f'Fold {fold + 1}, Epoch [{epoch + 1}/{num_epochs}], Test Loss: {val_losses[-1]}, Test Accuracy: {accuracy}')

    all_val_losses.append(val_losses)

  return all_val_losses


In [None]:
def hyperparameter_search(features, labels, num_layers_options, hidden_size_options, num_splits=5, num_epochs=10, batch_size=32, learning_rate=0.001):
  """
  Performs hyperparameters search in terms of width and depth for the MLP
      -features, labels, num_splits=5, num_epochs=10, batch_size=32, learning_rate=0.001 : same as cross_validation
      -num_layers_options : list of integers corresponding to the number of hidden layers in each architecture to be tested
      -hidden_size_options : list of tuples of integers giving the sizes of each hidden leayer in each architecture to be tested. Must be consistent with num_layers_options
  """

  #Consistency check
  assert len(num_layers_options) == len(hidden_size_options)
  for hidden_size_option, num_layers_option in zip(hidden_size_options, num_layers_options):
    assert len(hidden_size_option) == num_layers_option

  best_loss = np.inf
  best_model = None
  best_hidden_size = None
  best_num_layers = None

  losses = {}

  for hidden_size_option, num_layers_option in zip(hidden_size_options, num_layers_options):

    assert len(hidden_size_option)==num_layers_option

    model = create_mlp(features.shape[1], hidden_size_option * num_layers_option, len(np.unique(labels)))
    print(f"\nHidden Size: {hidden_size_option}, Num Layers: {num_layers_option}")

    validation_loss = np.average(cross_validation(model, features, labels, num_epochs=num_epochs, batch_size=batch_size,
                            learning_rate=learning_rate, num_splits=num_splits))

    losses[(num_layers_option,hidden_size_option)] = validation_loss

    if validation_loss < best_loss:
        best_loss = validation_loss
        best_model = model
        best_hidden_size = hidden_size_option
        best_num_layers = num_layers_option

  print("\nBest Model:")
  print(f"Hidden Size: {best_hidden_size}, Num Layers: {best_num_layers}")
  return best_model


In [None]:
csv_path = 'path.csv'  # Replace with actual dataset path

features, labels = load_dataset(csv_path)
features = standardize_features(features)

input_size = features.shape[1]

# Define hyperparameter search space
## Careful with the consistence between the two lists!! (Otherwise it will raise AssertionError)
num_layers_options = [1, 1, 2, 2, 3, 3]
hidden_size_options = [(64,), (128,), (64, 32), (128, 64), (128,64,32), (64,32,16)]


best_model = hyperparameter_search(features, labels, num_layers_options, hidden_size_options)


In [None]:
def evaluate_model(model, features, labels):
  """
  Evaluated the model on the whole dataset once the best model found.
  """
  model.eval()
  with torch.no_grad():
      inputs = torch.tensor(features, dtype=torch.float32)
      targets = torch.tensor(labels, dtype=torch.long)
      outputs = model(inputs)
      _, predicted = torch.max(outputs, 1)
      accuracy = (predicted == targets).sum().item() / len(targets)
  return accuracy

In [None]:
# Evaluate the best model on the entire dataset
final_accuracy = evaluate_model(best_model, features, labels)
print(f'\nFinal Accuracy on the Entire Dataset: {final_accuracy}')