In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# **Functions**

## Data loading functions

In [31]:
def load_dataset(csv_path):
    """
    Load the dataset made of labels and TMD features stored ar csv_path
        - csv_path: String giving the path where the dataset is stored
    Returns features and labels as Pandas DataFrames
    """
    df = pd.read_csv(csv_path)
    labels = df.iloc[:, 0].values.astype(int)
    features = df.iloc[:, 1:].values
    return features, labels

def standardize_features(features):
  """
  Make every feature centered and scaled to unit variance
      - features: Pandas DataFrame
  """
  scaler = StandardScaler()
  return scaler.fit_transform(features)

## Creation of custom MLP

In [25]:
def create_mlp(input_size, hidden_sizes, output_size=2):
  """
  Defines a function to create a MLP model with variable architecture
      -input_size, output_size : Integers giving the size of input and input
      -hidden_size : List of integers corresponding to the hidden layers widths
  """
  layers = []
  sizes = [input_size] + hidden_sizes + [output_size]
  for i in range(len(sizes) - 1):
      layers.append(nn.Linear(sizes[i], sizes[i+1]))
      if i < len(sizes) - 2:
          layers.append(nn.ReLU())
  return nn.Sequential(*layers)

## Cross validation for a given model

In [78]:
def cross_validation(model, features, labels, num_epochs=10, batch_size=32, learning_rate=0.001, num_splits=5):
  """
  For a given model and dataset (features, labels) this funciton performs K_fold cross validation
      -model: Pytorch MLP model with appropriate input and output sizes
      -features, labels = Dataset in the form of two Pandas dataframes
      -num_epoch, batch_size, learning_rate : usual NN parameters
      -num_split : number of splits for the K_fold CV
  Returns a list with the num_splits values of validation loss
  """
  skf = StratifiedKFold(n_splits=num_splits, shuffle=True, random_state=42)
  criterion = nn.CrossEntropyLoss()

  all_val_losses = []

  for fold, (train_index, test_index) in enumerate(skf.split(features, labels)):
    train_features, test_features = features[train_index], features[test_index]
    train_labels, test_labels = labels[train_index], labels[test_index]

    train_dataset = TensorDataset(torch.tensor(train_features, dtype=torch.float32),
                                  torch.tensor(train_labels, dtype=torch.long))
    test_dataset = TensorDataset(torch.tensor(test_features, dtype=torch.float32),
                                  torch.tensor(test_labels, dtype=torch.long))

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    val_losses = []

    for epoch in range(num_epochs):
        model.train()
        for batch_features, batch_labels in train_loader:
          optimizer.zero_grad()
          outputs = model(batch_features)
          loss = criterion(outputs, batch_labels)
          loss.backward()
          optimizer.step()

        model.eval()
        val_loss = 0.0
        correct = 0
        total = 0
        with torch.no_grad():
          for batch_features, batch_labels in test_loader:
            outputs = model(batch_features)
            loss = criterion(outputs, batch_labels)
            val_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total += batch_labels.size(0)
            correct += (predicted == batch_labels).sum().item()

        accuracy = correct / total
        val_losses.append(val_loss / len(test_loader))
        print(f'Fold {fold + 1}, Epoch [{epoch + 1}/{num_epochs}], Test Loss: {val_losses[-1]}, Test Accuracy: {accuracy}')

    all_val_losses.append(val_losses)

  return all_val_losses


## Hyperparamters search (i.e test cross_validation result for different models)

In [83]:
def hyperparameter_search(features, labels, hidden_size_options, num_splits=5, num_epochs=10, batch_size=32, learning_rate=0.001):
  """
  Performs hyperparameters search in terms of width and depth for the MLP
      -features, labels, num_splits=5, num_epochs=10, batch_size=32, learning_rate=0.001 : same as cross_validation
      -num_layers_options : list of integers corresponding to the number of hidden layers in each architecture to be tested
      -hidden_size_options : list of tuples of integers giving the sizes of each hidden leayer in each architecture to be tested. Must be consistent with num_layers_options
  """

  #Consistency check
  best_loss = np.inf
  best_model = None
  best_hidden_size = None
  best_num_layers = None

  losses = {}

  for hidden_size_option in hidden_size_options:

    model = create_mlp(features.shape[1], hidden_size_option, len(np.unique(labels)))
    print(f"\nHidden Size: {hidden_size_option}, Num Layers: {len(hidden_size_option)}")

    validation_loss = np.average(cross_validation(model, features, labels, num_epochs=num_epochs, batch_size=batch_size,
                            learning_rate=learning_rate, num_splits=num_splits))

    print((len(hidden_size_option),hidden_size_option))
    losses[(len(hidden_size_option),tuple(hidden_size_option))] = validation_loss

    if validation_loss < best_loss:
        best_loss = validation_loss
        best_model = model
        best_hidden_size = hidden_size_option
        best_num_layers = len(hidden_size_option)

  print("\nBest Model:")
  print(f"Hidden Size: {best_hidden_size}, Num Layers: {best_num_layers}")
  return best_model


# **Actual tests**

## Data loading

In [65]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [66]:
data_path = '/content/gdrive/MyDrive/OutputCS433/dataset.csv'

In [67]:
features, labels = load_dataset(data_path)
features = standardize_features(features)

input_size = features.shape[1]

In [88]:
print(np.unique(labels,return_counts=True))

(array([0, 1]), array([79, 19]))


## A first model (Note: Using weighted cross entropy)

In [104]:
model1 = create_mlp(input_size,[32],2)

In [105]:
model=model1
batch_size=32
num_epochs=50
learning_rate=0.0005
number_splits=5
weights=torch.tensor([0.2, 0.8], dtype=torch.float32)


In [106]:
skf = StratifiedKFold(n_splits=number_splits, shuffle=True, random_state=42)

all_val_losses = []

fold=0

train_index, test_index = next(skf.split(features, labels))

train_features, test_features = features[train_index], features[test_index]
train_labels, test_labels = labels[train_index], labels[test_index]

train_dataset = TensorDataset(torch.tensor(train_features, dtype=torch.float32),
                              torch.tensor(train_labels, dtype=torch.long))
test_dataset = TensorDataset(torch.tensor(test_features, dtype=torch.float32),
                              torch.tensor(test_labels, dtype=torch.long))

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

optimizer = optim.Adam(model1.parameters(), lr=learning_rate)

val_losses = []
accuracies_1=[]
accuracies_0=[]

for epoch in range(num_epochs):
    model.train()
    for batch_features, batch_labels in train_loader:
      optimizer.zero_grad()
      outputs = model(batch_features)
      loss = nn.functional.cross_entropy(outputs, batch_labels, weight=weights)
      loss.backward()
      optimizer.step()

    model.eval()
    val_loss = 0.0
    correct = 0
    total = 0
    total1 = 0
    total0 = 0
    correct1 = 0
    correct0 = 0

    with torch.no_grad():
      for batch_features, batch_labels in test_loader:
        outputs = model(batch_features)
        loss = nn.functional.cross_entropy(outputs, batch_labels, weight=weights)
        val_loss += loss.item()
        _, predicted = torch.max(outputs.data, 1)
        total += batch_labels.size(0)
        total1 += torch.sum(batch_labels == 1).item()
        total0 += torch.sum(batch_labels == 0).item()
        correct += (predicted == batch_labels).sum().item()

        correct0  += (batch_labels[predicted == batch_labels]==0).sum().item()
        correct1  += (batch_labels[predicted == batch_labels]==1).sum().item()


    accuracy = correct / total
    accuracy1 = correct1 / total1
    accuracy0 = correct0 / total0
    val_losses.append(val_loss / len(test_loader))
    accuracies_1.append(accuracy1)
    accuracies_0.append(accuracy0)

    print(f'Fold {fold + 1}, Epoch [{epoch + 1}/{num_epochs}], Test Loss: {val_losses[-1]}, Test Accuracy: {accuracy}, Test accuracy on 1=sick: {accuracy1}, Test accuracy on 0: {accuracy0}')




Fold 1, Epoch [1/50], Test Loss: 0.8375037908554077, Test Accuracy: 0.6, Test accuracy on 1=sick: 0.5, Test accuracy on 0: 0.625
Fold 1, Epoch [2/50], Test Loss: 0.8548016548156738, Test Accuracy: 0.6, Test accuracy on 1=sick: 0.25, Test accuracy on 0: 0.6875
Fold 1, Epoch [3/50], Test Loss: 0.7556112408638, Test Accuracy: 0.7, Test accuracy on 1=sick: 0.5, Test accuracy on 0: 0.75
Fold 1, Epoch [4/50], Test Loss: 0.7878825068473816, Test Accuracy: 0.75, Test accuracy on 1=sick: 0.5, Test accuracy on 0: 0.8125
Fold 1, Epoch [5/50], Test Loss: 1.0118687152862549, Test Accuracy: 0.7, Test accuracy on 1=sick: 0.25, Test accuracy on 0: 0.8125
Fold 1, Epoch [6/50], Test Loss: 0.9976857304573059, Test Accuracy: 0.65, Test accuracy on 1=sick: 0.25, Test accuracy on 0: 0.75
Fold 1, Epoch [7/50], Test Loss: 0.7385992407798767, Test Accuracy: 0.8, Test accuracy on 1=sick: 0.5, Test accuracy on 0: 0.875
Fold 1, Epoch [8/50], Test Loss: 0.7359012365341187, Test Accuracy: 0.75, Test accuracy on 1=s

## Hyperparameters search (Careful! Not using weighted cross entropy loss yet!)

In [84]:
# Define hyperparameter search space
hidden_size_options = [[64], [128], [64, 32], [128, 64], [128,64,32], [64,32,16]]


best_model = hyperparameter_search(features, labels, hidden_size_options)



Hidden Size: [64], Num Layers: 1
Fold 1, Epoch [1/10], Test Loss: 0.9525598287582397, Test Accuracy: 0.75
Fold 1, Epoch [2/10], Test Loss: 1.4524471759796143, Test Accuracy: 0.65
Fold 1, Epoch [3/10], Test Loss: 2.043156862258911, Test Accuracy: 0.65
Fold 1, Epoch [4/10], Test Loss: 1.8187230825424194, Test Accuracy: 0.65
Fold 1, Epoch [5/10], Test Loss: 1.0540103912353516, Test Accuracy: 0.7
Fold 1, Epoch [6/10], Test Loss: 0.8914045095443726, Test Accuracy: 0.75
Fold 1, Epoch [7/10], Test Loss: 0.9733399152755737, Test Accuracy: 0.75
Fold 1, Epoch [8/10], Test Loss: 0.9803611040115356, Test Accuracy: 0.7
Fold 1, Epoch [9/10], Test Loss: 1.1543691158294678, Test Accuracy: 0.7
Fold 1, Epoch [10/10], Test Loss: 1.109277606010437, Test Accuracy: 0.75
Fold 2, Epoch [1/10], Test Loss: 0.9855219125747681, Test Accuracy: 0.75
Fold 2, Epoch [2/10], Test Loss: 0.9539388418197632, Test Accuracy: 0.8
Fold 2, Epoch [3/10], Test Loss: 1.0097142457962036, Test Accuracy: 0.7
Fold 2, Epoch [4/10], T

## Best model evaluation

In [None]:
## TODO : Further analysis of the best model