# Notebook 1: Cross Validation for Binary Classification
__Author: Bibek Poudel__

This Notebook performs 5 fold cross validation to find the best "number of hidden units" in the MLP size specified by the question. For all datasets given, the number of hidden units are varied in the range 1 to 10 and at each value 5 fold cross validation is performed.

The results are as follows: 

| Dataset | No. of hidden units | Average Cross Validation Accuracy across 5 folds|
| --- | --- | --- |
| Iris | 9 | 95.83% | 
| Digit | 9 | 96.25% |
| Breast Cancer |10 | 96.71% |
| Wine | 7 | 73.12% |
| Diabetes | 8 | 77.07% |

In [1]:
import glob
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
import random
from sklearn.model_selection import KFold

In [2]:
# set seeds for reproducibility of results
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

<torch._C.Generator at 0x7fbd7eb0ba10>

In [3]:
# Load dataset paths
datasets = glob.glob('./datasets/bi-class/*.npz')
print("Datasets Loaded:\n")
    
# Data Exploration
for dataset_path in datasets:
    print(dataset_path.split('/')[-1])
    dataset = np.load(dataset_path)
    train_X, train_Y, test_X, test_Y = dataset['train_X'],dataset['train_Y'],dataset['test_X'],dataset['test_Y']
    print("\tTrain Features:",train_X.shape)
    print("\t"+"Train Labels:",train_Y.shape)
    print("\t"+"Test Features:",test_X.shape)
    print("\t"+"Test Labels:",test_Y.shape)

Datasets Loaded:

iris.npz
	Train Features: (120, 4)
	Train Labels: (120,)
	Test Features: (30, 4)
	Test Labels: (30,)
digit.npz
	Train Features: (800, 64)
	Train Labels: (800,)
	Test Features: (200, 64)
	Test Labels: (200,)
breast-cancer.npz
	Train Features: (547, 10)
	Train Labels: (547,)
	Test Features: (136, 10)
	Test Labels: (136,)
wine.npz
	Train Features: (142, 13)
	Train Labels: (142,)
	Test Features: (36, 13)
	Test Labels: (36,)
diabetes.npz
	Train Features: (615, 8)
	Train Labels: (615,)
	Test Features: (153, 8)
	Test Labels: (153,)


In [4]:
# Define a dataset class (Required by pytorch)
class BinaryDataset(Dataset):
    def __init__(self, inputs, targets):
        self.inputs_list = inputs
        self.target_list = targets
        assert (len(self.inputs_list) == len(self.target_list))
        
    def __len__(self):
        return len(self.inputs_list)
    
    def __getitem__(self, key):
        input_idx = self.inputs_list[key]
        target_idx = self.target_list[key]
        return [input_idx.astype(np.float32), target_idx.astype(np.float32)]

In [5]:
# Define a neural network with a single hidden layer
class model(nn.Module):
    def __init__(self, input_units, hidden_units):
        super().__init__()
        self.activation = nn.ReLU() # ReLU activation function
        self.fc1 = nn.Linear(input_units, hidden_units) # Hidden units can be specified arbitrarily
        self.fc2 = nn.Linear(hidden_units, 1)
        self.output_activation = nn.Sigmoid()
        
    def forward(self, x):
        x = self.activation(self.fc1(x))
        x = self.output_activation(self.fc2(x)) # For Binary classification
        return x

In [6]:
# Define a performance metric
def akkuracy(ground_truths, predictions):
    
    predictions = torch.Tensor(predictions)
    ground_truths = torch.ByteTensor(ground_truths)   # a Tensor of 0s and 1s
    actual_preds = predictions >= 0.5       # a Tensor of 0s and 1s or True and False
    num_correct = torch.sum(ground_truths==actual_preds)  # a Tensor
    acc = (num_correct.item() * 100.0 / len(ground_truths))  # scalar
    return acc

In [7]:
# Define a method to train neural network model
def train(model, dataloader, optimizer, learning_rate, loss_function, epochs, train_size):
    
    for epoch in range(epochs):
        predictions =  np.zeros(train_size)
        ground_truths_total= np.zeros(train_size)
        count =0
        
        #print("Epoch:{}".format(epoch), end = "\t")
        total_loss = 0.0

        for bi, (inputs, targets) in enumerate(dataloader):
            current_size=targets.shape[0]
            ground_truths_total[count: count+current_size] = targets
            
            targets= targets.view(-1,1) # or do torch squeeze to outputs
            optimizer.zero_grad()

            outputs = model(inputs)
            outputs_np = np.squeeze(outputs.cpu().detach().numpy())
            
            loss = loss_function(outputs, targets)

            loss.backward()

            optimizer.step()
            
            total_loss+=loss.item()
            
            
            predictions[count: count+current_size] = outputs_np
            count+=current_size

        #print("Average batch loss:{}".format(round(total_loss/len(dataloader),3)))
        torch.save(model, "./saved_models/cross_val_model.pt")
        train_accuracy = akkuracy(ground_truths_total, predictions)
    return train_accuracy

In [8]:
# Define a method to perform model validation
def validate(trained_model, dataloader, val_size):
    predictions =  np.zeros(val_size)
    ground_truths_total= np.zeros(val_size)
    count =0
    with torch.no_grad():
        for bi, (inputs, ground_truths) in enumerate(dataloader):
            
            outputs = trained_model(inputs)
            outputs_np = np.squeeze(outputs.cpu().detach().numpy()) #Although ... 

            current_size=ground_truths.shape[0]
            predictions[count: count+current_size] = outputs_np
            ground_truths_total[count: count+current_size] = ground_truths
            count+=current_size

    validation_accuracy = akkuracy(ground_truths_total, predictions)
    return validation_accuracy

In [9]:
# Define a method to perform corss five fold validation (80%, 20% split)
def cross_validate(dataset_train_X, dataset_train_Y, folds, hidden_units):
    #print("Training set received", dataset_train_X.shape)
    results=0.0
    kfold = KFold(n_splits=folds, shuffle=True)
    
    for fold, (train_ids, val_ids) in enumerate(kfold.split(dataset_train_X)):
        print("\tCross validation fold",fold+1)
        #print(train_ids)
        #print(val_ids)

        # Training
        net = model(dataset_train_X.shape[1], hidden_units).train()
        
        # Using some standard hyper-parameter values for cross validation
        epochs =20
        loss = nn.BCELoss() # Loss function as Binary Cross Entropy
        lr = 0.001 # Learning rate 
        optimizer = torch.optim.Adam(net.parameters(), lr = lr) # Optimizer as Adam
        BATCH_SIZE = 4
        
        fold_train_features =  dataset_train_X[train_ids]
        fold_train_targets = dataset_train_Y[train_ids]
        
        fold_train_dataset = BinaryDataset(fold_train_features, fold_train_targets)

        fold_train_dataloader = torch.utils.data.DataLoader(dataset=fold_train_dataset,
                                                   batch_size=BATCH_SIZE,
                                                   collate_fn=None,
                                                   shuffle=False)
        
        train_acc = train(net, fold_train_dataloader, optimizer, lr, loss, epochs,fold_train_targets.shape[0])
        print("\tTraining Complete with accuracy = {}% ..saving a model".format(round(train_acc,3)))
        
        # Validation
        # Load the model that was trained this fold
        
        trained_model = torch.load("./saved_models/cross_val_model.pt")

        fold_val_features = dataset_train_X[val_ids]
        fold_val_targets = dataset_train_Y[val_ids]
        
        fold_val_dataset = BinaryDataset(fold_val_features, fold_val_targets)
        
       
        fold_val_dataloader = torch.utils.data.DataLoader(dataset=fold_val_dataset,
                                                   batch_size=BATCH_SIZE,
                                                   collate_fn=None,
                                                   shuffle=False)
    
        valid_acc = validate(trained_model, fold_val_dataloader, fold_val_targets.shape[0])
        results+=valid_acc
        print("\tValidation Accuracy = {}%\n".format(round(valid_acc,3)))
        
    final_acc = round(results/folds,3)
    print("Mean validation accuracy across {} folds: {}%".format(folds, final_acc))
    return final_acc

In [10]:
# Vary hidden units from 1 to 10 
folds = 5
for dataset_path in datasets:
    results=[]
    print("\n__________________________________________________________")
    print("\n{} dataset: performing 5 fold cross validation".format(dataset_path.split('/')[-1]))
    print("Varying number of hidden units from 1 to 10")
    print("__________________________________________________________\n")
    for hidden_unit in range(1,11):
        print("\nNumber of hidden units (H)= {}\n".format(hidden_unit))
        dataset = np.load(dataset_path)
        train_X, train_Y, test_X, test_Y = dataset['train_X'],dataset['train_Y'],dataset['test_X'],dataset['test_Y']
        results.append(cross_validate(train_X, train_Y, folds, hidden_unit))
        print("__________________________________________________________\n")
    print("********************************************************************\n")  
    results = np.array(results)
    print("In {} dataset, best number of hidden units found to be {} with validation accuracy of {} %".format(dataset_path.split('/')[-1],
                                                                                                              np.where(results==np.max(results))[0][0]+1,
                                                                                                              np.max(results)))
    print("\n********************************************************************\n")  


__________________________________________________________

iris.npz dataset: performing 5 fold cross validation
Varying number of hidden units from 1 to 10
__________________________________________________________


Number of hidden units (H)= 1

	Cross validation fold 1
	Training Complete with accuracy = 69.792% ..saving a model
	Validation Accuracy = 54.167%

	Cross validation fold 2
	Training Complete with accuracy = 65.625% ..saving a model
	Validation Accuracy = 70.833%

	Cross validation fold 3
	Training Complete with accuracy = 65.625% ..saving a model
	Validation Accuracy = 70.833%

	Cross validation fold 4
	Training Complete with accuracy = 34.375% ..saving a model
	Validation Accuracy = 29.167%

	Cross validation fold 5
	Training Complete with accuracy = 66.667% ..saving a model
	Validation Accuracy = 66.667%

Mean validation accuracy across 5 folds: 58.333%
__________________________________________________________


Number of hidden units (H)= 2

	Cross validation fold 1

	Training Complete with accuracy = 52.656% ..saving a model
	Validation Accuracy = 46.875%

	Cross validation fold 5
	Training Complete with accuracy = 92.5% ..saving a model
	Validation Accuracy = 85.0%

Mean validation accuracy across 5 folds: 71.625%
__________________________________________________________


Number of hidden units (H)= 2

	Cross validation fold 1
	Training Complete with accuracy = 90.625% ..saving a model
	Validation Accuracy = 90.625%

	Cross validation fold 2
	Training Complete with accuracy = 90.312% ..saving a model
	Validation Accuracy = 93.75%

	Cross validation fold 3
	Training Complete with accuracy = 96.562% ..saving a model
	Validation Accuracy = 91.25%

	Cross validation fold 4
	Training Complete with accuracy = 90.781% ..saving a model
	Validation Accuracy = 90.625%

	Cross validation fold 5
	Training Complete with accuracy = 91.562% ..saving a model
	Validation Accuracy = 86.875%

Mean validation accuracy across 5 folds: 90.625%
______________________

	Training Complete with accuracy = 97.032% ..saving a model
	Validation Accuracy = 97.248%

	Cross validation fold 4
	Training Complete with accuracy = 96.347% ..saving a model
	Validation Accuracy = 97.248%

	Cross validation fold 5
	Training Complete with accuracy = 96.804% ..saving a model
	Validation Accuracy = 97.248%

Mean validation accuracy across 5 folds: 95.985%
__________________________________________________________


Number of hidden units (H)= 3

	Cross validation fold 1
	Training Complete with accuracy = 96.11% ..saving a model
	Validation Accuracy = 98.182%

	Cross validation fold 2
	Training Complete with accuracy = 96.339% ..saving a model
	Validation Accuracy = 99.091%

	Cross validation fold 3
	Training Complete with accuracy = 96.575% ..saving a model
	Validation Accuracy = 97.248%

	Cross validation fold 4
	Training Complete with accuracy = 97.717% ..saving a model
	Validation Accuracy = 93.578%

	Cross validation fold 5
	Training Complete with accuracy = 97.032

	Training Complete with accuracy = 40.708% ..saving a model
	Validation Accuracy = 37.931%

	Cross validation fold 2
	Training Complete with accuracy = 59.292% ..saving a model
	Validation Accuracy = 62.069%

	Cross validation fold 3
	Training Complete with accuracy = 39.474% ..saving a model
	Validation Accuracy = 42.857%

	Cross validation fold 4
	Training Complete with accuracy = 81.579% ..saving a model
	Validation Accuracy = 85.714%

	Cross validation fold 5
	Training Complete with accuracy = 58.772% ..saving a model
	Validation Accuracy = 64.286%

Mean validation accuracy across 5 folds: 58.571%
__________________________________________________________


Number of hidden units (H)= 4

	Cross validation fold 1
	Training Complete with accuracy = 73.451% ..saving a model
	Validation Accuracy = 68.966%

	Cross validation fold 2
	Training Complete with accuracy = 78.761% ..saving a model
	Validation Accuracy = 75.862%

	Cross validation fold 3
	Training Complete with accuracy = 78.94

	Training Complete with accuracy = 78.659% ..saving a model
	Validation Accuracy = 73.171%

	Cross validation fold 2
	Training Complete with accuracy = 72.764% ..saving a model
	Validation Accuracy = 76.423%

	Cross validation fold 3
	Training Complete with accuracy = 76.829% ..saving a model
	Validation Accuracy = 73.984%

	Cross validation fold 4
	Training Complete with accuracy = 77.439% ..saving a model
	Validation Accuracy = 70.732%

	Cross validation fold 5
	Training Complete with accuracy = 74.593% ..saving a model
	Validation Accuracy = 79.675%

Mean validation accuracy across 5 folds: 74.797%
__________________________________________________________


Number of hidden units (H)= 5

	Cross validation fold 1
	Training Complete with accuracy = 76.626% ..saving a model
	Validation Accuracy = 78.862%

	Cross validation fold 2
	Training Complete with accuracy = 77.846% ..saving a model
	Validation Accuracy = 71.545%

	Cross validation fold 3
	Training Complete with accuracy = 75.81