# Notebook 3: Cross Validation for Multi-Class Classification
__Author: Bibek Poudel__

This Notebook performs 5 fold cross validation to find the best "number of hidden units in 2 hidden layers" in the MLP size specified by the question. For the dataset given, 

The number of hidden units in the first hidden layer (L1) is chosen from {50, 75, 100} and 

The number of hidden units in the second hidden layer (L2) is chosen from {10, 15,20}


The results are as follows: 

| Dataset | No. of hidden units in L1 | No. of hidden units in L2 |Average Cross Validation Accuracy across 5 folds|
| --- | --- | --- | ---|
| Given Multiclass Dataset | 100 | 20 | 84.43% | 


In [1]:
import glob
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
import random
from sklearn.model_selection import KFold
from scipy.io import loadmat

In [2]:
# set seeds for reproducibility of results
SEED = 101
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

<torch._C.Generator at 0x7fe063251a10>

In [3]:
# Load dataset paths
datasets = glob.glob('./datasets/multi-class/*.mat')
print("Dataset Loaded:\n")
    
# Data Exploration
for dataset_path in datasets:
    path_name_end = dataset_path.split('/')[-1]
    #print(path_name_end)
    
    d_set = loadmat(dataset_path)
    d_name = path_name_end.split('.')[0]
    #print(d_set.keys())
    data = d_set[d_name]
    print("\tdata: {} , shape = {}\n".format(d_name,data.shape))


Dataset Loaded:

	data: train_images , shape = (10000, 784)

	data: test_labels , shape = (1, 1000)

	data: train_labels , shape = (1, 10000)

	data: test_images , shape = (1000, 784)



In [4]:
# Define a dataset class (Required by pytorch)
class MultiDataset(Dataset):
    def __init__(self, inputs, targets):
        self.inputs_list = inputs
        self.target_list = targets
        assert (len(self.inputs_list) == len(self.target_list))
        
    def __len__(self):
        return len(self.inputs_list)
    
    def __getitem__(self, key):
        input_idx = self.inputs_list[key]
        target_idx = self.target_list[key]
        return [input_idx.astype(np.float32), target_idx.astype(np.float32) ]#.astype(np.long)]

In [5]:
# Define a neural network with a single hidden layer
class model(nn.Module):
    def __init__(self, input_units, hidden_units_l1, hidden_units_l2, output_units):
        super().__init__()
        self.activation = nn.ReLU() # ReLU activation function
        self.fc1 = nn.Linear(input_units, hidden_units_l1) # Hidden units can be specified arbitrarily
        self.fc2 = nn.Linear(hidden_units_l1, hidden_units_l2) # Hidden units can be specified arbitrarily
        self.fc3 = nn.Linear(hidden_units_l2, output_units) # Output units can be specified arbitrarily
        
    def forward(self, x):
        x = self.activation(self.fc1(x))
        x = self.activation(self.fc2(x)) 
        x = self.fc3(x)
        return x

In [6]:
# Performance metric
def akkuracy(ground_truths, predictions):
    num_correct = np.sum(ground_truths==predictions)
    total = ground_truths.shape[0]
    return num_correct*100.0 / total

In [7]:
# Define a method to train neural network model
def train(model, dataloader, optimizer, learning_rate, loss_function, epochs, train_size):
    
    epoch_losses = np.zeros(epochs)
    for epoch in range(epochs):
        predictions =  np.zeros(train_size)
        ground_truths_total= np.zeros(train_size)
        count =0
        
        #print("Epoch:{}".format(epoch), end = "\t")
        total_loss = 0.0

        for bi, (inputs, targets) in enumerate(dataloader):
            current_size=targets.shape[0]
            ground_truths_total[count: count+current_size] = targets
            
            optimizer.zero_grad()

            outputs = model(inputs)

            #print(outputs)
            #print(targets)
            
            loss = loss_function(outputs.float(), targets.long()) # targets should be long [batch size], outputs are distributed across the classes?
            
            loss.backward()

            optimizer.step()
            
            total_loss+=loss.item()
            
            _, predicted = torch.max(outputs.data, 1) 
            predicted_np = np.squeeze(predicted.cpu().detach().numpy())
            predictions[count: count+current_size] = predicted_np
            
            count+=current_size
        
        avg_batch_loss = round(total_loss/len(dataloader),3)
        #print("Average batch loss:{}".format(avg_batch_loss))
        epoch_losses[epoch] = avg_batch_loss
        torch.save(model, "./saved_models/cross_val_model_multi.pt")
        train_accuracy = akkuracy(ground_truths_total, predictions)
    
    return train_accuracy

In [8]:
# Define a method to perform model validation
def validate(trained_model, dataloader, val_size):
    predictions =  np.zeros(val_size)
    ground_truths_total= np.zeros(val_size)
    count =0
    with torch.no_grad():
        for bi, (inputs, ground_truths) in enumerate(dataloader):
            
            outputs = trained_model(inputs)
            _, predicted = torch.max(outputs.data, 1) 
            predicted_np = np.squeeze(predicted.cpu().detach().numpy())
            current_size=ground_truths.shape[0]
            predictions[count: count+current_size] = predicted_np
            ground_truths_total[count: count+current_size] = ground_truths
            count+=current_size

    validation_accuracy = akkuracy(ground_truths_total, predictions)
    return validation_accuracy

In [9]:
# Define a method to perform corss five fold validation (80%, 20% split)
def cross_validate(dataset_train_X, dataset_train_Y, folds, hidden_units_l1, hidden_units_l2, output_units):
    #print("Training set received", dataset_train_X.shape)
    results=0.0
    kfold = KFold(n_splits=folds, shuffle=True)
    
    for fold, (train_ids, val_ids) in enumerate(kfold.split(dataset_train_X)):
        print("\tCross validation fold",fold+1)
        #print(train_ids)
        #print(val_ids)

        # Training
        net = model(dataset_train_X.shape[1], hidden_units_l1, hidden_units_l2, output_units).train()
        
        # Using some standard hyper-parameter values for cross validation
        epochs =50
        loss = nn.CrossEntropyLoss() # Loss function as Cross Entropy
        lr = 0.0001 # Learning rate 
        optimizer = torch.optim.Adam(net.parameters(), lr = lr) # Optimizer as Adam
        BATCH_SIZE = 32
        
        fold_train_features =  dataset_train_X[train_ids]
        fold_train_targets = dataset_train_Y[train_ids]
        
        fold_train_dataset = MultiDataset(fold_train_features, fold_train_targets)

        fold_train_dataloader = torch.utils.data.DataLoader(dataset=fold_train_dataset,
                                                   batch_size=BATCH_SIZE,
                                                   collate_fn=None,
                                                   shuffle=False)
        
        train_acc = train(net, fold_train_dataloader, optimizer, lr, loss, epochs,fold_train_targets.shape[0])
        print("\tTraining Complete with accuracy = {}% ..saving a model".format(round(train_acc,3)))
        
        # Validation
        # Load the model that was trained this fold
        
        trained_model = torch.load("./saved_models/cross_val_model_multi.pt")

        fold_val_features = dataset_train_X[val_ids]
        fold_val_targets = dataset_train_Y[val_ids]
        
        fold_val_dataset = MultiDataset(fold_val_features, fold_val_targets)
        
       
        fold_val_dataloader = torch.utils.data.DataLoader(dataset=fold_val_dataset,
                                                   batch_size=BATCH_SIZE,
                                                   collate_fn=None,
                                                   shuffle=False)
    
        valid_acc = validate(trained_model, fold_val_dataloader, fold_val_targets.shape[0])
        results+=valid_acc
        print("\tValidation Accuracy = {}%\n".format(round(valid_acc,3)))
        
    final_acc = round(results/folds,3)
    print("Mean validation accuracy across {} folds: {}%".format(folds, final_acc))
    return final_acc

In [10]:
def return_data(dataset_path):
    path_name_end = dataset_path.split('/')[-1]
    d_set = loadmat(dataset_path)
    d_name = path_name_end.split('.')[0]
    data = d_set[d_name]
    return data

train_X = return_data(datasets[0])
train_Y = np.squeeze(return_data(datasets[2]))

folds = 5
output_units = 10
l1_values = [50, 75, 100]
l2_values = [10, 15, 20]


print("\n__________________________________________________________")
print("\nPerforming 5 fold cross validation in the multi class dataset")
print("Varying number of hidden units: L1 in [50, 75, 100] and L2 in [10, 15, 20]")
print("__________________________________________________________\n")
# Vary hidden units in L1 and L2
results=[]
for l1 in l1_values:
    for l2 in l2_values:
        print("\nNumber of hidden units in L1= {} and L2= {}\n".format(l1,l2))
        results.append(cross_validate(train_X, train_Y, folds, l1, l2, output_units))
        print("__________________________________________________________\n")
print("********************************************************************\n")  
results = np.array(results)
best_acc = np.max(results)
ind = np.where(results==np.max(results))[0]
#print(ind)

l1_best = l1_values[int(ind/3)] # /3 is correct
l2_best = l2_values[int(ind%3)]

print("Best number of hidden units in L1 and L2 found to be {} and {} respectively with validation accuracy of {} %".format(l1_best, l2_best, best_acc))
print("\n********************************************************************\n")  


__________________________________________________________

Performing 5 fold cross validation in the multi class dataset
Varying number of hidden units: L1 in [50, 75, 100] and L2 in [10, 15, 20]
__________________________________________________________


Number of hidden units in L1= 50 and L2= 10

	Cross validation fold 1
	Training Complete with accuracy = 92.312% ..saving a model
	Validation Accuracy = 81.6%

	Cross validation fold 2
	Training Complete with accuracy = 92.775% ..saving a model
	Validation Accuracy = 82.4%

	Cross validation fold 3
	Training Complete with accuracy = 91.412% ..saving a model
	Validation Accuracy = 82.1%

	Cross validation fold 4
	Training Complete with accuracy = 92.75% ..saving a model
	Validation Accuracy = 84.35%

	Cross validation fold 5
	Training Complete with accuracy = 94.125% ..saving a model
	Validation Accuracy = 82.7%

Mean validation accuracy across 5 folds: 82.63%
__________________________________________________________


Number of hi