# The following code was used to partially analyse which hyperparameters are most suited for our sign language mnist dataset  
## Results from the analysis (outputs, tensorboards) were created with the usage of ray tune

In [None]:
from functools import partial
import numpy as np
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import random_split
import torchvision
import torchvision.transforms as transforms
from ray import tune
from ray.tune import CLIReporter
from ray.tune.schedulers import ASHAScheduler
import functools
import operator

from sign_language_mnist import get_train_val_loaders


In [None]:
class CNN(nn.Module):

    def __init__(self, layer_1_size, kernel_l1, layer_2_size, kernel_l2, layer_3_size, kernel_l3, fc1, fc2, input_dims=(1, 28, 28)):

        super().__init__()

        self.feature_extractor = nn.Sequential(
            nn.Conv2d(1, layer_1_size, kernel_size = kernel_l1, padding = 1),
            nn.BatchNorm2d(layer_1_size),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2),
            nn.Dropout(0.25),
            nn.Conv2d(layer_1_size, layer_2_size, kernel_size = kernel_l2, padding = 1),
            nn.BatchNorm2d(layer_2_size),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2),
            nn.Dropout(0.25),
            nn.Conv2d(layer_2_size, layer_3_size, kernel_size = kernel_l3, padding = 1),
            nn.BatchNorm2d(layer_3_size),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2),
            nn.Dropout(0.25),
        )

        "a trick to find out automatically the number of flattened features in the first fully connected layer"
        num_features_before_fcnn = functools.reduce(operator.mul, list(self.feature_extractor(torch.rand(1, *(input_dims))).shape))

        self.classifier = nn.Sequential(
            nn.Linear(num_features_before_fcnn, fc1),
            nn.ReLU(inplace=True),
            nn.Dropout(0.5),
            nn.Linear(fc1, fc2),
            nn.ReLU(inplace=True),
            nn.Dropout(0.5),
            nn.Linear(fc2, 25),
            nn.LogSoftmax(dim = 1)
        )

    def forward(self, x):
        size = x.size(0)
        out = self.feature_extractor(x)
        out = out.view(size, -1)
        out = self.classifier(out)
        return out

In [None]:
def train_tune(config, num_epochs=35, checkpoint_dir=None, data_dir=None):
    """
    A train method that will be used to train every model created in the "main_tune" function
    ----------
    config : dict
        A dictionary containing every hyperparameter and his values which will be analysed
    num_epochs : int
        how much epochs to train each model
    checkpoint_dir: str
        A path in order to restore a checkpoint
    data_dir : str
        A path where the data is stored
    """
    net = CNN(config["layer_1_size"], config["kernel_l1"], config["layer_2_size"], config["kernel_l2"], config["layer_3_size"], config["kernel_l3"], config["fc1"], config["fc2"])

    device = "cpu"
    if torch.cuda.is_available():
        device = "cuda:0"
        if torch.cuda.device_count() > 1:
           net = nn.DataParallel(net)
    net.to(device)

    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(net.parameters(), lr=0.001)

    data_loaders = get_train_val_loaders()
    train_loader = data_loaders['train']
    val_loader = data_loaders['val']

    for epoch in range(num_epochs):  # loop over the dataset multiple times
        running_loss = 0.0
        epoch_steps = 0
        for i, data in enumerate(train_loader):
            
            # get the inputs
            inputs, labels = data
            inputs, labels = inputs.to(device), labels.to(device)

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = net(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            # print statistics
            running_loss += loss.item()
            epoch_steps += 1
            if i % 2000 == 1999:  # print every 2000 mini-batches
                print("[%d, %5d] loss: %.3f" % (epoch + 1, i + 1,
                                                running_loss / epoch_steps))
                running_loss = 0.0

        # Validation loss
        val_loss = 0.0
        val_steps = 0
        total = 0
        correct = 0
        for i, data in enumerate(val_loader):
            with torch.no_grad():

                inputs, labels = data
                inputs, labels = inputs.to(device), labels.to(device)

                outputs = net(inputs)
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()

                loss = criterion(outputs, labels)
                val_loss += loss.cpu().numpy()
                val_steps += 1

        with tune.checkpoint_dir(epoch) as checkpoint_dir:
            path = os.path.join(checkpoint_dir, "checkpoint")
            torch.save((net.state_dict(), optimizer.state_dict()), path)

        tune.report(loss=(val_loss / val_steps), accuracy=correct / total)

    print("Finished Training")

In [None]:
def main_tune(num_samples=10, num_epochs=35, gpus_per_trial=1):
    """
    Make an analysis with ray tune creating many models with different hyperparameter values,
    produce local output table with the CLIReporter and store additional results in a tensorboard
    Parameters
    ----------
    num_samples : int
        If no grid search on the hyperparameter values is desired, 
        only a certain number of samples (from the space created from the given hyperparameter values) can be taken
        in order to approximate.
    num_epochs : int
        how much epochs to train each model
    gpus_per_trial : int
        Number of GPUs available
    Returns
    -------
    str
        Best hyperparameter value combination 
    """
    # config = {
    #     "layer_1_size": tune.choice([32, 64, 128]),
    #     "layer_2_size": tune.choice([64, 128, 256]),
    #     "layer_3_size": tune.choice([64, 128, 256]),
    #     "kernel_l1": tune.choice([3,5]),
    #     "kernel_l2": tune.choice([3,5]),
    #     "kernel_l3": tune.choice([3,5]),
    #     "fc1": tune.choice([64, 128, 256, 512]),
    #     "fc2": tune.choice([64, 128, 256, 512]),
    #     "fc3": tune.choice([64, 128, 256, 512]),
    #     "lr": tune.choice([0.0001, 0.001, 0.01, 0.1]),
    #     "batch_size": tune.choice([32, 64, 128, 256]),
    # }
    config = {
        "layer_1_size": tune.grid_search([32, 64, 128]),
        "layer_2_size": tune.grid_search([64, 128, 256]),
        "layer_3_size": tune.grid_search([64, 128, 256]),
        "kernel_l1": tune.grid_search([3,5]),
        "kernel_l2": tune.grid_search([3,5]),
        "kernel_l3": tune.grid_search([3,5]),
        "fc1": tune.grid_search([64, 128, 256]),
        "fc2": tune.grid_search([64, 128, 256]),
        "fc3": tune.grid_search([64, 128, 256]),
        #"lr": tune.grid_search([0.0001, 0.001, 0.01, 0.1]),
        #"batch_size": tune.grid_search([32, 64, 128, 256]),
    }

    scheduler = ASHAScheduler(
        max_t=num_epochs,
        grace_period=1,
        reduction_factor=2)

    reporter = CLIReporter(
        parameter_columns=["layer_1_size", "layer_2_size", "layer_3_size", "kernel_l1", "kernel_l2", "kernel_l3", "fc1", "fc2", "lr"],
        metric_columns=["loss", "mean_accuracy", "training_iteration"])

    analysis = tune.run(
        tune.with_parameters(
            train_tune,
            num_epochs=num_epochs,
        ),
        resources_per_trial={
            "cpu": 1,
            "gpu": gpus_per_trial
        },
        metric="loss",
        mode="min",
        config=config,
        num_samples=num_samples,
        scheduler=scheduler,
        progress_reporter=reporter,
        name="main_tune")

    print("Best hyperparameters found were: ", analysis.best_config)

In [None]:
main_tune()