Although this Notebook aims to use Tensorboard to visualize the performance metrics and the learning of the network across epochs for a supervised learning dataset of digits available in `sklearn.datasets`, it also covers a few more topics which are mentioned below.
- Define your own Dataset class to load data as batchwise tensors. Refer [How to create a custom dataloader](https://medium.com/analytics-vidhya/writing-a-custom-dataloader-for-a-simple-neural-network-in-pytorch-a310bea680af) and the notebook below for the same.

- Visualize metrics and parameter learnings in tensorboard.

- Perform hyperparameter tuning using cartesian product and an OOP implementation (RunBuilder class).

- Organize the training process using OOP and save the results in an ordered fashion using RunManager class.

In [108]:
# The most important package
import torch

# Package with miscellaneous capabilities for loading image data, transforming etc.
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader, Dataset

# To define the network
import torch.nn as nn
import torch.nn.functional as F
from torch import optim         # To define the optimizer to study the class

# Basic data related operations
import numpy as np
import pandas as pd

# For loading the dataset and for evaluation purposes
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

# For writing to a tensorboard file.
from torch.utils.tensorboard import SummaryWriter

# To compute the cartesian product in hyperparameter tuning
from itertools import product

# To create a named-tuple in RunBuilder class for using `.` notation to access hyperparameters easily.
from collections import namedtuple

import time   # To time the epochs and runs across different hyperparameters
import json   # To write to a json file
from IPython.display import display, clear_output   # To dynamically update the output in the notebook

In [153]:
# Constants
B_SIZE = 64
LEARNING_RATE = 0.01
EPOCHS = 150
RO = 2 # Round off upto RO digits

To install tensorboard, you can simply do 

`!pip install tensorboard` 

from this notebook. If it's already installed, uninstall and reinstall to get the latest version.

In [110]:
!tensorboard --version

2.1.0


In [111]:
bc = load_digits()
X_train, X_ts, y_train, y_ts = train_test_split(bc.data, bc.target, test_size = 0.3)
X_val, X_test, y_val, y_test = train_test_split(X_ts, y_ts, test_size = 0.333)

In [112]:
def add_target(X, y):
    return np.hstack((X, y.reshape(y.shape[0], 1)))

In [113]:
train = add_target(X_train, y_train)
val = add_target(X_val, y_val)
test = add_target(X_test, y_test)

# Defining your own dataloader

torch provides `Dataset` and `Dataloader` classes in order to help you feed the data to the network which you can tweak as you like. If you want to define your own way of feeding data, you'll have to inherit the `Dataset` class and implement the `__getitem__` and `__len__` methods.

In [114]:
class datset(Dataset):
    
    # Convert the data into a float tensor
    def __init__(self, data):
        self.data = torch.FloatTensor(data)
    
    
    def __len__(self):
        return len(self.data)
    
    
    def __getitem__(self, index):
        
        # Last column is the target column
        target = self.data[index][-1].long()
        
        # All other predictor columns (i.e. remove the target column)
        data_val = self.data[index][:-1]
        
        # Return the predictor variables and the target variable
        return data_val, target

In [115]:
# Define datasets to pass to the dataloader object
train_set = datset(train)
val_set = datset(val)
test_set = datset(test)

In [116]:
# Define dataloaders to load data in batches
train_loader = DataLoader(train_set, batch_size = B_SIZE, shuffle = True)
val_loader = DataLoader(val_set, batch_size = B_SIZE)
test_loader = DataLoader(test_set, batch_size = B_SIZE)

In [120]:
# Binary classification with certain number of predictors
input_size = train_set[0][0].size()[0]
output_size = 10

In [121]:
# Define the network architecture
class Network(nn.Module):
    def __init__(self):
        super().__init__()
        
        # First Dense Layer
        self.fc1 = nn.Linear(in_features = input_size, out_features = 50)
        # Output layer
        self.out = nn.Linear(in_features = 50, out_features = output_size)
        
    def forward(self, t):
        # Input
        t = t
        
        # Dense Layer 1
        t = self.fc1(t)
        t = F.relu(t)
        
        # Output Layer
        t = self.out(t)
        t = F.softmax(t, dim = 1)
        
        return t

In [158]:
# Define a function to calculate validation accuracy and validation loss
def validate(model, loader):
    # Define containers to accumulate loss and correct predictions
    total_loss = 0.
    correct_predictions = 0.
    
    # Since we're only predicting, we don't need to compute the gradients
    torch.set_grad_enabled(False)
    
    for batch in loader:
        predictors, target = batch
        
        # Do a forward pass
        predictions = model(predictors)
        
        # Compute and accumulate the loss
        total_loss += F.cross_entropy(predictions, target).item()
        
        # Find out the argmax to get the category of each entry
        predictions = predictions.argmax(dim = 1)
        
        # Get the total number of correct pedictions in the batch and accumulate the same
        correct_predictions += predictions.eq(target).sum().item()
    
    # Switch on the gradient computing 
    torch.set_grad_enabled(True)
    
    # Return the loss and accuracy
    acc = 100 * correct_predictions / (loader.batch_size * len(loader))
    
    return (np.round(total_loss , RO), np.round(acc, RO))

# Hyperparameter tuning and Tensorboard Visualization

We can perform hyperparameter tuning using the following methods and visualize it in tensorboard. Then we can pick the best hyperparameters for building our network and use it.

**Creating a grid of values**
- Individually define lists for all the parameters which you wanna use to build the network architecture.

- Use Cartesian product to create an assortment of all those pairs together.

```python
params = dict(
LR_values = list(np.linspace(0.01, 0.04, 4)),
BATCH_SIZE_values = [10, 32, 64]
)

# Make a list of the lists of values
param_values = [val for val in params.values()]

# Iterate over the product
from itertools import product
for i, j in product(*param_values):
    print(i, j)
```
- Iterate over the parameters in these list to obtain the performance parameters and store them.

- Pick those parameters which gave the best value.

**Storing data in a logfile to visualize in tensorboard**

Use the `SummaryWriter` class in order to create a logfile to store the details of runs for every epoch and for every combination of tried out hyperparameters.

Use the `comment` argument of `SummaryWriter` class to accomplish this purpose.

```python
# Define a writer with respective params in comment
cm = f"Learning Rate- {LR} Batch Size- {BATCH_SZ}"
tb = SummaryWriter(comment = cm)

# Add a scalar to the written file
tb.add_scalar("Loss", epoch_loss, e)

# Add a histogram (weights, biases, gradients) to the written file
tb.add_histogram(name, weight, e)

# Close the file before quitting the program
tb.close()

```

Save the training loss and training and testing accuracies as scalars and the weights of the network layers as histograms. Then you can open command prompt, navigate to the folder which contains these logfiles (generally logfiles are kept in the runs folder, so you can navigate to the folder containing the runs folder) and type the following command in anaconda prompt

`tensorboard --logdir=runs`

It will give you a server which you open in a web browser and there you can find plots for all your parameters. 

A few examples of the same are shown in the snapshot below
![](./images/Tensorboard_graphs.PNG)

In [154]:
params = dict(
LR_values = list(np.linspace(0.01, 0.04, 4)),
BATCH_SIZE_values = [10, 32, 64]
)

param_values = [val for val in params.values()]

In [157]:
for LR, BATCH_SZ in product(*param_values): 
    cm = f"Learning Rate- {LR} Batch Size- {BATCH_SZ}"
    print("\nThe set of parameters for testing are as follows: \n")
    print(cm)
    # Create an instance of the network
    model = Network()

    # Define the optimizer
    optimizer = torch.optim.Adam(model.parameters(), lr = LR)

    # Implement the training loop
    torch.set_grad_enabled(True)
    
    # Define the loader
    train_loader = DataLoader(train_set, batch_size = BATCH_SZ)
    
    # Define the SummaryWriter to write in a log file
    tb = SummaryWriter(comment = cm)
    
    for e in range(EPOCHS):
        epoch_loss = 0.
        epoch_correct_predictions = 0.

        for batch in train_loader:
            predictors, target = batch        

            # Forward propagation
            predictions = model(predictors)

            # Calculate  and accumulate the loss
            loss = F.cross_entropy(predictions, target)
            epoch_loss += loss.item()

            # Empty the gradients before accumulating them
            optimizer.zero_grad()

            # Backpropagation i.e. find out the gradients
            loss.backward()

            # Modify the weights of the model
            optimizer.step()

            # Get the index that corresponds to the class
            predictions = predictions.argmax(dim = 1)

            # Find out the number of correct predictions for this batch and accumulate them
            epoch_correct_predictions += predictions.eq(target).sum().item()

        val_loss, val_accuracy = validate(model, val_loader)
        
        train_accuracy = 100 * epoch_correct_predictions / len(train_set)
        
        if ((e == 0) or (e == EPOCHS - 1)):
            print(f"Epoch: {e}  Train Loss: {np.round(epoch_loss, RO)} Train Accuracy: {np.round(train_accuracy, RO)} Validation Loss: {val_loss} Validation Accuracy: {val_accuracy}")
    
        tb.add_scalar("Loss", epoch_loss, e)
        tb.add_scalar("Train Accuracy", train_accuracy, e)
        tb.add_scalar("Validation Accuracy", val_accuracy, e)

        for name, weight in model.named_parameters():
            tb.add_histogram(name, weight, e)
            tb.add_histogram(f"{name}.grad", weight.grad, e)
    
    # Close the writer object    
    tb.close()


The set of parameters for testing are as follows: 

Learning Rate- 0.01 Batch Size- 10
Epoch: 0  Train Loss: 239.08 Train Accuracy: 56.25 Validation Loss: 11.08 Validation Accuracy: 56.77
Epoch: 149  Train Loss: 199.4 Train Accuracy: 87.83 Validation Loss: 9.74 Validation Accuracy: 78.65

The set of parameters for testing are as follows: 

Learning Rate- 0.01 Batch Size- 32
Epoch: 0  Train Loss: 75.17 Train Accuracy: 59.03 Validation Loss: 10.44 Validation Accuracy: 68.49
Epoch: 149  Train Loss: 59.49 Train Accuracy: 97.3 Validation Loss: 8.97 Validation Accuracy: 90.36

The set of parameters for testing are as follows: 

Learning Rate- 0.01 Batch Size- 64
Epoch: 0  Train Loss: 39.88 Train Accuracy: 46.38 Validation Loss: 10.94 Validation Accuracy: 60.68
Epoch: 149  Train Loss: 29.32 Train Accuracy: 99.52 Validation Loss: 8.89 Validation Accuracy: 91.67

The set of parameters for testing are as follows: 

Learning Rate- 0.02 Batch Size- 10
Epoch: 0  Train Loss: 297.85 Train Accuracy: 

# Organizing Code
## RunBuilder Class

We can create a class in order to store the values of cartesian product of different hyperparameters which we did above. It helps clean the code up and create order, improve readability and add modularity to the code.

What will this class do?

It will help to get individual named tuples which could be used to iterate over and obtain results for different hyperparameters. This is how we can do it:

```python
class RunBuilder():
    @staticmethod
    def get_runs(params):
        
        # Creates a named tuple which we can use to access values in organized way of .notation
        Run = namedtuple('Run', params.keys())
        # Create a container to hold all the combination of params
        runs = []
        
        # Compute the Cartesian product of parameters
        for v in product(*params.values()):
            runs.append(Run(*v))
        
        return runs
```

**Why Static Method?**

We do not want to create an instance of the class because it has a method which will be used by one and all identically to create groups of hyperparameter values.

We could then access the class method as follows:
```python
params = OrderedDict(
    lr = [.01, .001]
    ,batch_size = [1000, 10000]
)

for run in RunBuilder.get_runs(params):
    comment = f'-{run}'
```

How is this better than unpacking values from a simple cartesian product? In the `for` loop, you've to unpack the values manually and do stuff whereas here, the class' method handles that. 

## RunManager Class

We can see the code above is cluttered and has a lot of stuff written in chunks which could be modularized and organized nicely. Let's implement that.

A brief on what the attributes of the RunManager class are and what are each of the methods doing:

`__init__`: It is initializing all the instance varaibles of the class which are as follows:

   - `e`: It is an instance of the class Epoch which holds information about an epoch's count, it's loss, it's number of correct predictions, and the time at which that epoch started.

   - `run_params`: It's a named tuple returned by the `RunBuilder` class which holds the values of parameters with which the run has happened.

   - `run_count`: It's a running count of which named tuple or which set of hyperparameters am I using for tuning currently.

   - `run_data`: It's a container to hold all the results related to a run as we step through different values of run_params

   - `run_start_time`: When we started experimenting with a particular value of run_params.

   - `network`: It's the neural network architecture with weights.
   
   - `loader`: It's the loader through which batches of data will be passed through the network.
   
   - `tb`: It's the tensorboard object used to write data to runs folder for visualizing results in TensorBoard
   
`begin_run`: It is a method which is called at the begining of a run before the loop of epochs. It does the following things.
   
   - Remembers the start time of the run.
   
   - Sets the hyperparameters for a run.
   
   - Keeps track of how many run combinations are happening. Increments by 1 every time this method is called.
   
   - Defines the loader based on batch_size which may be a potential hyperparameter.
   
   - Instantiate a SummaryWriter object to store the results of all the runs.

`end_run`: When a run is over, it resets the epoch count and closes the SummaryWriter object.

`begin_epoch`: It is called at the beginning of each epoch. What it does is increments the epoch count, registers the start time of an epoch and resets the loss and num_correct attributes of the Epoch instance to zero.

`end_epoch`: It does a lot of things. They're as follows.

- It clocks in the time required for one epoch to complete. Also it calculates the runtime which ain't right except for the last epoch of a run but pardon the inconvenience caused.

- Compute the total loss and accuracy. 

- Write these parameters to tensorboard.

- Store the values corresponding to the epoch finished in the `run_data` container 

`track_loss`: It computes the loss at the end of every epoch and stores it to the Epoch instance attribute.

`track_num_correct`: It computes the number of correct predictions and stores it to the Epoch instance attribute.

`save`: It saves the `run_data` instance variables in an orderly fashion to a csv and json file respectively.

In [167]:
# Define the RunBuilder class

class RunBuilder():
    @staticmethod
    def get_runs(params):
        
        # Creates a named tuple which we can use to access values in organized way of .notation
        Run = namedtuple('Run', params.keys())
        # Create a container to hold all the combination of params
        runs = []
        
        # Compute the Cartesian product of parameters
        for v in product(*params.values()):
            runs.append(Run(*v))
        
        return runs

In [160]:
# Define the epoch class
class Epoch():
    def __init__(self):
        # Keeps track of which epoch it is
        self.count = 0
        # Keeps track of the loss
        self.loss = 0
        # Keeps track of the number of correct predictions
        self.num_correct = 0
        # When to start
        self.start_time = 0

In [183]:
class RunManager():
    def __init__(self):
        self.e = Epoch()
        self.run_params = None
        self.run_count = 0
        self.run_data = []
        self.run_start_time = None
        self.network = None
        self.loader = None
        self.tb = None
    
    def begin_run(self, run, network, loader):
        self.run_start_time = time.time()
        self.run_params = run
        self.run_count += 1
        self.network = network
        self.loader = loader
        self.tb = SummaryWriter(comment = f"-{run}")
#         self.tb.add_graph(network)
    
    def end_run(self):
        self.tb.close()
        self.e.count = 0
    
    def begin_epoch(self):
        self.e.start_time = time.time()
        self.e.count += 1
        self.e.loss = 0
        self.e.num_correct = 0
        
    def end_epoch(self):
        epoch_duration = time.time() - self.e.start_time
        run_duration = time.time() - self.run_start_time
        
        loss = self.e.loss /len(self.loader.dataset)
        accuracy = self.e.num_correct / (len(self.loader.dataset))
        
        self.tb.add_scalar('Loss', loss, self.e.count)
        self.tb.add_scalar('Accuracy', accuracy, self.e.count)
        
        for name, param in network.named_parameters():
            self.tb.add_histogram(name, param, self.e.count)
            self.tb.add_histogram(f"{name}.grad", param.grad, self.e.count)
            
        results = {'Run': self.run_count, 
                   'Epoch': self.e.count, 
                   'Loss':loss,
                   'Accuracy': accuracy,
                   'Epoch Duration': epoch_duration,
                   'Run Duration': run_duration}
        
        for k, v in self.run_params._asdict().items(): 
            results[k] = v
        
        self.run_data.append(results)

        df = pd.DataFrame.from_dict(self.run_data, orient='columns')
        clear_output(wait = True)
        display(df)
        
    def track_loss(self, loss):
        self.e.loss += loss.item() * self.loader.batch_size
    
    def track_num_correct(self, pred, labels):
        self.e.num_correct += pred.argmax(dim = 1).eq(labels).sum().item()
        
    def save(self, fileName):
        pd.DataFrame.from_dict(
            self.run_data, orient='columns'
        ).to_csv(f'{fileName}.csv')

        with open(f'{fileName}.json', 'w', encoding='utf-8') as f:
            json.dump(self.run_data, f, ensure_ascii=False, indent=4)      

In [184]:
params = dict(
    lr = [.01, 0.02]
    ,batch_size = [100, 200]
    ,shuffle = [True, False]
)

In [187]:
m = RunManager()
for run in RunBuilder().get_runs(params):
    
    network = Network()
    
    loader = DataLoader(train_set, batch_size = run.batch_size, shuffle = run.shuffle)
    
    optimizer = optim.SGD(network.parameters(), lr = run.lr)
    
    m.begin_run(run, network, loader)
    for epoch in range(5):
        m.begin_epoch()
        
        for batch in loader:
            predictors, target = batch
            
            predictions = network(predictors)
            
            loss = F.cross_entropy(predictions, target)
            
            optimizer.zero_grad()
            
            loss.backward()
            
            optimizer.step()
            
            m.track_loss(loss)
            m.track_num_correct(predictions, target)
        
        m.end_epoch()
    m.end_run()

m.save('results')

Unnamed: 0,Run,Epoch,Loss,Accuracy,Epoch Duration,Run Duration,lr,batch_size,shuffle
0,1,1,2.368076,0.170247,0.068634,0.140598,0.01,100,True
1,1,2,2.336621,0.206046,0.115923,0.343686,0.01,100,True
2,1,3,2.31671,0.214797,0.129149,0.550384,0.01,100,True
3,1,4,2.296158,0.237868,0.084952,0.721663,0.01,100,True
4,1,5,2.274956,0.278441,0.111935,0.918147,0.01,100,True
5,2,1,2.40768,0.095465,0.078117,0.44847,0.01,100,False
6,2,2,2.373144,0.162291,0.079953,0.650438,0.01,100,False
7,2,3,2.327235,0.228321,0.08791,0.838034,0.01,100,False
8,2,4,2.286817,0.264916,0.086984,1.005302,0.01,100,False
9,2,5,2.232415,0.383453,0.118281,1.197232,0.01,100,False
