# 

In [1]:
# setup imports
%matplotlib inline
from torchvision import transforms, datasets
import torchvision
import torch.optim as optim  # optimization package
import torch.nn.functional as F
import torch.nn as nn
import torch
from tqdm import tqdm_notebook as tqdm  # progress bar
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from torch.utils.tensorboard import SummaryWriter
plt.style.use('dark_background')
plt.rcParams["figure.figsize"] = (20, 13)

from collections import OrderedDict, namedtuple
from itertools import product # computes a cortege from multilpe lists
import time

In [2]:
# device (GPU) configuration
print('CUDA available:\t', torch.cuda.is_available())
print('Number of GPUs:\t', torch.cuda.device_count())
print('Default device:\t', torch.cuda.get_device_name())

# set the device
device = "cuda:0"

# enable the benchmark mode in cudnn (finds optimal settings for network configuration)
# but if theinput sizes changes at each iteration turn it off
torch.backends.cudnn.benchmark = True

CUDA available:	 True
Number of GPUs:	 1
Default device:	 GeForce GTX 1060


# set the batch size
batch_size = 100

# какие трансформации применяем к сету перед тем как начать его использовать (их может быть много)
transform = transforms.Compose([transforms.ToTensor()]) # ToTensor - because we need to transform the set`s data to the tensor form

# train set and train loader
train_set = torchvision.datasets.FashionMNIST(
    root='D:/Stuff on HDD/My stuff/ML DS/Data_folder',
    train=True, download=True, # download=True to download set from the web, train=True to get the part of the set designated for training
    transform=transform)
train_loader = torch.utils.data.DataLoader(train_set, batch_size=batch_size, shuffle=True)

# test set and test loader
test_set = torchvision.datasets.FashionMNIST(
    root='D:/Stuff on HDD/My stuff/ML DS/Data_folder',
    train=False, download=False, # False & False, cause for testing and already downloaded
    transform=transform)
test_loader = torch.utils.data.DataLoader(test_set, batch_size=batch_size, shuffle=False)

In [3]:
class Network(nn.Module):
    def __init__(self):        
        super().__init__()
        
        # create the layers
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=6, kernel_size=5)
        self.conv2 = nn.Conv2d(in_channels=6, out_channels=12, kernel_size=5)        
        self.fc1 = nn.Linear(in_features=12 * 4 * 4, out_features=120) # 12*4*4 = 12 "инпутов" 4 на 4 пикселя
        self.fc2 = nn.Linear(in_features=120, out_features=60)
        self.out = nn.Linear(in_features=60, out_features=10)
        
    def forward(self, t):
        t = self.conv1(t)
        t = F.relu(t) # set an activation function
        t = F.max_pool2d(t, kernel_size=2, stride=2) # pooling layer
        
        t = self.conv2(t)
        t = F.relu(t)
        t = F.max_pool2d(t, kernel_size=2, stride=2)
        
        t = t.reshape(-1, 12 * 4 * 4) # flattening the tensor before forwarding it to fc1
        t = self.fc1(t)
        t = F.relu(t)
        
        t = self.fc2(t)
        t = F.relu(t)
        
        t = self.out(t)
        
        return t

In [4]:
class RunBuilder():    
    @staticmethod
    def get_runs(params):
        """Returns a list of tuples of parameters for running a network"""        
        Run = namedtuple('Run', params.keys())
        runs=[]        
        for v in product(*params.values()):
            runs.append(Run(*v))        
        return runs
    
"""    
class Epoch():
    def __init__(self):
        self.count = 0
        self.loss = 0
        self.num_correct = 0
        self.start_time = None
        
class Run():
    def __init__(self):
        self.params = None
        self.count = 0
        self.data = []
        self.start_time = None
"""      

class RunManager():
    def __init__(self):        
        self.epoch_count = 0
        self.epoch_loss = 0
        self.epoch_num_correct = 0
        self.epoch_start_time = None
        
        self.run_params = None
        self.run_count = 0
        self.run_data = []
        self.run_start_time = None
        
        self.network = None
        self.loader = None
        self.tb = None
        
    def begin_run(self, run, network, loader):        
        self.run_start_time = time.time()
        
        self.run_params = run
        self.run_count += 1
        
        self.network = network
        self.loader = loader
        self.tb = SummaryWriter(comment=f'-{run}')
        
        images, labels = next(iter(self.loader))
        images = images.to(device)
        labels = labels.to(device)
        grid = torchvision.utils.make_grid(images)
        
        self.tb.add_image('images', grid)
        self.tb.add_graph(self.network, images)
        
    def end_run(self):        
        self.tb.close()
        self.epoch_count = 0
        
    def begin_epoch(self):        
        self.epoch_start_time = time.time()        
        self.epoch_count += 1
        self.epoch_count = 0
        self.epoch_num_correct = 0
        
    def end_epoch(self):
        """End an epoch, calculate loss and accuracy, save the results to the TensorBoard and
        DataFrame, display the DataFrame"""        
        epoch_duration = time.time() - self.epoch_start_time
        run_duration = time.time() - self.run_start_time
        
        # calculate loss & accuracy relative to the size of the dataset
        loss = self.epoch_loss / len(self.loader.dataset)
        accuracy = self.epoch_num_correct / len(self.loader.dataset)
        
        self.tb.add_scalar('Loss', loss, self.epoch_count)
        self.tb.add_scalar('Accuracy', accuracy, self.epoch_count)
        
        # pass histogram data to the tensorboard
        for name, param in self.network.named_parameters():
            self.tb.add_histogram(name, param, self.epoch_count)
            self.tb.add_histogram(f'{name}.grad', param.grad, self.epoch_count)
        
        # saving run data to DataFrame
        results = OrderedDict()
        results["run"] = self.run_count
        results["epoch"] = self.epoch_count
        results["loss"] = loss
        results["accuracy"] = accuracy
        results["epoch duration"] = epoch_duration
        results["run duration"] = run_duration        
        for key, value in self.run_params._asdict().items(): results[key] = value # add run_params to the dictionary
        self.run_data.append(results)
        df = pd.DataFrame.from_dict(self.run_data, orient='columns')
        display(df) # display results of the 
        
    def track_loss(self, loss):
        """Update loss value"""
        self.epoch_loss += loss.item() * self.loader.batch_size
        
    def track_num_correct(self, preds, labels):
        """Update number of correct predictions"""
        self.epoch_num_correct += self._get_num_correct(preds, labels)
    
    @torch.no_grad() # turn off gradient tracking
    def _get_num_correct(self, preds, labels): # underscore indicates that this is an internal class method
        """Calculate the number of correct predictions"""
        return preds.argmax(dim=1).eq(labels).sum().item()
    
    def save(self, fileName):
        """Save the results to .csv and .json"""
        pd.DataFrame.from_dict(self.run_data, orient='columns').to_csv(f'{fileName}.csv')        
        with open(f'{fileName}.json', 'w', encoding='utf-8') as f:
            json.dump(self.run_data, f, ensure_ascii=False, indent=4)

In [5]:
# setup the transformations to the dataset
transform = transforms.Compose([transforms.ToTensor()]) # ToTensor - because we need to transform the set`s data to the tensor f

# prepare the training set
train_set = torchvision.datasets.FashionMNIST(
    root='D:/Stuff on HDD/My stuff/ML DS/Data_folder',
    train=True, download=True, # train=True to get the part of the set designated for training
    transform=transform)




# set the training parameters
params = OrderedDict(lr = [.01], batch_size = [100, 2000])





# start the training
m = RunManager()
for run in RunBuilder.get_runs(params):
    network = Network().to(device)
    loader = torch.utils.data.DataLoader(train_set, batch_size=run.batch_size)
    optimizer = optim.Adam(network.parameters(), lr=run.lr)
    
    m.begin_run(run, network, loader)
    for epoch in tqdm(range(5), desc='Epoch'):
        m.begin_epoch()
        
        for batch in tqdm(loader, desc='Batch'):
            images = batch[0]
            images = images.to(device)
            labels = batch[1]
            labels = labels.to(device)
            preds = network(images)
            loss = F.cross_entropy(preds, labels)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()            
            m.track_loss(loss)
            m.track_num_correct(preds, labels)
            
        m.end_epoch()
        print('end epoch')
    m.end_run()
m.save('results')           

HBox(children=(IntProgress(value=0, description='Epoch', max=5, style=ProgressStyle(description_width='initial…

HBox(children=(IntProgress(value=0, description='Batch', max=600, style=ProgressStyle(description_width='initi…




Unnamed: 0,run,epoch,loss,accuracy,epoch duration,run duration,lr,batch_size
0,1,0,0.580875,0.78105,25.155843,35.098088,0.01,100


end epoch


HBox(children=(IntProgress(value=0, description='Batch', max=600, style=ProgressStyle(description_width='initi…




Unnamed: 0,run,epoch,loss,accuracy,epoch duration,run duration,lr,batch_size
0,1,0,0.580875,0.78105,25.155843,35.098088,0.01,100
1,1,0,0.986689,0.8492,24.690674,60.078311,0.01,100


end epoch


HBox(children=(IntProgress(value=0, description='Batch', max=600, style=ProgressStyle(description_width='initi…




Unnamed: 0,run,epoch,loss,accuracy,epoch duration,run duration,lr,batch_size
0,1,0,0.580875,0.78105,25.155843,35.098088,0.01,100
1,1,0,0.986689,0.8492,24.690674,60.078311,0.01,100
2,1,0,1.353712,0.86345,26.064752,86.403368,0.01,100


end epoch


HBox(children=(IntProgress(value=0, description='Batch', max=600, style=ProgressStyle(description_width='initi…




Unnamed: 0,run,epoch,loss,accuracy,epoch duration,run duration,lr,batch_size
0,1,0,0.580875,0.78105,25.155843,35.098088,0.01,100
1,1,0,0.986689,0.8492,24.690674,60.078311,0.01,100
2,1,0,1.353712,0.86345,26.064752,86.403368,0.01,100
3,1,0,1.708867,0.86855,24.564326,111.223564,0.01,100


end epoch


HBox(children=(IntProgress(value=0, description='Batch', max=600, style=ProgressStyle(description_width='initi…




Unnamed: 0,run,epoch,loss,accuracy,epoch duration,run duration,lr,batch_size
0,1,0,0.580875,0.78105,25.155843,35.098088,0.01,100
1,1,0,0.986689,0.8492,24.690674,60.078311,0.01,100
2,1,0,1.353712,0.86345,26.064752,86.403368,0.01,100
3,1,0,1.708867,0.86855,24.564326,111.223564,0.01,100
4,1,0,2.053017,0.87225,25.953753,137.456567,0.01,100


end epoch



HBox(children=(IntProgress(value=0, description='Epoch', max=5, style=ProgressStyle(description_width='initial…

HBox(children=(IntProgress(value=0, description='Batch', max=30, style=ProgressStyle(description_width='initia…




Unnamed: 0,run,epoch,loss,accuracy,epoch duration,run duration,lr,batch_size
0,1,0,0.580875,0.78105,25.155843,35.098088,0.01,100
1,1,0,0.986689,0.8492,24.690674,60.078311,0.01,100
2,1,0,1.353712,0.86345,26.064752,86.403368,0.01,100
3,1,0,1.708867,0.86855,24.564326,111.223564,0.01,100
4,1,0,2.053017,0.87225,25.953753,137.456567,0.01,100
5,2,0,3.342941,0.50355,17.042595,19.806788,0.01,2000


end epoch


HBox(children=(IntProgress(value=0, description='Batch', max=30, style=ProgressStyle(description_width='initia…




Unnamed: 0,run,epoch,loss,accuracy,epoch duration,run duration,lr,batch_size
0,1,0,0.580875,0.78105,25.155843,35.098088,0.01,100
1,1,0,0.986689,0.8492,24.690674,60.078311,0.01,100
2,1,0,1.353712,0.86345,26.064752,86.403368,0.01,100
3,1,0,1.708867,0.86855,24.564326,111.223564,0.01,100
4,1,0,2.053017,0.87225,25.953753,137.456567,0.01,100
5,2,0,3.342941,0.50355,17.042595,19.806788,0.01,2000
6,2,0,3.996362,0.748133,16.397587,36.449713,0.01,2000


end epoch


HBox(children=(IntProgress(value=0, description='Batch', max=30, style=ProgressStyle(description_width='initia…




Unnamed: 0,run,epoch,loss,accuracy,epoch duration,run duration,lr,batch_size
0,1,0,0.580875,0.78105,25.155843,35.098088,0.01,100
1,1,0,0.986689,0.8492,24.690674,60.078311,0.01,100
2,1,0,1.353712,0.86345,26.064752,86.403368,0.01,100
3,1,0,1.708867,0.86855,24.564326,111.223564,0.01,100
4,1,0,2.053017,0.87225,25.953753,137.456567,0.01,100
5,2,0,3.342941,0.50355,17.042595,19.806788,0.01,2000
6,2,0,3.996362,0.748133,16.397587,36.449713,0.01,2000
7,2,0,4.517981,0.80035,16.266244,53.020112,0.01,2000


end epoch


HBox(children=(IntProgress(value=0, description='Batch', max=30, style=ProgressStyle(description_width='initia…




Unnamed: 0,run,epoch,loss,accuracy,epoch duration,run duration,lr,batch_size
0,1,0,0.580875,0.78105,25.155843,35.098088,0.01,100
1,1,0,0.986689,0.8492,24.690674,60.078311,0.01,100
2,1,0,1.353712,0.86345,26.064752,86.403368,0.01,100
3,1,0,1.708867,0.86855,24.564326,111.223564,0.01,100
4,1,0,2.053017,0.87225,25.953753,137.456567,0.01,100
5,2,0,3.342941,0.50355,17.042595,19.806788,0.01,2000
6,2,0,3.996362,0.748133,16.397587,36.449713,0.01,2000
7,2,0,4.517981,0.80035,16.266244,53.020112,0.01,2000
8,2,0,4.975092,0.832017,16.19575,69.480155,0.01,2000


end epoch


HBox(children=(IntProgress(value=0, description='Batch', max=30, style=ProgressStyle(description_width='initia…




Unnamed: 0,run,epoch,loss,accuracy,epoch duration,run duration,lr,batch_size
0,1,0,0.580875,0.78105,25.155843,35.098088,0.01,100
1,1,0,0.986689,0.8492,24.690674,60.078311,0.01,100
2,1,0,1.353712,0.86345,26.064752,86.403368,0.01,100
3,1,0,1.708867,0.86855,24.564326,111.223564,0.01,100
4,1,0,2.053017,0.87225,25.953753,137.456567,0.01,100
5,2,0,3.342941,0.50355,17.042595,19.806788,0.01,2000
6,2,0,3.996362,0.748133,16.397587,36.449713,0.01,2000
7,2,0,4.517981,0.80035,16.266244,53.020112,0.01,2000
8,2,0,4.975092,0.832017,16.19575,69.480155,0.01,2000
9,2,0,5.390819,0.847133,16.355875,86.103356,0.01,2000


end epoch

