# Import

In [357]:
import torch
import torch.nn as nn
from torch.optim import SGD
from torch.utils.data import Dataset, DataLoader, RandomSampler, random_split

# import torchvision
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px

import copy
import os

In [358]:
if torch.cuda.is_available():
	device = torch.device("cuda:0")
else:
	device = torch.device("cpu")

print(f"Using {device}")

Using cuda:0


## Import Data

In [359]:
class CTDataset(Dataset):
	def __init__(self, filepath):
		self.x, self.y = torch.load(filepath)
		self.x = self.x / 255.0
		self.y = nn.functional.one_hot(self.y, num_classes=10).to(float)

	def __len__(self):
		return self.x.shape[0]

	def __getitem__(self, ix):
		return self.x[ix], self.y[ix]

In [360]:
# https://www.di.ens.fr/~lelarge/MNIST.tar.gz
train_ds = CTDataset("./MNIST/training.pt")
# test_ds = CTDataset('./MNIST/test.pt')

In [361]:
train, dev, valid = random_split(train_ds, [0.6, 0.2, 0.2])

In [362]:
train_size = min(1_000, len(train)) # Check if model overfits on small data, to ensure DNN actually is effective
dev_size = min(1_000, len(dev))

min_training_batches = 4
train_batch_size = min(32, max(1, train_size // min_training_batches))

evaluation_batch_size = min(1_024, dev_size)

In [363]:
train_random_sampler = RandomSampler(train, num_samples=train_size)
dev_random_sampler = RandomSampler(dev, num_samples=dev_size)

train_dl = DataLoader(
	train, sampler=train_random_sampler, batch_size=train_batch_size, drop_last=True
)

dev_dl = DataLoader(
	dev, sampler=dev_random_sampler, batch_size=evaluation_batch_size, drop_last=True
)

## Train

In [364]:
class NeuralNet(nn.Module):
	def __init__(self, init_dl, hidden_layers):
		super().__init__()

		for x, y in init_dl:
			break

		self.input_size = x.shape[-1]
		self.output_size = y.shape[-1]

		# input_layer = 
		
		
		output_layer = nn.LazyLinear(self.output_size) # output layer
		
		layers = (
      		# [input_layer] +
			hidden_layers +
			[output_layer]
		)

		self.network = nn.Sequential(
			*layers
		)

		# init lazy layers
		self.forward(x)

	def reshape(self, x):
		# batch_size, no_of_channels, width, height
		return x.view(x.shape[0], 1, x.shape[1], x.shape[2])

	def forward(self, x):
		return self.network(self.reshape(x)).squeeze()

In [365]:
def get_max_len(arrays):
    return max(
        [
            len(array)
            for array
            in arrays
        ]
    )

def pad(array, max_len):
    return list(np.pad(
        array,
        pad_width = (0, max_len-len(array)),
        constant_values = np.nan
    ))

def get_all_nodes(model):
    network_nodes = []

    layers = model.named_children()
    for i, layer in enumerate(layers):
        layer_nodes_formatted = []
        
        sub_layer = layer[-1]
        for sub_layer_node in sub_layer:
            layer_nodes_formatted.append(sub_layer_node)

        network_nodes.append(layer_nodes_formatted)
    
    return network_nodes

def get_summary_agg(summary, agg=["mean"]):
    summary = (
        summary
        .groupby(["Epoch", "Subset"])
        .agg(agg)
    )
    summary.columns = list(map('_'.join, summary.columns.values))
    summary = (
        summary
        .reset_index()
        .pivot(
            index="Epoch",
            columns="Subset",
            # values = "Accuracy"
        )
    )
    summary.columns = list(map('_'.join, summary.columns.values))
    
    # should not be part of data collection
    # summary["Generalization_Gap"] = summary["Loss_mean_Dev"] - summary["Loss_mean_Train"]
    
    summary = summary.reset_index()
    
    return summary


In [366]:
# @torch.compile(mode="reduce-overhead")
def train_batch(model, optimizer, loss, x, y, train_dl_len, batch_idx, device, accum_iter=1, k_frac=None):
    x = x.to(device)
    y = y.to(device)
    # x = x.half()
    # y = y.half()
    
    model.train()
    # with torch.set_grad_enabled(True): # turn on history tracking
    # forward pass
    proba = model(x)
    loss_array = loss(proba, y)

    loss_scalar = loss_array.mean()
    
    # backward pass
    optimizer.zero_grad(set_to_none=True)
    loss_scalar.backward()

    # weights update
    # if accum_iter != 1 -> gradient accumulation
    batch_num = batch_idx + 1

    if (
        (batch_num % accum_iter == 0)
        or
        (batch_num == len(train_dl_len))
    ):
        optimizer.step()

    

# @torch.compile(mode="reduce-overhead")
def train_epoch(dl, model, optimizer, loss, train_dl_len, device, eval=False, k_frac=None):

    # epoch_accuracies = []
    epoch_losses = []

    for batch_idx, (x, y) in enumerate(dl):
        train_batch(model, optimizer, loss, x, y, train_dl_len, batch_idx, device, accum_iter=1, k_frac=k_frac)
    
        # epoch_accuracies += eval_batch(model, x, y)
        if eval:
            epoch_losses += eval_batch(model, x, y, loss, device)

    return epoch_losses

# @torch.compile(mode="reduce-overhead")
def eval_batch(model, x, y, loss, device):
    x = x.to(device)
    y = y.to(device)
    
    # x = x.half()
    # y = y.half()

    model.eval()
    with torch.inference_mode(): # turn off history tracking
        # forward pass
        proba = model(x)
        
        loss_value = loss(proba, y)
        epoch_loss_array = loss_value.detach() # loss_value.item() # batch loss

        # true = y.argmax(axis=1)
        # pred = proba.argmax(axis=1)
        # epoch_accuracy_array = (pred == true) # torch.sum()


        return epoch_loss_array

# @torch.compile(mode="reduce-overhead")
def eval_epoch(dl, model, loss, device):
    # epoch_accuracies = []
    epoch_losses = []
    for batch_idx, (x, y) in enumerate(dl):
        epoch_losses += eval_batch(model, x, y, loss, device)

    return epoch_losses


def train_model(train_dl, dev_dl, model, loss, optimizer, n_epochs, device, train_eval_every=10, dev_eval_every=10, agg=None, k_frac=None, log=False):
    model = model.to(device)

    model.train()
  
    summary_list = []
  
    train_dl_len = len(train_dl)

    for epoch in range(1, n_epochs + 1):
        if epoch % train_eval_every == 0 or epoch == 1:
            eval_train = True
        if epoch % dev_eval_every == 0 or epoch == 1:
            eval_dev = True
        
        epoch_train_losses = train_epoch(train_dl, model, optimizer, loss, train_dl_len, device, eval=eval_train, k_frac=k_frac)
        # return pd.DataFrame()
        
        if eval_dev:
            epoch_dev_losses = eval_epoch(dev_dl, model, loss, device)
        else:
            epoch_dev_losses = []
        
        for e in epoch_train_losses:
            summary_list.append(
                [epoch, "Train", float(e)]
            )
        for e in epoch_dev_losses:
            summary_list.append(
                [epoch, "Dev", float(e)]
            )

        if log:
            print(f"Epoch {epoch}/{n_epochs} Completed")

    model.eval()

    summary = (
         pd.DataFrame(
            columns = ["Epoch", "Subset", "Loss"],
            data = summary_list
        )
    )

    if agg is not None:
    	summary = summary.pipe(get_summary_agg, agg)

    return summary

In [367]:
optim_class = torch.optim
# optim_children = dir(optim_class)
# no_of_optimizers = [o.startswith("_") for o in optim_children].index(True)
# optimizer_names = [o for o in optim_children[:no_of_optimizers] if o not in ["Optimizer", "LB"]]

optimizer_names = [
    'ASGD',
    'Adadelta',
    'Adagrad',
    'Adam',
    'AdamW',
    'Adamax',
    # 'LBFGS',
    'NAdam',
    'RAdam',
    'RMSprop',
    'Rprop',
    'SGD',
    # 'SparseAdam'
]

In [368]:
optimizer_names

['ASGD',
 'Adadelta',
 'Adagrad',
 'Adam',
 'AdamW',
 'Adamax',
 'NAdam',
 'RAdam',
 'RMSprop',
 'Rprop',
 'SGD']

In [369]:
def train_models(loss, model, n_epochs, optimizer_names, learning_rates, train_size, device, agg=["mean"], train_eval_every=10, dev_eval_every=10, log=False, output_path = "summary.csv"):
	# summaries = pd.DataFrame()
	# i=0
	
	for learning_rate in learning_rates:
		for optimizer_name in optimizer_names:
			model_copy = copy.deepcopy(model)
			optimizer = getattr(optim_class, optimizer_name)(model_copy.parameters(), lr=learning_rate)
			
			for state in optimizer.state.values():
				for k, v in state.items():
					if isinstance(v, torch.Tensor):
						state[k] = v.to(device)
			
			summary = train_model(
				train_dl,
				dev_dl,
				model_copy,
				loss,
				optimizer,
				n_epochs,
				device = device,
				train_eval_every=train_eval_every,
				dev_eval_every=dev_eval_every,
				log=log,
				agg = agg
			)

			summary["Model"] = str(get_all_nodes(model_copy))
			summary["Optimizer"] = optimizer_name
			summary["Learning_Rate"] = learning_rate
			summary["Train_Size"] = train_size

			# disabled due too high space complexity
			# summaries = pd.concat([
			# 	summaries,
			# 	summary
			# ])

			summary.to_csv(
				output_path,
				index = False,
				mode = "a",
				header = not os.path.exists(output_path)
			)

			# i += 1
			# if i==1:
			# 	break
	
	return None

In [370]:
model = NeuralNet(
	train_dl,
	hidden_layers = [
		nn.Flatten(),
		nn.LazyLinear(100),
		nn.ReLU(),
		nn.LazyLinear(10),
		nn.ReLU()
		# nn.Sigmoid() not required
	]
)
summaries = train_models(
    loss = nn.CrossEntropyLoss(reduction="none"),
	model = model,
	n_epochs = 100, # 3
	optimizer_names = optimizer_names,
 	learning_rates = [
      0.01, 0.05, 0.10
    ],
	train_size = train_size,
	device = device,
    agg=["mean", "std"],
    train_eval_every=10,
	dev_eval_every=10,
    log = True
)

Epoch 1/100 Completed
Epoch 2/100 Completed
Epoch 3/100 Completed
Epoch 4/100 Completed
Epoch 5/100 Completed
Epoch 6/100 Completed
Epoch 7/100 Completed
Epoch 8/100 Completed
Epoch 9/100 Completed
Epoch 10/100 Completed
Epoch 11/100 Completed
Epoch 12/100 Completed
Epoch 13/100 Completed
Epoch 14/100 Completed
Epoch 15/100 Completed
Epoch 16/100 Completed
Epoch 17/100 Completed
Epoch 18/100 Completed
Epoch 19/100 Completed
Epoch 20/100 Completed
Epoch 21/100 Completed
Epoch 22/100 Completed
Epoch 23/100 Completed
Epoch 24/100 Completed
Epoch 25/100 Completed
Epoch 26/100 Completed
Epoch 27/100 Completed
Epoch 28/100 Completed
Epoch 29/100 Completed
Epoch 30/100 Completed
Epoch 31/100 Completed
Epoch 32/100 Completed
Epoch 33/100 Completed
Epoch 34/100 Completed
Epoch 35/100 Completed
Epoch 36/100 Completed
Epoch 37/100 Completed
Epoch 38/100 Completed
Epoch 39/100 Completed
Epoch 40/100 Completed
Epoch 41/100 Completed
Epoch 42/100 Completed
Epoch 43/100 Completed
Epoch 44/100 Complet

KeyboardInterrupt: 

In [None]:
# output_path = "summary.csv"
# summaries.to_csv(
#     output_path,
#     index = False,
#     mode = "a",
#     header = not os.path.exists(output_path)
# )