# Import

In [1]:
import torch
import torch.nn as nn
from torch.optim import SGD
from torch.utils.data import Dataset, DataLoader, RandomSampler, random_split

# import torchvision
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px

import copy

## Import Data

In [2]:
class CTDataset(Dataset):
	def __init__(self, filepath):
		self.x, self.y = torch.load(filepath)
		self.x = self.x / 255.0
		self.y = nn.functional.one_hot(self.y, num_classes=10).to(float)

	def __len__(self):
		return self.x.shape[0]

	def __getitem__(self, ix):
		return self.x[ix], self.y[ix]

In [3]:
# https://www.di.ens.fr/~lelarge/MNIST.tar.gz
train_ds = CTDataset("./MNIST/training.pt")
# test_ds = CTDataset('./MNIST/test.pt')

In [4]:
train, dev, valid = random_split(train_ds, [0.6, 0.2, 0.2])

In [5]:
train_size = min(1_000, len(train)) # Check if model overfits on small data, to ensure DNN actually is effective
dev_size = min(1_000, len(dev))

min_training_batches = 4
train_batch_size = min(32, max(1, train_size // min_training_batches))

evaluation_batch_size = min(1_024, dev_size)

In [6]:
train_random_sampler = RandomSampler(train, num_samples=train_size)
dev_random_sampler = RandomSampler(dev, num_samples=dev_size)

train_dl = DataLoader(
	train, sampler=train_random_sampler, batch_size=train_batch_size, drop_last=True
)

dev_dl = DataLoader(
	dev, sampler=dev_random_sampler, batch_size=evaluation_batch_size, drop_last=True
)

## Train

In [7]:
class NeuralNet(nn.Module):
	def __init__(self, init_dl, hidden_layers):
		super().__init__()

		for x, y in init_dl:
			break

		self.input_size = x.shape[-1]
		self.output_size = y.shape[-1]

		# input_layer = 
		
		
		output_layer = nn.LazyLinear(self.output_size) # output layer
		
		layers = (
      		# [input_layer] +
			hidden_layers +
			[output_layer]
		)

		self.network = nn.Sequential(
			*layers
		)

		# init lazy layers
		self.forward(x)

	def reshape(self, x):
		# batch_size, no_of_channels, width, height
		return x.view(x.shape[0], 1, x.shape[1], x.shape[2])

	def forward(self, x):
		return self.network(self.reshape(x)).squeeze()

In [8]:
def get_max_len(arrays):
	return max(
		[
			len(array)
			for array
			in arrays
		]
	)

def pad(array, max_len):
	return list(np.pad(
		array,
		pad_width = (0, max_len-len(array)),
		constant_values = np.nan
	))

# @torch.compile(mode="reduce-overhead")
def train_batch(model, optimizer, loss, x, y, train_dl_len, batch_idx, accum_iter=1, k_frac=None):
	# x = x.half()
	# y = y.half()
	
	model.train()
	# with torch.set_grad_enabled(True): # turn on history tracking
	# forward pass
	proba = model(x)
	loss_array = loss(proba, y)

	loss_scalar = loss_array.mean()
	
	# backward pass
	optimizer.zero_grad(set_to_none=True)
	loss_scalar.backward()

	# weights update
	# if accum_iter != 1 -> gradient accumulation
	batch_num = batch_idx + 1
	if (
		(batch_num % accum_iter == 0)
		or
		(batch_num == len(train_dl_len))
	):
		optimizer.step()

# @torch.compile(mode="reduce-overhead")
def train_epoch(dl, model, optimizer, loss, train_dl_len, k_frac=None):

	# epoch_accuracies = []
	epoch_losses = []
	for batch_idx, (x, y) in enumerate(dl):
		train_batch(model, optimizer, loss, x, y, train_dl_len, batch_idx, accum_iter=1, k_frac=k_frac)
	
		# epoch_accuracies += eval_batch(model, x, y)
		epoch_losses += eval_batch(model, x, y, loss)

	return epoch_losses

# @torch.compile(mode="reduce-overhead")
def eval_batch(model, x, y, loss):
	# x = x.half()
	# y = y.half()

	model.eval()
	with torch.inference_mode(): # turn off history tracking
		# forward pass
		proba = model(x)
		
		loss_value = loss(proba, y)
		epoch_loss_array = loss_value.detach() # loss_value.item() # batch loss

		# true = y.argmax(axis=1)
		# pred = proba.argmax(axis=1)
		# epoch_accuracy_array = (pred == true) # torch.sum()


		return epoch_loss_array

# @torch.compile(mode="reduce-overhead")
def eval_epoch(dl, model, loss):
	# epoch_accuracies = []
	epoch_losses = []
	for batch_idx, (x, y) in enumerate(dl):
		epoch_losses += eval_batch(model, x, y, loss)

	return epoch_losses


def train_model(train_dl, dev_dl, model, loss, optimizer, n_epochs, eval_every=5, k_frac=None, agg=["mean"], log=False):
	model.train()
  
	summary_list = []
  
	train_dl_len = len(train_dl)

	for epoch in range(1, n_epochs + 1):
		epoch_train_losses = train_epoch(train_dl, model, optimizer, loss, train_dl_len, k_frac)
		
		if epoch % eval_every == 0 or epoch == 1:
			epoch_dev_losses = eval_epoch(dev_dl, model, loss)
		else:
			epoch_dev_losses = []
		
		for e in epoch_train_losses:
			summary_list.append(
				[epoch, "Train", float(e)]
			)
		for e in epoch_dev_losses:
			summary_list.append(
				[epoch, "Dev", float(e)]
			)

		if log:
			print(f"Epoch {epoch}/{n_epochs} Completed")

	model.eval()

	summary = (
	 	pd.DataFrame(
			columns = ["Epoch", "Subset", "Loss"],
			data = summary_list
		)
	)
 
	if agg:
		summary = (
			summary
			.groupby(["Epoch", "Subset"])
			.agg(["mean"])
		)
		summary.columns = list(map('_'.join, summary.columns.values))
		summary = (
			summary
			.reset_index()
			.pivot(
				index="Epoch",
				columns="Subset",
				# values = "Accuracy"
			)
		)
		summary.columns = list(map('_'.join, summary.columns.values))
		summary["Generalization_Gap"] = summary["Loss_mean_Dev"] - summary["Loss_mean_Train"]
		summary = summary.reset_index()
	return summary

In [9]:
optim_class = torch.optim
# optim_children = dir(optim_class)
# no_of_optimizers = [o.startswith("_") for o in optim_children].index(True)
# optimizer_names = [o for o in optim_children[:no_of_optimizers] if o not in ["Optimizer", "LB"]]

optimizer_names = [
    'ASGD',
    'Adadelta',
    'Adagrad',
    'Adam',
    'AdamW',
    'Adamax',
    # 'LBFGS',
    'NAdam',
    'RAdam',
    'RMSprop',
    'Rprop',
    'SGD',
    # 'SparseAdam'
]

In [10]:
optimizer_names

['ASGD',
 'Adadelta',
 'Adagrad',
 'Adam',
 'AdamW',
 'Adamax',
 'NAdam',
 'RAdam',
 'RMSprop',
 'Rprop',
 'SGD']

In [11]:
def train_models(loss, model, n_epochs, optimizer_names, learning_rates):
	summaries = pd.DataFrame()
	# i=0
	
	for learning_rate in learning_rates:
		for optimizer_name in optimizer_names:
			model_copy = copy.deepcopy(model)
			optimizer = getattr(optim_class, optimizer_name)(model_copy.parameters(), lr=learning_rate)
			
			summary = train_model(
				train_dl,
				dev_dl,
				model_copy,
				loss,
				optimizer,
				n_epochs,
				eval_every=10,
				agg = ["mean"]
			)
			summary["Model"] = model_copy
			summary["Optimizer"] = optimizer_name
			summary["Learning_Rate"] = learning_rate

			summaries = pd.concat([
				summaries,
				summary
			])

			# i += 1
			# if i==2:
			# 	break
	
	return summaries

In [12]:
summaries = train_models(
    loss = nn.CrossEntropyLoss(reduction="none"),
	model = NeuralNet(
		train_dl,
		hidden_layers = [
			nn.Flatten(),
			nn.LazyLinear(100),
			nn.ReLU(),
			nn.LazyLinear(10),
			nn.ReLU()
			# nn.Sigmoid() not required
		]
	),
	n_epochs = 100, # 3
	optimizer_names = optimizer_names,
 	learning_rates = [
      0.01 #, 0.1, 1
    ]
)



In [13]:
summaries

Unnamed: 0,Epoch,Loss_mean_Dev,Loss_mean_Train,Generalization_Gap,Model,Optimizer,Learning_Rate
0,1,2.306312,2.318080,-0.011769,NeuralNet(\n (network): Sequential(\n (0):...,ASGD,0.01
1,2,,2.309096,,NeuralNet(\n (network): Sequential(\n (0):...,ASGD,0.01
2,3,,2.288600,,NeuralNet(\n (network): Sequential(\n (0):...,ASGD,0.01
3,4,,2.274558,,NeuralNet(\n (network): Sequential(\n (0):...,ASGD,0.01
4,5,2.242484,2.250037,-0.007553,NeuralNet(\n (network): Sequential(\n (0):...,ASGD,0.01
...,...,...,...,...,...,...,...
95,96,,0.374244,,NeuralNet(\n (network): Sequential(\n (0):...,SGD,0.01
96,97,,0.353902,,NeuralNet(\n (network): Sequential(\n (0):...,SGD,0.01
97,98,,0.357082,,NeuralNet(\n (network): Sequential(\n (0):...,SGD,0.01
98,99,,0.388141,,NeuralNet(\n (network): Sequential(\n (0):...,SGD,0.01


In [14]:
def plot_summary(df, y, percentage=False):
	df = df.copy()
	
	x = "Epoch"
	c = "Optimizer"
	
	sub_title = f"Lower is better"
	range_y = None
	# if y == "Generalization_Gap":
	# 	sub_title = f"Lower is better"
	# 	range_y = None
	# else:
	# 	range_y = [0, 100 if percentage else 1]
	# 	sub_title = f"Higher is better"

	if percentage:
		df[y] *= 100

	title = f'{y.replace("_", " ")}'

	title += f"<br><sup>{sub_title}</sup>"

	fig = px.line(
		data_frame=df,
		x=x,
		y=y,
		color = c,
		title = title,
		range_x = [df[x].values.min(), df[x].values.max()],
		range_y = range_y, # df[y].values.min() * 0.95
		markers=True,
	)
 
	fig.update_layout(xaxis_title="Epoch", yaxis_title="Loss")
	fig.update_traces(
		patch={
			"marker": {"size": 5},
			"line": {
				"width": 1,
				# "dash": "dot"
			},
		}
	)
	fig.update_traces(connectgaps=True) # required for connecting dev accuracies
 
	return fig

In [15]:
plot_summary(
    summaries,
    "Loss_mean_Train"
)

  sf: grouped.get_group(s if len(s) > 1 else s[0])


In [16]:
plot_summary(
    summaries,
    "Loss_mean_Dev"
)





In [17]:
plot_summary(
    summaries,
    "Generalization_Gap"
)



