# Import

In [33]:
import torch
import torch.nn as nn
from torch.optim import SGD
from torch.utils.data import Dataset, DataLoader, RandomSampler, random_split

# import torchvision
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px

import copy

## Import Data

In [34]:
class CTDataset(Dataset):
	def __init__(self, filepath):
		self.x, self.y = torch.load(filepath)
		self.x = self.x / 255.0
		self.y = nn.functional.one_hot(self.y, num_classes=10).to(float)

	def __len__(self):
		return self.x.shape[0]

	def __getitem__(self, ix):
		return self.x[ix], self.y[ix]

In [35]:
# https://www.di.ens.fr/~lelarge/MNIST.tar.gz
train_ds = CTDataset("./MNIST/training.pt")
# test_ds = CTDataset('./MNIST/test.pt')

In [36]:
train, dev, valid = random_split(train_ds, [0.6, 0.2, 0.2])

In [37]:
train_size = min(1_000, len(train)) # Check if model overfits on small data, to ensure DNN actually is effective
dev_size = min(1_000, len(dev))

min_training_batches = 4
train_batch_size = min(32, max(1, train_size // min_training_batches))

evaluation_batch_size = min(1_024, dev_size)

In [38]:
train_random_sampler = RandomSampler(train, num_samples=train_size)
dev_random_sampler = RandomSampler(dev, num_samples=dev_size)

train_dl = DataLoader(
	train, sampler=train_random_sampler, batch_size=train_batch_size, drop_last=True
)

dev_dl = DataLoader(
	dev, sampler=dev_random_sampler, batch_size=evaluation_batch_size, drop_last=True
)

## Train

In [39]:
class NeuralNet(nn.Module):
	def __init__(self, init_dl, hidden_layers):
		super().__init__()

		for x, y in init_dl:
			break

		self.input_size = x.shape[-1]
		self.output_size = y.shape[-1]

		# input_layer = 
		
		
		output_layer = nn.LazyLinear(self.output_size) # output layer
		
		layers = (
      		# [input_layer] +
			hidden_layers +
			[output_layer]
		)

		self.network = nn.Sequential(
			*layers
		)

		# init lazy layers
		self.forward(x)

	def reshape(self, x):
		# batch_size, no_of_channels, width, height
		return x.view(x.shape[0], 1, x.shape[1], x.shape[2])

	def forward(self, x):
		return self.network(self.reshape(x)).squeeze()

In [40]:
def get_max_len(arrays):
	return max(
		[
			len(array)
			for array
			in arrays
		]
	)

def pad(array, max_len):
	return list(np.pad(
		array,
		pad_width = (0, max_len-len(array)),
		constant_values = np.nan
	))

# @torch.compile(mode="reduce-overhead")
def train_batch(model, optimizer, loss, x, y, train_dl_len, batch_idx, accum_iter=1, k_frac=None):
	# x = x.half()
	# y = y.half()
	
	model.train()
	# with torch.set_grad_enabled(True): # turn on history tracking
	# forward pass
	proba = model(x)
	loss_array = loss(proba, y)

	loss_scalar = loss_array.mean()
	
	# backward pass
	optimizer.zero_grad(set_to_none=True)
	loss_scalar.backward()

	# weights update
	# if accum_iter != 1 -> gradient accumulation
	batch_num = batch_idx + 1
	if (
		(batch_num % accum_iter == 0)
		or
		(batch_num == len(train_dl_len))
	):
		optimizer.step()

# @torch.compile(mode="reduce-overhead")
def train_epoch(dl, model, optimizer, loss, train_dl_len, k_frac=None):

	epoch_accuracies = []
	for batch_idx, (x, y) in enumerate(dl):
		train_batch(model, optimizer, loss, x, y, train_dl_len, batch_idx, accum_iter=1, k_frac=k_frac)
	
		epoch_accuracies += eval_batch(model, x, y)

	return epoch_accuracies

# @torch.compile(mode="reduce-overhead")
def eval_batch(model, x, y):
	# x = x.half()
	# y = y.half()

	model.eval()
	with torch.inference_mode(): # turn off history tracking
		# forward pass
		proba = model(x)
		
		true = y.argmax(axis=1)
		pred = proba.argmax(axis=1)

		epoch_accuracy_array = (pred == true) # torch.sum()

		# epoch_loss_array = loss_value.detach() # loss_value.item() # batch loss

		return epoch_accuracy_array

# @torch.compile(mode="reduce-overhead")
def eval_epoch(dl, model):
	epoch_accuracies = []
	for batch_idx, (x, y) in enumerate(dl):
		epoch_accuracies += eval_batch(model, x, y)

	return epoch_accuracies


def train_model(train_dl, dev_dl, model, loss, optimizer, n_epochs, eval_every=5, k_frac=None, agg=["mean"], log=False):
	model.train()
  
	summary_list = []
  
	train_dl_len = len(train_dl)

	for epoch in range(1, n_epochs + 1):
		epoch_train_accuracies = train_epoch(train_dl, model, optimizer, loss, train_dl_len, k_frac)
		
		if epoch % eval_every == 0 or epoch == 1:
			epoch_dev_accuracies = eval_epoch(dev_dl, model)
		else:
			epoch_dev_accuracies = []
		
		for e in epoch_train_accuracies:
			summary_list.append(
				[epoch, "Train", float(e)]
			)
		for e in epoch_dev_accuracies:
			summary_list.append(
				[epoch, "Dev", float(e)]
			)

		if log:
			print(f"Epoch {epoch}/{n_epochs} Completed")

	model.eval()

	summary = (
	 	pd.DataFrame(
			columns = ["Epoch", "Subset", "Accuracy"],
			data = summary_list
		)
	)
 
	if agg:
		summary = (
			summary
			.groupby(["Epoch", "Subset"])
			.agg(["mean"])
		)
		summary.columns = list(map('_'.join, summary.columns.values))
		summary = (
			summary
			.reset_index()
			.pivot(
				index="Epoch",
				columns="Subset",
				# values = "Accuracy"
			)
		)
		summary.columns = list(map('_'.join, summary.columns.values))
		summary["Generalization_Gap"] = summary["Accuracy_mean_Train"] - summary["Accuracy_mean_Dev"]
		summary = summary.reset_index()
	return summary

In [41]:
optim_class = torch.optim
# optim_children = dir(optim_class)
# no_of_optimizers = [o.startswith("_") for o in optim_children].index(True)
# optimizer_names = [o for o in optim_children[:no_of_optimizers] if o not in ["Optimizer", "LB"]]

optimizer_names = [
    'ASGD',
    'Adadelta',
    'Adagrad',
    'Adam',
    'AdamW',
    'Adamax',
    # 'LBFGS',
    'NAdam',
    'RAdam',
    'RMSprop',
    'Rprop',
    'SGD',
    # 'SparseAdam'
]

In [42]:
optimizer_names

['ASGD',
 'Adadelta',
 'Adagrad',
 'Adam',
 'AdamW',
 'Adamax',
 'NAdam',
 'RAdam',
 'RMSprop',
 'Rprop',
 'SGD']

In [43]:
def train_models(loss, model, n_epochs, optimizer_names):
	summaries = pd.DataFrame()
	i=0
 
	for optimizer_name in optimizer_names:
		model_copy = copy.deepcopy(model)
		optimizer = getattr(optim_class, optimizer_name)(model_copy.parameters(), lr=0.01)
		
		summary = train_model(
			train_dl,
			dev_dl,
			model_copy,
			loss,
			optimizer,
			n_epochs,
			eval_every=5,
			agg = ["mean"]
		)
		summary["Optimizer"] = optimizer_name
		summaries = pd.concat([
			summaries,
			summary
		])
		# i += 1
		# if i==2:
		# 	break
	
	return summaries

In [44]:
summaries = train_models(
    loss = nn.CrossEntropyLoss(reduction="none"),
	model = NeuralNet(
		train_dl,
		hidden_layers = [
			nn.Flatten(),
			nn.LazyLinear(100),
			nn.ReLU(),
			nn.LazyLinear(10),
			nn.ReLU()
			# nn.Sigmoid() not required
		]
	),
	n_epochs = 100, # 3
	optimizer_names = optimizer_names
)


Lazy modules are a new feature under heavy development so changes to the API or functionality can happen at any moment.



In [45]:
summaries

Unnamed: 0,Epoch,Accuracy_mean_Dev,Accuracy_mean_Train,Optimizer
0,1,0.155,0.127016,ASGD
1,2,,0.186492,ASGD
2,3,,0.195565,ASGD
3,4,,0.172379,ASGD
4,5,0.201,0.183468,ASGD
...,...,...,...,...
95,96,,0.900202,SGD
96,97,,0.875000,SGD
97,98,,0.887097,SGD
98,99,,0.892137,SGD


In [46]:
def plot_summary(df, y, percentage=True):
	df = df.copy()
	
	x = "Epoch"
	c = "Optimizer"
	
	if y == "Generalization_Gap":
		sub_title = f"Lower is better"
		range_y = None
	else:
		range_y = [0, 100 if percentage else 1]
		sub_title = f"Higher is better"

	if percentage:
		df[y] *= 100

	title = f"{y}"

	title += f"<br><sup>{sub_title}</sup>"

	fig = px.line(
		data_frame=df,
		x=x,
		y=y,
		color = c,
		title = title,
		range_x = [df[x].values.min(), df[x].values.max()],
		range_y = range_y, # df[y].values.min() * 0.95
		markers=True,
	)
 
	fig.update_layout(xaxis_title="Epoch", yaxis_title="Accuracy")
	fig.update_traces(
		patch={
			"marker": {"size": 5},
			"line": {
				"width": 1,
				# "dash": "dot"
			},
		}
	)
	fig.update_traces(connectgaps=True) # required for connecting dev accuracies
 
	return fig

In [47]:
plot_summary(
    summaries,
    "Accuracy_mean_Train"
)





In [48]:
plot_summary(
    summaries,
    "Accuracy_mean_Dev"
)





In [49]:
plot_summary(
    summaries,
    "Generalization_Gap"
)



