Import all the required libraries

In [2]:
import torch

from torch import nn
from torch.utils.data import DataLoader, random_split
from torchvision import datasets, transforms
from torchsummary import summary
from torch import optim

import re

import torch.nn.functional as F

import matplotlib.pyplot as plt

Define all hyperparameters in a common place

In [3]:
# batch_size = (2048-4) * 1024 * 1024 // (28*28*8)
batch_size = 128
alpha = .04
n1 = 32
n2 = 32
loss = F.cross_entropy
loss_name = re.sub("<function ", "", re.sub(" at 0x[0-9a-f]*>", "", loss.__repr__()))
epochs = 20

# batch_size = 128
# alpha = 0.01
# n1 = 256
# n2 = 128
# loss = F.cross_entropy
# loss_name = re.sub("<function ", "", re.sub(" at 0x[0-9a-f]*>", "", loss.__repr__()))
# epochs = 20

Define the network as two linear layers with ReLU, and a linear layer with Softmax

In [4]:
class FashionMNISTnn(nn.Module):
  def __init__(self, n1, n2):
    super().__init__()
    self.network_stack = nn.Sequential(
        nn.Flatten(),
        nn.Linear(28*28, n1),
        nn.ReLU(),
        nn.Linear(n1, n2),
        nn.ReLU(),
        nn.Linear(n2, 10),
        # nn.Softmax(1)
    )

  def forward(self, input):
    return self.network_stack(input)

Compute average and standard deviation of the train set

In [5]:
trans = transforms.ToTensor()

dataset = datasets.FashionMNIST("./data", train= True, download=True, transform=trans)
dataset_as_np = dataset.data.numpy() /255

mean = dataset_as_np.mean()
std = dataset_as_np.std()

print(f"Avg: {mean:.6}, std: {std:.6}")

Avg: 0.286041, std: 0.353024


Create the train and test datasets

In [6]:
trans = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=(mean,), std=(std,))
])

# Note: transformations are applied by the dataloader
main_dataset = datasets.FashionMNIST("./data", train=True, download=True, transform=trans)
test_dataset = datasets.FashionMNIST("./data", train=False, download=True, transform=trans)

# Split the train dataset into train and dev
train_dataset, dev_dataset = random_split(main_dataset, [55000, 5000])

Create train and test dataset loaders

In [7]:
training_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
dev_loader = DataLoader(dev_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [8]:
main_dataset

Dataset FashionMNIST
    Number of datapoints: 60000
    Root location: ./data
    Split: Train
    StandardTransform
Transform: Compose(
               ToTensor()
               Normalize(mean=(np.float64(0.2860405969887955),), std=(np.float64(0.35302424451492237),))
           )

Define training function

In [None]:
def train(model, device, dataset, optimiz, epoch, costF):
  model.train()

  for batch_idx, (data, target) in enumerate(dataset):
    # Move train data to the training device (likely GPU)
    data = data.to(device)
    target = target.to(device)

    # Reset gradient
    optimiz.zero_grad()

    # Forward propagation
    out = model(data)

    # Loss and backpropagation
    cost = costF(out, target)
    cost.backward()
    optimiz.step()

    # print(f"Epoch {epoch:02} - batch: {batch_idx:02} ==> Loss: {cost:2.04}")

Define test function

In [10]:
def test(model, device, dataset, costF, setName):
  # Set model in evaluation mode
  model.eval()

  # Init total loss and correct predictions
  loss = 0.0
  correct_pred = 0

  ## DEBUG
  count = 0

  for data, target in dataset:
    # Move train data to the training device (likely GPU)
    data = data.to(device)
    target = target.to(device)

    # Forward propagation
    out = model(data)

    # Define the model prediction as the class with higher probability
    pred = out.argmax(dim=1, keepdim=True)

    # Try to reshape the arrays into what they already are
    # If there were errors in the setup, this will create errors

    # sanity check
    batch_size = data.shape[0]
    pred = pred.view(batch_size)  # [bs,]
    target = target.view(batch_size)  # [bs,]

    # Sum the loss of all inputs
    loss += costF(out, target, reduction='sum').item()

    # Sum the number of correct predictions
    correct_pred += pred.eq(target).sum().item()

  # Compute statistics
  num_samples = len(dataset.dataset)
  avg_loss = loss / num_samples
  accuracy = float(correct_pred) / num_samples

  # Print statistics
#   print(f"{setName} ==> Avg epoch loss: {avg_loss:2.04} - accuracy: {accuracy:2.04}")

  return avg_loss, accuracy


In [11]:
dev = torch.device('cuda')# if torch.cuda.is_available() else torch.device("cpu")

model = FashionMNISTnn(n1=n1, n2=n2)
model.to(dev)

print(model)

FashionMNISTnn(
  (network_stack): Sequential(
    (0): Flatten(start_dim=1, end_dim=-1)
    (1): Linear(in_features=784, out_features=32, bias=True)
    (2): ReLU()
    (3): Linear(in_features=32, out_features=32, bias=True)
    (4): ReLU()
    (5): Linear(in_features=32, out_features=10, bias=True)
  )
)


In [12]:
print(summary(model, (1, 28, 28)))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
           Flatten-1                  [-1, 784]               0
            Linear-2                   [-1, 32]          25,120
              ReLU-3                   [-1, 32]               0
            Linear-4                   [-1, 32]           1,056
              ReLU-5                   [-1, 32]               0
            Linear-6                   [-1, 10]             330
Total params: 26,506
Trainable params: 26,506
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.00
Forward/backward pass size (MB): 0.01
Params size (MB): 0.10
Estimated Total Size (MB): 0.11
----------------------------------------------------------------
None


Train the model, and test it on the dev set

In [13]:
optimiz = optim.SGD(model.parameters(), lr = alpha, momentum=0.9)

epochs_list = []
train_losses = []
dev_losses = []

print("HYPERPARAMETERS:")
print(f"\t- batch size: {batch_size}")
print(f"\t- learning rate: {alpha}")
print(f"\t- n1: {n1}")
print(f"\t- n2: {n2}")
print(f"\t- loss: {loss_name}")
print(f"\t- epochs: {epochs}")
print("")

print("+-------+---------------------+---------------------+")
print("|       |      TRAIN set      |       DEV set       |")
print("+ Epoch +----------+----------+----------+----------+")
print("|       | Avg loss | Accuracy | Avg loss | Accuracy |")
print("+-------+----------+----------+----------+----------+")

for epoch in range(epochs):
  train(model, dev, training_loader, optimiz, epoch, loss)

  train_loss, train_acc = test(model, dev, training_loader, loss, "TRAIN")
  dev_loss, dev_acc = test(model, dev, dev_loader, loss, "DEV")

  train_losses.append(train_loss)
  dev_losses.append(dev_loss)
  epochs_list.append(epoch)

  print(f"| {epoch:5} |", end="")
  print(f"  {train_loss:6.3f}  | {100*train_acc:6.2f} % |", end="")
  print(f"  {dev_loss:6.3f}  | {100*dev_acc:6.2f} % |")

  plt.figure()
  plt.plot(epochs_list, train_losses, label="Train", marker='o')
  plt.plot(epochs_list, dev_losses, label="Dev", marker='o')

  plt.title("Average loss at epochs")

  plt.xlabel("Epoch")
  plt.xticks(epochs_list)

  plt.ylabel("Loss")
  plt.ylim(bottom=0)

  plt.legend()
  plt.savefig("loss.png")
  plt.close()
  print("+-------+----------+----------+----------+----------+")


HYPERPARAMETERS:
	- batch size: 128
	- learning rate: 0.04
	- n1: 32
	- n2: 32
	- loss: cross_entropy
	- epochs: 20

+-------+---------------------+---------------------+
|       |      TRAIN set      |       DEV set       |
+ Epoch +----------+----------+----------+----------+
|       | Avg loss | Accuracy | Avg loss | Accuracy |
+-------+----------+----------+----------+----------+


KeyboardInterrupt: 