Install Required Packages

In [None]:
%pip install torch torchvision --index-url https://download.pytorch.org/whl/cu129
%pip install matplotlib
%pip install numpy
%pip install tensorboard
%pip install scikit-learn
%pip install seaborn

In [None]:
# Fix Tensorboard Ghost Processes
import tempfile
import shutil
import os
shutil.rmtree(os.path.join(tempfile.gettempdir(), '.tensorboard-info'))

In [2]:
# Set Up
import os
import torch

import torchvision

import matplotlib.pyplot as plt
import numpy as np

from DataLoader import DataLoader
from MLP import MLP
from VGG16 import VGG16
from PreTrainedVGG16 import PreTrainedVGG16
from Trainer import Trainer

def img_show(img):
    img = img / 2 + 0.5 # unnormalize
    np_img = img.cpu().numpy()
    plt.imshow(np.transpose(np_img, (1, 2, 0)))
    plt.show()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


Setup MLP model for Training

In [None]:
# Train Model
%load_ext tensorboard
train_set = DataLoader(data_dir=os.getcwd(), trans_width=224, trans_height=224).load(dataset="/data/train", batch_size=32, shuffle=True, workers=2)
valid_loader = DataLoader(data_dir=os.getcwd(), trans_width=224, trans_height=224).load(dataset="/data/valid", batch_size=32, shuffle=False, workers=2)
mlp_model = MLP(log_dir=os.getcwd()+"/MLP", lr=1e-04).to(device)
mlp_trainer = Trainer(log_dir=os.getcwd()+"/MLP", n_epochs=3, device=device)

Train MLP Model

In [None]:
%load_ext tensorboard
mlp_trainer.fit(model=mlp_model, train=train_set, valid=valid_loader, use_lr_scheduler=True)

In [None]:
plt.clf()
plt.plot(getattr(mlp_trainer, "avg_train_loss"), label="Avg. Training Loss")
plt.plot(getattr(mlp_trainer, "avg_valid_loss"), label="Avg. Validation Loss")
plt.plot(getattr(mlp_trainer, "training_accuracy"), label="Training Accuracy (x100)%")
plt.plot(getattr(mlp_trainer, "validation_accuracy"), label="Validation Accuracy (x100)%")
plt.legend()
plt.savefig(os.getcwd() + "/MLP/MLP_Loss_Plot.jpg")

Write MLP Loss to Tensorboard

In [None]:
%load_ext tensorboard
for epoch in range(len(mlp_trainer.avg_train_loss)):
    mlp_model.writer.add_scalars('MLP', {'Avg_Training_Loss': mlp_trainer.avg_train_loss[epoch],
                                         'Avg_Validation_Loss': mlp_trainer.avg_valid_loss[epoch],
                                         'Training_Accuracy(x100)%': mlp_trainer.training_accuracy[epoch],
                                         'Validation_Accuracy(x100)%': mlp_trainer.validation_accuracy[epoch]}, epoch)

In [None]:
mlp_model.writer.close()

In [None]:
%tensorboard --logdir=./MLP

Test MLP Model

In [None]:
test_loader = DataLoader(data_dir=os.getcwd(), trans_width=124, trans_height=124).load(dataset="/data/test", batch_size=32, shuffle=False, workers=2)
classes = train_set.dataset.classes
test_ground_truth = [4, 4, 2, 2, 7, 3, 5]
dataiter = iter(test_loader)
images, labels = next(dataiter)

images = images.to(device)

img_show(torchvision.utils.make_grid(images))
mlp_model.eval()
output = mlp_model(images).to(device)
estimatedLabels = torch.max(output, 1).indices

print('Estimated Labels: ', ' '.join(f'{classes[estimatedLabels[j]]:5s}' for j in range(images.shape[0])))

In [None]:
# resume = False
# if resume:
#   epochs = mlp_model.load(dir="MLP/MLP_Epoch_11.tar")
#   trainer.load(dir=f"MLP/Epoch_{epochs}_LossAccuracy.tar")

Setup VGG16 Model

In [None]:
# Train VGG16 Model
%load_ext tensorboard
train_loader = DataLoader(data_dir=os.getcwd(), trans_width=224, trans_height=224).load(dataset="/data/train", batch_size=32, shuffle=True, workers=2)
valid_loader = DataLoader(data_dir=os.getcwd(), trans_width=224, trans_height=224).load(dataset="/data/valid", batch_size=32, shuffle=False, workers=2)
vgg16_model = VGG16(log_dir=os.getcwd()+"/VGG16", output_size=8, lr=1e-4).to(device)
vgg16_trainer = Trainer(log_dir=os.getcwd()+"/VGG16", n_epochs=3, device=device)

In [None]:
%load_ext tensorboard
vgg16_trainer.fit(model=vgg16_model, train=train_loader, valid=valid_loader, use_lr_scheduler=True)

In [None]:
plt.clf()
plt.plot(getattr(vgg16_trainer, "avg_train_loss"), label="Avg. Training Loss")
plt.plot(getattr(vgg16_trainer, "avg_valid_loss"), label="Avg. Validation Loss")
plt.plot(getattr(vgg16_trainer, "training_accuracy"), label="Training Accuracy (x100)%")
plt.plot(getattr(vgg16_trainer, "validation_accuracy"), label="Validation Accuracy (x100)%")
plt.legend()
plt.savefig(os.getcwd() + "/VGG16/VGG16_Loss_Plot.jpg")

In [None]:
%load_ext tensorboard
for epoch in range(len(vgg16_trainer.avg_train_loss)):
    vgg16_model.writer.add_scalars('VGG16', {'Avg_Training_Loss': vgg16_trainer.avg_train_loss[epoch],
                                             'Avg_Validation_Loss': vgg16_trainer.avg_valid_loss[epoch],
                                             'Training_Accuracy(x100)%': vgg16_trainer.training_accuracy[epoch],
                                             'Validation_Accuracy(x100)%': vgg16_trainer.validation_accuracy[epoch]}, epoch)

In [None]:
vgg16_model.writer.close()

In [None]:
%tensorboard --logdir=./VGG16

In [None]:
test_loader = DataLoader(data_dir=os.getcwd(), trans_width=224, trans_height=224).load(dataset="/data/test", batch_size=32, shuffle=False, workers=2)
classes = train_loader.dataset.classes
dataiter = iter(test_loader)
images, labels = next(dataiter)

images = images.to(device)

img_show(torchvision.utils.make_grid(images))
vgg16_model.eval()
output = vgg16_model(images).to(device)
estimatedLabels = torch.max(output, 1).indices

print('Estimated Labels: ', ' '.join(f'{classes[estimatedLabels[j]]:5s}' for j in range(images.shape[0])))

Setup PreTrained VGG16 Model for Training (Just last layer of classifier)

In [None]:
%load_ext tensorboard
train_set = DataLoader(data_dir=os.getcwd(), trans_width=224, trans_height=224).load(dataset="/data/train", batch_size=32, shuffle=True, workers=2)
valid_loader = DataLoader(data_dir=os.getcwd(), trans_width=224, trans_height=224).load(dataset="/data/valid", batch_size=32, shuffle=False, workers=2)
torch_vgg16_model = PreTrainedVGG16(log_dir=os.getcwd()+"/PreTrainedVGG16", lr=1e-4).to(device)
torch_vgg16_trainer = Trainer(log_dir=os.getcwd()+"/PreTrainedVGG16", n_epochs=3, device=device)

Train PreTrained VGG16 Model

In [None]:
%load_ext tensorboard
torch_vgg16_trainer.fit(model=torch_vgg16_model, train=train_set, valid=valid_loader, use_lr_scheduler=False)

In [None]:
plt.clf()
plt.plot(getattr(torch_vgg16_trainer, "avg_train_loss"), label="Avg. Training Loss")
plt.plot(getattr(torch_vgg16_trainer, "avg_valid_loss"), label="Avg. Validation Loss")
plt.plot(getattr(torch_vgg16_trainer, "training_accuracy"), label="Training Accuracy (%)")
plt.plot(getattr(torch_vgg16_trainer, "validation_accuracy"), label="Validation Accuracy (%)")
plt.legend()
plt.savefig(os.getcwd() + "/VGG16/VGG16_Loss_Plot.jpg")

Test PreTrained VGG16 Model

In [None]:
%load_ext tensorboard
for epoch in range(len(torch_vgg16_trainer.avg_train_loss)):
    torch_vgg16_model.writer.add_scalars('PreTrainedVGG16', {'Avg_Training_Loss': torch_vgg16_trainer.avg_train_loss[epoch],
                                                             'Avg_Validation_Loss': torch_vgg16_trainer.avg_valid_loss[epoch],
                                                             'Training_Accuracy(x100)%': torch_vgg16_trainer.training_accuracy[epoch],
                                                             'Validation_Accuracy(x100)%': torch_vgg16_trainer.validation_accuracy[epoch]}, epoch)

In [None]:
torch_vgg16_model.writer.close()

In [None]:
%tensorboard --logdir=./PreTrainedVGG16

In [None]:
test_loader = DataLoader(data_dir=os.getcwd(), trans_width=224, trans_height=224).load(dataset="/data/test", batch_size=32, shuffle=False, workers=2)
classes = train_set.dataset.classes
dataiter = iter(test_loader)
images, labels = next(dataiter)

images = images.to(device)

img_show(torchvision.utils.make_grid(images))
torch_vgg16_model.eval()
output = torch_vgg16_model(images).to(device)
estimatedLabels = torch.max(output, 1).indices

print('Estimated Labels: ', ' '.join(f'{classes[estimatedLabels[j]]:5s}' for j in range(images.shape[0])))

Before I started tuning the models, I spent majority of the time implementing the foundations of each model, the trainer, dataloader and confusion matrices.

To implement the trainer, I adapted the trainer from my lab work. I made modifications to allow me to calculate both generalization loss and training loss during the training phase, and added a learning rate scheduler as it is usually highly recommended. I decided to use an exponential learning rate scheduler as it was simple to implement. I also added the necessary code to save the required data at regular checkpoints.

I also then created a function that I could use to plot and save a confusion matrix. Once I had that, I added the required code to the Trainer. This is done during the validation of the last epoch during training to get an idea of how well the model is classifying each class. I do it then as the test set does not have at least a single instance of every class. The downside to this is that at this point it may be memorizing the validation set, however using the graphs of generalization/training loss and accuracy, I can retrain to the point where it typically starts to converge, meaning the confusion matrix should give a good idea of the model performance. Once I had all this in place, I began to implement basic architecture of each model.

To implement the MLP model, I adapted my lab work as it was proven there to work well for image classification. I also added some dropout between each layer to prevent over-relying on specific neurons during training. I just used a learning rate of 1x10^-4, batch size of 32 and 3 epochs for testing to make sure the MLP and all my components were working correctly. This process went smoothly.

I then began to implement the VGG16 architecture from the 2015 research paper by Simonyan and Zisserman (https://arxiv.org/pdf/1409.1556). This is where progress was halted and majority of my time was spent. I followed the architecture outlined in the paper but was getting some really strange results when I started testing it. I initially started with the same hyper parameters as the MLP, but at the completion of my testing, I noticed the lines for training and validation were practically flat and I knew I had a problem. I first tried adjusting the number of epochs to 25 as I thought maybe the issue was just that I didn't train it for long enough to give it time to learn, due to how deep the architecture is, but I still had flat lines. So I tried increasing the learning rate because I thought maybe it was too low for it to learn anything. I increased it from 1x10^-4, to 1x10^-3, which still gave me the same result. So I continued increasing it one by one until I had a learning rate of 1 in which resulted in the loss reading as NaN. I also tried some learning rates smaller than 1x10^-4 just out of curiosity to see if I was missing something, but still no change from the flat lines. Next I tried playing with the batch sizes. I tried some smaller batch sizes in the hopes that it would pick up more on the features on the image and learn something, but again flat lines on the graph. I also tried some larger ones but alas no changed.

![Alt Text](vgg16_issue.png)

The only thing I could think of at this point was that maybe the dataset was too small, so I tried testing it on the CIRFAR10 dataset from torchvision to try and narrow down the issue. I tried training it on this dataset for 25 epochs with a learning rate starting at 1x10^-4. My graph still appeared the same, so I had no clue what could be wrong. I tried everything I could think of. Finally, I decided to examine the architecture I had implemented against the research paper again. Everything seemed fine, but I noticed that the architecture called for a softmax activation at the very end which I implemented and thought I would try without it. The model began to train as expected. So I dove deeeper to try and find out why this was the case, because I knew the architecture required it, but for some reason didn't work when I included it. I found that Pytorch's CrossEntropyLoss function already internally applies a softmax activation, therefore the two were conflicting and the model wasn't able to learn correctly.

Finally, I implemented the PreTrained VGG16 model from Pytorch. I initialised the weights using one of the variables from Pytorch for this model to minimise the need for training as much as possible. I redefined the last layer of the model to output the correct number of classes, and to be able to train it for this specific set of classes. I checked it using the same initial hyper parameters as the previous models and everything was in place to begin training each model. However, time at this point was running out.