Install Required Packages

In [None]:
%pip install torch torchvision --index-url https://download.pytorch.org/whl/cu129
%pip install matplotlib
%pip install numpy
%pip install tensorboard
%pip install scikit-learn
%pip install seaborn

In [None]:
# Fix Tensorboard Ghost Processes
import tempfile
import shutil
import os
shutil.rmtree(os.path.join(tempfile.gettempdir(), '.tensorboard-info'))

In [None]:
# Set Up
import os
import torch

import torchvision

import matplotlib.pyplot as plt
import numpy as np

from DataLoader import DataLoader
from MLP import MLP
from VGG16 import VGG16
from PreTrainedVGG16 import PreTrainedVGG16
from Trainer import Trainer

def img_show(img):
    img = img / 2 + 0.5 # unnormalize
    np_img = img.cpu().numpy()
    plt.imshow(np.transpose(np_img, (1, 2, 0)))
    plt.show()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

Setup MLP model for Training

In [None]:
# Train Model
%load_ext tensorboard
train_set = DataLoader(data_dir=os.getcwd(), trans_width=224, trans_height=224).load(dataset="/data/train", batch_size=32, shuffle=True, workers=2)
valid_loader = DataLoader(data_dir=os.getcwd(), trans_width=224, trans_height=224).load(dataset="/data/valid", batch_size=32, shuffle=False, workers=2)
mlp_model = MLP(log_dir=os.getcwd()+"/MLP", lr=1e-4).to(device)
mlp_trainer = Trainer(log_dir=os.getcwd()+"/MLP", n_epochs=13, device=device)

Train MLP Model

In [None]:
%load_ext tensorboard
mlp_trainer.fit(model=mlp_model, data=train_set, valid=valid_loader, use_lr_scheduler=True)

In [None]:
plt.clf()
plt.plot(getattr(mlp_trainer, "avg_train_loss"), label="Avg. Training Loss")
plt.plot(getattr(mlp_trainer, "avg_valid_loss"), label="Avg. Validation Loss")
plt.plot(getattr(mlp_trainer, "training_accuracy"), label="Training Accuracy (x100)%")
plt.plot(getattr(mlp_trainer, "validation_accuracy"), label="Validation Accuracy (x100)%")
plt.legend()
plt.savefig(os.getcwd() + "/MLP/MLP_Loss_Plot.jpg")

Write MLP Loss to Tensorboard

In [None]:
%load_ext tensorboard
for epoch in range(len(mlp_trainer.avg_train_loss)):
    mlp_model.writer.add_scalars('MLP', {'Avg_Training_Loss': mlp_trainer.avg_train_loss[epoch],
                                         'Avg_Validation_Loss': mlp_trainer.avg_valid_loss[epoch],
                                         'Training_Accuracy(x100)%': mlp_trainer.training_accuracy[epoch],
                                         'Validation_Accuracy(x100)%': mlp_trainer.validation_accuracy[epoch]}, epoch)

In [None]:
mlp_model.writer.close()

In [None]:
%tensorboard --logdir=./MLP

Test MLP Model

In [None]:
test_loader = DataLoader(data_dir=os.getcwd(), trans_width=224, trans_height=224).load(dataset="/data/test", batch_size=32, shuffle=False, workers=2)
classes = train_set.dataset.classes
dataiter = iter(test_loader)
images, labels = next(dataiter)

images = images.to(device)

img_show(torchvision.utils.make_grid(images))
mlp_model.eval()
output = mlp_model(images).to(device)
estimatedLabels = torch.max(output, 1).indices

print('Estimated Labels: ', ' '.join(f'{classes[estimatedLabels[j]]:5s}' for j in range(images.shape[0])))

Setup VGG16 Model

In [None]:
# Train VGG16 Model
%load_ext tensorboard
train_loader = DataLoader(data_dir=os.getcwd(), trans_width=224, trans_height=224).load(dataset="/data/train", batch_size=32, shuffle=True, workers=2)
valid_loader = DataLoader(data_dir=os.getcwd(), trans_width=224, trans_height=224).load(dataset="/data/valid", batch_size=32, shuffle=False, workers=2)
vgg16_model = VGG16(log_dir=os.getcwd()+"/VGG16", output_size=8, lr=1e-4).to(device)
vgg16_trainer = Trainer(log_dir=os.getcwd()+"/VGG16", n_epochs=12, device=device)

In [None]:
%load_ext tensorboard
vgg16_trainer.fit(model=vgg16_model, data=train_loader, valid=valid_loader, use_lr_scheduler=True)

In [None]:
plt.clf()
plt.plot(getattr(vgg16_trainer, "avg_train_loss"), label="Avg. Training Loss")
plt.plot(getattr(vgg16_trainer, "avg_valid_loss"), label="Avg. Validation Loss")
plt.plot(getattr(vgg16_trainer, "training_accuracy"), label="Training Accuracy (x100)%")
plt.plot(getattr(vgg16_trainer, "validation_accuracy"), label="Validation Accuracy (x100)%")
plt.legend()
plt.savefig(os.getcwd() + "/VGG16/VGG16_Loss_Plot.jpg")

In [None]:
%load_ext tensorboard
for epoch in range(len(vgg16_trainer.avg_train_loss)):
    vgg16_model.writer.add_scalars('VGG16', {'Avg_Training_Loss': vgg16_trainer.avg_train_loss[epoch],
                                             'Avg_Validation_Loss': vgg16_trainer.avg_valid_loss[epoch],
                                             'Training_Accuracy(x100)%': vgg16_trainer.training_accuracy[epoch],
                                             'Validation_Accuracy(x100)%': vgg16_trainer.validation_accuracy[epoch]}, epoch)

In [None]:
vgg16_model.writer.close()

In [None]:
%tensorboard --logdir=./VGG16

In [None]:
test_loader = DataLoader(data_dir=os.getcwd(), trans_width=224, trans_height=224).load(dataset="/data/test", batch_size=32, shuffle=False, workers=2)
classes = train_loader.dataset.classes
dataiter = iter(test_loader)
images, labels = next(dataiter)

images = images.to(device)

img_show(torchvision.utils.make_grid(images))
vgg16_model.eval()
output = vgg16_model(images).to(device)
estimatedLabels = torch.max(output, 1).indices

print('Estimated Labels: ', ' '.join(f'{classes[estimatedLabels[j]]:5s}' for j in range(images.shape[0])))

Setup PreTrained VGG16 Model for Training (Just last layer of classifier)

In [None]:
%load_ext tensorboard
train_set = DataLoader(data_dir=os.getcwd(), trans_width=224, trans_height=224).load(dataset="/data/train", batch_size=32, shuffle=True, workers=2)
valid_loader = DataLoader(data_dir=os.getcwd(), trans_width=224, trans_height=224).load(dataset="/data/valid", batch_size=32, shuffle=False, workers=2)
torch_vgg16_model = PreTrainedVGG16(log_dir=os.getcwd()+"/PreTrainedVGG16", lr=1e-4).to(device)
torch_vgg16_trainer = Trainer(log_dir=os.getcwd()+"/PreTrainedVGG16", n_epochs=3, device=device)

Train PreTrained VGG16 Model

In [None]:
%load_ext tensorboard
torch_vgg16_trainer.fit(model=torch_vgg16_model, train=train_set, valid=valid_loader, use_lr_scheduler=False)

In [None]:
plt.clf()
plt.plot(getattr(torch_vgg16_trainer, "avg_train_loss"), label="Avg. Training Loss")
plt.plot(getattr(torch_vgg16_trainer, "avg_valid_loss"), label="Avg. Validation Loss")
plt.plot(getattr(torch_vgg16_trainer, "training_accuracy"), label="Training Accuracy (%)")
plt.plot(getattr(torch_vgg16_trainer, "validation_accuracy"), label="Validation Accuracy (%)")
plt.legend()
plt.savefig(os.getcwd() + "/VGG16/VGG16_Loss_Plot.jpg")

Test PreTrained VGG16 Model

In [None]:
torch_vgg16_model.writer.close()

In [None]:
%tensorboard --logdir=./PreTrainedVGG16

In [None]:
test_loader = DataLoader(data_dir=os.getcwd(), trans_width=224, trans_height=224).load(dataset="/data/test", batch_size=32, shuffle=False, workers=2)
classes = train_set.dataset.classes
dataiter = iter(test_loader)
images, labels = next(dataiter)

images = images.to(device)

img_show(torchvision.utils.make_grid(images))
torch_vgg16_model.eval()
output = torch_vgg16_model(images).to(device)
estimatedLabels = torch.max(output, 1).indices

print('Estimated Labels: ', ' '.join(f'{classes[estimatedLabels[j]]:5s}' for j in range(images.shape[0])))

Before I started tuning the models, I spent majority of the time implementing the foundations of each model, the trainer, dataloader and confusion matrices.

To implement the trainer, I adapted the trainer from my lab work. I made modifications to allow me to calculate both generalization loss and training loss during the training phase, and added a learning rate scheduler as it is usually highly recommended. I decided to use an exponential learning rate scheduler as it was simple to implement. I also added the necessary code to save the required data at regular checkpoints.

I also then created a function that I could use to plot and save a confusion matrix. Once I had that, I added the required code to the Trainer. This is done during the validation of each epoch during training to get an idea of how well the model is classifying each class. I do it then as the test set does not have at least a single instance of every class. The downside to this is that at this point it may be memorizing the validation set, however using the graphs of generalization/training loss and accuracy, I can retrain to the point where it typically starts to converge, meaning the confusion matrix should give a good idea of the model performance. Once I had all this in place, I began to implement basic architecture of each model.

To implement the MLP model, I adapted my lab work as it was proven there to work well for image classification. I also added some dropout between each layer to prevent over-relying on specific neurons during training. I just used a learning rate of 1x10^-4, batch size of 32 and 3 epochs for testing to make sure the MLP and all my components were working correctly. This process went smoothly.

I then began to implement the VGG16 architecture from the 2015 research paper by Simonyan and Zisserman (https://arxiv.org/pdf/1409.1556). This is where progress was halted and majority of my time was spent. I followed the architecture outlined in the paper but was getting some really strange results when I started testing it. I initially started with the same hyper parameters as the MLP, but at the completion of my testing, I noticed the lines for training and validation were practically flat and I knew I had a problem. I first tried adjusting the number of epochs to 25 as I thought maybe the issue was just that I didn't train it for long enough to give it time to learn, due to how deep the architecture is, but I still had flat lines. So I tried increasing the learning rate because I thought maybe it was too low for it to learn anything. I increased it from 1x10^-4, to 1x10^-3, which still gave me the same result. So I continued increasing it one by one until I had a learning rate of 1 in which resulted in the loss reading as NaN. I also tried some learning rates smaller than 1x10^-4 just out of curiosity to see if I was missing something, but still no change from the flat lines. Next I tried playing with the batch sizes. I tried some smaller batch sizes in the hopes that it would pick up more on the features on the image and learn something, but again flat lines on the graph. I also tried some larger ones but alas no changed.

![Alt Text](vgg16_issue.png)

The only thing I could think of at this point was that maybe the dataset was too small, so I tried testing it on the CIRFAR10 dataset from torchvision to try and narrow down the issue. I tried training it on this dataset for 25 epochs with a learning rate starting at 1x10^-4. My graph still appeared the same, so I had no clue what could be wrong. I tried everything I could think of. Finally, I decided to examine the architecture I had implemented against the research paper again. Everything seemed fine, but I noticed that the architecture called for a softmax activation at the very end which I implemented and thought I would try without it. The model began to train as expected. So I dove deeeper to try and find out why this was the case, because I knew the architecture required it, but for some reason didn't work when I included it. I found that Pytorch's CrossEntropyLoss function already internally applies a softmax activation, therefore the two were conflicting and the model wasn't able to learn correctly.

Finally, I implemented the PreTrained VGG16 model from Pytorch. I initialised the weights using one of the variables from Pytorch for this model to minimise the need for training as much as possible. I redefined the last layer of the model to output the correct number of classes, and to be able to train it for this specific set of classes. I checked it using the same initial hyper parameters as the previous models and everything was in place to begin training each model. However, time at this point was running out.

For the test set, there was a non clothing image present, so I removed it from the dataset to be able to accurately test each model.

I started with training my MLP. I decided to go with a 3 layer architecture as it was recommended to use 3-4 layers, and used 1024, 512, 256 neurons in the appropriate layers. To keep things simple, I reshape each image to 224 * 224 * 3 as that is what is used for the VGG16 architecture later on. I'm starting with the initial learning rate from the labs of 1x10^-4, and a batch size of 32. I will start with a larger number of epochs, 25, to give it a chance to converge when graphed.

![image](MLP/Epochs_25_LR_1e-4/Screenshot%20(122).png)![image](MLP/Epochs_25_LR_1e-4/MLP_Loss_Plot.jpg)

The model began to converge around epoch 12, where the trajectory of the generalization loss began to start climbing. The training accuracy at this point was around 97% and the validation accuracy was around 89. In the last epoch the validation accuracy had dropped to 88% but the training accuracy had climbed to 99%.

![image](MLP/Epochs_25_LR_1e-4/Epoch0_ConfusionMatrix.png)

After the first epoch, the confusion matrix shows that the it could get majority of the tees correct but was still facing a bit of trouble shirts, knitwear and jackets, which makes sense due to its lack of spatial awareness when the inputs are passed through the model.

![image](MLP/Epochs_25_LR_1e-4/Epoch12_ConfusionMatrix.png)

At the 12th epoch around where it converged, it shows a strong diagonal trend on the confusion matrix meaning it is getting a lot of the images correct. However, there is still a bit of confusion on tees, shirts, jackets, and knitwear.

![image](MLP/Epochs_25_LR_1e-4/Epoch24_ConfusionMatrix.png)

The last epoch shows similar to the 12th epoch, and it appears that there is less confusion between tees, knitwear and jackets, but at this point it could have started to memorise the dataset.

![image](MLP/Epochs_25_LR_1e-4/Screenshot%20(126).png)

On the test set the model performed poorly. It managed to get both pairs of jeans correct, but clearly struggled with tees, shirts, jackets and knitwear, which makes sense from the Confusion Matrices. It also managed to get shoes confused with accessories, which did happen a little during training, but at this point should have been less likely which could indicate it started just memorising.

With the learning rate scheduler, I decided to increase the learning rate to 1x10^-1 so it learns more at the start to see if it converges any quicker.

![image](MLP/Epochs_25_LR_1e-1/Screenshot%20(127).png) ![image](MLP/Epochs_25_LR_1e-1/MLP_Loss_Plot.jpg)

As can be seen, the training was extremely unstable at this learning rate meaning that the learning accuracy was probably too high.

![image](MLP/Epochs_25_LR_1e-1/Epoch24_ConfusionMatrix.png)

Everything was confused with tees.

![image](MLP/Epochs_25_LR_1e-1/Screenshot%20(128).png)

As expected from the confusion matrix, everything was predicted to be a tee.

Next I tried a learning rate of 1x10^-2, just to see if it will still be unstable.

![image](MLP/Epochs_25_LR_1e-2/Screenshot%20(130).png) ![image](MLP/Epochs_25_LR_1e-2/MLP_Loss_Plot.jpg)

I'm not 100% sure what is happening with the tensorboard output, but I don't think it is correct, but I have been also plotting the data with matplotlib and as you can see, it also couldn't stabilize so the learning rate must still be too high.

![image](MLP/Epochs_25_LR_1e-2/Epoch24_ConfusionMatrix.png)

The confusion matrix was basically identical.

Based on these results, I went back to the learning rate of 1x10^4 as it seemed the best but decreased the epochs to 13 (0-12) as that is where it began to converge.

![image](MLP/Epochs_13_LR_1e-4/Screenshot%20(132).png) ![image](MLP/Epochs_13_LR_1e-4/MLP_Loss_Plot.jpg)

Tensorboard still seems to be incorrect, but as the other graph shows, it seems like it started to converge slightly at the last epoch compared to last time.

![image](MLP/Epochs_13_LR_1e-4/Epoch13_ConfusionMatrix.png)

As can be seen in the confusion matrix, it is getting a little more confused with tees, shirts, jackets, and knitwear compared to before. This difference in performance could be from different weight initialisations during the instantiation of the model as I perform no initialisation myself and leave it to randomness.

![image](MLP/Epochs_13_LR_1e-4/Screenshot%20(133).png)

It performs similarly on the test set compared to the first one.

Next I trained my VGG16 implementation. I started with the exact same hyper parameters as the MLP. The architecture should match what was written in the research paper, including some dropout between the fully connected layers. I have also left initialisation to randomness.

![image](VGG16/Epochs_25_LR_1e-4/Screenshot%20(134).png) ![image](VGG16/Epochs_25_LR_1e-4/VGG16_Loss_Plot.jpg)

During training, it seems as though the model started to converge around epoch 11, which is earlier than the MLP model did.

![image](VGG16/Epochs_25_LR_1e-4/Epoch0_ConfusionMatrix.png)

The VGG16 had a bit of a slower start. This was the confusion matrix at the end of the first epoch. It was getting everything confused with tees.

![image](VGG16/Epochs_25_LR_1e-4/Epoch11_ConfusionMatrix.png)

At the 11th epoch before it started to converge, it seemed to have an easier time with the shirts, jackets, knitwear and tees, however as can be seen in the confusion matrix, it was still predicting tees incorrectly a bit. At this point, the model had a 92% training accuracy and 89% validation accuracy.

![image](VGG16/Epochs_25_LR_1e-4/Epoch24_ConfusionMatrix.png)

At the last epoch, the model began having a little more difficulty with the torso items again. At this point, the model had a 98% training accuracy and a 90% validation accuracy.

![image](VGG16/Epochs_25_LR_1e-4/Screenshot%20(135).png)

On the test dataset, the VGG16 performed much better than the MLP although was not perfect. This is expected due to the VGG16 using convolution layers that help take into account the spatial context of each pixel in the image. Although, it is strange that even though in both the 11th epoch and the last epoch the model correctly predicted all shoes, it managed to get it wrong in the test set. This could be because it is overtrained at this point.

I next tried increasing the learning rate to 1x10^-2 to see if it would also be unstable like the MLP, or whether the deep nature of the VGG16 would counteract it.

![image](VGG16/Epochs_25_LR_1e-2/Screenshot%20(136).png) ![image](VGG16/Epochs_25_LR_1e-2/VGG16_Loss_Plot.jpg)

As can be seen, it is just as unstable as it was for the MLP.

![image](VGG16/Epochs_25_LR_1e-2/Epoch24_ConfusionMatrix.png)

Just like the MLP, everything was confused with tees.

Next I tried decreasing the learning rate to 1x10^-6.

![image](VGG16/Epochs_25_LR_1e-6/Screenshot%20(137).png) ![image](VGG16/Epochs_25_LR_1e-6/VGG16_Loss_Plot.jpg)

Tensorboard started playing up again, but the other plot shows that it had slow start over the first 10 epochs. The learning rate was either too low, or it needs to be trained for longer at this rate.

![image](VGG16/Epochs_25_LR_1e-6/Epoch24_ConfusionMatrix.png)

The confusion matrix shows it is struggling in the same areas as usual as well as shoes and accessories, and shorts and jeans. At the last epoch, the training accuracy was around 71% and the validation accuracy was around 69%.

I tried increasing the number of epochs to 50, and it showed a similar trend, meaning that the learning rate may be too small.

![image](VGG16/Epochs_50_LR_1e-6/Screenshot%20(138).png) ![image](VGG16/Epochs_50_LR_1e-6/VGG16_Loss_Plot.jpg)

Tensorboard is again playing up.

![image](VGG16/Epochs_50_LR_1e-6/Epoch49_ConfusionMatrix.png)

The model is still getting confused with shirts, tees, jackets, and knitwear. The training accuracy was around 77% and the validation accuracy was at 76%. This was higher than the 25 epochs at the same learning rate.

I tried increasing the learning rate to 1x10^-3, and 10 epochs because at 1x10^-4 it converged around the 12th epoch, so I thought that maybe increasing the learning rate a little, would require less epochs.

![image](VGG16/Epochs_10_LR_1e-3/Screenshot%20(139).png) ![image](VGG16/Epochs_10_LR_1e-3/VGG16_Loss_Plot.jpg)

The model didn't train as well as I expected. I wouldn't say it seemed unstable like the higher learning rates, but maybe just needed more epochs to train at this learning rate. From these tests, I think the original learning rate performed best.

![image](VGG16/Epochs_10_LR_1e-3/Epoch9_ConfusionMatrix.png)

The confusion matrix shows that it didn't learn very much at all, and was still getting everything confused with tees.

Finally, I returned to the original learning rate, and changed the number of epochs to 12.

![image](VGG16/Epochs_12_LR_1e-4/Screenshot%20(140).png) ![image](VGG16/Epochs_12_LR_1e-4/VGG16_Loss_Plot.jpg)

The model trained much better with these parameters.

![image](VGG16/Epochs_12_LR_1e-4/Epoch11_ConfusionMatrix.png)

At the last epoch, the model had very little confusion although still some confusion with shirts, tees, knitwear and jackets were present. The training accuracy at this point was 91% and validation accuracy was 90%.

![image](VGG16/Epochs_12_LR_1e-4/Screenshot%20(141).png)

On the test set, it performed similarly to the same model at 25 epochs. It couldn't get the knitwear or shoe image correct. It could be that maybe it learnt shoes very early and started to memorise, because it was getting 100% of them correct in validation.

Overall the VGG16 model performed much better on the test data when trained compared to the MLP model.