<a href="https://colab.research.google.com/github/AACRobinson/IMLO-Assessment/blob/main/IMLO_Assessment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Initialisation**

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import scipy.io
import sklearn

from torch.utils.data import Dataset, DataLoader
from torchvision import datasets, transforms
from torchvision.transforms import Compose, ToTensor, Resize, InterpolationMode
from torch import nn
from torch.utils.data import DataLoader


if torch.cuda.is_available() :
  torch.cuda.empty_cache()
  device = 'cuda'
elif torch.backends.mps.is_available() :
  torch.backends.mps.empty_cache()
  device = 'mps'
else:
  torch.cpu.empty_cache()
  device = 'cpu'

print("Using", device, "device\n")


trainFlowers = datasets.Flowers102 (
    root="data",
    split="train",
    download=True,
    transform=Compose([ToTensor(), Resize((500, 500))]) #Need to convert all images to the same tensor dimensions
                                                          ##we want these images to be large enough to for model to be able to classify with a reasonable degree of accuracy
                                                          ##yet small enough to make sure memory doesn't fill up/model trains in a reasonable amount of time
)

validateFlowers = datasets.Flowers102 (
    root="data",
    split="val",
    download=True,
    transform=Compose([ToTensor(), Resize((500, 500))])
)

testFlowers = datasets.Flowers102 (
    root="data",
    split="test",
    download=True,
    transform=Compose([ToTensor(), Resize((500, 500))])
)

trainDataLoader = DataLoader(trainFlowers, batch_size=16, shuffle=True)
validateDataLoader = DataLoader(validateFlowers, batch_size=16, shuffle=True)
testDataLoader = DataLoader(testFlowers, batch_size=16, shuffle=True)

Using cuda device

Downloading https://thor.robots.ox.ac.uk/datasets/flowers-102/102flowers.tgz to data/flowers-102/102flowers.tgz


100%|██████████| 344862509/344862509 [00:10<00:00, 31733312.45it/s]


Extracting data/flowers-102/102flowers.tgz to data/flowers-102
Downloading https://thor.robots.ox.ac.uk/datasets/flowers-102/imagelabels.mat to data/flowers-102/imagelabels.mat


100%|██████████| 502/502 [00:00<00:00, 622572.62it/s]


Downloading https://thor.robots.ox.ac.uk/datasets/flowers-102/setid.mat to data/flowers-102/setid.mat


100%|██████████| 14989/14989 [00:00<00:00, 21040302.09it/s]


# **Visualisation**

In [None]:
train_features, train_labels = next(iter(trainDataLoader))
print("Feature batch shape: ", train_features.size())
print("Labels batch shape: ", train_labels.size())
image = train_features[0].squeeze().T #Need to transpose the tensor for the dimensions to be correct (3, 500, 500) -> (500, 500, 3)
label = train_labels[0].item()
plt.imshow(image)
plt.show()
print("Label: ", label)


figure = plt.figure(figsize=(8, 8))
cols, rows = 3, 3
for i in range(1, cols*rows+1) :
  sample_index = torch.randint(len(trainFlowers), size=(1,)).item()
  image, label = trainFlowers[sample_index]
  figure.add_subplot(rows, cols, i)
  plt.title(trainFlowers[sample_index][1])
  plt.axis("off")
  plt.imshow(image.squeeze().T)
plt.show()

# **Building the NN - Initial Model 1, ~18% Accuracy**

In [2]:
class flowerNN(nn.Module):
    def __init__(self):
        super().__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(3*500*500, 1024), #About as large as is possible
            nn.ReLU(),
            nn.Linear(1024, 1024),
            nn.ReLU(),
            nn.Linear(1024, 102),
        )

    def forward(self, x):
        x = self.flatten(x).to(device)
        logits = self.linear_relu_stack(x).to(device)
        return logits

flowerNNModel = flowerNN().to(device)
modelLR = 0.0001
epochs = 100
lossFunct = nn.CrossEntropyLoss().to(device)
modelOptimiser = torch.optim.SGD(flowerNNModel.parameters(), modelLR)

def trainer(dataloader, nnModel, lossFunct, modelOptimiser) :
  nnModel.train()
  for modelBatch, (X, y) in enumerate(dataloader) :
    prediction = nnModel(X)
    y = y.to(device)
    loss = lossFunct(prediction, y)
    loss.backward()
    modelOptimiser.step()
    modelOptimiser.zero_grad()

def evaluator(dataloader, nnModel, lossFunct):
    nnModel.eval()
    dataSize = len(dataloader.dataset)
    numBatches = len(dataloader)
    loss, correct = 0, 0

    with torch.no_grad():
        for X, y in dataloader:
            prediction = nnModel(X)
            y = y.to(device)
            loss += lossFunct(prediction, y).item()
            correct += (prediction.argmax(1) == y).type(torch.float).sum().item()

    loss /= numBatches
    correct /= dataSize
    print("Test Error: \n Accuracy: ", round((100*correct), 2), "\n Avg loss: ", round(loss, 2), "\n")

for t in range(epochs):
    print("Epoch ", t+1, "\n-------------------------------")
    trainer(trainDataLoader, flowerNNModel, lossFunct, modelOptimiser)
    evaluator(validateDataLoader, flowerNNModel, lossFunct)
print("Final Evaluation ", "\n-------------------------------")
evaluator(testDataLoader, flowerNNModel, lossFunct)
print("Done!\n\n")

Epoch  1 
-------------------------------
Test Error: 
 Accuracy:  1.96 
 Avg loss:  4.62 

Epoch  2 
-------------------------------
Test Error: 
 Accuracy:  1.57 
 Avg loss:  4.61 

Epoch  3 
-------------------------------
Test Error: 
 Accuracy:  2.65 
 Avg loss:  4.6 

Epoch  4 
-------------------------------
Test Error: 
 Accuracy:  3.14 
 Avg loss:  4.59 

Epoch  5 
-------------------------------
Test Error: 
 Accuracy:  3.33 
 Avg loss:  4.58 

Epoch  6 
-------------------------------
Test Error: 
 Accuracy:  3.92 
 Avg loss:  4.57 

Epoch  7 
-------------------------------
Test Error: 
 Accuracy:  4.31 
 Avg loss:  4.56 

Epoch  8 
-------------------------------
Test Error: 
 Accuracy:  5.2 
 Avg loss:  4.55 

Epoch  9 
-------------------------------
Test Error: 
 Accuracy:  5.0 
 Avg loss:  4.54 

Epoch  10 
-------------------------------
Test Error: 
 Accuracy:  6.57 
 Avg loss:  4.53 

Epoch  11 
-------------------------------
Test Error: 
 Accuracy:  6.27 
 Avg los

# **Initial Model 2**

Low accuracy caused by algorithm trying to classify into too many categories?


*   Split the dataset into smaller datasets with 5-10 types of flower each
*   Train each dataset individually, average result



In [23]:
def splitDataset(dataset, labelsPerSet) :
  splitSet = []
  currentSet = []
  increment = labelsPerSet

  print("Splitting Dataset...")
  for item in dataset :
    if item[1] < labelsPerSet :
      currentSet.append(item)
    else :
      splitSet.append(currentSet)
      print("Set ", (labelsPerSet//increment), ":", (labelsPerSet-increment), "-", (labelsPerSet-1), "| Length: ", len(currentSet))
      labelsPerSet += increment
      currentSet = []
      currentSet.append(item)

  splitSet.append(currentSet)
  print("Set ", (labelsPerSet//increment), ":", (labelsPerSet-increment), "-", (dataset[-1][1]), "| Length: ", len(currentSet))
  print("Done! \n")
  return splitSet

def DataLoaderPerSet(dataset, batch_size=16, shuffle=True) :
  dataloaders = []
  print("Initialising Dataloaders...")
  for item in dataset :
    dataloaders.append(DataLoader(item, batch_size, shuffle))
  print("Done! \n")
  return dataloaders


splitTrainFlowers = splitDataset(trainFlowers, 5)
splitValidateFlowers = splitDataset(validateFlowers, 5)
splitTestFlowers = splitDataset(testFlowers, 5)

splitTrainFlowersDataloaders = DataLoaderPerSet(splitTrainFlowers)
splitValidateFlowersDataloaders = DataLoaderPerSet(splitTrainFlowers)
splitTestFlowersDataloaders = DataLoaderPerSet(splitTrainFlowers)
completeTestFlowersDataloader = DataLoader(testFlowers, batch_size=16, shuffle=True)


class flowerNN(nn.Module):
    def __init__(self):
        super().__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(3*500*500, 1024), #About as large as is possible
            nn.ReLU(),
            nn.Linear(1024, 1024),
            nn.ReLU(),
            nn.Linear(1024, 102),
        )

    def forward(self, x):
        x = self.flatten(x).to(device)
        logits = self.linear_relu_stack(x).to(device)
        return logits

flowerNNModel = flowerNN().to(device)
modelLR = 0.0001
epochs = 20
lossFunct = nn.CrossEntropyLoss().to(device)
modelOptimiser = torch.optim.SGD(flowerNNModel.parameters(), modelLR)

Splitting Dataset...
Set  1 : 0 - 4 | Length:  50
Set  2 : 5 - 9 | Length:  50
Set  3 : 10 - 14 | Length:  50
Set  4 : 15 - 19 | Length:  10
Set  5 : 20 - 24 | Length:  90
Set  6 : 25 - 29 | Length:  50
Set  7 : 30 - 34 | Length:  50
Set  8 : 35 - 39 | Length:  50
Set  9 : 40 - 44 | Length:  50
Set  10 : 45 - 49 | Length:  50
Set  11 : 50 - 54 | Length:  50
Set  12 : 55 - 59 | Length:  50
Set  13 : 60 - 64 | Length:  50
Set  14 : 65 - 69 | Length:  50
Set  15 : 70 - 74 | Length:  50
Set  16 : 75 - 79 | Length:  50
Set  17 : 80 - 84 | Length:  50
Set  18 : 85 - 89 | Length:  50
Set  19 : 90 - 94 | Length:  50
Set  20 : 95 - 99 | Length:  50
Set  21 : 100 - 101 | Length:  20
Done! 

Splitting Dataset...
Set  1 : 0 - 4 | Length:  50
Set  2 : 5 - 9 | Length:  50
Set  3 : 10 - 14 | Length:  50
Set  4 : 15 - 19 | Length:  10
Set  5 : 20 - 24 | Length:  90
Set  6 : 25 - 29 | Length:  50
Set  7 : 30 - 34 | Length:  50
Set  8 : 35 - 39 | Length:  50
Set  9 : 40 - 44 | Length:  50
Set  10 : 45 -

In [24]:
def trainer(dataloader, nnModel, lossFunct, modelOptimiser) :
  nnModel.train()
  for modelBatch, (X, y) in enumerate(dataloader) :
    prediction = nnModel(X)
    y = y.to(device)
    loss = lossFunct(prediction, y)
    loss.backward()
    modelOptimiser.step()
    modelOptimiser.zero_grad()

def evaluator(dataloader, nnModel, lossFunct):
    nnModel.eval()
    dataSize = len(dataloader.dataset)
    numBatches = len(dataloader)
    loss, correct = 0, 0

    with torch.no_grad():
        for X, y in dataloader:
            prediction = nnModel(X)
            y = y.to(device)
            loss += lossFunct(prediction, y).item()
            correct += (prediction.argmax(1) == y).type(torch.float).sum().item()

    loss /= numBatches
    correct /= dataSize
    print("Test Error: \n Accuracy: ", round((100*correct), 2), "\n Avg loss: ", round(loss, 2), "\n")

for i in range(len(splitTrainFlowersDataloaders)) :
  print("Training Set", i, ":\n-------------------------------")
  for j in range(epochs):
    print("Epoch ", j+1, "\n-------------------------------")
    trainer(splitTrainFlowersDataloaders[i], flowerNNModel, lossFunct, modelOptimiser)
    evaluator(splitValidateFlowersDataloaders[i], flowerNNModel, lossFunct)
  print("Set Evaluation ", "\n-------------------------------")
  evaluator(splitTestFlowersDataloaders[i], flowerNNModel, lossFunct)
print("Final Evaluation ", "\n-------------------------------")
evaluator(completeTestFlowersDataloader, flowerNNModel, lossFunct)
print("Done!\n\n")

Training Set 0 :
-------------------------------
Epoch  1 
-------------------------------
Test Error: 
 Accuracy:  46.0 
 Avg loss:  4.4 

Epoch  2 
-------------------------------
Test Error: 
 Accuracy:  58.0 
 Avg loss:  4.26 

Epoch  3 
-------------------------------
Test Error: 
 Accuracy:  48.0 
 Avg loss:  4.04 

Epoch  4 
-------------------------------
Test Error: 
 Accuracy:  40.0 
 Avg loss:  3.98 

Epoch  5 
-------------------------------
Test Error: 
 Accuracy:  46.0 
 Avg loss:  3.68 

Epoch  6 
-------------------------------
Test Error: 
 Accuracy:  48.0 
 Avg loss:  3.49 

Epoch  7 
-------------------------------
Test Error: 
 Accuracy:  50.0 
 Avg loss:  3.29 

Epoch  8 
-------------------------------
Test Error: 
 Accuracy:  34.0 
 Avg loss:  3.09 

Epoch  9 
-------------------------------
Test Error: 
 Accuracy:  74.0 
 Avg loss:  3.14 

Epoch  10 
-------------------------------
Test Error: 
 Accuracy:  38.0 
 Avg loss:  2.79 

Epoch  11 
--------------------