# Flower classifier 

*Author: Baccega Sandro*

In this notebook we will classify Oxford's `102 Category Flower Dataset` that can be found [here](https://www.robots.ox.ac.uk/~vgg/data/flowers/102/index.html).

Another required asset to run this code is `Oxford-102_Flower_dataset_labels.txt` from JosephKJ that can be found [here](https://gist.github.com/JosephKJ/94c7728ed1a8e0cd87fe6a029769cde1), this external file contains the flowers names.

---

## Imports and constants

In [68]:
from scipy.io import loadmat
import pandas as pd
import numpy as np
import torch
import shutil
import time
import json
import os
import copy
import matplotlib.pyplot as plt
# import seaborn as sns
import numpy as np
from PIL import Image
from collections import OrderedDict
from torch import nn, optim
import torch.utils.data as data
from torch.utils.data import Dataset
from torch.optim import lr_scheduler
from torch.autograd import Variable
from torchvision import datasets, models, transforms

# --- CONSTANTS ---

SEED = 151836

DATASET_SPLIT = 0.8         # Get 80% of dataset for training, the rest for validating 
BATCH_SIZE = 32

# Assets location

RAW_IMAGE_LABELS_MAT_FILE = "assets/imagelabels.mat"
RAW_DATASET_LABELS_FILE = "assets/Oxford-102_Flower_dataset_labels.txt"
RAW_DATASET_IMAGES_FOLDER = "assets/jpg"
RAW_SEGMENTED_IMAGES_FOLDER = "assets/segmim"

# Data folder location

DATASET_IMAGES_FOLDER = "data"

# Setting seed
torch.manual_seed(SEED)

# Set device to use for computations
device = (
    "cuda"
    if torch.cuda.is_available()
    else ("mps" if torch.backends.mps.is_available() else "cpu")
)

print(f"-----\nPyTorch version: {torch.__version__}\nDevice: {device}\n-----")


-----
PyTorch version: 1.13.0.dev20220608
Device: mps
-----


## Creating the sorted data folder

In [87]:
df = pd.DataFrame()
df['Image'] = sorted(os.listdir(RAW_DATASET_IMAGES_FOLDER))
df['Category'] = loadmat(RAW_IMAGE_LABELS_MAT_FILE)['labels'][0] - 1
# df['Category'] = df['Category'].astype(str)

groups = df.groupby('Category')['Image'].apply(list)

# If data folder exists, do not create images folder
if not os.path.isdir(DATASET_IMAGES_FOLDER):
    print("Creating data folder")        
    os.mkdir(DATASET_IMAGES_FOLDER)
    
    for category, images in groups.items():
        os.mkdir('{}/{}'.format(DATASET_IMAGES_FOLDER, category))
        for image in images:
            shutil.copyfile('{}/{}'.format(RAW_DATASET_IMAGES_FOLDER, image), '{}/{}/{}'.format(DATASET_IMAGES_FOLDER,category,image))

    print("Done - data folder creation")
else:
    print("Skipping - data folder creation")        


# Creating category to label reference
rawLabelReferenceData = np.loadtxt(RAW_DATASET_LABELS_FILE,dtype="str", delimiter='\n')
labelReferenceData = list(map(lambda str: str[2:-1], rawLabelReferenceData))

i = 0
labelReference = {}

for category, images in groups.items():
    labelReference[category] = labelReferenceData[i]
    i += 1

print("Done - label reference creation")
# print(labelReference)
    

Skipping - data folder creation
Done - label reference creation


## Creating the datasets

In [56]:
class TransformDataset(Dataset):
    def __init__(self, subset, transform=None):
        self.subset = subset
        self.transform = transform
        
    def __getitem__(self, index):
        x, y = self.subset[index]
        if self.transform:
            x = self.transform(x)
        return x, y
        
    def __len__(self):
        return len(self.subset)


dataset = datasets.ImageFolder(DATASET_IMAGES_FOLDER)

train_set_size = int(len(dataset) * DATASET_SPLIT)
valid_set_size = len(dataset) - train_set_size

untransformed_train_dataset, untransformed_validation_dataset = data.random_split(dataset, [train_set_size, valid_set_size])

train_dataset = TransformDataset(
    untransformed_train_dataset, transform=transforms.Compose([
        transforms.RandomRotation(45),
        transforms.RandomResizedCrop(224),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], 
                             [0.229, 0.224, 0.225])
    ])
)
validation_dataset = TransformDataset(
    untransformed_validation_dataset, transform=transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], 
                             [0.229, 0.224, 0.225])
    ])
)

train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
validation_dataloader = torch.utils.data.DataLoader(validation_dataset, batch_size=BATCH_SIZE, shuffle=True)

print("Train dataset size: {}".format(len(train_dataset)))
print("Validation dataset size: {}".format(len(validation_dataset)))


Train dataset size: 6551
Validation dataset size: 1638


## Creating the model

In [89]:
model = models.vgg19(pretrained=True)
print(model)

classifier = nn.Sequential(OrderedDict([
                          ('fc1', nn.Linear(25088, 4096)),
                          ('relu', nn.ReLU()),
                          ('fc2', nn.Linear(4096, 102)),
                          ('output', nn.LogSoftmax(dim=1))
                          ]))

for param in model.parameters():
    param.requires_grad = False

model.classifier = classifier

Downloading: "https://download.pytorch.org/models/vgg19-dcbb9e9d.pth" to /Users/sandrobaccega/.cache/torch/hub/checkpoints/vgg19-dcbb9e9d.pth
100%|██████████| 548M/548M [07:11<00:00, 1.33MB/s] 


VGG(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU(inplace=True)
    (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (6): ReLU(inplace=True)
    (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (8): ReLU(inplace=True)
    (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (13): ReLU(inplace=True)
    (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (15): ReLU(inplace=True)
    (16): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padd