In [None]:
''' all the imports necessary are done here.'''

import torch
import torch.nn as nn
from torchvision import datasets, transforms
from math import ceil as ceil
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
from torchvision.models import ViT_B_16_Weights
import torchvision

# Downloading the data and unzipping the same.
!wget https://storage.googleapis.com/wandb_datasets/nature_12K.zip
!unzip -q nature_12K.zip

--2023-04-10 18:22:57--  https://storage.googleapis.com/wandb_datasets/nature_12K.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 172.217.219.128, 209.85.147.128, 142.250.125.128, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|172.217.219.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3816687935 (3.6G) [application/zip]
Saving to: ‘nature_12K.zip’


2023-04-10 18:23:16 (193 MB/s) - ‘nature_12K.zip’ saved [3816687935/3816687935]



We are using the Vision Transformer model. 
- It appears to be more data hungry than resnet but yet it gives a good enough validation accuracy (better than resnet). 
- It has had better testing accuracy over ImageNet as compared to the other pretrained models.
- However being an NLP based model, it can be slower to train.
- Dropout layer (torch.nn.Dropout) has been added to reduce overfitting

In [None]:
# setting up autotransforms for the vision transformer model. The auto transforms will be used to preprocess the images so as to make them compatible.

weights = ViT_B_16_Weights.DEFAULT
auto_transforms = weights.transforms()

# device selection code. If GPU available, choose it, else stick to CPU.
device = "cuda" if torch.cuda.is_available() else "cpu"

#import the model.
model = torchvision.models.vit_b_16(weights=weights).to(device)

#Freeze the parameters.
for params in model.parameters():
    params.requires_grad=False

# Modify the last layer so as to fit the output space.
lastLayer = model.heads.head.in_features

# add Dropout so as to prevent overFitting.
model.heads.head = nn.Sequential(nn.Dropout(p=0.5), nn.Linear(lastLayer,10))
model.to(device) 

#set the loss function
loss_fn = nn.CrossEntropyLoss()

# set the optimizer to Addam with a learning rate of 0.0001
optimizer = torch.optim.Adam(model.parameters(), lr = 0.0001)


Downloading: "https://download.pytorch.org/models/vit_b_16-c867db91.pth" to /root/.cache/torch/hub/checkpoints/vit_b_16-c867db91.pth
100%|██████████| 330M/330M [00:04<00:00, 80.4MB/s]


Using datasets.ImageFolder from torch framework to import the dataset.
- autotransformer used to process the data to make it similar to the ImageNet dataset on which the pretrained network has been trained on.
- The same has been done to the test data and both are taking data from folders mentioned in the root file path

In [None]:
# load in the images to tensors, store them in an organized fashion with their captions as their labels.

trainData = datasets.ImageFolder(root = "inaturalist_12K/train",
                                 transform = auto_transforms,
                                 target_transform = None)
testData  = datasets.ImageFolder(root = "inaturalist_12K/val",
                                 transform = auto_transforms)

print(f"train data : {trainData} and test data : {testData}")

train data : Dataset ImageFolder
    Number of datapoints: 9999
    Root location: inaturalist_12K/train
    StandardTransform
Transform: ImageClassification(
               crop_size=[224]
               resize_size=[256]
               mean=[0.485, 0.456, 0.406]
               std=[0.229, 0.224, 0.225]
               interpolation=InterpolationMode.BILINEAR
           ) and test data : Dataset ImageFolder
    Number of datapoints: 2000
    Root location: inaturalist_12K/val
    StandardTransform
Transform: ImageClassification(
               crop_size=[224]
               resize_size=[256]
               mean=[0.485, 0.456, 0.406]
               std=[0.229, 0.224, 0.225]
               interpolation=InterpolationMode.BILINEAR
           )


In [None]:
# extract the labels.
classLabels = trainData.classes

We divide (logically for now), the data set into 80% training and 20% validation. We ensure the split has randomness in it for better accuracy during testing.

In [None]:
# split the training data into 80% training and 20% validation

trainSplit = ceil(0.8*len(trainData))

# use random_split
trainData, valData = torch.utils.data.random_split(trainData, [trainSplit, len(trainData) - trainSplit])

using the torch.utils.data.DataLoader(...) funcitonality to wrap the dataset.

- This will help in taking efficient caching
- The shuffle function will add randomness (hence regularization) to the training process.
- training and validation data loaders are made different and the batch_size parameter takes care of dividing the data into appropriate batch sizes.

In [None]:
# wrap into data loaders so that the data is accessed in an easier way during training, validating and testing.


trainDataLoader = torch.utils.data.DataLoader(trainData,
                                              shuffle=True,
                                              batch_size=32)

valDataLoader = torch.utils.data.DataLoader(valData,
                                            shuffle=True,
                                            batch_size=32)

testDataLoader = torch.utils.data.DataLoader(testData,
                                             shuffle=False,
                                             batch_size=32)

the fit(....) method takes care of training the model.
the eval(....) method takes care of evaluating the model.


In [None]:
def accuracy(y_true, y_pred):
    ''' accuracy Function for calculating the percentage of y_true[i] == y_pred[i]
        args : y_true ---> int actual value/ label(s) of for the input(s).
        return : accuracy ---> float [0,100] The accuracy of the batch.
    '''
    correct = torch.eq(y_true,y_pred).sum().item()
    accuracy = 0.0
    accuracy = correct/(len(y_true))*100
    return accuracy

In [None]:
# train Funciton.
def fit(trainDataLoader, valDataLoader, epochs):
    ''' args : trianDataLoader -> torch.utils.data.DataLoader contains wrapped up training data.
             : valDataLoader.  -> torch.utils.data.DataLoader contains wrapped up validation data.
             : epochs          -> number of epochs for which to run the training.

        return nn.Module model -> This is the model after training with parameters appropriately updated.
    '''
    for epoch in tqdm(range(epochs)):
        train_loss = 0
        train_acc = 0
        for batch, (X,y) in enumerate(trainDataLoader):
            X,y = X.to(device), y.to(device)
            model.train()
            y_pred = model(X)
            loss = loss_fn(y_pred, y)
            train_loss += loss
            train_acc += accuracy(y_true=y, y_pred=y_pred.argmax(dim=1))
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        if batch%50==0:
            print(f"processed {batch}/{len(trainDataLoader)} samples")

        train_loss /= len(trainDataLoader)
        train_acc /= len(trainDataLoader)
    
        val_loss = 0.0
        val_acc = 0
        model.eval()
        with torch.inference_mode():
            for X,y in valDataLoader:
                X,y = X.to(device), y.to(device)
                val_pred = model(X)
                val_loss += loss_fn(val_pred, y)
                val_acc += accuracy(y_true=y, y_pred=val_pred.argmax(dim=1))
            val_acc /= len(testDataLoader)
            val_loss /= len(testDataLoader)

        print(f"Train loss: {train_loss}, Train accuracy: {train_acc}, validation loss: {val_loss}, validation accuracy: {val_acc}\n")


In [None]:
def eval(testLoader):
    ''' args : testLoader -> wrapper type for the testing data which is unseen.'''
    test_loss = 0.0
    test_acc = 0
    model.eval()
    with torch.inference_mode():
        for X,y in valDataLoader:
            X,y = X.to(device), y.to(device)
            test_pred = model(X)
            test_loss += loss_fn(test_pred, y)
            test_acc += accuracy(y_true=y, y_pred=test_pred.argmax(dim=1))
        test_acc /= len(testDataLoader)
        test_loss /= len(testDataLoader)
        print(f"Test Loss: {test_loss}, Test accuracy: {test_acc}")


The Following fine tuning is done with adam with a learning Rate of 0.001.
The learning Rate seems to be too much as we see the model got stuck in some local minima and couldn't get out of it.

In [None]:
# fine tune the model for 30 epochs.

fit(trainDataLoader, valDataLoader,30)

  0%|          | 0/30 [00:00<?, ?it/s]

Train loss: 0.9238288998603821, Train accuracy: 74.6625, validation loss: 0.5547844171524048, validation accuracy: 84.79497354497354

Train loss: 0.5593219995498657, Train accuracy: 84.3125, validation loss: 0.4824058711528778, validation accuracy: 86.30291005291005

Train loss: 0.521395206451416, Train accuracy: 85.05, validation loss: 0.46531957387924194, validation accuracy: 86.33928571428571

Train loss: 0.48913222551345825, Train accuracy: 85.7375, validation loss: 0.45911088585853577, validation accuracy: 86.2037037037037

Train loss: 0.47368288040161133, Train accuracy: 85.95, validation loss: 0.446982204914093, validation accuracy: 86.94775132275132

Train loss: 0.45796260237693787, Train accuracy: 86.325, validation loss: 0.44957101345062256, validation accuracy: 86.08465608465607

Train loss: 0.44056427478790283, Train accuracy: 86.9625, validation loss: 0.44494733214378357, validation accuracy: 86.65013227513226

Train loss: 0.45684802532196045, Train accuracy: 86.4875, vali

In [None]:
# evaluate the model.

eval(testDataLoader)

Test Loss: 0.45687350630760193, Test accuracy: 86.59391534391536


In [None]:
# save the model

torch.save(model, "vit_16_model.pth")

accuracy function ( over tensors ) to calculate accuracy( by the batch). 