In [2]:
import torch
import pandas as pd
import torch.nn as nn
import pandas as pd
import numpy as np
import pickle

from torchvision.models import vit_b_32
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from torchvision.models import ViT_B_32_Weights
from torch.utils.data import DataLoader, Dataset,DataLoader,random_split
from torch.optim import Adam
from PIL import Image
from sklearn.model_selection import KFold
from torchvision.transforms import Resize,Compose, ToTensor

In [3]:
# -------------------------------------------------------- Global settings --------------------------------------------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Currently running on {device}".format())


TRAIN_PERCENT = 0.8
VALIDATION_PERCENT = 0.1
BATCH_SIZE = 64
RESHAPE_SIZE = 512
NUM_EPOCHS = 25
LR = 0.0001
#LR = 0.00001

Currently running on cuda


In [4]:
# -------------------------------------------------------- Custom Transforms --------------------------------------------------------------

class ExpandDimension(object):
    def __call__(self, sample):
        if(sample.shape[0] == 1):
            sample = sample.repeat(3,1,1)
        return sample

In [5]:
content_transform = Compose([ViT_B_32_Weights.IMAGENET1K_V1.transforms()])
expand_dims_transform = Compose([ToTensor(),ExpandDimension()])

In [6]:
class DifficultyDataset(Dataset):

    def __init__(self, csv_file_path, transform=None):
        self.scores_df = pd.read_csv(csv_file_path)
        self.image_paths = self.scores_df['path'].tolist()
        self.scores = self.scores_df['score'].tolist()

    def __len__(self):
        return len(self.scores)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        img_path = self.image_paths[idx]
        image = Image.open(img_path)
        
        if(transform):
            image = expand_dims_transform(image)
            image = transform(image)
        
        score = torch.tensor(self.scores[idx])

        return image,score,img_path

In [7]:
train_dataset = DifficultyDataset('/../../Results/deepretrieval-caltech101_700_train-p@100.csv', transform=content_transform)
test_dataset = DifficultyDataset('/../../Results/deepretrieval-caltech101_700-p@100.csv', transform=content_transform)
gt_file = pd.read_csv('/../../Results/deepretrieval-caltech101_700-p@100.csv')

In [8]:
vit_model = vit_b_32(weights=ViT_B_32_Weights.DEFAULT)
regression_head = torch.nn.Sequential(
    torch.nn.Linear(in_features = 768 , out_features = 1),
    torch.nn.Sigmoid())
vit_model.heads = regression_head

transform = ViT_B_32_Weights.IMAGENET1K_V1.transforms()
vit_model = vit_model.to(device)

In [9]:
train_size = int(TRAIN_PERCENT*int(len(train_dataset)))
validation_size = len(train_dataset) - train_size

train_dataset, validation_dataset = random_split(train_dataset, lengths=[train_size,validation_size],generator=torch.Generator().manual_seed(420))

train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
validation_dataloader = DataLoader(validation_dataset, batch_size=BATCH_SIZE, shuffle=True)

In [10]:
test_dataloader = DataLoader(test_dataset, batch_size = 1, shuffle=True)

In [11]:
criterion = torch.nn.MSELoss()
optimizer = Adam(vit_model.parameters(), lr=LR)

In [12]:
def compute_loss(model , loader):
    
    total_loss = 0.0
    
    with torch.no_grad():
        for idx,data in enumerate(loader):
            images, scores, path = data
            
            images = images.to(device)
            scores = scores.to(device).unsqueeze(1)
            
            outputs = model(images)
            
            loss = criterion(outputs, scores)
            total_loss += loss.item()
    
    total_loss /= len(loader)
    
    return total_loss

In [13]:
def train_model(model, train_dataloader, validation_dataloader, optimizer , criterion):
    
    max_loss = 1000
    for i in range(NUM_EPOCHS):
        print("Epoch num {}/{}".format(i+1,NUM_EPOCHS))
        
        epoch_train_loss = 0
        
        for idx, data in enumerate(train_dataloader):
            #print("Batch num {}/{}".format(idx+1, len(train_dataloader)))
 
            (images,scores,path) = data
    
            images = images.to(device)
            scores = scores.to(device).unsqueeze(1)
            
            optimizer.zero_grad()
            
            outputs = model(images)
            loss = criterion(scores, outputs)
            

            loss.backward()
            optimizer.step()
            
            epoch_train_loss += loss.item()
            
        epoch_train_loss /= len(train_dataloader)
        validation_loss = compute_loss(model, validation_dataloader)
        
        print("Epoch train loss {}".format(epoch_train_loss))
        print("Epoch validation loss {}".format(validation_loss))

In [14]:
vit_model.train()
train_model(vit_model, train_dataloader,validation_dataloader,optimizer, criterion)

Epoch num 1/25
Epoch train loss 0.08827982304824723
Epoch validation loss 0.05152933423717817
Epoch num 2/25
Epoch train loss 0.05785697574416796
Epoch validation loss 0.055617623031139374
Epoch num 3/25
Epoch train loss 0.05545650629533662
Epoch validation loss 0.05751607194542885
Epoch num 4/25
Epoch train loss 0.054071752147542104
Epoch validation loss 0.05942155917485555
Epoch num 5/25
Epoch train loss 0.053265479703744255
Epoch validation loss 0.07292788351575534
Epoch num 6/25
Epoch train loss 0.0508059598505497
Epoch validation loss 0.05418908968567848
Epoch num 7/25
Epoch train loss 0.051250393191973366
Epoch validation loss 0.06263039509455363
Epoch num 8/25
Epoch train loss 0.04543604681061374
Epoch validation loss 0.058888014405965805
Epoch num 9/25
Epoch train loss 0.02703450744350751
Epoch validation loss 0.04318238546450933
Epoch num 10/25
Epoch train loss 0.02033371799108055
Epoch validation loss 0.04568938662608465
Epoch num 11/25
Epoch train loss 0.012489384557637904
E

In [15]:
score_dict = {}
vit_model.eval()
for (image, score , path) in test_dataloader:
    image = image.to(device)
    output = vit_model(image)
    score_dict[path[0]] = output.item()

In [16]:
paths = gt_file.values.tolist()
paths = [path[0] for path in paths]

In [17]:
scores = []
for path in paths:
    scores.append(score_dict[path])

In [18]:
result_df = pd.DataFrame({'path': paths, 'score': scores})
result_df.to_csv('/../../Results/vitregressor-deepretrieval-caltech101_700-p@100.csv',index=False)