## FINE TUNING VIT ON STYLE

In [1]:
import os
os.chdir('..')

In [2]:
from torchvision import datasets, transforms, models
from torch.utils.data import Dataset, DataLoader
import numpy as np
import torch
import os
import pandas as pd
from tqdm import tqdm
from timm import create_model
from sklearn.model_selection import train_test_split
from PIL import Image
from torchvision import transforms
from torch.utils.data import WeightedRandomSampler
import torch.nn as nn
import copy
import torch.optim as optim
from torch.optim import lr_scheduler


device = torch.device("cuda")

## CUSTOM DATASET

## LOAD PRE-TRAINED MODEL

In [3]:
model = create_model("vit_base_patch16_224", pretrained=True, num_classes=32)
model.to(device)
model

VisionTransformer(
  (patch_embed): PatchEmbed(
    (proj): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
    (norm): Identity()
  )
  (pos_drop): Dropout(p=0.0, inplace=False)
  (patch_drop): Identity()
  (norm_pre): Identity()
  (blocks): Sequential(
    (0): Block(
      (norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
      (attn): Attention(
        (qkv): Linear(in_features=768, out_features=2304, bias=True)
        (q_norm): Identity()
        (k_norm): Identity()
        (attn_drop): Dropout(p=0.0, inplace=False)
        (proj): Linear(in_features=768, out_features=768, bias=True)
        (proj_drop): Dropout(p=0.0, inplace=False)
      )
      (ls1): Identity()
      (drop_path1): Identity()
      (norm2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
      (mlp): Mlp(
        (fc1): Linear(in_features=768, out_features=3072, bias=True)
        (act): GELU(approximate='none')
        (drop1): Dropout(p=0.0, inplace=False)
        (norm): Identity(

### REQUIRE GRAD JUST FOR HEAD LAYER

In [4]:
for p in model.parameters():
    p.requires_grad=False

In [5]:
for p in model.head.parameters():
    p.requires_grad = True

In [6]:
set(map(lambda x: x.requires_grad, model.patch_embed.parameters()))

{False}

In [7]:
set(map(lambda x: x.requires_grad, model.blocks._modules['11'].parameters()))

{False}

In [8]:
set(map(lambda x: x.requires_grad, model.head.parameters()))

{True}

## SPLIT IN TRAINING AND VALIDATION SET

## BUILDING DATA LOADERS

In [9]:
from src.data import AIxIADataset

data_transforms = transforms.Compose(
    [
        transforms.Resize((224, 224)),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
    ]
)

batch_size = 512
train_set = AIxIADataset(
    dataset="./data/processed/normal/artgraph_clip_style/train.csv",
    mapping="./data/external/artgraph2bestemotions/mapping/style_entidx2name.csv",
    mapping_kwargs={"names": ["idx", "name"]},
    img_dir='./data/raw/images-resized',
    preprocess=data_transforms,
)
train_loader = DataLoader(
    train_set, batch_size=batch_size, shuffle=True, drop_last=False
)
validation_set = AIxIADataset(
    dataset="./data/processed/normal/artgraph_clip_style/val.csv",
    mapping="./data/external/artgraph2bestemotions/mapping/style_entidx2name.csv",
    mapping_kwargs={"names": ["idx", "name"]},
    img_dir='./data/raw/images-resized',
    preprocess=data_transforms,
)
validation_loader = DataLoader(
    validation_set, batch_size=batch_size, shuffle=True, drop_last=False
)

## TRAINING

In [10]:
from src.utils.early_stop import EarlyStopping
from src.data import DataDict
def fine_tune(model, train_loader, validation_loader, criterion, optimizer, scheduler, early_stop, num_epochs = 100):
    best_model = copy.deepcopy(model)
    best_acc = 0.0
    best_epoch=0
    
    stop = False
    for epoch in range(1, num_epochs + 1):
        if stop:
            break
        print(f'Epoch {epoch}/{num_epochs}')
        print('-'*120)

        data_loader = None
        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  # Set model to training mode
                data_loader = train_loader
            else:
                model.eval()   # Set model to evaluate mode
                data_loader = validation_loader

            running_loss = 0.0
            running_corrects = 0

            # Iterate over data.
            for data_dict in tqdm(data_loader):
                inputs = data_dict[DataDict.IMAGE].to(device)
                labels = data_dict[DataDict.GTS].to(device)

                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs)
                    _, preds = torch.max(nn.Softmax(dim = 1)(outputs), 1)
                    loss = criterion(outputs, labels)

                    # backward + optimize only if in training phase
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                # statistics
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)

            epoch_loss = running_loss / (len(data_loader) * data_loader.batch_size)
            epoch_acc = running_corrects.double() / (len(data_loader) * data_loader.batch_size)

            if phase == 'val':
                scheduler.step(epoch_loss)
            
            print(f'{phase} Loss: {epoch_loss:.4f} Acc: {epoch_acc:.4f}')

            # deep copy the model
            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_epoch = epoch
                best_model = copy.deepcopy(model)
                
                
            if phase == 'val':
                early_stop(epoch_loss, model=model)
                print('-'*120, end = '\n\n')
                stop=early_stop.early_stop
                
                
    print(f'Best val Acc: {best_acc:4f}')
    print(f'Best epoch: {best_epoch:03d}')

    # load best model 
    return best_model

### Train the head layer to optimize the task

In [11]:
#if the model fine tuned on the head layer is not present into the directory
if 'vit_just_head.pt' not in os.listdir():
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=1e-3)
    scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min', min_lr = 1e-6, verbose = True, factor = .1, patience = 1,
                                               threshold = 1e-3)
    early_stop= EarlyStopping(path="models/vit_just_head.pt", patience = 5)
    best_model_head=fine_tune(model, train_loader, validation_loader, criterion, optimizer, scheduler, early_stop, num_epochs = 30)

Epoch 1/30
------------------------------------------------------------------------------------------------------------------------


  0%|          | 0/160 [00:00<?, ?it/s]

  2%|▎         | 4/160 [00:17<14:22,  5.53s/it]

## UNLOCK LAST ATTENTION BLOCK LAYER PARAMETERS IN ORDER TO LET THE NETWORK BE ABLE TO EXTRACT THE BEST FEATURES FOR THIS TASK

Train the feature extraction layer to optimize the task with attention to the feature extraction sub-task

In [None]:
for p in model.blocks[11].parameters():#last feature extraction layer
    p.requires_grad=True

In [None]:
for p in model.norm.parameters():
    p.requires_grad=True

In [None]:
if 'vit.pt' not in os.listdir(): 
    optimizer = optim.Adam(best_model_head.parameters(), lr=1e-3)
    scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min', min_lr = 1e-6, verbose = True, factor = .1, patience = 1,
                                               threshold = 1e-3)
    early_stop= EarlyStopping(path='models/vit.pt', patience = 5)
    best_model=fine_tune(best_model_head, train_loader, validation_loader, criterion, optimizer, scheduler, early_stop, num_epochs = 30)

Epoch 1/1
------------------------------------------------------------------------------------------------------------------------


  0%|          | 0/1 [00:00<?, ?it/s]

100%|██████████| 1/1 [00:00<00:00,  1.79it/s]


train Loss: 2.5527 Acc: 0.0820


100%|██████████| 1/1 [00:00<00:00,  1.63it/s]


val Loss: 2.5094 Acc: 0.1250
------------------------------------------------------------------------------------------------------------------------

Best val Acc: 0.125000
Best epoch: 001


In [None]:
model.load_state_dict(torch.load('models/vit.pt'))

<All keys matched successfully>

In [None]:
for p in model.parameters():
    p.requires_grad=False

In [None]:
model.reset_classifier(num_classes=0)#changing the model into a feature extractor (deleting the classification head)
model

VisionTransformer(
  (patch_embed): PatchEmbed(
    (proj): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
    (norm): Identity()
  )
  (pos_drop): Dropout(p=0.0, inplace=False)
  (patch_drop): Identity()
  (norm_pre): Identity()
  (blocks): Sequential(
    (0): Block(
      (norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
      (attn): Attention(
        (qkv): Linear(in_features=768, out_features=2304, bias=True)
        (q_norm): Identity()
        (k_norm): Identity()
        (attn_drop): Dropout(p=0.0, inplace=False)
        (proj): Linear(in_features=768, out_features=768, bias=True)
        (proj_drop): Dropout(p=0.0, inplace=False)
      )
      (ls1): Identity()
      (drop_path1): Identity()
      (norm2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
      (mlp): Mlp(
        (fc1): Linear(in_features=768, out_features=3072, bias=True)
        (act): GELU(approximate='none')
        (drop1): Dropout(p=0.0, inplace=False)
        (norm): Identity(

In [None]:
#custom class which can manage a dataset and return just the input, since the task is a feature extraction
class UnsupervisedDataSet(Dataset):
    def __init__(self, main_dir, list_files, transform):
        self.main_dir = main_dir
        self.transform = transform
        self.total_imgs = list_files

    def __len__(self):
        return len(self.total_imgs)

    def __getitem__(self, idx):
        img_path = f'{self.main_dir}/{self.total_imgs[idx]}'
        image = Image.open(img_path)
        if(image.mode != 'RGB'):
            image = image.convert('RGB')
        tensor_image = self.transform(image)
        return (tensor_image, self.total_imgs[idx])

In [None]:
data_transforms = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
])

df = pd.read_csv('./data/external/artgraph2bestemotions/mapping/artwork_entidx2name.csv', names=["idx", "names"])
list_images = df["names"].tolist()
list_features = []

In [None]:
batch_size = 32#you can change the batch size depending on the artwork
extraction_dataset = UnsupervisedDataSet('./data/raw/images-resized', list_images, transform=data_transforms)
train_loader = DataLoader(extraction_dataset, batch_size=batch_size, shuffle=False, 
                               drop_last=False)

In [None]:
#extracting features for all the artworks
model.eval()
name_images = []
with torch.no_grad():
    x = torch.zeros((len(list_images), 768))
    for idx, image in tqdm(enumerate(train_loader), total=len(train_loader)):
        img, img_name = image
        name_images.append(img_name)
        x[idx*batch_size : (idx+1)*batch_size] = model(img.to(device))

 24%|██▍       | 869/3640 [01:30<04:48,  9.59it/s]


KeyboardInterrupt: 

# Dimensionality reduction

In [None]:
x_num = x.detach().numpy()

In [None]:
#rescaling vector dimension using PCA 
import pickle
from sklearn.decomposition import PCA
pca = PCA(n_components=128)
x_num_128 = pca.fit_transform(x_num)
x_128 = torch.tensor(x_num_128)
with open('pca.pk','wb') as file:
    pickle.dump(pca, file)

# A brief qualitative analysis

In [None]:
#x_128 = torch.load("visual-features-vit-fine-tuning-128.pt")

In [None]:
#saving
node_feat_dir = './data/external/artgraph2bestemotions/raw/node-feat/artwork'
os.makedirs(node_feat_dir, exist_ok=True)
#node_feat_dir = os.path.join(fr'{root}/{graph}', 'raw', 'node-feat', 'artwork')
if not os.path.exists(node_feat_dir):
    os.makedirs(node_feat_dir)

x_df = pd.DataFrame(x_num)
x_df.to_csv(os.path.join(node_feat_dir, "node-feat-vit-fine-tuning.csv"), index=False, header=False)