In [1]:
import timm
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
from torchvision import datasets, transforms
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import StratifiedKFold

from PIL import Image
from timm.data import resolve_data_config
from timm.data.transforms_factory import create_transform
Image.MAX_IMAGE_PIXELS = 1000000000

  setattr(self, word, getattr(machar, word).flat[0])
  return self._float_to_str(self.smallest_subnormal)
  setattr(self, word, getattr(machar, word).flat[0])
  return self._float_to_str(self.smallest_subnormal)


In [2]:
# define functions for training (fine-tuning) and testing model
        
def finetune_ViT(model, trainloader, optimizer, criterion, num_epochs, scheduler=None):
    
    m = nn.Sigmoid()
    print("Starting training...")
    
    for epoch in range(num_epochs):
        
        running_loss = 0
        for i, data in enumerate(trainloader, start=0):
            
            images, labels = data
            images, labels = images.to(device), labels.to(device)
            
            optimizer.zero_grad()
            
            outputs = model(images)
            outputs = m(outputs)
            
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            
            # print statistics
            running_loss += loss.item()
            if i % 20 == 19:    # print every 20 mini-batches
                print('[%d, %5d] loss: %.3f' %
                      (epoch + 1, i + 1, running_loss / 20))
                running_loss = 0.0
         
        if scheduler != None:
            scheduler.step()
            

def test_ViT(model, testloader):
    correct = 0
    total = 0
    
    m = nn.Sigmoid()
    
    with torch.no_grad():
        for data in testloader:
            
            images, labels = data
            images, labels = images.to(device), labels.to(device)
            #print("True values: ", labels)
            
            outputs = our_ViT(images)
            probabilities = m(outputs)
            #print("Probabilities: ", probabilities)
            
            predicted = torch.argmax(outputs,1)
            #print("Predicted: ", predicted)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            #print("Correct: ", correct)
        
        acc = correct/total * 100
        
    return acc

In [3]:
# define validation testing function for determining optimal learning rates and batch sizes

def perform_validation_testing(model, L, validationset, lr_array, batchsize_array, criterion, num_epochs=5, scheduler=None):

    # create stratified folds to preserve class percentage representation within folds
    skf = StratifiedKFold(n_splits = L, shuffle = True, random_state = 1)

    # split validationset tuples into data and labels
    val_data, val_labels = zip(*validationset)

    accuracies = np.empty([len(lr_array), len(batchsize_array)])

    # run validation testing

    for i, lr in enumerate(lr_array):
        
        optimizer = optim.SGD(filter(lambda layer: layer.requires_grad, model.parameters()), lr=lr, momentum = .9)

        for j, batch_size in enumerate(batchsize_array):

            fold_val_acc = []
            print("LEARNING RATE: ", lr)
            print("BATCH SIZE: ", batch_size)

            for fold, (train_ids, test_ids) in enumerate(skf.split(val_data, val_labels)):

                # restore original weights for ViT and reset optimizer
                model.load_state_dict(torch.load("/projectnb/dl523/students/kjv/EC520_Project/ViT/Saved_Models/untrained_ViT.pth"))

                print("---STARTING NEW FOLD---")

                trainfold_sampler = torch.utils.data.SubsetRandomSampler(train_ids)
                testfold_sampler = torch.utils.data.SubsetRandomSampler(test_ids)

                # define data loaders for training and testing data in this fold
                val_trainloader = torch.utils.data.DataLoader(validationset, batch_size=batch_size, sampler=trainfold_sampler)
                val_testloader = torch.utils.data.DataLoader(validationset, batch_size=batch_size, sampler=testfold_sampler)

                # train ViT on L-1 folds
                finetune_ViT(model, trainloader=val_trainloader, optimizer=optimizer, criterion=criterion, num_epochs=num_epochs, scheduler=scheduler)

                # test validation accuracy on remaining fold and keep track of accuracy per fold
                result = test_ViT(model, val_testloader)
                fold_val_acc.append(result)

            # take average accuracy across all folds for given learning rate
            print("---ALL FOLD ACCURACIES FOR CURRENT LR/BATCH_SIZE---")
            print(fold_val_acc)
            accuracies[i,j] = sum(fold_val_acc)/len(fold_val_acc)
            print(f"Average = {accuracies[i,j]}\n")


    # choose learning rate with best validation accuracy
    print("---FINAL AVG ACCURACIES PER LEARNING RATE/BATCH SIZE COMBO---")
    print("Accuracy Matrix: \n", accuracies)
    best_lr_ind, best_bs_ind = np.unravel_index(np.argmax(accuracies, axis=None), accuracies.shape)
    
    optimal_lr = learning_rates[best_lr_ind]
    optimal_batch_size = batchsize_array[best_bs_ind]

    print(f"\nBest learning rate: {optimal_lr}")
    print(f"Best batch size: {optimal_batch_size}")
    return optimal_lr, optimal_batch_size

In [4]:
# fine-tuning helper methods

def freeze_layers(model):
    """
    Freeze all model layers
    """
    for param in our_ViT.parameters():
        param.requires_grad = False
            
def unfreeze_layers(model):
    """
    Reset requires_grad for all params
    """
    for param in our_ViT.parameters():
        param.requires_grad = True


In [5]:
# testing for function of finetuning variable num of layers
#layer_names = []

#for name, _ in our_ViT.named_modules():
#    layer_names.append(name)
    
#layers_to_freeze = layer_names[:-2-1]

#print(layers_to_freeze)

In [6]:
# ----- timm Implemenation of ViT -----

In [7]:
# create default model if needed
ViT = timm.create_model('vit_huge_patch14_224_in21k', pretrained=True)

# output the config info for the default pre-trained model
ViT.default_cfg

{'url': 'https://storage.googleapis.com/vit_models/imagenet21k/ViT-H_14.npz',
 'num_classes': 21843,
 'input_size': (3, 224, 224),
 'pool_size': None,
 'crop_pct': 0.9,
 'interpolation': 'bicubic',
 'fixed_input_size': True,
 'mean': (0.5, 0.5, 0.5),
 'std': (0.5, 0.5, 0.5),
 'first_conv': 'patch_embed.proj',
 'classifier': 'head',
 'hf_hub': 'timm/vit_huge_patch14_224_in21k',
 'architecture': 'vit_huge_patch14_224_in21k'}

In [8]:
# create ViT for art style classification
our_ViT = timm.create_model('vit_huge_patch14_224_in21k', pretrained = True, num_classes = 25)

# confirm changes in classifier output
our_ViT.get_classifier()

Removing representation layer for fine-tuning.


Linear(in_features=1280, out_features=25, bias=True)

In [9]:
# basic pre-processing tasks for proper ViT data ingestion

config = resolve_data_config({}, model=our_ViT)
transform = create_transform(**config)

In [10]:
# pre-training model setup

# freeze all layers of the ViT
freeze_layers(our_ViT)

# unfreeze desired layers for fine-tuning
our_ViT.head.bias.requires_grad = True
our_ViT.head.weight.requires_grad = True

# check that correct layers are frozen
for name, param in our_ViT.named_parameters():
    if param.requires_grad:
        print(name)


head.weight
head.bias


In [11]:
# move model to GPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Device: {device}")

# save starting state for ViT
torch.save(our_ViT.state_dict(), "/projectnb/dl523/students/kjv/EC520_Project/ViT/Saved_Models/untrained_ViT.pth")

our_ViT.to(device)

Device: cuda:0


VisionTransformer(
  (patch_embed): PatchEmbed(
    (proj): Conv2d(3, 1280, kernel_size=(14, 14), stride=(14, 14))
    (norm): Identity()
  )
  (pos_drop): Dropout(p=0.0, inplace=False)
  (blocks): Sequential(
    (0): Block(
      (norm1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
      (attn): Attention(
        (qkv): Linear(in_features=1280, out_features=3840, bias=True)
        (attn_drop): Dropout(p=0.0, inplace=False)
        (proj): Linear(in_features=1280, out_features=1280, bias=True)
        (proj_drop): Dropout(p=0.0, inplace=False)
      )
      (drop_path): Identity()
      (norm2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
      (mlp): Mlp(
        (fc1): Linear(in_features=1280, out_features=5120, bias=True)
        (act): GELU()
        (drop1): Dropout(p=0.0, inplace=False)
        (fc2): Linear(in_features=5120, out_features=1280, bias=True)
        (drop2): Dropout(p=0.0, inplace=False)
      )
    )
    (1): Block(
      (norm1): LayerNorm

In [12]:
# initialize validation dataset

validationpath = "/projectnb/dl523/students/kjv/EC520_Project/Data/wikipaintings_full/wikipaintings_val"
validationset = datasets.ImageFolder(validationpath, transform=transform)

In [None]:
# perform stratified L-Fold Cross Validation testing to determine optimal learning rates and batch sizes for training

L = 5
num_epochs = 5
criterion = nn.CrossEntropyLoss()
learning_rates = [0.003, 0.01, 0.03, 0.06] # from ViT paper
batch_sizes = [32, 64, 128, 256, 512]

optimal_lr, optimal_batch_size = perform_validation_testing(model=our_ViT, L=L, validationset=validationset, criterion=criterion, lr_array=learning_rates, batchsize_array=batch_sizes, num_epochs=num_epochs)

LEARNING RATE:  0.003
BATCH SIZE:  32
---STARTING NEW FOLD---
Starting training...
[1,    20] loss: 3.218
[1,    40] loss: 3.214
[1,    60] loss: 3.210
[1,    80] loss: 3.205
[1,   100] loss: 3.202
[1,   120] loss: 3.199
[1,   140] loss: 3.196
[1,   160] loss: 3.191
[1,   180] loss: 3.186
[2,    20] loss: 3.182
[2,    40] loss: 3.179
[2,    60] loss: 3.175
[2,    80] loss: 3.171
[2,   100] loss: 3.166
[2,   120] loss: 3.164
[2,   140] loss: 3.162
[2,   160] loss: 3.162
[2,   180] loss: 3.148
[3,    20] loss: 3.153
[3,    40] loss: 3.152
[3,    60] loss: 3.145
[3,    80] loss: 3.139
[3,   100] loss: 3.132
[3,   120] loss: 3.135
[3,   140] loss: 3.137
[3,   160] loss: 3.130
[3,   180] loss: 3.131
[4,    20] loss: 3.127
[4,    40] loss: 3.126
[4,    60] loss: 3.124
[4,    80] loss: 3.105
[4,   100] loss: 3.124
[4,   120] loss: 3.108
[4,   140] loss: 3.114
[4,   160] loss: 3.112
[4,   180] loss: 3.104
[5,    20] loss: 3.107
[5,    40] loss: 3.101
[5,    60] loss: 3.104
[5,    80] loss: 3.0

---STARTING NEW FOLD---
Starting training...
[1,    20] loss: 3.217
[1,    40] loss: 3.213
[2,    20] loss: 3.207
[2,    40] loss: 3.203
[3,    20] loss: 3.197
[3,    40] loss: 3.194
[4,    20] loss: 3.189
[4,    40] loss: 3.184
[5,    20] loss: 3.180
[5,    40] loss: 3.177
---STARTING NEW FOLD---
Starting training...
[1,    20] loss: 3.217
[1,    40] loss: 3.213
[2,    20] loss: 3.208
[2,    40] loss: 3.203
[3,    20] loss: 3.197
[3,    40] loss: 3.194
[4,    20] loss: 3.188
[4,    40] loss: 3.184
[5,    20] loss: 3.180
[5,    40] loss: 3.176
---STARTING NEW FOLD---
Starting training...
[1,    20] loss: 3.217
[1,    40] loss: 3.212
[2,    20] loss: 3.208
[2,    40] loss: 3.203
[3,    20] loss: 3.197
[3,    40] loss: 3.194
[4,    20] loss: 3.189
[4,    40] loss: 3.184
[5,    20] loss: 3.178
[5,    40] loss: 3.178
---STARTING NEW FOLD---
Starting training...
[1,    20] loss: 3.217
[1,    40] loss: 3.213
[2,    20] loss: 3.208
[2,    40] loss: 3.203
[3,    20] loss: 3.197
[3,    40] loss

In [None]:
# define paths to training and testing folders, and create loaders using optimal batch size

trainpath = "/projectnb/dl523/students/kjv/EC520_Project/Data/wikipaintings_full/wikipaintings_train"
train_set = datasets.ImageFolder(trainpath, transform=transform)
trainloader = torch.utils.data.DataLoader(train_set, batch_size=optimal_batch_size, shuffle = True)

# initialize test dataset

testpath = "/projectnb/dl523/students/kjv/EC520_Project/Data/wikipaintings_full/wikipaintings_test"

testset = datasets.ImageFolder(testpath, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=optimal_batch_size)

In [None]:
# get random training images to verify import worked
dataiter = iter(trainloader)
images, labels = dataiter.next()

print(images.shape)

# show images
def imshow(img):
    img = img * our_ViT.default_cfg['std'][0] + our_ViT.default_cfg['mean'][0]  # unnormalize
    npimg = img.numpy()
    plt.imshow(np.transpose(npimg, (1, 2, 0)))

imshow(torchvision.utils.make_grid(images))

# show labels
print("Class: ", labels)

In [None]:
# call finetune_ViT on our_ViT using optimal learning rate
optimizer = optim.SGD(filter(lambda layer: layer.requires_grad, our_ViT.parameters()), lr=optimal_lr, momentum = .9)

# set ViT to initial untrained weights
our_ViT.load_state_dict(torch.load("/projectnb/dl523/students/kjv/EC520_Project/ViT/Saved_Models/untrained_ViT.pth"))

# train the ViT
finetune_ViT(our_ViT, trainloader=trainloader, optimizer=optimizer, criterion=criterion, num_epochs=num_epochs)

# save final model
torch.save(our_ViT.state_dict(), "/projectnb/dl523/students/kjv/EC520_Project/ViT/Saved_Models/v1.pth")

In [None]:
# test finetuned ViT
acc = test_ViT(our_ViT, testloader)

print(f"Results: {acc}% accuracy")

In [None]:
# Print top categories per image (need to put into for loop if we want to use)

#top5_prob, top5_catid = torch.topk(probabilities, 5)
#for i in range(top5_prob.size(0)):
#    print(categories[top5_catid[i]], top5_prob[i].item())