# **Preprocess the dataset and stores tensors to Google Driver**

In [13]:
from google.colab import drive
drive.mount('/content/drive')
import os
path = os.path.join('/content/drive/MyDrive/')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


##**Read data files** ##

Read dataframes from csv files

In [14]:
import pandas as pd
import os

path = '/content/drive/MyDrive/ViLT'
train_file = os.path.join(path, 'combined_train.csv')
val_file = os.path.join(path, 'combined_val.csv')
test_file = os.path.join(path, 'combined_test.csv')

train_df = pd.read_csv(train_file)
val_df = pd.read_csv(val_file)
test_df = pd.read_csv(test_file)

Read train, validation, test data

In [15]:
# !pip install lightning
!pip install transformers



Load pretrained ViLT model

In [16]:
from transformers import ViltProcessor, ViltForQuestionAnswering
import torch

processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa")

Preprocess the images and comments to encodings and stores them to the storage

##**Finetune the ViLT model** ##

create dataloaders

In [17]:
from torch.utils.data import DataLoader, Dataset
from torchvision.io import read_image
from torchvision.transforms import Resize
import torchvision
import numpy as np

class ViltDataset(Dataset):
  def __init__(self, dataframes):
    self.base_path = '/content/drive/MyDrive/ViLT'
    self.img_paths = dataframes['filename']
    self.comments = dataframes['clean_comments']
    self.resize = Resize((384, 512))
    self.labels = dataframes['scenic']

  def __len__(self):
    return len(self.img_paths)

  def __getitem__(self, idx):
    img_path = self.img_paths[idx]
    img = self.resize(read_image(os.path.join(self.base_path, img_path), mode=torchvision.io.ImageReadMode.RGB))
    comment = self.comments[idx]
    encoding = processor(img, comment, return_tensors='pt', padding='max_length', truncation=True)
    label = torch.nn.functional.one_hot(torch.tensor(self.labels[idx]), num_classes=2)

    return [encoding, label]

In [18]:
# create dataloaders
training_data = ViltDataset(train_df[:])
test_data = ViltDataset(test_df[:])
val_data = ViltDataset(val_df[:])

In [19]:
train_dataloader = DataLoader(training_data, batch_size=32, shuffle=True)
test_dataloader = DataLoader(test_data, batch_size=10, shuffle=True)
val_dataloader = DataLoader(val_data, batch_size=10, shuffle=True)

Define a classifier model

In [37]:
class ScenicClassifier(torch.nn.Module):
  def __init__(self, enc_features: int):
    super().__init__()
    self.embedding = model.vilt
    self.classifier = torch.nn.Sequential(
        torch.nn.Linear(enc_features, 64, bias=False),
        # torch.nn.ReLU(),
        torch.nn.Linear(64, 16, bias=False),
        torch.nn.Linear(16, 2, bias=False),
        # torch.nn.ReLU()
        )

  def forward(self, encodings):
    embeds = self.embedding(**encodings)['pooler_output']
    logits = self.classifier(embeds)

    return logits

In [21]:
class ScenicClassifier2(torch.nn.Module):
  def __init__(self, enc_features: int):
    super().__init__()
    self.embedding = model.vilt
    self.classifier = torch.nn.Linear(enc_features, 2, bias=False)
    torch.nn.init.xavier_uniform(self.classifier.weight)

  def forward(self, encodings):
    embeds = self.embedding(**encodings)['pooler_output']
    logits = self.classifier(embeds)

    return logits

In [38]:
device='cuda:0'
# device='cpu'
classifier = ScenicClassifier(768)
# classifier = ScenicClassifier2(768)
classifier.to(device)

criterion = torch.nn.MSELoss()
criterion.to(device)
# optimizer = torch.optim.Adam(classifier.parameters(), lr=0.001)
optimizer = torch.optim.SGD(classifier.parameters(), lr=0.0001, momentum=0.8)

## Train the model

In [39]:
classifier.train()
for epoch in range(4):
  running_loss = 0.0
  for i, data in enumerate(train_dataloader, 0):
    encodings, labels = data
    encodings.to(device)
    labels.to(device)
    for key in encodings.keys():
      encodings[key] = torch.squeeze(encodings[key], 1)
    optimizer.zero_grad()
    outputs = classifier(encodings)
    labels = torch.tensor(labels, dtype=torch.float).cuda()
    # new_labels = []
    # for label in labels:
    #   new_labels.append(torch.nn.functional.one_hot(torch.squeeze(torch.tensor(label, dtype=torch.int)), num_classes=2))
    # new_labels = torch.tensor(new_labels)

    # print(outputs)
    # print(new_labels)
    loss = criterion(outputs, labels)
    loss.backward()
    optimizer.step()
    running_loss += loss.item()
    if i % 10 == 9:
      print(f'[{epoch+1}, {i+1:5d}] loss : {running_loss / 10:.3f}')
      running_loss=0.0

print('Finished training')

  labels = torch.tensor(labels, dtype=torch.float).cuda()


[1,    10] loss : 0.497
[1,    20] loss : 0.417
[1,    30] loss : 0.365
[1,    40] loss : 0.297
[1,    50] loss : 0.255
[1,    60] loss : 0.217
[1,    70] loss : 0.188
[1,    80] loss : 0.163
[1,    90] loss : 0.144
[1,   100] loss : 0.128
[1,   110] loss : 0.111
[1,   120] loss : 0.102
[1,   130] loss : 0.092
[1,   140] loss : 0.087
[1,   150] loss : 0.093
[1,   160] loss : 0.088
[1,   170] loss : 0.083
[1,   180] loss : 0.085
[1,   190] loss : 0.092
[1,   200] loss : 0.079
[1,   210] loss : 0.072
[1,   220] loss : 0.075
[1,   230] loss : 0.089
[1,   240] loss : 0.086
[1,   250] loss : 0.086
[2,    10] loss : 0.072
[2,    20] loss : 0.067
[2,    30] loss : 0.077
[2,    40] loss : 0.084
[2,    50] loss : 0.067
[2,    60] loss : 0.079
[2,    70] loss : 0.073
[2,    80] loss : 0.070
[2,    90] loss : 0.075
[2,   100] loss : 0.065
[2,   110] loss : 0.076
[2,   120] loss : 0.079
[2,   130] loss : 0.064
[2,   140] loss : 0.079
[2,   150] loss : 0.074
[2,   160] loss : 0.069
[2,   170] loss 

## Test the result

In [52]:
from sklearn.metrics import f1_score

def f1_score_eval(dataloader, dataname):
  correct = 0
  total = 0
  predicts = []
  groundtruth = []
  with torch.no_grad():
    for testdata in dataloader:
      encodings, labels = testdata
      encodings = encodings.to(device)
      for key in encodings.keys():
        encodings[key] = torch.squeeze(encodings[key], 1)
      output = classifier(encodings)
      _, predicted = torch.max(output.data, 1)
      predicts.append(predicted.to('cpu'))
      total+= labels.size(0)
      groundtruth.append(torch.argmax(labels.to('cpu'), 1))
      correct+=(predicted.to('cpu')==torch.argmax(labels.to('cpu'), 1)).sum().item()

  print(f'Accuracy of the network on {dataname} images : {100*correct//total}%')
  print(predicts)
  print(groundtruth)
  f1 = f1_score(np.asarray(predicts), np.asarray(groundtruth))
  print(f'F1 Score of the network on {dataname} images: {f1 * 100}%')

In [53]:
f1_score_eval(val_dataloader, 'validation')
f1_score_eval(test_dataloader, 'test')



Accuracy of the network on validation images : 91%
[tensor([0, 0, 0, 0, 1, 0, 1, 1, 0, 0]), tensor([1, 0, 1, 0, 1, 1, 0, 0, 1, 0]), tensor([1, 1, 0, 0, 0, 1, 1, 1, 0, 0]), tensor([0, 0, 0, 1, 1, 0, 0, 0, 1, 0]), tensor([1, 0, 0, 1, 0, 1, 0, 1, 1, 1]), tensor([0, 0, 1, 1, 0, 1, 0, 0, 0, 1]), tensor([1, 1, 1, 1, 0, 1, 0, 0, 1, 0]), tensor([0, 1, 1, 1, 1, 0, 0, 0, 0, 0]), tensor([0, 0, 1, 1, 1, 1, 0, 0, 1, 1]), tensor([0, 1, 0, 1, 0, 0, 1, 0, 1, 1]), tensor([0, 1, 1, 1, 0, 1, 1, 0, 1, 1]), tensor([1, 0, 1, 0, 1, 0, 0, 0, 1, 1]), tensor([1, 0, 0, 1, 0, 1, 1, 1, 0, 1]), tensor([0, 1, 1, 1, 0, 0, 1, 0, 1, 1]), tensor([1, 0, 1, 0, 1, 1, 1, 1, 1, 0]), tensor([0, 1, 0, 0, 0, 0, 0, 0, 1, 0]), tensor([1, 1, 0, 1, 0, 0, 1, 0, 0, 1]), tensor([1, 1, 1, 0, 1, 0, 1, 1, 1, 0]), tensor([1, 1, 1, 0, 0, 0, 0, 0, 1, 1]), tensor([1, 0, 1, 0, 1, 1, 1, 0, 1, 0]), tensor([0, 0, 1, 0, 0, 0, 1, 1, 0, 1]), tensor([1, 0, 0, 1, 0, 1, 1, 1, 0, 1]), tensor([1, 0, 1, 1, 1, 0, 0, 0, 0, 1]), tensor([0, 0, 1, 1, 0, 1, 0,

  f1 = f1_score(np.asarray(predicts), np.asarray(groundtruth))
  f1 = f1_score(np.asarray(predicts), np.asarray(groundtruth))


ValueError: ignored

In [32]:
correct = 0
total = 0
predicts = []
groundtruth = []
with torch.no_grad():
  for testdata in val_dataloader:
    encodings, labels = testdata
    encodings = encodings.to(device)
    for key in encodings.keys():
      encodings[key] = torch.squeeze(encodings[key], 1)
    output = classifier(encodings)
    _, predicted = torch.max(output.data, 1)
    predicts.append(predicted)
    total+= labels.size(0)
    # print(predicted)
    # print(labels)
    # print(torch.argmax(labels, 1))
    # break
    groundtruth.append(labels.to('cpu'), 1))
    correct+=(predicted.to('cpu')==torch.argmax(labels.to('cpu'), 1)).sum().item()

print(f'Accuracy of the network : {100*correct//total}%')



Accuracy of the network : 91%


In [None]:
from sklearn.metrics import f1_score

f1 = f1_score(np.asarray(predicts), np.asarray(groundtruth), average='binary')
print(f'F1 Score of the network on validation images: {f1 * 100}%')

In [33]:
torch.cuda.empty_cache()
!nvidia-smi -caa
!nvidia-smi

Cleared Accounted PIDs for GPU 00000000:00:04.0.
All done.
Thu Aug 24 00:22:36 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   70C    P0    36W /  70W |   2093MiB / 15360MiB |     68%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+----------------------------



### TRASH BELOW ###



In [None]:
# text_embeds = model.vilt.embeddings.text_embeddings(encoding['input_ids'])
# print(encoding['input_ids'])
# encoding = encoding['input_ids', 'token_type_ids', 'pixel_values', 'pixel_mask']

# output = model(**encoding)
# print(output)
# text_emb = model.vilt.embeddings.text_embeddings(**encoding)
# print(output.keys())

In [None]:
# img_embedding = processor.image_processor(image)
# print(img_embedding)
# encoding = processor(image, text, return_tensors='pt')
# print(encoding.keys())