# **Preprocess the dataset and stores tensors to Google Driver**

In [1]:
from google.colab import drive
drive.mount('/content/drive')
import os
path = os.path.join('/content/drive/MyDrive/')

Mounted at /content/drive


##**Read data files** ##

Read dataframes from csv files

In [2]:
import pandas as pd
import os

path = '/content/drive/MyDrive/ViLT'
train_file = os.path.join(path, 'combined_train.csv')
val_file = os.path.join(path, 'combined_val.csv')
test_file = os.path.join(path, 'combined_test.csv')

train_df = pd.read_csv(train_file)
val_df = pd.read_csv(val_file)
test_df = pd.read_csv(test_file)

Read train, validation, test data

In [3]:
# !pip install lightning
!pip install transformers

Collecting transformers
  Downloading transformers-4.32.0-py3-none-any.whl (7.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.5/7.5 MB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m24.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m39.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m46.1 MB/s[0m eta [36m0:00:0

Load pretrained ViLT model

In [4]:
from transformers import ViltProcessor, ViltForQuestionAnswering
import torch

processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa")

Downloading (…)rocessor_config.json:   0%|          | 0.00/251 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/320 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/136k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/470M [00:00<?, ?B/s]

Preprocess the images and comments to encodings and stores them to the storage

In [5]:
import os
from torchvision.io import read_image
from torchvision.transforms import Resize
import torchvision
import numpy as np

def preprocess_df(dataframes):
  base_path = '/content/drive/MyDrive/ViLT'
  img_paths = dataframes['filename']
  comments = dataframes['clean_comments']
  labels = dataframes['scenic']
  img_resize = Resize((384, 512))
  proc_count = 0

  for idx in range(len(img_paths)):
    out_path = os.path.join(base_path, img_paths[idx]+'.pt')
    if not os.path.exists(out_path):

      img = img_resize(read_image(os.path.join(base_path, img_paths[idx]), mode=torchvision.io.ImageReadMode.RGB))
      comment = comments[idx]
      encoding = processor(img, comment, return_tensors='pt', padding='max_length', truncation=True)
      for key in encoding.keys():
        encoding[key] = torch.squeeze(encoding[key], 0)

      label = torch.nn.functional.one_hot(torch.tensor(labels[idx]), num_classes=2)
      output = [encoding, label]
      torch.save(output, out_path)
      # print(out_path)
      proc_count+=1
      if proc_count%500==0:
        print(f'Num processed : {proc_count}')

In [6]:
preprocess_df(train_df[:200])
preprocess_df(test_df[:40])
preprocess_df(val_df[:40])



##**Finetune the ViLT model** ##

crate dataloaders

In [7]:
from torch.utils.data import DataLoader, Dataset
from torchvision.io import read_image
from torchvision.transforms import Resize
import torchvision
import numpy as np

class ViltDataset(Dataset):
  def __init__(self, dataframes):
    self.base_path = '/content/drive/MyDrive/ViLT'
    self.img_paths = dataframes['filename']

  def __len__(self):
    return len(self.img_paths)

  def __getitem__(self, idx):
    encoding_path = os.path.join(self.base_path, self.img_paths[idx]+'.pt')
    encoding = torch.load(encoding_path)
    encoding, label = encoding

    return [encoding, label]

In [8]:
# create dataloaders
training_data = ViltDataset(train_df[:200])
test_data = ViltDataset(test_df[:40])
val_data = ViltDataset(val_df[:40])

In [9]:
train_dataloader = DataLoader(training_data, batch_size=4, shuffle=True)
test_dataloader = DataLoader(test_data, batch_size=2, shuffle=True)
val_dataloader = DataLoader(val_data, batch_size=2, shuffle=True)

Define a classifier model

In [10]:
class ScenicClassifier(torch.nn.Module):
  def __init__(self, enc_features: int):
    super().__init__()
    self.embedding = model.vilt
    self.classifier = torch.nn.Sequential(
        torch.nn.Linear(enc_features, 32, bias=False),
        torch.nn.ReLU(),
        torch.nn.Linear(32, 2, bias=False),
        # torch.nn.ReLU()
        )

  def forward(self, encodings):
    embeds = self.embedding(**encodings)['pooler_output']
    logits = self.classifier(embeds)

    return logits

In [11]:
class ScenicClassifier2(torch.nn.Module):
  def __init__(self, enc_features: int):
    super().__init__()
    self.embedding = model.vilt
    self.classifier = torch.nn.Linear(enc_features, 2, bias=False)
    torch.nn.init.xavier_uniform(self.classifier.weight)

  def forward(self, encodings):
    embeds = self.embedding(**encodings)['pooler_output']
    logits = self.classifier(embeds)

    return logits

In [12]:
# device='cuda:0'
device='cpu'
classifier = ScenicClassifier(768)
# classifier = ScenicClassifier2(768)
classifier.to(device)

criterion = torch.nn.MSELoss()
criterion.to(device)
optimizer = torch.optim.SGD(classifier.parameters(), lr=0.001, momentum=0.8)

## Train the model

In [14]:
classifier.train()
for epoch in range(4):
  running_loss = 0.0
  for i, data in enumerate(train_dataloader, 0):
    encodings, labels = data
    encodings.to(device)
    labels.to(device)
    optimizer.zero_grad()
    outputs = classifier(encodings)
    labels = torch.tensor(labels, dtype=torch.float)#.cuda()
    # new_labels = []
    # for label in labels:
    #   new_labels.append(torch.nn.functional.one_hot(torch.squeeze(torch.tensor(label, dtype=torch.int)), num_classes=2))
    # new_labels = torch.tensor(new_labels)

    # print(outputs)
    # print(new_labels)
    loss = criterion(outputs, labels)
    loss.backward()
    optimizer.step()
    running_loss += loss.item()
    if i % 10 == 9:
      print(f'[{epoch+1}, {i+1:5d}] loss : {running_loss / 10:.3f}')
      running_loss=0.0

print('Finished training')

  labels = torch.tensor(labels, dtype=torch.float)#.cuda()


[1,    10] loss : 0.457
[1,    20] loss : 0.280


KeyboardInterrupt: ignored

In [16]:
!del classifier
!del optimizer

torch.cuda.empty_cache()
!nvidia-smi -caa
!nvidia-smi


/bin/bash: line 1: del: command not found
/bin/bash: line 1: del: command not found
Cleared Accounted PIDs for GPU 00000000:00:04.0.
All done.
Wed Aug 23 17:57:22 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   52C    P0    28W /  70W |  15037MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                         

## Test the result

In [None]:
correct = 0
total = 0

with torch.no_grad():
  for testdata in val_dataloader:
    encodings, labels = testdata
    encodings = encodings.to(device)
    output = classifier(encodings)
    _, predicted = torch.max(output.data, 1)
    total+= labels.size(0)
    # print(predicted)
    # print(labels)
    # print(torch.argmax(labels, 1))
    # break
    correct+=(predicted.to('cpu')==torch.argmax(labels.to('cpu'), 1)).sum().item()

print(f'Accuracy of the network : {100*correct//total}%')

Accuracy of the network : 93%




### TRASH BELOW ###



In [None]:
# text_embeds = model.vilt.embeddings.text_embeddings(encoding['input_ids'])
# print(encoding['input_ids'])
# encoding = encoding['input_ids', 'token_type_ids', 'pixel_values', 'pixel_mask']

# output = model(**encoding)
# print(output)
# text_emb = model.vilt.embeddings.text_embeddings(**encoding)
# print(output.keys())

In [None]:
# img_embedding = processor.image_processor(image)
# print(img_embedding)
# encoding = processor(image, text, return_tensors='pt')
# print(encoding.keys())