<a href="https://colab.research.google.com/github/AlbertoPaM/VQA-CLIP/blob/main/VQA_Baseline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#!pip install transformers

In [2]:
import torch
import numpy as np
import pandas as pd
import transformers
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from PIL import Image
from torch import nn
from tqdm import tqdm

In [3]:
class VQA_Dataset(Dataset):
    def __init__(self,df,vocab,is_test=False,gen_path=''):
        self.df = df
        self.vocab = vocab
        self.is_test = is_test
        self.gen_path = gen_path
        
    def __len__(self):
        return len(self.df)-1
    
    def __getitem__(self,idx):
        answer = self.df['answer'][idx].split(',')[0]
        answer = self.vocab.index(answer)
        quetion = self.df['question'][idx]
        path = self.gen_path + '/' + self.df['image_id'][idx] +'.png'
        image = Image.open(path)
        if not self.is_test:
            return image,quetion,answer
        else:
            return image,quetion

In [4]:
from transformers import CLIPProcessor, CLIPVisionModel

class Visual_Encoder(nn.Module):
    def __init__(self):
        super(Visual_Encoder,self).__init__()
        self.model = CLIPVisionModel.from_pretrained("openai/clip-vit-base-patch16")
        self.processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch16")
        
    def forward(self,image,device='cuda'):
        image = self.processor(images=image,return_tensors="pt")['pixel_values']
        return self.model(pixel_values=image.to(device)).pooler_output

In [5]:
from transformers import AutoTokenizer, RobertaModel
class Text_Encoder(nn.Module):
    def __init__(self):
        super(Text_Encoder,self).__init__()
        self.tokenizer = AutoTokenizer.from_pretrained("roberta-base")
        self.model = RobertaModel.from_pretrained("roberta-base")
    
    def forward(self,text,device='cuda'):
        text = self.tokenizer(text, return_tensors="pt")
        return self.model(**text.to(device)).pooler_output

In [6]:
class Classifier(nn.Module):
    def __init__(self,input_size=768*2,output_size=582):
        super(Classifier,self).__init__()
        self.lstm = nn.LSTM(input_size,1024)
        self.dropout = nn.Dropout(0.35)
        self.fc1 = nn.Linear(1024,output_size)
    
    def forward(self,x):
        x, _ = self.lstm(x)
        x = self.dropout(x)
        x = self.fc1(x)
        return x

In [7]:
class VQA_Model(nn.Module):
    def __init__(self):
        super(VQA_Model,self).__init__()
        self.visual_encoder = Visual_Encoder()
        self.textual_encoder = Text_Encoder()
        self.classifier = Classifier()
    
    def forward(self,image,answer,device='cuda'):
        text_out = self.textual_encoder(answer).to(device)
        image_out = self.visual_encoder(image).to(device)
        x = torch.cat((text_out,image_out),dim=1)
        x = self.classifier(x)
        return x
    def freeze(self,visual=True,textual=False,clas=False):
        if visual:
            for n,p in self.visual_encoder.named_parameters():
                p.requires_grad = False
        if textual:
            for n,p in self.textual_encoder.named_parameters():
                p.requires_grad = False
        if clas:
            for n,p in self.classifier.named_parameters():
                p.requires_grad = False

In [8]:
def train_one_epoch(model, train_dataloader, optimizer, loss_fn, epoch, device,verbose=False):
    model.train()
    running_loss = 0
    prog_bar = tqdm(enumerate(train_dataloader), total=len(train_dataloader))
    for batch, (image,quetion,answer) in prog_bar:
        if batch > 9972:
            break
        optimizer.zero_grad()
        answer = torch.tensor([answer]).to(device)
        preds = model(image,quetion)
        loss = loss_fn(preds, answer)
        loss.backward()
        optimizer.step()
        loss_item = loss.item()
        running_loss += loss_item
        prog_bar.set_description(f"loss: {loss_item:.4f}")
        if verbose == True and batch % 20 == 0:
            print(f"Batch: {batch}, Loss: {loss_item}")
    
    avg_loss = running_loss / len(train_dataloader)
    
    return avg_loss

In [9]:
@torch.no_grad()
def valid_one_epoch(model, valid_dataloader, loss_fn, epoch, device, log_wandb=True, verbose=False):
    model.eval()
    running_loss = 0
    prog_bar = tqdm(enumerate(valid_dataloader), total=len(valid_dataloader))
    for batch, (image,quetion,answer) in prog_bar:
        if batch >= 2492:
            break
        answer = torch.tensor([answer]).to(device)
        preds = model(image,quetion)
        loss = loss_fn(preds, answer)
        
        loss_item = loss.item()
        running_loss += loss_item
        
        prog_bar.set_description(f"val_loss: {loss_item:.4f}")        
        if verbose == True and batch % 10 == 0:
            print(f"Batch: {batch}, Loss: {loss_item}")
    
    avg_val_loss = running_loss / len(valid_dataloader)
    
    return avg_val_loss

In [10]:
train_df = pd.read_csv('/content/drive/MyDrive/Tesis/CLIP/dataset/data_train.csv')
test_df = pd.read_csv('/content/drive/MyDrive/Tesis/CLIP/dataset/data_eval.csv')
gen_path = '/content/drive/MyDrive/Tesis/CLIP/dataset/images'
with open('/content/drive/MyDrive/Tesis/CLIP/dataset/answer_space.txt') as f:
    vocab = f.read().splitlines()

In [11]:
trainset = VQA_Dataset(train_df,vocab,gen_path=gen_path)
testset = VQA_Dataset(test_df,vocab,gen_path=gen_path)
#train_loader = DataLoader(trainset,batch_size=8, shuffle=True)
#test_loader = DataLoader(trainset,batch_size=8, shuffle=True)

In [12]:
model = VQA_Model()
model = model.to('cuda')

Downloading (…)lve/main/config.json:   0%|          | 0.00/4.10k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/599M [00:00<?, ?B/s]

Some weights of the model checkpoint at openai/clip-vit-base-patch16 were not used when initializing CLIPVisionModel: ['text_model.encoder.layers.0.self_attn.v_proj.weight', 'text_model.encoder.layers.9.self_attn.v_proj.weight', 'text_model.encoder.layers.8.self_attn.v_proj.weight', 'text_model.encoder.layers.5.layer_norm2.weight', 'text_model.embeddings.position_ids', 'text_model.encoder.layers.11.self_attn.out_proj.weight', 'text_model.encoder.layers.5.mlp.fc2.bias', 'text_model.encoder.layers.7.self_attn.v_proj.bias', 'text_model.encoder.layers.5.layer_norm1.weight', 'text_model.encoder.layers.11.mlp.fc1.bias', 'text_model.encoder.layers.9.self_attn.out_proj.bias', 'text_model.encoder.layers.7.self_attn.v_proj.weight', 'text_model.embeddings.position_embedding.weight', 'text_model.encoder.layers.4.self_attn.k_proj.weight', 'text_model.encoder.layers.4.self_attn.q_proj.bias', 'text_model.encoder.layers.3.self_attn.k_proj.bias', 'text_model.encoder.layers.8.mlp.fc2.weight', 'text_mode

Downloading (…)rocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/905 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/961k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [13]:
optimizer = transformers.AdamW(model.parameters(),lr=1e-5)
criterion = nn.CrossEntropyLoss()



In [14]:
model.freeze(visual=True,textual=True)

In [15]:
len(train_df)

9974

In [16]:
for epoch in range(5):
    train_loss = train_one_epoch(model, trainset, optimizer, criterion, epoch=epoch, device='cuda')
    print(f'train_loss - {train_loss}')
    valid_loss = valid_one_epoch(model, testset, criterion, epoch=epoch, device='cuda')
    print(f'valid_loss - {valid_loss}')
    torch.save(model.state_dict(),'/content/drive/MyDrive/Tesis/CLIP/modelVQA.pth')

loss: 4.9483: 100%|██████████| 9973/9973 [32:46<00:00,  5.07it/s]


train_loss - 5.088579839609648


val_loss: 5.2937: 100%|█████████▉| 2492/2493 [02:15<00:00, 18.41it/s]


valid_loss - 4.86863976781549


loss: 4.4521: 100%|██████████| 9973/9973 [09:39<00:00, 17.21it/s]


train_loss - 4.699060420041862


val_loss: 4.8592: 100%|█████████▉| 2492/2493 [02:08<00:00, 19.44it/s]


valid_loss - 4.720833508546602


loss: 3.8020: 100%|██████████| 9973/9973 [09:36<00:00, 17.31it/s]


train_loss - 4.515851901214173


val_loss: 4.3562: 100%|█████████▉| 2492/2493 [02:10<00:00, 19.13it/s]


valid_loss - 4.629041238716698


loss: 3.5401: 100%|██████████| 9973/9973 [09:41<00:00, 17.14it/s]


train_loss - 4.391164717279321


val_loss: 4.1347: 100%|█████████▉| 2492/2493 [02:14<00:00, 18.48it/s]


valid_loss - 4.575279909302994


loss: 3.3988: 100%|██████████| 9973/9973 [09:50<00:00, 16.88it/s]


train_loss - 4.293799960361038


val_loss: 3.9770: 100%|█████████▉| 2492/2493 [02:19<00:00, 17.85it/s]


valid_loss - 4.5291110936102505


In [17]:
valid_loss = valid_one_epoch(model, testset, criterion, epoch=epoch, device='cuda')

val_loss: 3.9770: 100%|█████████▉| 2492/2493 [02:23<00:00, 17.36it/s]


In [18]:
valid_loss

4.5291110936102505

In [19]:
preds = []
gt = []
prog_bar = tqdm(enumerate(testset), total=len(testset))
for batch, (image,quetion,answer) in prog_bar:
    if batch >= 2492:
        break
    answer = torch.tensor([answer]).to('cuda')
    gt += [answer]
    preds += [model(image,quetion).argmax(dim=-1).to('cpu').flatten().numpy()]

100%|█████████▉| 2492/2493 [02:08<00:00, 19.38it/s]


In [20]:
ggt = []
for i in gt:
    ggt += i.tolist()

In [21]:
ggt

[387,
 483,
 512,
 333,
 51,
 352,
 39,
 14,
 510,
 133,
 483,
 51,
 149,
 256,
 0,
 88,
 150,
 251,
 419,
 150,
 483,
 236,
 483,
 0,
 414,
 352,
 236,
 14,
 58,
 416,
 14,
 377,
 493,
 467,
 43,
 53,
 157,
 564,
 114,
 176,
 497,
 106,
 117,
 136,
 483,
 283,
 512,
 156,
 175,
 510,
 381,
 10,
 404,
 333,
 19,
 536,
 447,
 56,
 176,
 377,
 387,
 43,
 333,
 174,
 376,
 564,
 16,
 61,
 498,
 422,
 568,
 43,
 383,
 414,
 419,
 483,
 18,
 182,
 13,
 227,
 387,
 551,
 14,
 81,
 359,
 383,
 70,
 0,
 156,
 81,
 483,
 1,
 10,
 414,
 55,
 41,
 414,
 551,
 10,
 107,
 483,
 497,
 15,
 56,
 564,
 10,
 251,
 476,
 63,
 416,
 56,
 320,
 377,
 62,
 0,
 17,
 251,
 106,
 377,
 452,
 332,
 79,
 55,
 204,
 410,
 505,
 16,
 10,
 260,
 531,
 388,
 0,
 32,
 568,
 379,
 308,
 10,
 288,
 498,
 176,
 106,
 61,
 551,
 55,
 10,
 106,
 527,
 544,
 501,
 58,
 564,
 445,
 544,
 499,
 76,
 422,
 404,
 10,
 0,
 568,
 332,
 521,
 344,
 18,
 114,
 236,
 508,
 568,
 61,
 383,
 469,
 302,
 483,
 152,
 308,
 383,
 337,


In [22]:
from sklearn.metrics import accuracy_score, f1_score
f1_score(ggt, pp, average='weighted'), accuracy_score(ggt,pp)

NameError: ignored