In [22]:
import argparse
import builtins
import os
import pickle
import datetime
import torch

from torch.utils.data import DataLoader, ConcatDataset
import torch.distributed as dist
import torch.multiprocessing as mp
from torch.nn.parallel import DistributedDataParallel as DDP

import numpy as np
import pandas as pd

from dataset import Dictionary
from dataset import load_tsv, _load_dataset_pvqa
# , VQAFeatureDataset, VisualGenomeFeatureDataset, Flickr30kFeatureDataset
from dataset import PVQAFeatureDataset, PretrainDataset, _load_dataset_pvqa
from dataset import question_types, get_q_type
from modeling import BanModel, instance_bce_with_logits, compute_score_with_logits
from finetune_main import adjust_learning_rate

from tqdm import tqdm
import utils
from dataset import tfidf_from_questions

## Function support

In [30]:
def setup(rank, world_size):
    os.environ['MASTER_ADDR'] = 'localhost'
    os.environ['MASTER_PORT'] = '51243'
    
    
def adjust_learning_rate(optimizer, epoch, epochs):
    lr = 0.01
    if True:
        lr *= 0.5 * (1. + np.cos(np.pi * epoch / epochs))
    else:
        for milestone in args.schedule:
            lr *= 0.1 if epoch >= milestone else 1.
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr
    
def train(train_loader: DataLoader, model, optimizer, epoch, arg_gpu = 0):
    model.train()
    total_loss = 0.0
    train_score = 0
    total_norm = 0
    count_norm = 0
    grad_clip = .25
    for (v, b, q, a) in tqdm(train_loader):
        v = v.cuda(arg_gpu)
        b = b.cuda(arg_gpu)
        q = q.cuda(arg_gpu)
        a = a.cuda(arg_gpu)

        pred, att = model(v, b, q, a)
        loss = instance_bce_with_logits(pred, a)
        optimizer.zero_grad()
        loss.backward()

        total_norm += torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
        count_norm += 1

        total_loss += loss.item()

        optimizer.step()

        batch_score = compute_score_with_logits(pred, a.data).sum()
        train_score += batch_score.item()
    total_loss /= len(train_loader)
    train_score /= len(train_loader.dataset)
    print('total_loss=', total_loss, '; train_score=', train_score)
    return train_score

## Dictionary

In [2]:
dict_path = 'data/pvqa/pvqa_dictionary.pkl'

In [3]:
dictionary = Dictionary.load_from_file(dict_path)
len(dictionary)

loading dictionary from data/pvqa/pvqa_dictionary.pkl


4631

## Answers candidates

In [4]:
dataroot = 'data/pvqa'
name = 'train'

In [5]:
ans2label_path = os.path.join('data/pvqa', 'qas', 'trainval_ans2label.pkl')
label2ans_path = os.path.join('data/pvqa', 'qas', 'trainval_label2ans.pkl')

ans2label = pickle.load(open(ans2label_path, 'rb'))
label2ans = pickle.load(open(label2ans_path, 'rb'))

print(len(ans2label))

4092


## Images

In [6]:
img_id2idx = pickle.load(open(os.path.join(dataroot, '%s_img_id2idx.pkl' % name), 'rb'))
len(img_id2idx)

3021

In [7]:
# Arquivos com os bbox e as features das imagens
tsv_file = os.path.join(dataroot, 'images/%s%s.csv' % (name, ''))

FIELDNAMES = ['image_id', 'image_w', 'image_h', 'num_boxes', 'boxes', 'features']
pd.read_csv(tsv_file, delimiter='\t', names=FIELDNAMES)

Unnamed: 0,image_id,image_w,image_h,num_boxes,boxes,features
0,0,526,422,36,b'AAAAYP3wcEAAAABgIo5rQAAAAACzvHNAAAAAYHX2cEAA...,b'dk8pPwAAAAB0dbE/dJdRP7ItNT6cRQw+M/lbPwAAAADt...
1,1,492,357,36,b'AAAAADA7ekAAAAAArRFkQAAAAAB+yXxAAAAAoAoIakAA...,b'zZ1aPgAAAAA5K+A/IPgAQAeMlj6Iagw87Le0PmW7Bz34...
2,2,300,358,36,b'AAAA4DMhUUAAAACAu95OQAAAAKASFXJAAAAAwP/OdUAA...,b'ICXWPgAAAACMoIQ8AAAAAAAAAABUkgU+t7HNPwAAAABk...
3,3,350,576,36,b'AAAAoIKsJEAAAABgKBdOQAAAACCtTHRAAAAAADxUgUAA...,b'AAAAACmhfD8gPxQ+AAAAAJLhOD/AoCg9yGLfPwAAAAAA...
4,4,1340,742,36,b'AAAAYAJiYUAAAAAALpNSQAAAAOCj6pRAAAAA4KqghUAA...,b'AAAAADTl4EAAAAAAAAAAAAAAAACU+wg/hoONPgAAAAAR...
...,...,...,...,...,...,...
3016,3016,792,526,36,b'AAAAgPzTcEAAAADAzYFmQAAAAMB0lnNAAAAAAImcakAA...,b'0hSWPwAAAAA1IhA/hvqsP7F2kT8AAAAA6iWgPgAAAADU...
3017,3017,792,549,36,b'AAAAYBF4eEAAAAAABcVSQAAAAOB5q4JAAAAAAKxMYkAA...,b'AAAAAAAAAAAAAAAAFmDXPqBijj48T4s+AAAAAAAAAAAF...
3018,3018,792,528,36,b'AAAAoJpTTkAAAAAAAAAAAAAAACCJhIhAAAAAIL+xfEAA...,b'AAAAAAAAAAD0CfM9AAAAAAAAAABOsj1ARsQMQQAAAAA8...
3019,3019,792,528,36,b'AAAAoHQqNUAAAABgDgk4QAAAAEA6boNAAAAAYNhjeUAA...,b'AAAAAAAAAAD7Ajo+AAAAAAAAAAAJiiZAxzy6QAAAAABe...


In [8]:
image_data = load_tsv(tsv_file)
print(image_data[0]['boxes'].shape, image_data[0]['features'].shape)

(36, 4) (36, 2048)


In [9]:
#Features
print(np.array([datum['features'] for datum in image_data]).shape)

#Spatial
print(np.array([datum['boxes'] for datum in image_data]).shape)

(3021, 36, 2048)
(3021, 36, 4)


### Input data

In [10]:
"""
vqa = pickle.load(open(os.path.join(dataroot, 'qas/%s_vqa.pkl' % name), 'rb'))
entry = {'question_id': qa['question_id'],
                 'image_id': qa['img_id'],
                 'image': imd_id2val[qa['img_id']],
                 'question': qa['sent'],
                 'answer': answer,
                 'ans_sent': ans,
                 'ans_valid': ans_valid}
"""

entries = _load_dataset_pvqa(dataroot, name, img_id2idx, label2ans, ans2label)
entries[0]

{'question_id': 100001000,
 'image_id': 'train_0001',
 'image': 1,
 'question': 'What form a submucosal nodule composed of tumor cells embedded in dense fibrous tissue?',
 'answer': {'labels': [345], 'scores': [1]},
 'ans_sent': 'carcinoid tumors',
 'ans_valid': 1}

## Preparing the training set

In [11]:
train_dset = PVQAFeatureDataset('train', dictionary, adaptive=False)
val_dset = PVQAFeatureDataset('val', dictionary, adaptive=False)

In [12]:
train_dset.num_ans_candidates

4092

In [32]:
#Train Dataset
train_sampler = None
train_loader = DataLoader(train_dset, 128, shuffle=(train_sampler is None),
                                  num_workers=0, pin_memory=True)

## Preparing BAN Model

In [33]:
model = BanModel(ntoken=train_dset.dictionary.ntoken,
                     num_ans_candidates=train_dset.num_ans_candidates,
                     num_hid=1280, v_dim=train_dset.v_dim,
                     op='c',
                     gamma=8, qa_bl=True)

In [34]:
w_emb_path = 'data/pvqa/glove_pvqa_300d.npy'
tfidf = None
weights = None

#Inicialização dos embedding
"""
    Model: Glove300d
    Cada palavra é um vetor de dim = 300
"""

model.w_emb.init_embedding(w_emb_path, tfidf, weights)

weight_init.shape: torch.Size([4631, 300])
self ntoken:  4631


## Train

In [35]:
#Put on gpu
arg_gpu = 0
model.cuda()
torch.cuda.set_device(arg_gpu)
model.cuda(arg_gpu)

optimizer = torch.optim.Adamax(filter(lambda p: p.requires_grad, model.parameters()))

In [36]:
best_eval_score = 0
EPOCHS = 5

for epoch in range(0, 1):
    adjust_learning_rate(optimizer, epoch, EPOCHS)
    train_score = train(train_loader, model, optimizer, epoch)
    print('train_score=', train_score)

100%|████████████████████████████████████████████████████████████████████████████████| 155/155 [00:35<00:00,  4.42it/s]

total_loss= 25.76991543616018 ; train_score= 0.2507719564667173
eval_score= 0.2507719564667173



