In [31]:
import os
from glob import glob

import transformers
import torch
import torch.nn as nn
import torch.optim as optim

from torch.utils.data import Dataset, DataLoader
from tqdm.auto import tqdm

import torchvision
from skimage import io
import cv2

import copy, json

In [70]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [66]:
class ProcessQuestions:
    def __init__(self, ques_list):
        self.ques_list = ques_list
        
    def get_questions(self):
        
        for i, q in enumerate(self.ques_list):
            question_type = q['question_type']
            question = q['question']
            
            if question_type == "descriptive":
                pass
            elif question_type == "explanatory":
                pass
            elif question_type == "predictive":
                pass
            elif question_type == "counterfactual":
                pass 
            else:
                pass
        
        
        return # Tokenized Question answer Pairs
        
        
class CLEVRERDataset(Dataset):
    
    def __init__(self, data_dir, frame_dir, img_transform=None):
        # TODO load annotations
        assert os.path.isdir(data_dir)
        assert os.path.isdir(frame_dir)
        
        with open(os.path.join(data_dir, data_dir.split("/")[-1] + ".json"), "r") as f:
            self.json_data = json.load(f)
        self.frame_dir = frame_dir
        
        self.img_transform = img_transform     
        
    
    def __len__(self):
        # get length from directory
        return len(self.json_data)
    
    def __getitem__(self, idx):
        """
        TODO: 
        1. Change here hardcoded path in frame_paths to os.path.join(self.frame_dir, f"sim_{vid_id}", "*.png")
        2. Check normalization mean and std values used in image transform
        3. Add tokenized questions + concatinate options (where applicable) and answer token
        """
        
        vid_json = self.json_data[idx]
        vid_id = vid_json['scene_index']
        frame_paths = glob(os.path.join("../../../CLEVRER/frames", "sim_" + "00005", "*.png"))
        frames = torch.cat([self.img_transform(io.imread(img)).unsqueeze(0) for img in frame_paths])        
        
        process_questions = ProcessQuestions(vid_json['questions'])
        
        return {'frames': frames}

In [None]:
tokenizer = transformers.AutoTokenizer.from_pretrained('bert-base-cased')
img_transform = torchvision.transforms.Compose([torchvision.transforms.ToTensor(),
                                                torchvision.transforms.Normalize([0.485, 0.456, 0.406],
                                                                     [0.229, 0.224, 0.225])])
train_dataset = CLEVRERDataset(data_dir="../../../data/train", frame_dir="../../../CLEVRER/frames", img_transform=img_transform)
train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True, num_workers=4)
for i, data in enumerate(train_loader):
    frames = data['frames'].squeeze() #torch.Size([128, 3, 320, 480]) 
#     print(frames.shape)

In [126]:
class PositionalEmbedding(nn.Module):

    def __init__(self, dim_y, dim_x, max_len=300, p=0.2):
        super().__init__()
        
        # Compute the positional encodings once in log space.
        self.pe = torch.zeros(max_len, dim_y, dim_x)

        pos = torch.arange(0,max_len).unsqueeze(1).unsqueeze(2)

        div_term_x = torch.exp(torch.arange(0, dim_x, 2).expand((dim_y//2,dim_x//2)) * -(np.log(10000.0) / dim_x))
        div_term_y = torch.exp(torch.arange(0, dim_y, 2).unsqueeze(1).expand((dim_y//2,dim_x//2)) * -(np.log(10000.0) / dim_y))

        self.pe[:, 0::2, 0::2] = (torch.sin(pos * div_term_x) + torch.sin(pos * div_term_y))/2
        self.pe[:, 1::2, 1::2] = (torch.cos(pos * div_term_x) + torch.cos(pos * div_term_y))/2
        self.pe = self.pe.unsqueeze(0).repeat(3,1,1,1).transpose(0,1).reshape((3*max_len,dim_y,dim_x))
        # assert((self.pe[0] == self.pe[1]) & (self.pe[1] == self.pe[2])).all()
        # self.pe = self.pe.unsqueeze(0)
        # self.register_buffer("pe", self.pe)

        # self.dropout = nn.Dropout(p)

    def forward(self, x):
        print(x.shape)
        print(self.pe.shape)
        return x+self.pe[:x.shape[1],:,:].requires_grad_(False)

In [126]:
class BertCNNModel(nn.Module):
    
    def __init__(self):
        super().__init__()
        self.cnn = torchvision.models.resnet50(pretrained=True)
        self.bert = transformers.BertModel.from_pretrained('bert-base-cased')
        
        self.pos_emb = PositionalEmbedding(320, 480)
        
    def forward(self, frames, tokens):
        
        bert_output = self.bert(**tokens)
        cnn_output = self.cnn(self.pos_emb(frames))
        
        # feature vector - 1768-dimensional
        features = torch.hstack([cnn_output, bert_output.pooler_output])
        
        return features
 

In [133]:
class DescriptiveTaskHead(nn.Module):
	
	def __init__(self, n_classes=21, p=0.2):
		super().__init__()
		self.clf = nn.Sequential(
			nn.Linear(1768, 1024),
			nn.Dropout(p=0.2),
			nn.ReLU(),
			nn.Dropout(p=0.2),
			nn.Linear(1024, 1024),
			nn.Dropout(p=0.2),
			nn.ReLU(),
			nn.Linear(1024, n_classes)
		)

	def forward(self, features):
		# features: (b,1768)
		return self.clf(features)

class ExplanatoryTaskHead(nn.Module):
	
	def __init__(self, p=0.2):
		super().__init__()
		self.clf = nn.Sequential(
			nn.Linear(1768, 1024),
			nn.Dropout(p=0.2),
			nn.ReLU(),
			nn.Dropout(p=0.2),
			nn.Linear(1024, 1024),
			nn.Dropout(p=0.2),
			nn.ReLU(),
			nn.Linear(1024, 1),
            nn.Sigmoid()
		)

	def forward(self, features):
		# features: (b,1768)
		return self.clf(features).squeeze()

class PredictiveTaskHead(nn.Module):
	
	def __init__(self, n_classes=2, p=0.2):
		super().__init__()
		self.clf = nn.Sequential(
			nn.Linear(1768, 1024),
			nn.Dropout(p=0.2),
			nn.ReLU(),
			nn.Dropout(p=0.2),
			nn.Linear(1024, 1024),
			nn.Dropout(p=0.2),
			nn.ReLU(),
			nn.Linear(1024, n_classes)
		)

	def forward(self, features):
		# features: (b,1768)
		return self.clf(features)

class CounterfactualTaskHead(nn.Module):
	
	def __init__(self, p=0.2):
		super().__init__()
		self.clf = nn.Sequential(
			nn.Linear(1768, 1024),
			nn.Dropout(p=0.2),
			nn.ReLU(),
			nn.Dropout(p=0.2),
			nn.Linear(1024, 1024),
			nn.Dropout(p=0.2),
			nn.ReLU(),
			nn.Linear(1024, 1),
            nn.Sigmoid()
		)

	def forward(self, features):
		# features: (b,1768)
		return self.clf(features).squeeze()

In [134]:
options = ['0', '1', '2', '3', '4', '5', 'yes', 'no', 'rubber', 'metal', 'sphere', 'cube', 'cylinder', 'gray', 'brown', 'green', 'red', 'blue', 'purple', 'yellow', 'cyan']
option_id_map = {
    o:i for i,o in enumerate(options)
}

In [None]:
# TODO create dataloader


In [135]:
model = BertCNNModel()

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [128]:
model = model.to(device)

In [39]:
tokenizer = transformers.AutoTokenizer.from_pretrained('bert-base-cased')

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [129]:
model.pos_emb.pe = model.pos_emb.pe.to(device)

In [130]:
toks = tokenizer(['How many collisions happen in this frame?', 'What would happen if the blue ball disappeared?'], return_tensors='pt').to(device)
img = torch.randn((2,3,320,480)).to(device)
model(img, toks)

torch.Size([2, 3, 320, 480])
torch.Size([900, 320, 480])


tensor([[-0.8280, -0.1636, -0.5451,  ...,  0.9999, -0.9072,  0.9909],
        [-0.9929, -0.5160, -0.4730,  ...,  0.9999, -0.9261,  0.9942]],
       device='cuda:0', grad_fn=<CatBackward0>)

In [4]:
with open("../../../data/train/train.json", "r") as f:
    json_data = json.load(f)

list

In [33]:
img_transform = torchvision.transforms.Compose([torchvision.transforms.ToTensor(),
                                                torchvision.transforms.Normalize((0.4914, 0.4822, 0.4465),
                                                                     (0.2023, 0.1994, 0.2010))])

In [34]:
vid_json = json_data[0]
vid_id = vid_json['scene_index']
frame_paths = glob(os.path.join("../../../CLEVRER/frames", "sim_" + "00005", "*.png"))
frames = [img_transform(io.imread(img)) for img in frame_paths]

In [38]:
frames[0].shape

torch.Size([3, 320, 480])