In [1]:
import torch
from transformers import BertModel, BertConfig
from transformers import BertTokenizer

In [2]:
import sys
sys.path.append("../")

# create bert tokenizer and the bert pretrained model

In [3]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [4]:
text=["Hello all! This is a test for the BERT model!", "First time with HuggingFace's Bert"]
text

['Hello all! This is a test for the BERT model!',
 "First time with HuggingFace's Bert"]

In [5]:
encoding = tokenizer(text, add_special_tokens = True, truncation = True, padding=True, return_attention_mask = True, return_tensors = "pt")

In [6]:
bert_config = BertConfig(output_hidden_states=False)

In [7]:
bert_model = BertModel.from_pretrained('bert-base-uncased', config=bert_config)

In [8]:
bert_model.config.hidden_size

768

In [9]:
encoding

{'input_ids': tensor([[  101,  7592,  2035,   999,  2023,  2003,  1037,  3231,  2005,  1996,
         14324,  2944,   999,   102],
        [  101,  2034,  2051,  2007, 17662, 12172,  1005,  1055, 14324,   102,
             0,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0]])}

# Create Vision Transformer

In [10]:
from dgs.blocks.vision.vit import Vit

In [11]:
image_size = 256
vit_model = Vit(image_size=image_size, vit_dim=bert_model.config.hidden_size, classifier_type=None) # use the output shape as the same size of bert

# Create the configurations for coattention_transformers and the output_transformer

In [12]:
# this is the parameters for the GeneralTransformerEncoderLayer which is same as nn.TransformerEncoderLayer
coattention_transformer_config = {
    "d_model": bert_model.config.hidden_size, 
    "nhead": 8, 
    "dim_feedforward": 1024, 
    "dropout": 0.1, 
    "activation": "gelu"
}

In [13]:
# which is same as nn.TransformerEncoderLayer
output_transformer_config = {
    "d_model": bert_model.config.hidden_size, 
    "nhead": 8, 
    "dim_feedforward": 1024, 
    "dropout": 0.1, 
    "activation": "gelu"
}

# Create the TILBERT model

In [14]:
from dgs.models import TilBert

In [15]:
tilbert = TilBert(vit_model=vit_model, 
                  bert_model=bert_model, 
                  coattention_transformer_config=coattention_transformer_config, 
                  output_transformer_config=output_transformer_config,
                  num_of_combined_coattention_and_output_transformers=3,
                  classifier_type="token",
                  merge_mode="mul"
                 )

In [16]:
image_tensor = torch.randn(len(text),3,image_size, image_size)
image_tensor.shape

torch.Size([2, 3, 256, 256])

In [17]:
encoding["input_ids"].shape

torch.Size([2, 14])

In [18]:
multimodular_output = tilbert(image=image_tensor,
                              text_input_ids=encoding["input_ids"],
                              text_attention_mask=encoding["attention_mask"],
                              text_token_type_ids=encoding["token_type_ids"],
                             )

In [19]:
multimodular_output.shape

torch.Size([2, 768])

In [20]:
from torch.utils.tensorboard import SummaryWriter

In [21]:
with SummaryWriter("./tb_logs") as writer:
    writer.add_graph(tilbert, [image_tensor, encoding["input_ids"], encoding["attention_mask"], encoding["token_type_ids"]])

  input_tensor.shape[chunk_dim] == tensor_shape for input_tensor in input_tensors
  assert image_embedding.shape[1] == text_embedding.shape[1]
