In [1]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import CLIPVisionModel

import torch
import numpy as np
import pandas as pd
import PIL.Image

from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import typing as tp

import albumentations as albu
from albumentations.pytorch.transforms import ToTensorV2

In [2]:
DEVICE: str = "cuda" if torch.cuda.is_available() else "cpu"
LLM_NAME: str = 'gpt2'
VIT_NAME: str = 'openai/clip-vit-large-patch14'

VIT_EMB_SIZE = 1000
LLM_INP_EMB_SIZE = 768

DATA_PATH: str = "./"
BATCH_SIZE = 16

In [57]:
llm = GPT2LMHeadModel.from_pretrained(LLM_NAME).to(DEVICE)
tokenizer = GPT2Tokenizer.from_pretrained(LLM_NAME, pad_token="<|PAD|>")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
vit = CLIPVisionModel.from_pretrained(VIT_NAME).to(DEVICE)

Some weights of the model checkpoint at openai/clip-vit-large-patch14 were not used when initializing CLIPVisionModel: ['text_model.encoder.layers.11.layer_norm1.bias', 'text_model.encoder.layers.9.mlp.fc1.bias', 'text_model.encoder.layers.2.layer_norm1.bias', 'text_model.encoder.layers.6.mlp.fc1.bias', 'text_model.encoder.layers.6.self_attn.v_proj.bias', 'text_model.encoder.layers.1.layer_norm2.bias', 'text_model.encoder.layers.8.self_attn.k_proj.bias', 'text_model.encoder.layers.10.mlp.fc2.weight', 'text_model.encoder.layers.0.self_attn.out_proj.bias', 'text_model.encoder.layers.3.self_attn.q_proj.bias', 'text_model.encoder.layers.1.mlp.fc1.weight', 'text_model.encoder.layers.3.self_attn.out_proj.bias', 'text_model.encoder.layers.5.self_attn.out_proj.weight', 'text_model.encoder.layers.0.self_attn.q_proj.weight', 'text_model.encoder.layers.4.layer_norm1.bias', 'text_model.encoder.layers.4.self_attn.out_proj.weight', 'text_model.embeddings.position_ids', 'text_model.encoder.layers.5.s

In [5]:
data = pd.read_csv(DATA_PATH+"train"+"_dataframe.csv")

In [58]:
class CLEVRDataset(Dataset):
    def __init__(
        self,
        tokenizer: GPT2Tokenizer,
        data_type: str = "train",
        transform: tp.Optional[albu.Compose | None] = None
) -> None:
        super().__init__()
        self.data_type = data_type
        self.data = pd.read_csv(DATA_PATH+data_type+"_dataframe.csv")
        self.transform = transform
        self.tokenizer = tokenizer
        
        self.max_question_length = max([
            len(ques) for ques in self.data["Question"]
        ])
        self.max_answer_length = max([
            len(ans) for ans in self.data["Answer"]
        ])
    
    def get_batch(self, batch_size: int) -> tp.Tuple:
        rand_idxes = np.random.randint(0, len(self.data), batch_size)
        
        img_paths = self.data.loc[rand_idxes, "Path"].to_list()
        questions = self.data.loc[rand_idxes, "Question"].to_list()
        answers = self.data.loc[rand_idxes, "Answer"].to_list()
        
        imgs = torch.cat([
            self._get_image(img_path)[None, ...] for img_path in img_paths
        ])
        
        questions_tokens = self.tokenizer(questions, return_tensors="pt", padding=True)
        answers_tokens = self.tokenizer(answers, return_tensors="pt", padding=True)
        
        return (imgs, questions_tokens, answers_tokens)
    
    def _get_image(self, image_path: str) -> torch.Tensor:
        img = np.array(PIL.Image.open(image_path).convert("RGB"))
        if self.transform is None:
            raise ValueError("Transformation must be at least ToTensor, but None recieved")
        img = self.transform(image=img)["image"].float()
        return img
    
    def __getitem__(self, index) -> tp.Tuple:
        img_path, question, answer = data.iloc[index]
        
        img = self._get_image(img_path)
        question_tokens = self.tokenizer(question, return_tensors="pt")
        answer_tokens = self.tokenizer(answer, return_tensors="pt")
    
        return (
            img,
            question_tokens,
            answer_tokens
        )
        
    def __len__(self):
        return len(self.data)

In [61]:
CLEVRDataset(
    tokenizer=tokenizer,
    data_type="train",
    transform=transform
).get_batch(3)

(tensor([[[[105., 105., 105.,  ..., 102., 102., 102.],
           [105., 105., 105.,  ..., 102., 102., 103.],
           [105., 105., 105.,  ..., 103., 101., 101.],
           ...,
           [120., 121., 120.,  ..., 150., 149., 148.],
           [119., 120., 120.,  ..., 150., 149., 149.],
           [120., 120., 120.,  ..., 151., 149., 149.]],
 
          [[105., 105., 105.,  ..., 102., 102., 102.],
           [105., 105., 104.,  ..., 102., 102., 103.],
           [104., 105., 105.,  ..., 103., 101., 101.],
           ...,
           [119., 120., 119.,  ..., 147., 146., 145.],
           [119., 119., 119.,  ..., 147., 146., 147.],
           [119., 119., 119.,  ..., 148., 147., 147.]],
 
          [[105., 104., 104.,  ..., 102., 102., 101.],
           [104., 105., 104.,  ..., 102., 102., 103.],
           [104., 105., 105.,  ..., 103., 101., 101.],
           ...,
           [118., 118., 118.,  ..., 143., 142., 141.],
           [117., 118., 118.,  ..., 142., 142., 142.],
           

In [44]:
transform = albu.Compose([
    albu.Resize(
        height=224, 
        width=224
    ),
    ToTensorV2()
])

train_dataset = CLEVRDataset(
    tokenizer=tokenizer,
    data_type="train",
    transform=transform
)

train_dataloader = DataLoader(
    dataset=train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True
)

In [57]:
for imgs, qs, anss in train_dataloader:
    print(imgs.size())
    print(qs["input_ids"].size())
    print(anss["input_ids"].size())
    
    with torch.no_grad():
        vit_output = vit(imgs.to(DEVICE))
        llm_output = llm(**qs.to(DEVICE), output_hidden_states =True)
    
    break

ValueError: 203 is not a valid PaddingStrategy, please select one of ['longest', 'max_length', 'do_not_pad']

In [38]:
llm_output["hidden_states"][-1][:, 0, -2, :]

tensor([[-0.5499,  0.3904, -1.2417,  ...,  0.0848,  0.1040, -0.3367],
        [-0.5499,  0.3904, -1.2417,  ...,  0.0848,  0.1040, -0.3367],
        [-0.5499,  0.3904, -1.2417,  ...,  0.0848,  0.1040, -0.3367],
        ...,
        [-0.5499,  0.3904, -1.2417,  ...,  0.0848,  0.1040, -0.3367],
        [-0.5499,  0.3904, -1.2417,  ...,  0.0848,  0.1040, -0.3367],
        [-0.5499,  0.3904, -1.2417,  ...,  0.0848,  0.1040, -0.3367]],
       device='cuda:0')

In [17]:
class MegaModel(nn.Module):
    def __init__(
        self
    ) -> None:
        self.tokenizer = GPT2Tokenizer.from_pretrained(LLM_NAME)
        self.llm = GPT2LMHeadModel.from_pretrained(LLM_NAME).to(DEVICE)
        self.vit = CLIPVisionModel.from_pretrained(VIT_NAME).to(DEVICE)
        
        self.vit2token = nn.Linear()
        
    def forward(
        self,
        img,
        question
    ):
        

SyntaxError: incomplete input (1536146756.py, line 10)

In [9]:
# generated = llm.generate(**(q.to(DEVICE)), max_length=40)
# tokenizer.decode(generated[0].cpu().tolist())