In [1]:
import os
import json

In [2]:
image_dir = r'E:\VQA\floodnet\Images'
training_questions_path = r'E:\VQA\floodnet\Questions\Training Question.json'
validation_questions_path = r'E:\VQA\floodnet\Questions\Valid Question.json'
test_questions_path = r'E:\VQA\floodnet\Questions\Test_Question.json'

def load_questions_and_images(json_file_path, image_dir, split='Train_Image'):
    with open(json_file_path, 'r') as file:
        data = json.load(file)
    
    questions = []
    answers = []
    image_paths = []
    question_types = []

    for item in data:
        questions.append(data[item]['Question'])
        answers.append(data[item]['Ground_Truth'])
        question_types.append(data[item]['Question_Type'])
        image_paths.append(os.path.join(image_dir, split, data[item]['Image_ID']))

    return questions, answers, question_types, image_paths

In [3]:
train_questions, train_answers, Question_Types, train_image_paths = load_questions_and_images(training_questions_path, image_dir)

print("Training Questions:", train_questions[:5])
print("Training Answers:", train_answers[:5])
print("Training Question Types:", Question_Types[:5])
print("Training Image Paths:", train_image_paths[:5])

Training Questions: ['What is the overall condition of the given image?', 'What is the overall condition of the given image?', 'What is the overall condition of the given image?', 'How many non flooded buildings can be seen in this image?', 'How many buildings can be seen in the image?']
Training Answers: ['flooded', 'flooded', 'non flooded', 3, 3]
Training Question Types: ['Condition_Recognition', 'Condition_Recognition', 'Condition_Recognition', 'Complex_Counting', 'Simple_Counting']
Training Image Paths: ['E:\\VQA\\floodnet\\Images\\Train_Image\\10165.JPG', 'E:\\VQA\\floodnet\\Images\\Train_Image\\10166.JPG', 'E:\\VQA\\floodnet\\Images\\Train_Image\\10168.JPG', 'E:\\VQA\\floodnet\\Images\\Train_Image\\10168.JPG', 'E:\\VQA\\floodnet\\Images\\Train_Image\\10168.JPG']


In [4]:
import pandas as pd

train_df = pd.DataFrame({
    'Question': train_questions,
    'Answer': train_answers,
    'Question_Type': Question_Types,
    'Image_Path': train_image_paths
})

train_df.head()

Unnamed: 0,Question,Answer,Question_Type,Image_Path
0,What is the overall condition of the given image?,flooded,Condition_Recognition,E:\VQA\floodnet\Images\Train_Image\10165.JPG
1,What is the overall condition of the given image?,flooded,Condition_Recognition,E:\VQA\floodnet\Images\Train_Image\10166.JPG
2,What is the overall condition of the given image?,non flooded,Condition_Recognition,E:\VQA\floodnet\Images\Train_Image\10168.JPG
3,How many non flooded buildings can be seen in ...,3,Complex_Counting,E:\VQA\floodnet\Images\Train_Image\10168.JPG
4,How many buildings can be seen in the image?,3,Simple_Counting,E:\VQA\floodnet\Images\Train_Image\10168.JPG


In [5]:
train_df['Question_Type'].value_counts()

Question_Type
Condition_Recognition    2315
Yes_No                    867
Complex_Counting          693
Simple_Counting           636
Name: count, dtype: int64

In [6]:
train_df = train_df[train_df['Question_Type'] == 'Simple_Counting']

In [7]:
train_df.head()

Unnamed: 0,Question,Answer,Question_Type,Image_Path
4,How many buildings can be seen in the image?,3,Simple_Counting,E:\VQA\floodnet\Images\Train_Image\10168.JPG
6,How many buildings can be seen in this image?,4,Simple_Counting,E:\VQA\floodnet\Images\Train_Image\10170.JPG
13,How many buildings are in this image?,4,Simple_Counting,E:\VQA\floodnet\Images\Train_Image\10171.JPG
19,How many buildings can be seen in this image?,7,Simple_Counting,E:\VQA\floodnet\Images\Train_Image\10172.JPG
23,How many buildings can be seen in the image?,1,Simple_Counting,E:\VQA\floodnet\Images\Train_Image\10175.JPG


In [8]:
print("Number of Simple Counting Questions:", len(train_df))

Number of Simple Counting Questions: 636


In [19]:
import torch
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from torchvision.transforms import transforms

class FloodNetVQADataset(Dataset):
    def __init__(self, dataframe, transform=None):
        """
        Args:
            dataframe (DataFrame): Pandas DataFrame containing the data.
            transform (callable, optional): Optional transform to be applied on a sample.
        """
        self.dataframe = dataframe
        self.transform = transform

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        img_path = self.dataframe.iloc[idx, 3] 
        image = Image.open(img_path).convert('RGB')
        question = self.dataframe.iloc[idx, 0]  
        answer = self.dataframe.iloc[idx, 1]

        if self.transform:
            image = self.transform(image)

        sample = {'image': image, 'question': question, 'answer': answer}

        return sample

In [20]:
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

dataset = FloodNetVQADataset(train_df, transform=transform)
data_loader = DataLoader(dataset, batch_size=4, shuffle=True, num_workers=0)

In [22]:
for i, batch in enumerate(data_loader):
    images = batch['image']
    questions = batch['question']
    answers = batch['answer'].float()

    print("Batch:", i)
    print("Images Shape:", images.shape)
    print("Questions Shape:", questions)
    print("Answers Shape:", answers)
    break

Batch: 0
Images Shape: torch.Size([4, 3, 224, 224])
Questions Shape: ['How many buildings are in this image?', 'How many buildings are in this image?', 'How many buildings can be seen in this image?', 'How many buildings are in the image?']
Answers Shape: tensor([1., 3., 5., 2.])


In [27]:
import torch
from torchvision.models import resnet50

model = resnet50(pretrained=True)

def extract_features(images): 
    with torch.no_grad():
        features = model(images)
    return features



In [10]:
from transformers import BertTokenizer, BertModel

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

def text_features(question):
    inputs = tokenizer(question, return_tensors='pt')
    outputs = bert_model(**inputs)
    return outputs.last_hidden_state[:, 0, :]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [14]:
import torch.nn as nn
import torch

class VQAModel(nn.Module):
    def __init__(self):
        super(VQAModel, self).__init__()
        self.fc1 = nn.Linear(2048 + 768, 512) 
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.5)
        self.fc2 = nn.Linear(512, 1)  

    def forward(self, img_features, text_features):
        combined_features = torch.cat((img_features, text_features), dim=1)
        x = self.fc1(combined_features)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        return x

In [15]:
import torch.optim as optim

criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [29]:
num_epochs = 2

for epoch in range(num_epochs):
    for i, batch in enumerate(data_loader):
        images = batch['image']
        questions = batch['question']
        answers = batch['answer'].float()  

        img_features = extract_features(images) 
        ques_features = text_features(questions)  
        counts = answers.view(-1, 1).type(torch.float)  

        optimizer.zero_grad()

        outputs = model(img_features, ques_features)
        loss = criterion(outputs, counts)

        loss.backward()
        optimizer.step()

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`input_ids` in this case) have excessive nesting (inputs type `list` where type `int` is expected).