In [19]:
import os
import json

In [20]:
image_dir = r'E:\VQA\floodnet\Images'
training_questions_path = r'E:\VQA\floodnet\Questions\Training Question.json'
validation_questions_path = r'E:\VQA\floodnet\Questions\Valid Question.json'
test_questions_path = r'E:\VQA\floodnet\Questions\Test_Question.json'

def load_questions_and_images(json_file_path, image_dir, split='Train_Image'):
    with open(json_file_path, 'r') as file:
        data = json.load(file)
    
    questions = []
    answers = []
    image_paths = []
    question_types = []

    for item in data:
        questions.append(data[item]['Question'])
        answers.append(data[item]['Ground_Truth'])
        question_types.append(data[item]['Question_Type'])
        image_paths.append(os.path.join(image_dir, split, data[item]['Image_ID']))

    return questions, answers, question_types, image_paths

In [21]:
train_questions, train_answers, Question_Types, train_image_paths = load_questions_and_images(training_questions_path, image_dir)

print("Training Questions:", train_questions[:5])
print("Training Answers:", train_answers[:5])
print("Training Question Types:", Question_Types[:5])
print("Training Image Paths:", train_image_paths[:5])

Training Questions: ['What is the overall condition of the given image?', 'What is the overall condition of the given image?', 'What is the overall condition of the given image?', 'How many non flooded buildings can be seen in this image?', 'How many buildings can be seen in the image?']
Training Answers: ['flooded', 'flooded', 'non flooded', 3, 3]
Training Question Types: ['Condition_Recognition', 'Condition_Recognition', 'Condition_Recognition', 'Complex_Counting', 'Simple_Counting']
Training Image Paths: ['E:\\VQA\\floodnet\\Images\\Train_Image\\10165.JPG', 'E:\\VQA\\floodnet\\Images\\Train_Image\\10166.JPG', 'E:\\VQA\\floodnet\\Images\\Train_Image\\10168.JPG', 'E:\\VQA\\floodnet\\Images\\Train_Image\\10168.JPG', 'E:\\VQA\\floodnet\\Images\\Train_Image\\10168.JPG']


In [23]:
import pandas as pd

train_df = pd.DataFrame({
    'Question': train_questions,
    'Answer': train_answers,
    'Question_Type': Question_Types,
    'Image_Path': train_image_paths
})

train_df.head()

Unnamed: 0,Question,Answer,Question_Type,Image_Path
0,What is the overall condition of the given image?,flooded,Condition_Recognition,E:\VQA\floodnet\Images\Train_Image\10165.JPG
1,What is the overall condition of the given image?,flooded,Condition_Recognition,E:\VQA\floodnet\Images\Train_Image\10166.JPG
2,What is the overall condition of the given image?,non flooded,Condition_Recognition,E:\VQA\floodnet\Images\Train_Image\10168.JPG
3,How many non flooded buildings can be seen in ...,3,Complex_Counting,E:\VQA\floodnet\Images\Train_Image\10168.JPG
4,How many buildings can be seen in the image?,3,Simple_Counting,E:\VQA\floodnet\Images\Train_Image\10168.JPG


In [24]:
train_df['Question_Type'].value_counts()

Question_Type
Condition_Recognition    2315
Yes_No                    867
Complex_Counting          693
Simple_Counting           636
Name: count, dtype: int64

In [27]:
train_df['Question'].value_counts()

Question
What is the overall condition of the given image?            1448
What is the condition of road?                                452
Is the entire road non flooded?                               441
Is the entire road flooded?                                   426
What is the condition of the road in this image?              415
How many buildings are non flooded?                           183
How many non flooded buildings can be seen in this image?     179
How many buildings are non flooded in this image?             179
How many buildings can be seen in this image?                 173
How many buildings are in this image?                         169
How many buildings can be seen in the image?                  151
How many buildings are in the image?                          143
How many flooded buildings can be seen in this image?          55
How many buildings are flooded?                                49
How many buildings are flooded in this image?                  48
N

In [29]:
train_df[train_df['Question_Type'] == 'Simple_Counting'].head()

Unnamed: 0,Question,Answer,Question_Type,Image_Path
4,How many buildings can be seen in the image?,3,Simple_Counting,E:\VQA\floodnet\Images\Train_Image\10168.JPG
6,How many buildings can be seen in this image?,4,Simple_Counting,E:\VQA\floodnet\Images\Train_Image\10170.JPG
13,How many buildings are in this image?,4,Simple_Counting,E:\VQA\floodnet\Images\Train_Image\10171.JPG
19,How many buildings can be seen in this image?,7,Simple_Counting,E:\VQA\floodnet\Images\Train_Image\10172.JPG
23,How many buildings can be seen in the image?,1,Simple_Counting,E:\VQA\floodnet\Images\Train_Image\10175.JPG


In [31]:
train_df[train_df['Question_Type'] == 'Complex_Counting'].head()

Unnamed: 0,Question,Answer,Question_Type,Image_Path
3,How many non flooded buildings can be seen in ...,3,Complex_Counting,E:\VQA\floodnet\Images\Train_Image\10168.JPG
9,How many buildings are non flooded?,4,Complex_Counting,E:\VQA\floodnet\Images\Train_Image\10170.JPG
14,How many buildings are non flooded?,4,Complex_Counting,E:\VQA\floodnet\Images\Train_Image\10171.JPG
17,How many buildings are non flooded in this image?,7,Complex_Counting,E:\VQA\floodnet\Images\Train_Image\10172.JPG
21,How many buildings are non flooded in this image?,1,Complex_Counting,E:\VQA\floodnet\Images\Train_Image\10175.JPG


In [32]:
train_df[train_df['Question_Type'] == 'Yes_No'].head()

Unnamed: 0,Question,Answer,Question_Type,Image_Path
7,Is the entire road non flooded?,Yes,Yes_No,E:\VQA\floodnet\Images\Train_Image\10170.JPG
10,Is the entire road flooded?,No,Yes_No,E:\VQA\floodnet\Images\Train_Image\10171.JPG
18,Is the entire road flooded?,No,Yes_No,E:\VQA\floodnet\Images\Train_Image\10172.JPG
24,Is the entire road non flooded?,Yes,Yes_No,E:\VQA\floodnet\Images\Train_Image\10175.JPG
27,Is the entire road non flooded?,Yes,Yes_No,E:\VQA\floodnet\Images\Train_Image\10176.JPG


In [33]:
train_df[train_df['Question_Type'] == 'Condition_Recognition'].head()

Unnamed: 0,Question,Answer,Question_Type,Image_Path
0,What is the overall condition of the given image?,flooded,Condition_Recognition,E:\VQA\floodnet\Images\Train_Image\10165.JPG
1,What is the overall condition of the given image?,flooded,Condition_Recognition,E:\VQA\floodnet\Images\Train_Image\10166.JPG
2,What is the overall condition of the given image?,non flooded,Condition_Recognition,E:\VQA\floodnet\Images\Train_Image\10168.JPG
5,What is the overall condition of the given image?,non flooded,Condition_Recognition,E:\VQA\floodnet\Images\Train_Image\10170.JPG
8,What is the condition of the road in this image?,non flooded,Condition_Recognition,E:\VQA\floodnet\Images\Train_Image\10170.JPG


In [1]:
import torch
from PIL import Image
from torchvision import transforms
from torchvision.models import vgg16
from torch.utils.data import Dataset, DataLoader


class VQADataset(Dataset):
    def __init__(self, questions, answers, image_paths, transform=None):
        self.questions = questions
        self.answers = answers
        self.image_paths = image_paths
        self.transform = transform
    
    def __len__(self):
        return len(self.questions)
    
    def __getitem__(self, idx):
        image = Image.open(self.image_paths[idx]).convert("RGB")
        if self.transform:
            image = self.transform(image)
        
        question = self.questions[idx]
        answer = self.answers[idx]
        
        return image, question, answer

In [2]:
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
])


dataset = VQADataset(train_questions, train_answers, train_image_paths, transform=transform)
dataloader = DataLoader(dataset, batch_size=16, shuffle=True)

NameError: name 'train_questions' is not defined

In [3]:
import torch.nn as nn
import torch.nn.functional as F

class ImageFeatureExtractor(nn.Module):
    def __init__(self):
        super().__init__()
        vgg_model = vgg16(pretrained=False)  
        self.feature_extractor = nn.Sequential(*list(vgg_model.features.children())[:-1])

    def forward(self, x):
        x = self.feature_extractor(x)
        x = torch.flatten(x, start_dim=1)
        return x

class QuestionFeatureExtractor(nn.Module):
    def __init__(self, embedding_dim, hidden_dim):
        super().__init__()
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=2, batch_first=True)

    def forward(self, x):
        _, (hidden, _) = self.lstm(x)
        return hidden[-1] 

class VQAModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.img_feature_extractor = ImageFeatureExtractor()
        self.question_feature_extractor = QuestionFeatureExtractor(embedding_dim=300, hidden_dim=1024)
        self.fc = nn.Linear(25088 + 1024, 1024)  
        self.classifier = nn.Linear(1024, 4) # Output layer for 4 question types

    def forward(self, image, question):
        img_features = self.img_feature_extractor(image)
        question_features = self.question_feature_extractor(question)
        combined_features = torch.cat((img_features, question_features), dim=1)
        combined_features = F.relu(self.fc(combined_features))
        output = self.classifier(combined_features)
        return output


In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = VQAModel().to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
criterion = nn.CrossEntropyLoss()



In [5]:
model

VQAModel(
  (img_feature_extractor): ImageFeatureExtractor(
    (feature_extractor): Sequential(
      (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): ReLU(inplace=True)
      (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (3): ReLU(inplace=True)
      (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
      (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (6): ReLU(inplace=True)
      (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (8): ReLU(inplace=True)
      (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
      (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (11): ReLU(inplace=True)
      (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (13): ReLU(inplace=True)
      (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))

In [None]:
num_epochs = 10

for epoch in range(num_epochs):
    model.train()
    for images, questions, answers in dataloader:
        images, questions, answers = images.to(device), questions.to(device), answers.to(device)
        
        optimizer.zero_grad()
        outputs = model(images, questions)
        loss = criterion(outputs, answers)
        loss.backward()
        optimizer.step()