In [1]:
import os
import json
import pandas as pd

In [2]:
image_dir = r'E:\VQA\floodnet\Images'
training_questions_path = r'E:\VQA\floodnet\Questions\Training Question.json'
validation_questions_path = r'E:\VQA\floodnet\Questions\Valid Question.json'
test_questions_path = r'E:\VQA\floodnet\Questions\Test_Question.json'

def load_questions_and_images(json_file_path, image_dir, split='Train_Image'):
    with open(json_file_path, 'r') as file:
        data = json.load(file)
    
    questions = []
    answers = []
    image_paths = []
    question_types = []

    for item in data:
        questions.append(data[item]['Question'])
        answers.append(data[item]['Ground_Truth'])
        question_types.append(data[item]['Question_Type'])
        image_paths.append(os.path.join(image_dir, split, data[item]['Image_ID']))

    return questions, answers, question_types, image_paths

In [3]:
# Assuming load_questions_and_images function loads data correctly
questions, answers, Question_Types, image_paths = load_questions_and_images(training_questions_path, image_dir)

df = pd.DataFrame({'questions': questions, 'answers': answers, 'question_types': Question_Types, 'image_paths': image_paths})
df.head()

Unnamed: 0,questions,answers,question_types,image_paths
0,What is the overall condition of the given image?,flooded,Condition_Recognition,E:\VQA\floodnet\Images\Train_Image\10165.JPG
1,What is the overall condition of the given image?,flooded,Condition_Recognition,E:\VQA\floodnet\Images\Train_Image\10166.JPG
2,What is the overall condition of the given image?,non flooded,Condition_Recognition,E:\VQA\floodnet\Images\Train_Image\10168.JPG
3,How many non flooded buildings can be seen in ...,3,Complex_Counting,E:\VQA\floodnet\Images\Train_Image\10168.JPG
4,How many buildings can be seen in the image?,3,Simple_Counting,E:\VQA\floodnet\Images\Train_Image\10168.JPG


In [4]:
df = df[df['question_types'] == 'Simple_Counting']
df.head()

Unnamed: 0,questions,answers,question_types,image_paths
4,How many buildings can be seen in the image?,3,Simple_Counting,E:\VQA\floodnet\Images\Train_Image\10168.JPG
6,How many buildings can be seen in this image?,4,Simple_Counting,E:\VQA\floodnet\Images\Train_Image\10170.JPG
13,How many buildings are in this image?,4,Simple_Counting,E:\VQA\floodnet\Images\Train_Image\10171.JPG
19,How many buildings can be seen in this image?,7,Simple_Counting,E:\VQA\floodnet\Images\Train_Image\10172.JPG
23,How many buildings can be seen in the image?,1,Simple_Counting,E:\VQA\floodnet\Images\Train_Image\10175.JPG


In [5]:
#splitting the data into training, validation and testing (70-15-15)
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)
val_df, test_df = train_test_split(test_df, test_size=0.5, random_state=42)

In [6]:
train_df.head()

Unnamed: 0,questions,answers,question_types,image_paths
2085,How many buildings can be seen in this image?,3,Simple_Counting,E:\VQA\floodnet\Images\Train_Image\7345.JPG
472,How many buildings are in the image?,5,Simple_Counting,E:\VQA\floodnet\Images\Train_Image\6528.JPG
4198,How many buildings can be seen in the image?,2,Simple_Counting,E:\VQA\floodnet\Images\Train_Image\9073.JPG
4279,How many buildings can be seen in this image?,4,Simple_Counting,E:\VQA\floodnet\Images\Train_Image\9091.JPG
1131,How many buildings can be seen in the image?,5,Simple_Counting,E:\VQA\floodnet\Images\Train_Image\6854.JPG


In [7]:
print("Training Number of Simple Counting Questions:", len(train_df))
print("Validation Number of Simple Counting Questions:", len(val_df))
print("Test Number of Simple Counting Questions:", len(test_df))

Training Number of Simple Counting Questions: 445
Validation Number of Simple Counting Questions: 95
Test Number of Simple Counting Questions: 96


In [8]:
import torch
import torch.nn as nn
from PIL import Image
import torch.optim as optim
from torchvision.models import resnet50
from torchvision.transforms import transforms
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel

In [9]:
class FloodNetVQADataset(Dataset):
    def __init__(self, dataframe, transform=None):
        """
        Args:
            dataframe (DataFrame): Pandas DataFrame containing the data.
            transform (callable, optional): Optional transform to be applied on a sample.
        """
        self.dataframe = dataframe
        self.transform = transform

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        img_path = self.dataframe.iloc[idx, 3] 
        image = Image.open(img_path).convert('RGB')
        question = self.dataframe.iloc[idx, 0]  
        answer = self.dataframe.iloc[idx, 1]

        if self.transform:
            image = self.transform(image)

        sample = {'image': image, 'question': question, 'answer': answer}

        return sample

In [10]:
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

train_dataset = FloodNetVQADataset(train_df, transform=transform)
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=0)

val_dataset = FloodNetVQADataset(val_df, transform=transform)
val_dataloader = DataLoader(val_dataset, batch_size=32, shuffle=True, num_workers=0)

test_dataset = FloodNetVQADataset(test_df, transform=transform)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=True, num_workers=0)

In [11]:
for i, batch in enumerate(train_dataloader):
    images = batch['image']
    questions = batch['question']
    answers = batch['answer'].float()

    print("Batch:", i)
    print("Images Shape:", images.shape)
    print("Questions Shape:", questions)
    print("Answers Shape:", answers)
    break

Batch: 0
Images Shape: torch.Size([32, 3, 224, 224])
Questions Shape: ['How many buildings are in the image?', 'How many buildings are in this image?', 'How many buildings can be seen in this image?', 'How many buildings are in the image?', 'How many buildings are in the image?', 'How many buildings can be seen in this image?', 'How many buildings can be seen in the image?', 'How many buildings are in the image?', 'How many buildings are in the image?', 'How many buildings can be seen in the image?', 'How many buildings can be seen in the image?', 'How many buildings are in the image?', 'How many buildings can be seen in the image?', 'How many buildings can be seen in the image?', 'How many buildings are in this image?', 'How many buildings can be seen in the image?', 'How many buildings can be seen in the image?', 'How many buildings are in the image?', 'How many buildings are in the image?', 'How many buildings can be seen in this image?', 'How many buildings can be seen in the image

In [12]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

feature_extractor = resnet50(weights=True)
feature_extractor = feature_extractor.to(device)
feature_extractor = nn.Sequential(*list(feature_extractor.children())[:-2]) 
feature_extractor.eval()

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')
bert_model = bert_model.to(device)
bert_model.eval()

def extract_features(images):
    with torch.no_grad():
        features = feature_extractor(images)
        features = torch.nn.functional.adaptive_avg_pool2d(features, (1, 1))
        features = features.view(features.size(0), -1)
    return features


def text_features(questions):
    inputs = tokenizer(questions, return_tensors='pt', padding=True, truncation=True, max_length=45)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    outputs = bert_model(**inputs)
    return outputs.last_hidden_state[:, 0, :]



In [13]:
class VQAModel(nn.Module):
    def __init__(self):
        super(VQAModel, self).__init__()
        self.fc1 = nn.Linear(2048 + 768, 512)  
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.5)
        self.fc2 = nn.Linear(512, 1)  

    def forward(self, img_features, ques_features):
        img_features = img_features.view(img_features.size(0), -1)
        ques_features = ques_features.view(ques_features.size(0), -1)
        combined_features = torch.cat((img_features, ques_features), dim=1)
        x = self.fc1(combined_features)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        return x

In [14]:
vqa_model = VQAModel()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
vqa_model = vqa_model.to(device)

optimizer = torch.optim.Adam(vqa_model.parameters(), lr=0.001)
criterion = nn.MSELoss()

In [15]:
print(vqa_model)

VQAModel(
  (fc1): Linear(in_features=2816, out_features=512, bias=True)
  (relu): ReLU()
  (dropout): Dropout(p=0.5, inplace=False)
  (fc2): Linear(in_features=512, out_features=1, bias=True)
)


In [17]:
num_epochs = 5

for epoch in range(num_epochs):
    for i, batch in enumerate(train_dataloader):
        images = batch['image'].to(device)
        questions = batch['question']  
        answers = batch['answer'].float().to(device)

        img_features = extract_features(images)
        ques_features = text_features(questions)
        counts = answers.view(-1, 1)

        optimizer.zero_grad()

        outputs = vqa_model(img_features, ques_features)
        loss = criterion(outputs, counts)

        loss.backward()
        optimizer.step()

    #validation loss
    with torch.no_grad():
        for val_batch in val_dataloader:
            images = val_batch['image'].to(device)
            questions = val_batch['question']
            answers = val_batch['answer'].float().to(device)

            img_features = extract_features(images)
            ques_features = text_features(questions)
            counts = answers.view(-1, 1)

            outputs = vqa_model(img_features, ques_features)
            val_loss = criterion(outputs, counts)

    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}, Val Loss: {val_loss.item()}')

Epoch 1/5, Loss: 15.339031219482422, Val Loss: 8.820296287536621
Epoch 2/5, Loss: 5.899046421051025, Val Loss: 20.66608238220215
Epoch 3/5, Loss: 12.975815773010254, Val Loss: 20.170452117919922
Epoch 4/5, Loss: 3.8404624462127686, Val Loss: 6.488979816436768
Epoch 5/5, Loss: 6.888874053955078, Val Loss: 18.440916061401367


In [18]:
#test loss
with torch.no_grad():
    for test_batch in test_dataloader:
        images = test_batch['image'].to(device)
        questions = test_batch['question']
        answers = test_batch['answer'].float().to(device)

        img_features = extract_features(images)
        ques_features = text_features(questions)
        counts = answers.view(-1, 1)

        outputs = vqa_model(img_features, ques_features)
        test_loss = criterion(outputs, counts)

print(f'Test Loss: {test_loss.item()}')

Test Loss: 9.134334564208984


In [21]:
#prediction
def predict(image_path, question):
    image = Image.open(image_path).convert('RGB')
    image = transform(image).unsqueeze(0).to(device)
    img_features = extract_features(image)

    ques_features = text_features([question])
    output = vqa_model(img_features, ques_features)
    return output.item()

image_path = r"E:\VQA\floodnet\Images\Train_Image\6693.JPG"
question = 'How many people are there in the image?'

prediction = predict(image_path, question)
prediction = round(prediction)

print(f'Prediction: {prediction}')

Prediction: 8
