In [None]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/

Mounted at /content/drive
/content/drive/MyDrive


In [None]:
%pip install ultralytics transformers
import ultralytics
ultralytics.checks()

Ultralytics YOLOv8.2.36 🚀 Python-3.10.12 torch-2.3.0+cu121 CUDA:0 (Tesla T4, 15102MiB)
Setup complete ✅ (2 CPUs, 12.7 GB RAM, 30.2/201.2 GB disk)


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms
from transformers import BertTokenizer, BertModel, GPT2Tokenizer, GPT2LMHeadModel
from ultralytics import YOLO
from collections import defaultdict
from PIL import Image
from tqdm import tqdm
import json
import csv
import os

In [None]:
# Ensure CUDA (GPU support) is available if possible, else use CPU
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Using device: {device}')

Using device: cuda


In [None]:
# Load the trained YOLOv8 model
yolo_model = YOLO('/content/drive/MyDrive/00_PFE/Object_Detection/Training_Results/Yolov8-V4/Results/runs/train/experiment/weights/best.pt').to(device)

In [None]:
# Define the label mapping
label_mapping = [
    "flooded", "non flooded", "flooded,non flooded", "Yes", "No",
    "0", "1", "2", "3", "4", "5", "6", "7", "8", "9",
    "10", "11", "12", "13", "14", "15", "16", "17", "18", "19",
    "20", "21", "22", "23", "24", "25", "26", "27", "28", "29",
    "30", "31", "32", "33", "34", "35", "36", "37", "38", "39",
    "40", "41", "42", "43", "44", "45", "46", "47", "48", "49", "50"
]

In [None]:
# Function to extract features from YOLOv8
def extract_yolo_features(image_path, model, device):
    results = model(image_path)

    # Initialize lists to store extracted features
    boxes_list = []
    conf_list = []
    cls_list = []

    for result in results:
        if result.boxes is not None:
            boxes = result.boxes.xyxy.to(device)  # Bounding box coordinates
            confs = result.boxes.conf.to(device)  # Confidence scores
            classes = result.boxes.cls.to(device)  # Class values
            boxes_list.append(boxes)
            conf_list.append(confs)
            cls_list.append(classes)

    # Combine features into a single tensor
    if boxes_list:
        features = torch.cat([torch.cat(boxes_list), torch.cat(conf_list).unsqueeze(1), torch.cat(cls_list).unsqueeze(1)], dim=1)
    else:
        features = torch.empty((0, 6), device=device)

    return features

In [None]:
# VQAModel class
class VQAModel(nn.Module):
    def __init__(self, bert_model, gpt2_model, yolo_input_dim, hidden_dim, vocab_size):
        super(VQAModel, self).__init__()
        self.bert_model = bert_model
        self.gpt2_model = gpt2_model
        self.fc_yolo = nn.Linear(yolo_input_dim, hidden_dim)
        self.fc_proj = nn.Linear(hidden_dim + 768, gpt2_model.config.n_embd)
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size

    def forward(self, image_features, questions, attention_masks, num_questions_per_image):
        image_features = [self.fc_yolo(image_feature) for image_feature in image_features]
        image_features = torch.stack(image_features)

        text_features = [self.bert_model(question.unsqueeze(0).to(image_features.device), attention_mask=attention_mask.unsqueeze(0).to(image_features.device)).pooler_output for question, attention_mask in zip(questions, attention_masks)]
        text_features = torch.cat(text_features, dim=0)

        expanded_image_features = []
        for image_feature, num_questions in zip(image_features, num_questions_per_image):
            expanded_image_features.append(image_feature.repeat(num_questions, 1))
        expanded_image_features = torch.cat(expanded_image_features, dim=0)

        combined_features = torch.cat((expanded_image_features, text_features), dim=1)
        projected_features = self.fc_proj(combined_features)

        gpt2_output = self.gpt2_model(inputs_embeds=projected_features.unsqueeze(1), return_dict=True).logits
        logits = gpt2_output[:, -1, :]

        return logits

In [None]:
# Initialize tokenizer, BERT model, GPT-2 model, and VQA model
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased').to(device)
gpt2_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
gpt2_model = GPT2LMHeadModel.from_pretrained('gpt2').to(device)
num_classes = len(label_mapping)
hidden_dim = 256
vqa_model = VQAModel(bert_model=bert_model, gpt2_model=gpt2_model, yolo_input_dim=6, hidden_dim=hidden_dim, vocab_size=num_classes).to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [None]:
# Load the saved model state
vqa_model.load_state_dict(torch.load('/content/drive/MyDrive/00_PFE/VQA/Code-V3/VQAModel_Best.pth'))

<All keys matched successfully>

In [None]:
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor()
])

In [None]:
# Function to predict the answer given an image and question
def predict_answer(image_path, question):
    vqa_model.eval()
    image = Image.open(image_path).convert('RGB')
    image = transform(image).unsqueeze(0).to(device)

    inputs = bert_tokenizer.encode_plus(
        question,
        add_special_tokens=True,
        return_tensors='pt',
        padding='max_length',
        truncation=True,
        max_length=64
    )
    question_input = inputs['input_ids'].squeeze(0).to(device)
    attention_mask = inputs['attention_mask'].squeeze(0).to(device)

    with torch.no_grad():
        image_features = extract_yolo_features(image_path, yolo_model, device).mean(dim=0).unsqueeze(0)
        if image_features.nelement() == 0:
            image_features = torch.zeros((1, 6), device=device)
        outputs = vqa_model(image_features, [question_input], [attention_mask], [1])
        predicted_idx = torch.argmax(outputs, dim=1).item()

    return label_mapping[predicted_idx]

In [None]:
# Example usage
image_path = '/content/drive/MyDrive/00_PFE/VQA/Image_Predction_VQA/7.jpg'

question_01 = "How many buildings are flooded in this image?"
question_04 = "How many buildings are in this image?"
question_05 = "How many buildings are non flooded in this image?"
question_11 = "Is the entire road flooded?"
question_12 = "Is the entire road non flooded?"
question_14 = "What is the condition of the road in this image?"


answer_01 = predict_answer(image_path, question_01)
print(f"Question: {question_01}")
print(f"Answer: {answer_01}")

answer_04 = predict_answer(image_path, question_04)
print(f"Question: {question_04}")
print(f"Answer: {answer_04}")

answer_05 = predict_answer(image_path, question_05)
print(f"Question: {question_05}")
print(f"Answer: {answer_05}")

answer_11 = predict_answer(image_path, question_11)
print(f"Question: {question_11}")
print(f"Answer: {answer_11}")

answer_12 = predict_answer(image_path, question_12)
print(f"Question: {question_12}")
print(f"Answer: {answer_12}")

answer_14 = predict_answer(image_path, question_14)
print(f"Question: {question_14}")
print(f"Answer: {answer_14}")


image 1/1 /content/drive/MyDrive/00_PFE/VQA/Image_Predction_VQA/7.jpg: 448x640 18 Building-floodeds, 6 Building-non-floodeds, 3 Road-floodeds, 7 Waters, 27 Trees, 8 Vehicles, 2 Pools, 63.6ms
Speed: 3.0ms preprocess, 63.6ms inference, 1.6ms postprocess per image at shape (1, 3, 448, 640)
Question: How many buildings are flooded in this image?
Answer: 13

image 1/1 /content/drive/MyDrive/00_PFE/VQA/Image_Predction_VQA/7.jpg: 448x640 18 Building-floodeds, 6 Building-non-floodeds, 3 Road-floodeds, 7 Waters, 27 Trees, 8 Vehicles, 2 Pools, 7.0ms
Speed: 3.0ms preprocess, 7.0ms inference, 2.1ms postprocess per image at shape (1, 3, 448, 640)
Question: How many buildings are in this image?
Answer: 13

image 1/1 /content/drive/MyDrive/00_PFE/VQA/Image_Predction_VQA/7.jpg: 448x640 18 Building-floodeds, 6 Building-non-floodeds, 3 Road-floodeds, 7 Waters, 27 Trees, 8 Vehicles, 2 Pools, 7.0ms
Speed: 3.0ms preprocess, 7.0ms inference, 2.2ms postprocess per image at shape (1, 3, 448, 640)
Question: H

In [None]:
# Example usage
image_path = '/content/drive/MyDrive/00_PFE/VQA/Image_Predction_VQA/9109.JPG'

question_01 = "How many buildings are flooded in this image?"
question_04 = "How many buildings are in this image?"
question_05 = "How many buildings are non flooded in this image?"
question_11 = "Is the entire road flooded?"
question_12 = "Is the entire road non flooded?"
question_14 = "What is the condition of the road in this image?"


answer_01 = predict_answer(image_path, question_01)
print(f"Question: {question_01}")
print(f"Answer: {answer_01}")

answer_04 = predict_answer(image_path, question_04)
print(f"Question: {question_04}")
print(f"Answer: {answer_04}")

answer_05 = predict_answer(image_path, question_05)
print(f"Question: {question_05}")
print(f"Answer: {answer_05}")

answer_11 = predict_answer(image_path, question_11)
print(f"Question: {question_11}")
print(f"Answer: {answer_11}")

answer_12 = predict_answer(image_path, question_12)
print(f"Question: {question_12}")
print(f"Answer: {answer_12}")

answer_14 = predict_answer(image_path, question_14)
print(f"Question: {question_14}")
print(f"Answer: {answer_14}")


image 1/1 /content/drive/MyDrive/00_PFE/VQA/Image_Predction_VQA/9109.JPG: 480x640 1 Background, 11 Building-non-floodeds, 6 Road-non-floodeds, 10 Trees, 6 Vehicles, 11 Grasss, 132.9ms
Speed: 15.6ms preprocess, 132.9ms inference, 1033.9ms postprocess per image at shape (1, 3, 480, 640)
Question: How many buildings are flooded in this image?
Answer: 3

image 1/1 /content/drive/MyDrive/00_PFE/VQA/Image_Predction_VQA/9109.JPG: 480x640 1 Background, 11 Building-non-floodeds, 6 Road-non-floodeds, 10 Trees, 6 Vehicles, 11 Grasss, 8.1ms
Speed: 3.4ms preprocess, 8.1ms inference, 1.4ms postprocess per image at shape (1, 3, 480, 640)
Question: How many buildings are in this image?
Answer: 3

image 1/1 /content/drive/MyDrive/00_PFE/VQA/Image_Predction_VQA/9109.JPG: 480x640 1 Background, 11 Building-non-floodeds, 6 Road-non-floodeds, 10 Trees, 6 Vehicles, 11 Grasss, 7.1ms
Speed: 3.8ms preprocess, 7.1ms inference, 2.0ms postprocess per image at shape (1, 3, 480, 640)
Question: How many buildings ar