<a href="https://colab.research.google.com/github/ArkS0001/VQA--Visual-Question-Answering/blob/main/VQA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
import torch.nn as nn
import torchvision.models as models
from transformers import BertTokenizer, BertModel

# Define the Visual Question Answering model
class VQAModel(nn.Module):
    def __init__(self, num_answers):
        super(VQAModel, self).__init__()
        # Visual feature extractor: ResNet50 pre-trained on ImageNet.
        self.cnn = models.resnet50(pretrained=True)
        # Remove the final classification layer
        self.cnn = nn.Sequential(*list(self.cnn.children())[:-1])
        self.cnn_out_dim = 2048

        # Textual feature extractor: BERT model for encoding the question.
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.bert_out_dim = 768

        # Fusion layer: combine image and text features.
        self.fusion = nn.Linear(self.cnn_out_dim + self.bert_out_dim, 512)
        self.classifier = nn.Linear(512, num_answers)

    def forward(self, image, question):
        # Extract visual features from the image.
        image_features = self.cnn(image)  # shape: (batch, 2048, 1, 1)
        image_features = image_features.view(image_features.size(0), -1)  # shape: (batch, 2048)

        # Encode the question using BERT.
        # Note: If processing a batch of questions, ensure proper tokenization and device placement.
        inputs = self.tokenizer(question, return_tensors="pt", padding=True, truncation=True)
        input_ids = inputs["input_ids"].to(image.device)
        attention_mask = inputs["attention_mask"].to(image.device)
        text_outputs = self.bert(input_ids, attention_mask=attention_mask)
        text_features = text_outputs.pooler_output  # shape: (batch, 768)

        # Combine the visual and textual features.
        combined = torch.cat((image_features, text_features), dim=1)
        fused = torch.relu(self.fusion(combined))
        output = self.classifier(fused)  # Final prediction for answer classes.
        return output

# Example dataset class for VQA
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import torchvision.transforms as transforms

class VQADataset(Dataset):
    def __init__(self, image_paths, questions, answers, transform=None):
        self.image_paths = image_paths
        self.questions = questions
        self.answers = answers  # Assumes answers are provided as class indices.
        self.transform = transform

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        # Load and transform the image.
        image = Image.open(self.image_paths[idx]).convert("RGB")
        if self.transform:
            image = self.transform(image)
        question = self.questions[idx]
        answer = self.answers[idx]
        return image, question, answer

# Define transformations for the image input.
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])

# Assume you have lists of image paths, questions, and answers.
image_paths = ['/content/graphy.jpg']  # etc.
questions = ["What is the highest value?"]
answers = [3]  # Example answer indices corresponding to your answer vocabulary.

# Create the dataset and dataloader.
dataset = VQADataset(image_paths, questions, answers, transform)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

# Initialize the model, loss function, and optimizer.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
num_answers = 1000  # Adjust according to your dataset’s answer classes.
model = VQAModel(num_answers).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

# Sample training loop
num_epochs = 10
for epoch in range(num_epochs):
    for images, questions, answers in dataloader:
        images = images.to(device)
        answers = answers.to(device)  # Expected to be tensor of class indices.
        optimizer.zero_grad()
        outputs = model(images, questions)
        loss = criterion(outputs, answers)
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item():.4f}")


Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /root/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth
100%|██████████| 97.8M/97.8M [00:00<00:00, 176MB/s]
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Epoch 1/10, Loss: 6.9400
Epoch 2/10, Loss: 6.1390
Epoch 3/10, Loss: 5.3251
Epoch 4/10, Loss: 4.9281
Epoch 5/10, Loss: 3.9583
Epoch 6/10, Loss: 3.3413
Epoch 7/10, Loss: 2.7269
Epoch 8/10, Loss: 2.1178
Epoch 9/10, Loss: 1.5472
Epoch 10/10, Loss: 1.0478


In [4]:
from transformers import pipeline
from PIL import Image

# Load the pre-trained Visual Question Answering pipeline.
# This example uses the VILT model fine-tuned on VQA tasks.
vqa_pipeline = pipeline("vqa", model="dandelin/vilt-b32-finetuned-vqa")

# Load your image.
image = Image.open("/content/graphy.jpg")

# Define your question.
question = "What is the highest value in the chart?"

# Get the answer from the pipeline.
result = vqa_pipeline(image, question)

# Print the result.
print("Predicted Answer:", result[0])


Device set to use cuda:0


Predicted Answer: {'score': 0.1059919223189354, 'answer': '500'}


In [6]:
pip install pytesseract pillow

Collecting pytesseract
  Using cached pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Downloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Installing collected packages: pytesseract
Successfully installed pytesseract-0.3.13


In [7]:
#!/usr/bin/env python
"""
Production-Level Visual Question Answering (VQA) Pipeline

This script implements a VQA pipeline that processes an input image through:
  1. Visual Feature Extraction via a pre-trained CNN (ResNet50)
  2. Optical Character Recognition (OCR) to extract text from the image
  3. A Rule-Based Reasoning module that uses the extracted OCR text
     to answer questions (e.g., "What is the highest value?")

Requirements:
  - Python 3.x
  - PyTorch and torchvision
  - pytesseract (Tesseract must be installed on your system)
  - Pillow

Usage:
  python vqa_pipeline.py --image path_to_image.jpg --question "What is the highest value?"
"""

import argparse
import logging
import re
from PIL import Image
import pytesseract
import torch
import torchvision.transforms as transforms
import torchvision.models as models
import torch.nn as nn

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class VisualFeatureExtractor(nn.Module):
    """
    Uses a pre-trained ResNet50 model (without its final classification layer)
    to extract visual features from an image.
    """
    def __init__(self):
        super(VisualFeatureExtractor, self).__init__()
        # Load pre-trained ResNet50
        model = models.resnet50(pretrained=True)
        # Remove the classification head
        self.feature_extractor = nn.Sequential(*(list(model.children())[:-1]))
        self.feature_extractor.eval()

    def forward(self, image_tensor: torch.Tensor) -> torch.Tensor:
        """
        Extract features from the image tensor.
        :param image_tensor: Tensor of shape (batch_size, 3, 224, 224)
        :return: Feature tensor of shape (batch_size, 2048)
        """
        with torch.no_grad():
            features = self.feature_extractor(image_tensor)  # shape: (batch, 2048, 1, 1)
            features = features.view(features.size(0), -1)     # flatten to (batch, 2048)
        return features

class OCRModule:
    """
    Performs OCR on an image using Tesseract to extract text.
    """
    def extract_text(self, image: Image.Image) -> str:
        """
        Extract text from the provided image.
        :param image: PIL Image object.
        :return: Extracted text as a string.
        """
        try:
            text = pytesseract.image_to_string(image)
            return text
        except Exception as e:
            logger.error("Error during OCR extraction: %s", e)
            return ""

    def extract_numbers(self, text: str) -> list:
        """
        Extract all numerical values from a text string.
        :param text: Input text.
        :return: List of numbers (as floats).
        """
        numbers = re.findall(r'\d+\.\d+|\d+', text)
        try:
            return [float(num) for num in numbers]
        except Exception as e:
            logger.error("Error converting numbers: %s", e)
            return []

class ReasoningModule:
    """
    A rule-based reasoning module that provides answers based on the question
    and OCR extracted text. For example, it can answer queries like:
      - "What is the highest value?"
      - "What is the lowest value?"
      - "What is the average value?"
    """
    def reason(self, question: str, ocr_text: str) -> str:
        # Extract numbers from OCR text
        ocr_module = OCRModule()
        numbers = ocr_module.extract_numbers(ocr_text)
        if not numbers:
            return "No numerical data detected in the image."

        question_lower = question.lower()
        if "highest" in question_lower or "max" in question_lower:
            answer = max(numbers)
            return f"The highest value is {answer}."
        elif "lowest" in question_lower or "min" in question_lower:
            answer = min(numbers)
            return f"The lowest value is {answer}."
        elif "average" in question_lower or "mean" in question_lower:
            answer = sum(numbers) / len(numbers)
            return f"The average value is {answer}."
        else:
            # Default behavior: return the raw OCR text if no rule applies.
            return f"Extracted text: {ocr_text.strip()}"

class VQAPipeline:
    """
    The main pipeline that ties together visual feature extraction,
    OCR, and reasoning to answer a question about an image.
    """
    def __init__(self, device: str = "cpu"):
        self.device = device
        self.visual_extractor = VisualFeatureExtractor().to(self.device)
        self.ocr_module = OCRModule()
        self.reasoning_module = ReasoningModule()
        # Define image transformations matching the visual extractor's expectations.
        self.transform = transforms.Compose([
            transforms.Resize((224, 224)),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                 std=[0.229, 0.224, 0.225])
        ])

    def process(self, image_path: str, question: str) -> str:
        """
        Process the input image and question, and output an answer.
        :param image_path: File path to the image.
        :param question: A question regarding the image.
        :return: The predicted answer as a string.
        """
        # Load image
        try:
            image = Image.open(image_path).convert("RGB")
        except Exception as e:
            logger.error("Error loading image: %s", e)
            return "Error loading image."

        # Visual Feature Extraction (currently for logging; can be used in more advanced fusion)
        image_tensor = self.transform(image).unsqueeze(0).to(self.device)
        visual_features = self.visual_extractor(image_tensor)
        logger.info("Extracted visual features shape: %s", visual_features.shape)

        # OCR Extraction
        ocr_text = self.ocr_module.extract_text(image)
        logger.info("Extracted OCR text: %s", ocr_text.strip())

        # Reasoning: fuse the OCR output (and visual features if needed) with the question.
        answer = self.reasoning_module.reason(question, ocr_text)
        return answer

def main():
    parser = argparse.ArgumentParser(description="Production-Level Visual Question Answering Pipeline")
    parser.add_argument("--image", type=str, required=True, help="Path to the input image")
    parser.add_argument("--question", type=str, required=True, help="Question about the image")
    args = parser.parse_args()

    # Use GPU if available
    device = "cuda" if torch.cuda.is_available() else "cpu"
    logger.info("Using device: %s", device)

    pipeline = VQAPipeline(device=device)
    answer = pipeline.process(args.image, args.question)
    print("Predicted Answer:", answer)

if __name__ == "__main__":
    main()


usage: colab_kernel_launcher.py [-h] --image IMAGE --question QUESTION
colab_kernel_launcher.py: error: the following arguments are required: --image, --question


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [8]:
# Install Tesseract OCR and required Python packages.
!apt-get update
!apt-get install -y tesseract-ocr

!pip install pytesseract torch torchvision transformers


0% [Working]            Get:1 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
0% [Waiting for headers] [1 InRelease 1,140 B/129 kB 1%] [Waiting for headers] [Connected to r2u.sta                                                                                                    Get:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Get:3 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:5 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:6 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Get:7 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packages [1,374 kB]
Get:8 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
Hit:9 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/graphics-drivers/ppa

In [1]:
import logging
import re
from PIL import Image
import pytesseract
import torch
import torchvision.transforms as transforms
import torchvision.models as models
import torch.nn as nn

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Visual Feature Extraction using pre-trained ResNet50
class VisualFeatureExtractor(nn.Module):
    def __init__(self):
        super(VisualFeatureExtractor, self).__init__()
        model = models.resnet50(pretrained=True)
        # Remove the final classification layer
        self.feature_extractor = nn.Sequential(*(list(model.children())[:-1]))
        self.feature_extractor.eval()

    def forward(self, image_tensor: torch.Tensor) -> torch.Tensor:
        with torch.no_grad():
            features = self.feature_extractor(image_tensor)  # shape: (batch, 2048, 1, 1)
            features = features.view(features.size(0), -1)     # flatten to (batch, 2048)
        return features

# OCR Module using Tesseract
class OCRModule:
    def extract_text(self, image: Image.Image) -> str:
        try:
            text = pytesseract.image_to_string(image)
            return text
        except Exception as e:
            logger.error("Error during OCR extraction: %s", e)
            return ""

    def extract_numbers(self, text: str) -> list:
        numbers = re.findall(r'\d+\.\d+|\d+', text)
        try:
            return [float(num) for num in numbers]
        except Exception as e:
            logger.error("Error converting numbers: %s", e)
            return []

# Rule-based Reasoning Module
class ReasoningModule:
    def reason(self, question: str, ocr_text: str) -> str:
        ocr_module = OCRModule()
        numbers = ocr_module.extract_numbers(ocr_text)
        if not numbers:
            return "No numerical data detected in the image."

        question_lower = question.lower()
        if "highest" in question_lower or "max" in question_lower:
            answer = max(numbers)
            return f"The highest value is {answer}."
        elif "lowest" in question_lower or "min" in question_lower:
            answer = min(numbers)
            return f"The lowest value is {answer}."
        elif "average" in question_lower or "mean" in question_lower:
            answer = sum(numbers) / len(numbers)
            return f"The average value is {answer}."
        else:
            return f"Extracted text: {ocr_text.strip()}"

# Main VQA Pipeline
class VQAPipeline:
    def __init__(self, device: str = "cpu"):
        self.device = device
        self.visual_extractor = VisualFeatureExtractor().to(self.device)
        self.ocr_module = OCRModule()
        self.reasoning_module = ReasoningModule()
        self.transform = transforms.Compose([
            transforms.Resize((224, 224)),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                 std=[0.229, 0.224, 0.225])
        ])

    def process(self, image_path: str, question: str) -> str:
        try:
            image = Image.open(image_path).convert("RGB")
        except Exception as e:
            logger.error("Error loading image: %s", e)
            return "Error loading image."

        # Visual Feature Extraction (for demonstration; can be used for advanced fusion)
        image_tensor = self.transform(image).unsqueeze(0).to(self.device)
        visual_features = self.visual_extractor(image_tensor)
        logger.info("Extracted visual features shape: %s", visual_features.shape)

        # OCR Extraction
        ocr_text = self.ocr_module.extract_text(image)
        logger.info("Extracted OCR text: %s", ocr_text.strip())

        # Reasoning: fuse OCR output with the question
        answer = self.reasoning_module.reason(question, ocr_text)
        return answer


In [5]:
# For file upload in Colab:
from google.colab import files
uploaded = files.upload()  # Upload your image file here

# Get the first uploaded image file name
image_path = list(uploaded.keys())[0]

# Define your question
question = "What is project manager?"

# Use GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

# Instantiate and run the VQA pipeline
pipeline = VQAPipeline(device=device)
answer = pipeline.process(image_path, question)
print("Predicted Answer:", answer)


Saving worked5.png to worked5 (2).png
Using device: cuda




Predicted Answer: Extracted text: Ersteller: Meyer, Prozessstandard

Matthias
OE: EOZ/2
Version: 01.00
Seite 11 von 11 Anmeldepackage erzeugen

PS_2.1_011_1075_05

Relevante EingangsgroBen fur P3+

 

10. Ablaufplan

 

 

 

 

 

 

 

 

   

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

— = oe ee
= t= = =
aa

 

 

Prowse

 

 

 

 

 

 

 

ee Bean an

 

Einstufung: Intern KSU Unterlagenklasse 4.2 / Aufbewahrungsfrist: 15 Jahre ab Ereignis
PS - 02 20 © Volkswagen Aktiengesellschaft. Alle Rechte vorbehalten. (PC 01/2022)

INTERNAL
