<a href="https://colab.research.google.com/github/ArkS0001/VQA--Visual-Question-Answering/blob/main/VQA__2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
# Install Tesseract OCR and required Python packages.
!apt-get update
!apt-get install -y tesseract-ocr

!pip install pytesseract torch torchvision transformers


Hit:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
Hit:2 http://archive.ubuntu.com/ubuntu jammy InRelease
Hit:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Get:4 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Hit:5 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Get:6 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:7 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Get:11 http://security.ubuntu.com/ubuntu jammy-security/main amd64 Packages [2,692 kB]
Get:12 http://security.ubuntu.com/ubuntu jammy-security/restricted amd64 Packages [3,798 kB]
Fetched 6,874 kB in 3s (2,722 kB/s)
Reading package lists... Done
W: Skipping acqu

In [7]:
import logging
import re
from PIL import Image
import pytesseract
import torch
import torchvision.transforms as transforms
import torchvision.models as models
import torch.nn as nn

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# -----------------------------
# Visual Feature Extraction
# -----------------------------
class VisualFeatureExtractor(nn.Module):
    def __init__(self):
        super(VisualFeatureExtractor, self).__init__()
        model = models.resnet50(pretrained=True)
        # Remove the final classification layer
        self.feature_extractor = nn.Sequential(*(list(model.children())[:-1]))
        self.feature_extractor.eval()

    def forward(self, image_tensor: torch.Tensor) -> torch.Tensor:
        with torch.no_grad():
            features = self.feature_extractor(image_tensor)  # shape: (batch, 2048, 1, 1)
            features = features.view(features.size(0), -1)     # flatten to (batch, 2048)
        return features

# -----------------------------
# OCR Module
# -----------------------------
class OCRModule:
    def extract_text(self, image: Image.Image) -> str:
        try:
            text = pytesseract.image_to_string(image)
            return text
        except Exception as e:
            logger.error("Error during OCR extraction: %s", e)
            return ""

    def extract_numbers(self, text: str) -> list:
        numbers = re.findall(r'\d+\.\d+|\d+', text)
        try:
            return [float(num) for num in numbers]
        except Exception as e:
            logger.error("Error converting numbers: %s", e)
            return []

# -----------------------------
# Diagram Understanding Module
# -----------------------------
class DiagramUnderstandingModule:
    def classify_diagram(self, ocr_text: str) -> str:
        """
        Classify the diagram type based on keywords in the OCR text.
        Returns one of: "flowchart", "gantt", "chart", or "diagram".
        """
        lower_text = ocr_text.lower()
        if "start" in lower_text and "end" in lower_text:
            return "flowchart"
        elif "gantt" in lower_text or "project" in lower_text or re.search(r'\d{1,2}/\d{1,2}/\d{2,4}', lower_text):
            return "gantt"
        elif "bar" in lower_text or "chart" in lower_text or "graph" in lower_text:
            return "chart"
        else:
            return "diagram"

    def extract_structure(self, diagram_type: str, ocr_text: str) -> dict:
        """
        Extracts structural information based on the diagram type.
        For now, returns a simple dictionary; extend this method to add layout analysis.
        """
        return {"diagram_type": diagram_type, "content": ocr_text.strip()}

# -----------------------------
# Extended Reasoning Module
# -----------------------------
class ExtendedReasoningModule:
    def __init__(self):
        self.ocr_module = OCRModule()
        self.diagram_module = DiagramUnderstandingModule()

    def reason(self, question: str, ocr_text: str) -> str:
        numbers = self.ocr_module.extract_numbers(ocr_text)
        question_lower = question.lower()
        # Numeric reasoning if applicable
        if numbers and ("highest" in question_lower or "max" in question_lower):
            answer = max(numbers)
            return f"The highest value is {answer}."
        elif numbers and ("lowest" in question_lower or "min" in question_lower):
            answer = min(numbers)
            return f"The lowest value is {answer}."
        elif numbers and ("average" in question_lower or "mean" in question_lower):
            answer = sum(numbers) / len(numbers)
            return f"The average value is {answer}."
        else:
            # Diagram/structural understanding
            diagram_type = self.diagram_module.classify_diagram(ocr_text)
            structure = self.diagram_module.extract_structure(diagram_type, ocr_text)
            if "type" in question_lower or "kind" in question_lower:
                return f"This appears to be a {structure['diagram_type']}."
            elif "flow" in question_lower or "process" in question_lower:
                # A placeholder for more detailed flow extraction
                return f"Flow details (placeholder): {structure['content'][:200]}..."
            elif "schedule" in question_lower or "timeline" in question_lower:
                # For gantt charts; extend this with more specific parsing
                return f"Gantt chart details (placeholder): {structure['content'][:200]}..."
            else:
                # Default: return extracted OCR text summary.
                return f"Extracted text summary: {structure['content'][:300]}"

# -----------------------------
# Main VQA Pipeline
# -----------------------------
class VQAPipeline:
    def __init__(self, device: str = "cpu"):
        self.device = device
        self.visual_extractor = VisualFeatureExtractor().to(self.device)
        self.ocr_module = OCRModule()
        self.reasoning_module = ExtendedReasoningModule()
        self.transform = transforms.Compose([
            transforms.Resize((224, 224)),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                 std=[0.229, 0.224, 0.225])
        ])

    def process(self, image_path: str, question: str) -> str:
        try:
            image = Image.open(image_path).convert("RGB")
        except Exception as e:
            logger.error("Error loading image: %s", e)
            return "Error loading image."

        # Visual Feature Extraction (currently for logging; can be used for fusion in advanced versions)
        image_tensor = self.transform(image).unsqueeze(0).to(self.device)
        visual_features = self.visual_extractor(image_tensor)
        logger.info("Extracted visual features shape: %s", visual_features.shape)

        # OCR Extraction
        ocr_text = self.ocr_module.extract_text(image)
        logger.info("Extracted OCR text: %s", ocr_text.strip())

        # Reasoning: fuse OCR output with the question.
        answer = self.reasoning_module.reason(question, ocr_text)
        return answer


In [9]:
# For file upload in Colab:
from google.colab import files
uploaded = files.upload()  # Upload your image file here

# Get the first uploaded image file name
image_path = list(uploaded.keys())[0]

# Define your question
# Try different questions like:
# "What is the highest value?"
# "What type of diagram is this?"
# "Describe the process flow."
# "What is the schedule?"
question = "describe process flow"

# Use GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

# Instantiate and run the VQA pipeline
pipeline = VQAPipeline(device=device)
answer = pipeline.process(image_path, question)
print("Predicted Answer:", answer)


Saving worked5.png to worked5 (4).png
Using device: cuda




Predicted Answer: Flow details (placeholder): Ersteller: Meyer, Prozessstandard

Matthias
OE: EOZ/2
Version: 01.00
Seite 11 von 11 Anmeldepackage erzeugen

PS_2.1_011_1075_05

Relevante EingangsgroBen fur P3+

 

10. Ablaufplan

 

 

 

 

 

 
...
