In [None]:
!pip install PymuPDF

Collecting PymuPDF
  Downloading pymupdf-1.26.3-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.26.3-cp39-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m41.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PymuPDF
Successfully installed PymuPDF-1.26.3


In [None]:
import fitz
import re
import os
import json
from IPython.display import Image, display

doc = fitz.open('/content/IMO class 1 Maths Olympiad Sample Paper 1 for the year 2024-25.pdf')
image_folder = "images"
os.makedirs(image_folder, exist_ok=True)

output_data = []

for page_num, page in enumerate(doc):
    print(f"📄 Processing page {page_num + 1}")

    blocks = page.get_text("dict")["blocks"]
    question_blocks = []

    for block in blocks:
        if block["type"] == 0:
            for line in block["lines"]:
                text = " ".join([span["text"] for span in line["spans"]]).strip()
                if re.match(r"\d+\.\s", text):
                    question_blocks.append({
                        "text": text,
                        "y": block["bbox"][1]
                    })

    image_data = []
    for img_index, img in enumerate(page.get_images(full=True)):
        xref = img[0]
        base_image = doc.extract_image(xref)
        img_bytes = base_image["image"]
        img_ext = base_image["ext"]
        img_filename = f"page{page_num+1}_img{img_index+1}.{img_ext}"
        img_path = os.path.join(image_folder, img_filename)

        with open(img_path, "wb") as f:
            f.write(img_bytes)

        rects = page.get_image_rects(xref)
        y_pos = rects[0].y0 if rects else 0

        image_data.append({
            "path": img_path,
            "y": y_pos
        })

    for i, q in enumerate(question_blocks):
        q_start_y = q["y"]
        q_end_y = question_blocks[i+1]["y"] if i+1 < len(question_blocks) else float('inf')

        related_images = [
            img["path"] for img in image_data
            if q_start_y <= img["y"] < q_end_y
        ]

        total_imgs = len(related_images)
        if total_imgs == 4:
            question_imgs = []
            option_imgs = related_images
        elif total_imgs > 4:
            question_imgs = related_images[:-4]
            option_imgs = related_images[-4:]
        else:
            question_imgs = related_images
            option_imgs = []

        output_data.append({
            "question": q["text"],
            "images": question_imgs[0] if question_imgs else None,
            "option_images": option_imgs
        })

with open("questions_with_images.json", "w") as f:
    json.dump(output_data, f, indent=2)


📄 Processing page 1
📄 Processing page 2
📄 Processing page 3
📄 Processing page 4
📄 Processing page 5
📄 Processing page 6
📄 Processing page 7
📄 Processing page 8
📄 Processing page 9
📄 Processing page 10
📄 Processing page 11
📄 Processing page 12
📄 Processing page 13
📄 Processing page 14


In [None]:
import shutil

shutil.make_archive("images", "zip", "images")

'/content/images.zip'