In [1]:
import fitz  # PyMuPDF
from pathlib import Path

In [2]:


# Path to your PDF
PDF_PATH = Path("spiderman_comic.pdf")

# Folder where extracted images will be saved
OUT_DIR = Path("story_pages")
OUT_DIR.mkdir(parents=True, exist_ok=True)

# Your manually identified story pages (1-based indexing)
story_pages = [1,3,4,5,6,7,8,10,11,13,15,16,17,18,19,20,21,22,24,26,27,29,31,32]

# Open the PDF
doc = fitz.open(PDF_PATH)

print(f"PDF has {len(doc)} pages total.")

# Extract each selected page
for p in story_pages:
    if 1 <= p <= len(doc):
        page = doc[p - 1]           # convert to 0-based index
        pix = page.get_pixmap(dpi=200)   # render at good quality
        out_path = OUT_DIR / f"page_{p:02d}.png"
        pix.save(out_path)
        print(f"Saved page {p} → {out_path}")
    else:
        print(f"Page {p} is out of range, skipping.")


PDF has 36 pages total.
Saved page 1 → story_pages\page_01.png
Saved page 3 → story_pages\page_03.png
Saved page 4 → story_pages\page_04.png
Saved page 5 → story_pages\page_05.png
Saved page 6 → story_pages\page_06.png
Saved page 7 → story_pages\page_07.png
Saved page 8 → story_pages\page_08.png
Saved page 10 → story_pages\page_10.png
Saved page 11 → story_pages\page_11.png
Saved page 13 → story_pages\page_13.png
Saved page 15 → story_pages\page_15.png
Saved page 16 → story_pages\page_16.png
Saved page 17 → story_pages\page_17.png
Saved page 18 → story_pages\page_18.png
Saved page 19 → story_pages\page_19.png
Saved page 20 → story_pages\page_20.png
Saved page 21 → story_pages\page_21.png
Saved page 22 → story_pages\page_22.png
Saved page 24 → story_pages\page_24.png
Saved page 26 → story_pages\page_26.png
Saved page 27 → story_pages\page_27.png
Saved page 29 → story_pages\page_29.png
Saved page 31 → story_pages\page_31.png
Saved page 32 → story_pages\page_32.png


In [1]:
pip install pillow

Note: you may need to restart the kernel to use updated packages.


In [2]:
import matplotlib.pyplot as plt
from matplotlib.widgets import RectangleSelector
from PIL import Image
from pathlib import Path
import json

In [None]:
# story_pages = [1,3,4,5,6,7,8,10,11,13,15,16,17,18,19,20,21,22,24,26,27,29,31,32,]

In [14]:
!pip install ultralytics
!pip install opencv-python pillow


Collecting ultralytics
  Downloading ultralytics-8.3.231-py3-none-any.whl.metadata (37 kB)
Collecting polars>=0.20.0 (from ultralytics)
  Downloading polars-1.35.2-py3-none-any.whl.metadata (10 kB)
Collecting ultralytics-thop>=2.0.18 (from ultralytics)
  Downloading ultralytics_thop-2.0.18-py3-none-any.whl.metadata (14 kB)
Collecting polars-runtime-32==1.35.2 (from polars>=0.20.0->ultralytics)
  Downloading polars_runtime_32-1.35.2-cp39-abi3-win_amd64.whl.metadata (1.5 kB)
Downloading ultralytics-8.3.231-py3-none-any.whl (1.1 MB)
   ---------------------------------------- 0.0/1.1 MB ? eta -:--:--
   ------------------- -------------------- 0.5/1.1 MB 2.4 MB/s eta 0:00:01
   ----------------------------- ---------- 0.8/1.1 MB 2.6 MB/s eta 0:00:01
   ---------------------------------------- 1.1/1.1 MB 2.2 MB/s eta 0:00:00
Downloading polars-1.35.2-py3-none-any.whl (783 kB)
   ---------------------------------------- 0.0/783.6 kB ? eta -:--:--
   -------------------------- ------------- 

In [None]:
from ultralytics import YOLO
import cv2
from PIL import Image
from pathlib import Path
import json

# -----------------------------
# 1. Paths + Setup
# -----------------------------
MODEL_PATH = "best.pt"         # your fine-tuned panel detector
PAGES_DIR  = Path("story_pages")   # story pages extracted earlier
PANELS_DIR = Path("panels")
PANELS_DIR.mkdir(exist_ok=True)

# -----------------------------
# 2. Load the YOLO model
# -----------------------------
model = YOLO(MODEL_PATH)

# -----------------------------
# 3. Function to detect panels
# -----------------------------
def extract_panels_from_page(page_path):
    img = cv2.imread(str(page_path))

    # Run prediction
    results = model.predict(source=page_path, verbose=False)[0]

    page_id = int(page_path.stem.split("_")[1])
    page_panels = []

    # Loop through detected boxes
    for i, box in enumerate(results.boxes):
        x1, y1, x2, y2 = box.xyxy[0].tolist()
        x1, y1, x2, y2 = map(int, [x1, y1, x2, y2])

        # Crop the panel
        crop = img[y1:y2, x1:x2]

        # Panel ID & save path
        panel_id = f"p{page_id:02d}_{i+1:02d}"
        save_path = PANELS_DIR / f"{panel_id}.png"
        cv2.imwrite(str(save_path), crop)

        # Record metadata
        page_panels.append({
            "page_id": page_id,
            "panel_id": panel_id,
            "image_path": str(save_path),
            "bbox": [x1, y1, x2, y2],
            "confidence": float(box.conf)
        })

    return page_panels

# -----------------------------
# 4. Process all story pages
# -----------------------------
all_panels = []
for page in sorted(PAGES_DIR.glob("*.png")):
    panels = extract_panels_from_page(page)
    print(f"{page.name} → {len(panels)} panels detected")
    all_panels.extend(panels)

# -----------------------------
# 5. Save metadata
# -----------------------------
meta_path = "/mnt/data/panel_metadata_basic.json"
with open(meta_path, "w") as f:
    json.dump(all_panels, f, indent=2)

print(f"\nSaved metadata for {len(all_panels)} panels → {meta_path}")


In [17]:
from pathlib import Path
import re
import json

PANELS_DIR   = Path("panels")   # change to "panels_auto" if that’s your folder
OUT_META_PATH = Path("panel_metadata.json")

records = []

for img_file in sorted(PANELS_DIR.glob("p*.png")):
    # Expect names like p04_01.png
    m = re.match(r"p(\d{2})_(\d{2})\.png", img_file.name)
    if not m:
        print("Skipping unexpected file name:", img_file.name)
        continue

    page_id = int(m.group(1))          # 4 from p04_01
    panel_idx = int(m.group(2)) - 1    # make 0-based

    records.append({
        "panel_id": img_file.stem,             # "p04_01"
        "page_id": page_id,
        "panel_index": panel_idx,
        "image_path": str(img_file),

        # bbox / confidence unknown here (can be added later if you have them)
        "bbox": None,
        "confidence": None,

        # Text fields to be filled/updated
        "short_caption": "",
        "speech_text": "",
        "story_context": "",
        "characters_in_panel": [],

        # global story position
        "order_index": None
    })

# Sort in reading order
records.sort(key=lambda r: (r["page_id"], r["panel_index"]))

# Assign global order_index
for i, rec in enumerate(records):
    rec["order_index"] = i

with OUT_META_PATH.open("w", encoding="utf-8") as f:
    json.dump(records, f, indent=2, ensure_ascii=False)

print(f"Saved {len(records)} panels → {OUT_META_PATH}")


Skipping unexpected file name: p05_04png.png
Saved 83 panels → panel_metadata.json


In [18]:
!pip install open_clip_torch faiss-cpu pillow tqdm


Collecting open_clip_torch
  Downloading open_clip_torch-3.2.0-py3-none-any.whl.metadata (32 kB)
Collecting ftfy (from open_clip_torch)
  Downloading ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Collecting timm>=1.0.17 (from open_clip_torch)
  Downloading timm-1.0.22-py3-none-any.whl.metadata (63 kB)
Downloading open_clip_torch-3.2.0-py3-none-any.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ------ --------------------------------- 0.3/1.5 MB ? eta -:--:--
   -------------------- ------------------- 0.8/1.5 MB 2.1 MB/s eta 0:00:01
   --------------------------- ------------ 1.0/1.5 MB 1.9 MB/s eta 0:00:01
   ---------------------------------------- 1.5/1.5 MB 2.0 MB/s eta 0:00:00
Downloading timm-1.0.22-py3-none-any.whl (2.5 MB)
   ---------------------------------------- 0.0/2.5 MB ? eta -:--:--
   ---- ----------------------------------- 0.3/2.5 MB ? eta -:--:--
   ------------ --------------------------- 0.8/2.5 MB 2.4 MB/s eta 0:00:01
   ----

In [None]:
# import json
# from pathlib import Path

# import torch
# from PIL import Image
# import numpy as np
# import open_clip
# from tqdm import tqdm

# # ---- 1. Load CLIP model ----
# model_name = "ViT-B-32"
# pretrained = "openai"

# model, _, preprocess = open_clip.create_model_and_transforms(
#     model_name, 
#     pretrained=pretrained
# )
# tokenizer = open_clip.get_tokenizer(model_name)

# device = "cuda" if torch.cuda.is_available() else "cpu"
# model = model.to(device)
# model.eval()

# # ---- 2. Load your enriched panel metadata ----
# META_PATH = Path("panel_metadata.json")  # change if needed

# with META_PATH.open("r", encoding="utf-8") as f:
#     panels = json.load(f)

# print("Loaded", len(panels), "panels")


open_clip_model.safetensors:   0%|          | 0.00/605M [00:00<?, ?B/s]

In [4]:
import json

In [5]:
with open("updated_metadata.json", "r") as f:
    meta = json.load(f) 

In [6]:
meta

[{'panel_id': 'p01_01',
  'short_caption': 'Spider-Man is ambushed by multiple illusions of Mysterio in a chaotic office scene, with papers and objects flying everywhere.',
  'speech_text': 'Is this one of Mysterio’s illusions— or am I seeing two of him?',
  'story_context': "This is the cover of the issue and sets up the central conflict: Mysterio has returned and is using elaborate illusions to confuse and overwhelm Spider-Man. The scene hints that Spider-Man will be forced to confront not just physical threats but deceptive tricks. The chaotic environment reinforces that Mysterio’s illusions disrupt Spider-Man's senses and surroundings.",
  'characters_in_panel': ['Spider-Man', 'Mysterio'],
  'text': "[speech_text] : Is this one of Mysterio’s illusions— or am I seeing two of him? And [short_caption] : Spider-Man is ambushed by multiple illusions of Mysterio in a chaotic office scene, with papers and objects flying everywhere. And [story_context] : This is the cover of the issue and 

In [None]:
# def ensure_image_paths(meta, base_dir="panels"):
#     """
#     Ensures each metadata entry has an 'image_path' key.
#     If missing, it auto-generates one using:
#         f"{base_dir}/{panel_id}.png"
    
#     Args:
#         meta (list[dict]): metadata list loaded from JSON
#         base_dir (str): folder containing panel images
#     Returns:
#         meta (list[dict]): updated metadata list
#     """

#     for entry in meta:
#         panel_id = entry.get("panel_id")  # e.g., "p01_01"
#         if panel_id is None:
#             print("⚠️  Missing panel_id in entry:", entry)
#             continue

#         # Construct the expected image path
#         expected_path = f"{base_dir}/{panel_id}.png"

#         # Add only if missing or empty
#         if "image_path" not in entry or not entry["image_path"]:
#             entry["image_path"] = expected_path

#     return meta


In [66]:
# Suppose `meta` is already loaded
meta = ensure_image_paths(meta)



In [71]:
import json

output_path = "updated_metadata.json"

with open(output_path, "w", encoding="utf-8") as f:
    json.dump(meta, f, indent=4, ensure_ascii=False)

print("Saved JSON to:", output_path)


Saved JSON to: updated_metadata.json


In [8]:
import torch

In [9]:
emb = torch.load("panel_clip_embeddings.pt")
panel_ids      = emb["panel_ids"]       # list[str]
image_paths    = emb["image_paths"]     # list[str]
texts          = emb["texts"]           # list[str]
image_features = emb["image_features"]  # torch.Size([N, D])
text_features  = emb["text_features"]   # torch.Size([N, D])

# 2) Load your metadata JSON
  # list of dicts, each has "panel_id"


  emb = torch.load("panel_clip_embeddings.pt")


In [10]:
print(len(panel_ids), len(image_features), len(meta))


82 82 83


In [11]:
import torch.nn.functional as F

image_features_norm = F.normalize(image_features, dim=1)
text_features_norm  = F.normalize(text_features, dim=1)


In [12]:
import faiss
import numpy as np

In [13]:
img_mat = image_features_norm.numpy().astype("float32")
text_mat = text_features_norm.numpy().astype("float32")

In [14]:
d = img_mat.shape[1]
d

512

In [15]:
# building the FAISS index
image_index = faiss.IndexFlatIP(d)
image_index.add(img_mat)

text_index = faiss.IndexFlatIP(d)
text_index.add(text_mat)

In [17]:
import numpy as np

panel_ids   = np.array(emb["panel_ids"])
image_paths = np.array(emb["image_paths"])
texts       = np.array(emb["texts"])   # enriched text field you built


In [81]:
import torch
from PIL import Image
import clip   # or from open_clip import create_model_and_transforms, etc.

device = "cuda" if torch.cuda.is_available() else "cpu"

# make sure you load the same CLIP model+preprocess you used for the DB
model, preprocess = clip.load("ViT-B/32", device=device)
model.eval()

def search_images_by_text(query, k=5):
    # 1) encode query text with CLIP
    with torch.no_grad():
        tokens = clip.tokenize([query]).to(device)
        q_feat = model.encode_text(tokens)
        q_feat = q_feat / q_feat.norm(dim=-1, keepdim=True)

    # 2) convert to numpy for FAISS
    q_vec = q_feat.cpu().numpy().astype("float32")

    # 3) search in image index
    sims, idxs = image_index.search(q_vec, k)   # (1, k)

    # 4) build pretty results
    results = []
    for rank, (score, idx) in enumerate(zip(sims[0], idxs[0]), start=1):
        pid   = panel_ids[idx]
        ipath = image_paths[idx]
        base  = {
            "rank": rank,
            "score": float(score),
            "panel_id": str(pid),
            "image_path": str(ipath),
            "db_text": str(texts[idx]),
        }
        # attach rich metadata if available
        if pid in panel_meta:
            base["metadata"] = panel_meta[pid]
        results.append(base)

    return results


ModuleNotFoundError: No module named 'clip'

In [83]:
import torch
import clip

device = "cuda" if torch.cuda.is_available() else "cpu"

model, preprocess = clip.load("ViT-B/32", device=device)
model.eval()

print("Loaded model:", type(model))


100%|███████████████████████████████████████| 338M/338M [00:32<00:00, 11.1MiB/s]


Loaded model: <class 'clip.model.CLIP'>


In [18]:
import torch
from PIL import Image
import clip   # or from open_clip import create_model_and_transforms, etc.

device = "cuda" if torch.cuda.is_available() else "cpu"

# make sure you load the same CLIP model+preprocess you used for the DB
model, preprocess = clip.load("ViT-B/32", device=device)
model.eval()

def search_images_by_text(query, k=5):
    # 1) encode query text with CLIP
    with torch.no_grad():
        tokens = clip.tokenize([query]).to(device)
        q_feat = model.encode_text(tokens)
        q_feat = q_feat / q_feat.norm(dim=-1, keepdim=True)

    # 2) convert to numpy for FAISS
    q_vec = q_feat.cpu().numpy().astype("float32")

    # 3) search in image index
    sims, idxs = image_index.search(q_vec, k)   # (1, k)

    # 4) build pretty results
    results = []
    for rank, (score, idx) in enumerate(zip(sims[0], idxs[0]), start=1):
        pid   = panel_ids[idx]
        ipath = image_paths[idx]
        base  = {
            "rank": rank,
            "score": float(score),
            "panel_id": str(pid),
            "image_path": str(ipath),
            "db_text": str(texts[idx]),
        }
        # attach rich metadata if available
        if pid in panel_meta:
            base["metadata"] = panel_meta[pid]
        results.append(base)

    return results


In [19]:
res = search_images_by_text("Spider-Man fights Mysterio with smoke everywhere", k=5)
for r in res:
    print(r["rank"], r["score"], r["panel_id"], "->", r["image_path"])


: 