In [2]:
# PDF Metadata Reader
# Install dependencies if needed (uncomment on first run)
# %pip install PyMuPDF

import os
from typing import Any, Dict, List

try:
    import fitz  # PyMuPDF
except ImportError:
    try:
        import pymupdf as fitz
    except ImportError as exc:
        raise RuntimeError("PyMuPDF is required. Install with `pip install PyMuPDF`.\n" ) from exc

print("PyMuPDF version:", getattr(fitz, "__doc__", "loaded"))


PyMuPDF version: PyMuPDF 1.26.3: Python bindings for the MuPDF 1.26.3 library (rebased implementation).
Python 3.11 running on win32 (64-bit).



In [4]:
from datetime import datetime
from typing import Optional, Tuple
import os

def _safe_parse_pdf_date(raw: Optional[str]) -> Optional[str]:
    """Parse PDF date strings like D:YYYYMMDDHHmmSSZ and return ISO 8601 string.
    Returns the original string if parsing fails.
    """
    if not raw:
        return None
    try:
        s = raw
        if s.startswith("D:"):
            s = s[2:]
        # strip timezone suffixes like Z or -05'00'
        tz_pos = None
        for ch in ["Z", "+", "-"]:
            p = s.find(ch)
            if p != -1:
                tz_pos = p
                break
        core = s if tz_pos is None else s[:tz_pos]
        # pad to seconds
        core = core.ljust(14, "0")
        dt = datetime.strptime(core[:14], "%Y%m%d%H%M%S")
        return dt.isoformat()
    except Exception:
        return raw


def extract_pdf_metadata(path: str) -> dict:
    """Extract document info dict, XMP XML, pages, and derived stats.

    - Basic info: title, author, subject, keywords, creator, producer, creation_date, mod_date
    - XMP metadata: raw XML string if present
    - Per-page data: width/height (points), rotation, colorspaces summary
    - Derived stats: page_count, has_xmp, attachments_count, embedded_files (names)
    """
    if not os.path.exists(path):
        raise FileNotFoundError(path)

    doc = fitz.open(path)
    try:
        info = doc.metadata or {}
        xmp_xml = None
        try:
            xmp_xml = doc.get_xml_metadata() or None
        except Exception:
            xmp_xml = None

        # Normalize dates
        info_norm = dict(info)
        info_norm["creation_date"] = _safe_parse_pdf_date(info.get("creationDate"))
        info_norm["mod_date"] = _safe_parse_pdf_date(info.get("modDate"))

        # Per-page metrics
        pages = []
        colorspaces_seen = set()
        for i in range(doc.page_count):
            p = doc.load_page(i)
            mediabox = p.mediabox
            rot = p.rotation
            # Try to infer dominant colorspace from pixmap sample (fast approximation)
            try:
                pix = p.get_pixmap(dpi=36, alpha=False)
                mode = "RGB" if pix.n in (3,) else "GRAY" if pix.n == 1 else "CMYK" if pix.n == 4 else f"N{pix.n}"
                colorspaces_seen.add(mode)
            except Exception:
                mode = None
            pages.append({
                "index": i,
                "width_pt": float(mediabox.width),
                "height_pt": float(mediabox.height),
                "rotation": int(rot),
                "approx_colorspace": mode,
            })

        # Attachments / embedded files
        attachments = []
        try:
            for name in doc.embeddedFileNames():
                attachments.append(name)
        except Exception:
            pass

        return {
            "basic_info": {
                "title": info.get("title"),
                "author": info.get("author"),
                "subject": info.get("subject"),
                "keywords": info.get("keywords"),
                "creator": info.get("creator"),
                "producer": info.get("producer"),
                "creation_date": info_norm.get("creation_date"),
                "mod_date": info_norm.get("mod_date"),
                "trapped": info.get("trapped"),
                "format": info.get("format"),
                "encryption": info.get("encryption"),
            },
            "page_count": doc.page_count,
            "xmp_present": bool(xmp_xml),
            "xmp_xml": xmp_xml,
            "pages": pages,
            "colorspaces_detected": sorted(colorspaces_seen),
            "attachments_count": len(attachments),
            "embedded_files": attachments,
        }
    finally:
        doc.close()


In [5]:
# Example usage: set a local PDF path and extract metadata
# On Windows, you can paste a path like r"C:\\Users\\yourname\\Documents\\file.pdf"

PDF_PATH = r"C:\Users\burag\Downloads\cut line.pdf"  # <- change to your file

if os.path.exists(PDF_PATH):
    data = extract_pdf_metadata(PDF_PATH)
    print("Page count:", data["page_count"])
    print("Has XMP:", data["xmp_present"])    
else:
    print("Update PDF_PATH to a valid file path.")


Page count: 1
Has XMP: True


In [6]:
# Pretty-print full metadata as JSON
import json

try:
    print(json.dumps(data, indent=2, ensure_ascii=False)[:5000])  # print first 5000 chars
except NameError:
    print("Run the previous cell after setting PDF_PATH.")


{
  "basic_info": {
    "title": "cut line",
    "author": "",
    "subject": "",
    "keywords": "",
    "creator": "Adobe Illustrator 29.7 (Macintosh)",
    "producer": "Adobe PDF library 17.00",
    "creation_date": "2025-08-25T19:07:48",
    "mod_date": "2025-08-25T19:07:48",
    "trapped": "",
    "format": "PDF 1.4",
    "encryption": null
  },
  "page_count": 1,
  "xmp_present": true,
  "xmp_xml": "<?xpacket begin=\"﻿\" id=\"W5M0MpCehiHzreSzNTczkc9d\"?>\n<x:xmpmeta xmlns:x=\"adobe:ns:meta/\" x:xmptk=\"Adobe XMP Core 9.1-c003 1.000000, 0000/00/00-00:00:00        \">\n   <rdf:RDF xmlns:rdf=\"http://www.w3.org/1999/02/22-rdf-syntax-ns#\">\n      <rdf:Description rdf:about=\"\"\n            xmlns:dc=\"http://purl.org/dc/elements/1.1/\"\n            xmlns:xmp=\"http://ns.adobe.com/xap/1.0/\"\n            xmlns:xmpGImg=\"http://ns.adobe.com/xap/1.0/g/img/\"\n            xmlns:xmpMM=\"http://ns.adobe.com/xap/1.0/mm/\"\n            xmlns:stRef=\"http://ns.adobe.com/xap/1.0/sType/Resourc

In [7]:
# Optionally export to files
# - Save JSON metadata to disk
# - Save XMP XML (if present) to a sidecar file

OUTPUT_JSON = os.path.splitext(PDF_PATH)[0] + "_metadata.json"
OUTPUT_XMP = os.path.splitext(PDF_PATH)[0] + "_xmp.xml"

if 'data' in globals():
    with open(OUTPUT_JSON, 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=2, ensure_ascii=False)
    print("Saved:", OUTPUT_JSON)

    if data.get('xmp_xml'):
        with open(OUTPUT_XMP, 'w', encoding='utf-8') as f:
            f.write(data['xmp_xml'])
        print("Saved:", OUTPUT_XMP)
    else:
        print("No XMP metadata found; nothing saved for XML.")
else:
    print("No data to export. Run extraction first.")


Saved: C:\Users\burag\Downloads\cut line_metadata.json
Saved: C:\Users\burag\Downloads\cut line_xmp.xml


In [None]:
# pip install pypdf reportlab
from pypdf import PdfReader, PdfWriter
from reportlab.pdfgen import canvas
from reportlab.lib.units import mm
from reportlab.lib.colors import SpotColor
import io

def cutcontour_overlay(width_pt, height_pt,
                       inset_mm=5, corner_radius_mm=0,
                       line_w_pt=0.2,
                       spot_name="CutContour", tint=1.0):
    """
    Make a 1-page overlay with a rectangular (optionally rounded) die line as a spot color named spot_name.
    """
    buf = io.BytesIO()
    c = canvas.Canvas(buf, pagesize=(width_pt, height_pt))
    c.setLineWidth(line_w_pt)

    # Define a spot color named exactly as RIP expects (case-sensitive)
    cut_spot = SpotColor(spot_name, tint)
    c.setStrokeColor(cut_spot)

    inset = inset_mm * mm
    r = corner_radius_mm * mm
    x = inset
    y = inset
    w = width_pt - 2 * inset
    h = height_pt - 2 * inset

    p = c.beginPath()
    if r > 0:
        # Rounded rectangle path
        # (ReportLab's path doesn't have roundRect directly; use arcTo & lines)
        # Helper to draw rounded rect clockwise
        p.moveTo(x + r, y)
        p.lineTo(x + w - r, y)
        p.arcTo(x + w - 2*r, y, x + w, y + 2*r, startAng=270, extent=90)
        p.lineTo(x + w, y + h - r)
        p.arcTo(x + w - 2*r, y + h - 2*r, x + w, y + h, startAng=0, extent=90)
        p.lineTo(x + r, y + h)
        p.arcTo(x, y + h - 2*r, x + 2*r, y + h, startAng=90, extent=90)
        p.lineTo(x, y + r)
        p.arcTo(x, y, x + 2*r, y + 2*r, startAng=180, extent=90)
        p.close()
    else:
        # Simple rectangle
        p.rect(x, y, w, h)

    # Stroke only, no fill
    c.drawPath(p, stroke=1, fill=0)

    c.showPage()
    c.save()
    buf.seek(0)
    return buf.read()

def add_cutcontour_to_pdf(in_pdf, out_pdf, inset_mm=5, corner_radius_mm=0,
                          line_w_pt=0.2, spot_name="CutContour", tint=1.0):
    reader = PdfReader(in_pdf)
    writer = PdfWriter()

    for page in reader.pages:
        w = float(page.mediabox.width)
        h = float(page.mediabox.height)
        overlay_bytes = cutcontour_overlay(w, h, inset_mm, corner_radius_mm, line_w_pt, spot_name, tint)

        overlay_reader = PdfReader(io.BytesIO(overlay_bytes))
        overlay_page = overlay_reader.pages[0]

        page.merge_page(overlay_page)
        writer.add_page(page)

    with open(out_pdf, "wb") as f:
        writer.write(f)

# Example:
# add_cutcontour_to_pdf("input.pdf", "with_die_line.pdf",
#                       inset_mm=5, corner_radius_mm=3, line_w_pt=0.2,
#                       spot_name="CutContour", tint=1.0)
