In [1]:
# Colab: install packages
!pip -q install pymupdf pillow pytesseract

# Optional OCR engine (required only if enabling OCR fallback below)
# Comment out if you do not need OCR for scanned PDFs
!apt-get -y install tesseract-ocr


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m39.7 MB/s[0m eta [36m0:00:00[0m
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed, 0 to remove and 38 not upgraded.


In [2]:
import io, json, zipfile, pathlib
import fitz  # PyMuPDF
from google.colab import files

# Toggle OCR for scanned PDFs (True to enable OCR fallback)
OCR_ENABLED = True

# Only needed if OCR_ENABLED is True
if OCR_ENABLED:
    from PIL import Image
    import pytesseract

def page_text_with_optional_ocr(page):
    # Try direct text extraction first
    txt = page.get_text("text") or ""  # PyMuPDF plain text mode
    if txt.strip():
        return txt.strip()
    # Fallback to OCR if enabled
    if not OCR_ENABLED:
        return txt.strip()
    # Render page and OCR
    pix = page.get_pixmap(dpi=200)
    if pix.alpha:  # remove alpha for PIL
        pix = fitz.Pixmap(fitz.csRGB, pix)
    img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
    ocr_txt = pytesseract.image_to_string(img) or ""
    return ocr_txt.strip()

def pdf_to_json_bytes(filename: str, file_bytes: bytes) -> bytes:
    doc = fitz.open(stream=file_bytes, filetype="pdf")
    pages = []
    for i, page in enumerate(doc, start=1):
        text = page_text_with_optional_ocr(page)
        pages.append({"page_number": i, "text": text})
    data = {
        "file_name": filename,
        "page_count": len(doc),
        "pages": pages
    }
    return json.dumps(data, ensure_ascii=False, indent=2).encode("utf-8")


In [6]:
# 1) Prompt for multiple PDFs
uploaded = files.upload()  # returns dict: {filename: bytes}

# 2) Convert each to a JSON file in memory and write to ZIP
zip_buf = io.BytesIO()
with zipfile.ZipFile(zip_buf, mode="w", compression=zipfile.ZIP_DEFLATED) as zf:
    for name, content in uploaded.items():
        try:
            stem = pathlib.Path(name).stem
            json_bytes = pdf_to_json_bytes(name, content)
            zf.writestr(f"{stem}.json", json_bytes)
        except Exception as e:
            zf.writestr(f"{stem}__ERROR.txt", str(e))

# 3) Save ZIP to disk and trigger download
zip_path = "converted_jsons.zip"
with open(zip_path, "wb") as f:
    f.write(zip_buf.getvalue())

files.download(zip_path)  # prompts browser download


Saving gsde1ps.pdf to gsde1ps.pdf
Saving gsde101.pdf to gsde101.pdf
Saving gsde102.pdf to gsde102.pdf
Saving gsde103.pdf to gsde103.pdf
Saving gsde104.pdf to gsde104.pdf
Saving gsde105.pdf to gsde105.pdf
Saving gsde106.pdf to gsde106.pdf
Saving gsde107.pdf to gsde107.pdf
Saving gsde108.pdf to gsde108.pdf
Saving gsde109.pdf to gsde109.pdf
Saving gsde110.pdf to gsde110.pdf
Saving gsde111.pdf to gsde111.pdf
Saving gsde112.pdf to gsde112.pdf
Saving gsde113.pdf to gsde113.pdf
Saving gsde114.pdf to gsde114.pdf
Saving gsde115.pdf to gsde115.pdf


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>