In [2]:
%load_ext autoreload
%autoreload 2

import os, shutil
import fitz  
import tqdm

import polars as pl
import pandas as pd
from pymupdf4llm import to_markdown

from scripts.ocr_text import (
    is_scanned_pdf, 
    process_pdf_pipeline, 
    analyze_markdown_header_hierarchy
)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
def analyze_pdf(pdf_path):
    """Analyze a single PDF file and return a dict with its properties"""
    record = {
        'filename': os.path.basename(pdf_path),
        'file_size_mb': round(os.path.getsize(pdf_path) / (1024*1024), 3)
    }
    try:
        doc = fitz.open(pdf_path)
        record['page_count'] = len(doc)
        toc = doc.get_toc()
        record['has_toc'] = bool(toc)
        record['toc_entries'] = len(toc) if toc else 0
        record['is_scanned'] = is_scanned_pdf(pdf_path)
        # --- Header structure analysis ---
        tmp_pdf_path = './temp_ocr/tmp.pdf'
        shutil.copy(pdf_path, tmp_pdf_path)
        md_full = process_pdf_pipeline(tmp_pdf_path)
        record['markdown_status'] = 'success' if md_full.strip() else 'empty'

        # Add header analysis results to record
        record.update(analyze_markdown_header_hierarchy(md_full))
    except Exception as e:
        record['page_count'] = None
        record['has_toc'] = None
        record['toc_entries'] = None
        record['is_scanned'] = None
        record['markdown_status'] = 'error'
        record['header_level_counts'] = None
        record['header_assessment'] = str(e)
    return record
# PDF Folder with 383 pdfs
pdf_folder = "383-pdfs"  

In [11]:
# List all PDFs in the folder, analyze each, store results
records = []
file_list = sorted([f for f in os.listdir(pdf_folder) if f.lower().endswith('.pdf')])

for fname in tqdm.tqdm(file_list):
    fpath = os.path.join(pdf_folder, fname)
    rec = analyze_pdf(fpath)
    records.append(rec)

 99%|█████████▉| 381/383 [27:21<00:08,  4.11s/it]

📄 Detected born-digital PDFd margins saved: output_markdown/tmp.md

100%|█████████▉| 382/383 [27:27<00:04,  4.51s/it]

📄 Detected born-digital PDFd margins saved: output_markdown/tmp.md

100%|██████████| 383/383 [27:29<00:00,  4.31s/it]

✅ Markdown with hierarchy and margins saved: output_markdown/tmp.md




In [25]:
df = pd.DataFrame(records)
df.to_csv("pdfs_analysis_table.csv", index=False)
print("✅ Saved detail table to pdfs_analysis_table.csv")
df.shape

✅ Saved detail table to pdfs_analysis_table.csv


(383, 15)

In [26]:
df.describe(include="all")

Unnamed: 0,filename,file_size_mb,page_count,has_toc,toc_entries,is_scanned,markdown_status,hdr_level_2,hdr_level_3,assessment,hdr_level_1,hdr_level_4,hdr_level_5,header_level_counts,header_assessment
count,383,383.0,381.0,381,381.0,381,383,282.0,176.0,381,274.0,31.0,1.0,0.0,2
unique,383,,,2,,1,3,,,6,,,,,1
top,12936_2015_Article_885_pdf.pdf,,,True,,False,success,,,Header hierarchy appears reasonable.,,,,,"Choose only one of --force-ocr, --skip-text, -..."
freq,1,,,203,,381,370,,,201,,,,,2
mean,,0.946833,12.984252,,13.15748,,,7.87234,9.193182,,4.354015,4.612903,2.0,,
std,,2.646645,14.468287,,14.243979,,,6.255362,13.501626,,5.270924,2.800922,,,
min,,0.033,1.0,,0.0,,,1.0,1.0,,1.0,1.0,2.0,,
25%,,0.243,7.0,,0.0,,,1.0,2.0,,1.0,2.5,2.0,,
50%,,0.477,9.0,,8.0,,,7.0,7.0,,1.0,4.0,2.0,,
75%,,0.801,13.0,,26.0,,,12.0,12.0,,9.0,6.0,2.0,,


In [53]:
# We can use this as error counts and try to tune the parameters to reduce these counts
df.groupby('has_toc')["assessment"].value_counts().sort_index()

has_toc  assessment                                                                                   
False    Header hierarchy appears reasonable.                                                              91
         No headers found.                                                                                 24
         No top-level (#) header found. Headers do not start at top level.                                  4
         No top-level (#) header found. Headers do not start at top level. Only one header level used.      1
         Only one header level used.                                                                       58
True     Header hierarchy appears reasonable.                                                             110
         Header levels are too deeply nested.                                                              12
         No headers found.                                                                                 11
         No top-l

In [None]:
# Which header structure is most frequent among the docs
# Something like this is faulty: True         False        True         False        False

df[["hdr_level_1", "hdr_level_2", "hdr_level_3", "hdr_level_4", "hdr_level_5"]].dropna(how='all').fillna(0).map(bool).value_counts()

hdr_level_1  hdr_level_2  hdr_level_3  hdr_level_4  hdr_level_5
True         True         False        False        False          103
                          True         False        False           95
             False        False        False        False           61
False        True         True         False        False           48
                                       True         False           19
True         True         True         True         False           10
False        True         False        False        False            5
True         False        True         False        False            3
             True         False        True         False            1
                          True         True         True             1
Name: count, dtype: int64