### Architecture 

In [1]:
# pip install diagrams
# macOS: brew install graphviz   |  Ubuntu: sudo apt-get install -y graphviz
from diagrams import Diagram, Cluster, Edge
from diagrams.onprem.client import Users
from diagrams.generic.storage import Storage
from diagrams.generic.compute import Rack
from diagrams.generic.blank import Blank

# --- simple palette for consistent styling ---
PAL = {
    "ink":        "#1f2937",
    "bg":         "white",
    "ingest":     "#2563eb",   # blue (EDGAR -> raw -> converter)
    "convert":    "#fb923c",   # orange (converter -> pdf)
    "ours":       "#10b981",   # green (pdf -> our extractors)
    "json":       "#38bdf8",   # cyan (json out)
    "csv":        "#38bdf8",
    "docling":    "#f43f5e",   # red (pdf -> docling)
    "docling_out":"#fb7185",   # pink (docling outputs)
    "compare":    "#6366f1",   # indigo (to comparator)
    "store":      "#16a34a",   # green (to storage)
    "report":     "#0ea5e9",   # sky (to report)
    # cluster borders / fills
    "c_blue":     "#60a5fa", "c_blue_bg": "#e5f2ff",
    "c_green":    "#34d399", "c_green_bg": "#ecfdf5",
    "c_pink":     "#f472b6", "c_pink_bg": "#fff1f2",
    "c_indigo":   "#818cf8", "c_indigo_bg": "#eef2ff",
}

with Diagram(
    "SEC EDGAR → PDF → Parallel Extraction (JSON & CSV) with Docling Comparison",
    filename="/Users/RiyanshiKedia/Desktop/pdf-parser/scripts/sec_edgar_architecture_lr_pretty",
    outformat="png",
    show=False,
    graph_attr={
        "rankdir": "LR",
        "fontsize": "12",
        "bgcolor": PAL["bg"],
        "pad": "0.25",
        "splines": "spline",
        "fontname": "Helvetica",
    },
    node_attr={
        "shape": "box",
        "style": "rounded,filled",
        "fontname": "Helvetica",
        "fontsize": "11",
        "color": PAL["ink"],
        "fillcolor": "white",
        "penwidth": "1.6",
    },
    edge_attr={"penwidth": "2"},
):
    dev = Users("Developer")

    # ---------- DATA LOAD ----------
    with Cluster(
        "DATA LOAD",
        graph_attr={
            "style": "rounded,filled",
            "pencolor": PAL["c_blue"],
            "bgcolor": PAL["c_blue_bg"],
            "labelloc": "t",
            "labeljust": "l",
        },
    ):
        edgar = Storage("EDGAR Library\n(edgartools / sec-edgar-downloader)")
        raw = Storage("Raw Filing\n(Data_load.ipynb)")
        converter = Rack("PDF Converter\n(HTML/TXT → PDF)")
        pdfdoc = Storage("PDF Document")

        edgar >> Edge(color=PAL["ingest"], minlen="2") >> raw \
              >> Edge(color=PAL["ingest"], minlen="2") >> converter \
              >> Edge(color=PAL["convert"], minlen="2") >> pdfdoc

    # ---------- OUR EXTRACTION (parallel) ----------
    with Cluster(
        "OUR EXTRACTION (parallel)",
        graph_attr={
            "style": "rounded,filled",
            "pencolor": PAL["c_green"],
            "bgcolor": PAL["c_green_bg"],
            "labelloc": "t",
            "labeljust": "l",
        },
    ):
        text_ext = Rack("Text Extraction → JSON\n(pdfplumber)\nfile: simple_text_extractor.ipynb")
        json_out = Storage("parsed/MSFT")
        table_ext = Rack("Table Extraction → CSV\n(geometry + rulings)\nfile: tabula_extraction.ipynb")
        csv_out = Storage("our_tables.csv")

        pdfdoc >> Edge(color=PAL["ours"], minlen="2") >> text_ext \
               >> Edge(color=PAL["json"], minlen="2") >> json_out
        pdfdoc >> Edge(color=PAL["ours"], minlen="2") >> table_ext \
               >> Edge(color=PAL["csv"], minlen="2") >> csv_out

    # ---------- DOCLING BASELINE ----------
    with Cluster(
        "DOCLING BASELINE",
        graph_attr={
            "style": "rounded,filled",
            "pencolor": PAL["c_pink"],
            "bgcolor": PAL["c_pink_bg"],
            "labelloc": "t",
            "labeljust": "l",
        },
    ):
        docling = Rack("Docling Engine")
        docling_csv = Storage("docling_tables.csv")
        docling_json = Storage("docling.json")

        pdfdoc >> Edge(color=PAL["docling"], style="dashed", minlen="2") >> docling
        docling >> Edge(color=PAL["docling_out"]) >> docling_csv
        docling >> Edge(color=PAL["docling_out"]) >> docling_json

    # ---------- Comparator / Evaluator ----------
    with Cluster(
        "Comparator / Evaluator",
        graph_attr={
            "style": "rounded,filled",
            "pencolor": PAL["c_indigo"],
            "bgcolor": PAL["c_indigo_bg"],
            "labelloc": "t",
            "labeljust": "l",
        },
    ):
        comparator = Rack(
            "Compare OUR JSON/CSV\nvs DOCLING JSON/CSV\n→ Diffs & Quality Metrics"
        )

        json_out     >> Edge(color=PAL["compare"], minlen="2") >> comparator
        csv_out      >> Edge(color=PAL["compare"], minlen="2") >> comparator
        docling_json >> Edge(color=PAL["compare"], style="dashed") >> comparator
        docling_csv  >> Edge(color=PAL["compare"], style="dashed") >> comparator

    # ---------- Outputs ----------
    storage = Storage("Object Storage (S3 / GCS / Azure)\n• PDFs • JSON • CSV • Reports")
    report  = Storage("Comparison Report (HTML/JSON)")

    comparator >> Edge(color=PAL["store"], minlen="2") >> storage
    comparator >> Edge(color=PAL["report"], minlen="2") >> report
