In [None]:
"""
This project parses a PDF file and converts it into a **structured JSON** format.  
It preserves the **page hierarchy**, detects **sections, sub-sections, paragraphs, bullet points, tables, and charts/images**, 
and organizes them in a machine-readable structure.
"""

In [2]:
import fitz
import pdfplumber
import json


def extract_text_and_tables(pdf_path, page_num):
    """
    Extract structured text, bullets, and tables with proper hierarchy.
    Merge consecutive lines into one paragraph.
    """
    sections = []
    current_section = None
    current_sub_section = None
    paragraph_buffer = []  # collect lines to merge into paragraphs

    def flush_paragraph():
        """Helper to save accumulated lines as one paragraph."""
        nonlocal paragraph_buffer, current_section, current_sub_section
        if paragraph_buffer:
            if current_section is None:  # safeguard
                current_section = {
                    "section": "DEFAULT",
                    "sub_sections": [],
                    "content": []
                }
                sections.append(current_section)

            target = current_sub_section if current_sub_section else current_section
            target["content"].append({
                "type": "paragraph",
                "text": " ".join(paragraph_buffer)
            })
            paragraph_buffer = []

    with pdfplumber.open(pdf_path) as pdf:
        page = pdf.pages[page_num - 1]
        lines = page.extract_text().split("\n") if page.extract_text() else []

        for line in lines:
            clean_line = line.strip()
            if not clean_line:
                flush_paragraph()
                continue

            # ---- Detect Section ----
            if clean_line.isupper() and len(clean_line.split()) <= 4:
                flush_paragraph()
                current_section = {
                    "section": clean_line,
                    "sub_sections": [],
                    "content": []
                }
                sections.append(current_section)
                current_sub_section = None
                continue

            # ---- Detect Sub-Section ----
            if "|" in clean_line or ":" in clean_line:
                flush_paragraph()
                if current_section is None:  # safeguard
                    current_section = {
                        "section": "DEFAULT",
                        "sub_sections": [],
                        "content": []
                    }
                    sections.append(current_section)

                current_sub_section = {
                    "sub_section": clean_line,
                    "content": []
                }
                current_section["sub_sections"].append(current_sub_section)
                continue

            # ---- Bullets ----
            if clean_line.startswith(("•", "●", "-")):
                flush_paragraph()
                if current_section is None:  # safeguard
                    current_section = {
                        "section": "DEFAULT",
                        "sub_sections": [],
                        "content": []
                    }
                    sections.append(current_section)

                target = current_sub_section if current_sub_section else current_section
                target["content"].append({
                    "type": "bullet_point",
                    "text": clean_line.lstrip("•●- ").strip()
                })
            else:
                # ---- Add to paragraph buffer ----
                paragraph_buffer.append(clean_line)

        # ---- Flush any remaining paragraph ----
        flush_paragraph()

        # ---- Tables ----
        tables = page.extract_tables()
        for idx, table in enumerate(tables):
            if current_section is None:  # safeguard
                current_section = {
                    "section": "DEFAULT",
                    "sub_sections": [],
                    "content": []
                }
                sections.append(current_section)

            target = current_sub_section if current_sub_section else current_section
            target["content"].append({
                "type": "table",
                "description": f"Table {idx+1} detected",
                "table_data": table
            })

    return sections


def extract_images(doc, page_num):
    """
    Extract images/charts using PyMuPDF.
    """
    page = doc[page_num - 1]
    charts = []

    for block in page.get_text("dict")["blocks"]:
        if block["type"] == 1:  # image block
            x0, y0, x1, y1 = block["bbox"]
            charts.append({
                "type": "chart",
                "description": "Image/Chart detected",
                "bbox": [x0, y0, x1, y1]
            })

    return charts


def pdf_to_structured_json(pdf_path, output_json):
    doc = fitz.open(pdf_path)
    data = {"pages": []}

    for page_num in range(1, len(doc) + 1):
        page_dict = {
            "page_number": page_num,
            "sections": extract_text_and_tables(pdf_path, page_num),
            "charts": extract_images(doc, page_num)
        }
        data["pages"].append(page_dict)

    with open(output_json, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=4, ensure_ascii=False)

    print(f"✅ Structured JSON saved to {output_json}")


if __name__ == "__main__":
    pdf_file = "ASSIGNMENT/Sample_pdf.pdf"   
    output_file = "structured_output.json"
    pdf_to_structured_json(pdf_file, output_file)

✅ Structured JSON saved to structured_output.json
