<a href="https://colab.research.google.com/github/Debkanta837/adobe-hackathon/blob/main/problem1A.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# ==============================================================================
#  SETUP: INSTALL LIBRARIES
# ==============================================================================
# We use PyMuPDF (fitz) for its speed and detailed access to document layout,
# including the coordinates (bounding box) of text.
!pip install pymupdf -q

# ==============================================================================
#  IMPORT NECESSARY MODULES
# ==============================================================================
import fitz  # PyMuPDF library
import json
from google.colab import files
from collections import Counter
import statistics

# ==============================================================================
#  CORE LOGIC: ADVANCED PDF OUTLINE EXTRACTION
# ==============================================================================
def extract_structured_outline_advanced(pdf_path):
    """
    Analyzes a PDF using layout and style heuristics to extract its structure.

    This advanced logic identifies headings based on a combination of rules:
    1.  Word Count: A heading must have 10 words or fewer.
    2.  Vertical Spacing: It must have significant space both above and below it,
        unless it's the first line on a page (then only space below is required).
    3.  Style and Size: Font size and boldness are used as positive signals and
        to classify the heading level (Title, H1, H2, H3).
    """
    try:
        document = fitz.open(pdf_path)
    except Exception as e:
        print(f"Error: Could not open PDF '{pdf_path}'. Reason: {e}")
        return None

    all_lines = []
    # --- Step 1: Extract all lines with their properties ---
    for page_num, page in enumerate(document):
        page_dict = page.get_text("dict")
        for block in page_dict.get("blocks", []):
            if "lines" in block:
                for line in block["lines"]:
                    line_text = "".join(span["text"] for span in line["spans"]).strip()
                    if not line_text:
                        continue

                    font_size = round(line["spans"][0]["size"])
                    is_bold = "bold" in line["spans"][0]["font"].lower()

                    all_lines.append({
                        "text": line_text,
                        "size": font_size,
                        "page": page_num + 1,
                        "bold": is_bold,
                        "bbox": line["bbox"]
                    })

    if not all_lines:
        print("Warning: No text could be extracted.")
        return None

    # --- Step 2: Calculate the typical line spacing for the document ---
    vertical_spacings = []
    for i in range(len(all_lines) - 1):
        current_line = all_lines[i]
        next_line = all_lines[i+1]
        if current_line["page"] == next_line["page"]:
            spacing = next_line["bbox"][1] - current_line["bbox"][3]
            if 0 < spacing < 25:
                vertical_spacings.append(spacing)

    if not vertical_spacings:
        typical_spacing = 5
    else:
        try:
            typical_spacing = statistics.mode(round(s) for s in vertical_spacings)
        except statistics.StatisticsError:
            typical_spacing = statistics.median(vertical_spacings)

    # --- Step 3: Identify heading candidates based on the combined rules ---
    headings = []
    for i in range(len(all_lines)):
        line = all_lines[i]

        # RULE 1: Check word count. If it's too long, it cannot be a heading.
        if len(line["text"].split()) > 10:
            continue

        # RULE 2: Check for vertical spacing.
        space_before = float('inf')
        space_after = float('inf')
        is_first_on_page = (i == 0) or (all_lines[i-1]["page"] != line["page"])
        if not is_first_on_page:
            space_before = line["bbox"][1] - all_lines[i-1]["bbox"][3]
        is_last_on_page = (i == len(all_lines) - 1) or (all_lines[i+1]["page"] != line["page"])
        if not is_last_on_page:
            space_after = all_lines[i+1]["bbox"][1] - line["bbox"][3]

        has_space_after = space_after > (typical_spacing * 1.6)
        has_space_before = space_before > (typical_spacing * 1.6)

        # A line must have space after, and space before (unless it's first on the page).
        is_spatially_isolated = has_space_after and (has_space_before or is_first_on_page)

        if not is_spatially_isolated:
            continue

        # RULE 3: Check for positive style signals.
        # If the line is spatially isolated, it's a strong candidate. We'll accept it
        # if it has at least one other heading-like feature.
        is_bold = line["bold"]
        is_all_caps = line["text"].isupper() and len(line["text"]) > 1
        is_large_font = line["size"] > (typical_spacing * 1.5)

        if is_bold or is_all_caps or is_large_font:
            headings.append(line)

    if not headings:
        print("Warning: No headings could be identified with the current rules.")
        return {"title": "", "outline": []}

    # --- Step 4: Classify the identified headings into Title, H1, H2, H3 ---
    heading_font_sizes = sorted(list(set(h["size"] for h in headings)), reverse=True)

    title_text = ""
    if heading_font_sizes:
        title_size = heading_font_sizes[0]
        for h in headings:
            if h["size"] == title_size:
                title_text = h["text"]
                break

    heading_level_map = {}
    heading_types = ["H1", "H2", "H3"]
    for i, size in enumerate(heading_font_sizes[1:]):
        if i < len(heading_types):
            heading_level_map[size] = heading_types[i]

    outline_list = []
    for h in headings:
        if h["size"] in heading_level_map:
            outline_list.append({
                "level": heading_level_map[h["size"]],
                "text": h["text"],
                "page": h["page"]
            })

    # --- Step 5: Format the final JSON output ---
    final_output = {
        "title": title_text,
        "outline": outline_list
    }

    return final_output

# ==============================================================================
#  MAIN EXECUTION BLOCK (FOR GOOGLE COLAB)
# ==============================================================================
print("Please upload the PDF file you want to analyze.")

try:
    uploaded_files = files.upload()

    if uploaded_files:
        pdf_filename = next(iter(uploaded_files))
        print(f"\nProcessing file: '{pdf_filename}'...")

        structured_data = extract_structured_outline_advanced(pdf_filename)

        if structured_data:
            json_result = json.dumps(structured_data, indent=4)
            print("\n--- ✅ EXTRACTION SUCCESSFUL ---")
            print("Structured Outline (JSON):")
            print(json_result)

            output_filename = pdf_filename.rsplit('.', 1)[0] + '.json'
            with open(output_filename, 'w') as f:
                f.write(json_result)
            print(f"\nResult has been saved to '{output_filename}'.")
    else:
        print("\nOperation cancelled. No file was uploaded.")

except Exception as e:
    print(f"\nAn error occurred: {e}")

Please upload the PDF file you want to analyze.


Saving Adobe.pdf to Adobe (1).pdf

Processing file: 'Adobe (1).pdf'...

--- ✅ EXTRACTION SUCCESSFUL ---
Structured Outline (JSON):
{
    "title": "Welcome to the\u202f\u201cConnecting the Dots\u201d Challenge",
    "outline": [
        {
            "level": "H1",
            "text": "Rethink Reading. Rediscover Knowledge",
            "page": 2
        },
        {
            "level": "H1",
            "text": "The Journey Ahead",
            "page": 2
        },
        {
            "level": "H2",
            "text": "\u2022 Round 1:",
            "page": 2
        },
        {
            "level": "H2",
            "text": "\u2022 Round 2:",
            "page": 2
        },
        {
            "level": "H1",
            "text": "Why This Matters",
            "page": 2
        },
        {
            "level": "H1",
            "text": "Are you in?",
            "page": 2
        },
        {
            "level": "H1",
            "text": "Challenge Theme: Connecting the Dots Th