# Load file

In [1]:
from pathlib import Path

file = Path("../data/engineering/engineering_master_doc.md")
report = file.read_text(encoding="utf-8", errors="ignore")
report = report.splitlines()


In [18]:
def get_section(line):
    # Clean up whitespace
    line = line.strip()

    # Skip the main markdown title like '# Marketing Report ...'
    if line.startswith("# "):
        return line[2:].strip()

    return None

def get_subsection(line):
    # Clean up whitespace
    line = line.strip()

    if line.startswith("##") and not line.startswith("###"):
        return line.lstrip("##")

    return None

def get_subsubsection(line):

    # clean up whitespace
    line = line.strip()

    if line.startswith("###"):
        return line.lstrip("###")

    return None


In [19]:
lines = report
for i, line in enumerate(lines):

    section = get_section(line)
    subsection = get_subsection(line)
    subsubsection = get_subsubsection(line)

    if section:
        print(f"Section: {section}")
    if subsection:
        print(f" Subsection: {subsection}")
    if subsubsection:
        print(f"  SubSubsection: {subsubsection}")

Section: FinSolve Technologies Engineering Document
 Subsection:  1. Introduction
  SubSubsection:  1.1 Company Overview
  SubSubsection:  1.2 Purpose
  SubSubsection:  1.3 Scope
  SubSubsection:  1.4 Document Control
 Subsection:  2. System Architecture
  SubSubsection:  2.1 Overview
  SubSubsection:  2.2 High-Level Architecture
  SubSubsection:  2.3 Key Components
  SubSubsection:  2.3.1 Client Applications
  SubSubsection:  2.3.2 API Gateway
  SubSubsection:  2.3.3 Microservices
  SubSubsection:  2.3.4 Data Layer
  SubSubsection:  2.3.5 Infrastructure
  SubSubsection:  2.4 Scalability Architecture
  SubSubsection:  2.4.1 Horizontal Scaling
  SubSubsection:  2.4.2 Database Scalability
  SubSubsection:  2.4.3 Caching Strategy
  SubSubsection:  2.5 Resilience and Fault Tolerance
  SubSubsection:  2.5.1 High Availability
  SubSubsection:  2.5.2 Circuit Breakers
  SubSubsection:  2.5.3 Disaster Recovery
  SubSubsection:  2.5.4 Data Consistency
 Subsection:  3. Technology Stack
  SubSubse

In [22]:
def get_subsection_content(lines):

    current_section = None
    current_subsection = None
    current_subsub = None
    collecting = False
    content = []
    results = []

    def save_content():
        nonlocal collecting, content, current_section, current_subsection, current_subsub, results

        has_real_content = any(line.strip() for line in content)

        if not collecting or not has_real_content or current_section is None:
            content = []
            return

        results.append({
            "section": current_section,
            "subsection": current_subsection,
            "subsubsection": current_subsub,
            "content": content[:]
        })
        content = []

    for i, line in enumerate(lines):
        stripped = line.strip()

        new_section = get_section(stripped)
        if new_section is not None:
            if collecting:
                save_content()
                collecting = False

            current_section = new_section
            continue

        new_subsection = get_subsection(stripped)
        if new_subsection is not None:
            if collecting:
                save_content()

            current_subsection = new_subsection
            collecting = True
            continue

        new_subsubsection = get_subsubsection(stripped)
        if new_subsubsection is not None:
            if collecting:
                save_content()

            current_subsub = new_subsubsection
            collecting = True
            continue

        if collecting:
            content.append(line)


    if collecting:
        save_content()

    return results

master_chunked = get_subsection_content(report)
print(f"Found {len(master_chunked)} chunks")
master_chunked



Found 81 chunks


[{'section': 'FinSolve Technologies Engineering Document',
  'subsection': ' 1. Introduction',
  'subsubsection': ' 1.1 Company Overview',
  'content': ['FinSolve Technologies is a leading FinTech company headquartered in Bangalore, India, with operations across North America, Europe, and Asia-Pacific. Founded in 2018, FinSolve provides innovative financial solutions, including digital banking, payment processing, wealth management, and enterprise financial analytics, serving over 2 million individual users and 10,000 businesses globally.',
   '']},
 {'section': 'FinSolve Technologies Engineering Document',
  'subsection': ' 1. Introduction',
  'subsubsection': ' 1.2 Purpose',
  'content': ['This engineering document outlines the technical architecture, development processes, and operational guidelines for FinSolve\'s product ecosystem. It serves as a comprehensive guide for engineering teams, stakeholders, and partners to ensure alignment with FinSolve\'s mission: "To empower financia

In [23]:
import json
# output json
output_folder = Path("../data/engineering/chunked_reports")
output_folder.mkdir(parents=True, exist_ok=True)

output_file = output_folder / f"chunked_master.json"
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(master_chunked, f, ensure_ascii=False, indent=2)
print(f"Wrote {output_file}")

Wrote ..\data\engineering\chunked_reports\chunked_master.json
