# Load file

In [2]:
from pathlib import Path

file = Path("../data/finance/financial_summary.md")
report = file.read_text(encoding="utf-8", errors="ignore")
report = report.splitlines()

In [3]:
def get_section(line):
    # Clean up whitespace
    line = line.strip()

    # Skip the main markdown title like '# Marketing Report ...'
    if line.startswith("# "):
        return line[2:].strip()

    return None

def get_subsection(line, next_line):
    # Clean up whitespace
    line = line.strip()
    next_line = (next_line or "").strip()

    # Check if line ends with ':' and next line is dashes
    if line.endswith(":") and next_line.startswith("---"):
        return line[:-1].strip()  # remove the colon

    return None

lines = report
for i, line in enumerate(lines):
    next_line = lines[i + 1] if i + 1 < len(lines) else None
    
    section = get_section(line)
    subsection = get_subsection(line, next_line)
    
    if section:
        print(f"Section: {section}")
    if subsection:
        print(f" Subsection: {subsection}")

Section: Financial Report for FinSolve Technologies Inc. - 2024
 Subsection: Executive Summary
 Subsection: Year-Over-Year (YoY) Analysis
 Subsection: Expense Breakdown by Category
 Subsection: Cash Flow Analysis
 Subsection: Key Financial Ratios and Metrics
 Subsection: Operational Insights
 Subsection: Risk Analysis and Mitigation Strategies
 Subsection: Appendices


In [5]:
def get_subsection_content(lines):
    
    current_section = None
    current_subsection = None
    collecting = False
    content = []
    results = []

    def save_content():
        nonlocal collecting, content, current_section, current_subsection, results

        if not collecting or not content or current_section is None:
            return

        results.append({
            "section": current_section,
            "subsection": current_subsection,
            "content": content[:]
        })
        content = []

    for i, line in enumerate(lines):
        next_line = lines[i + 1] if i + 1 < len(lines) else None
        stripped = line.strip()

        new_section = get_section(stripped)
        if new_section is not None:
            if collecting:
                save_content()
                collecting = False

            current_section = new_section
            continue

        new_subsection = get_subsection(stripped, next_line)
        if new_subsection is not None:
            if collecting:
                save_content()

            current_subsection = new_subsection
            collecting = True
            continue

        if collecting:
            content.append(line)

    if collecting:
        save_content()

    return results

# Test it
summary_chunked = get_subsection_content(report)
print(f"Found {len(summary_chunked)} subsections")
summary_chunked

Found 8 subsections


[{'section': 'Financial Report for FinSolve Technologies Inc. - 2024',
  'subsection': 'Executive Summary',
  'content': ['-------------------------------------------',
   '2024 marked a year of both opportunity and challenge for FinSolve Technologies. Despite a robust revenue increase, we saw significant pressure in certain expense categories, notably vendor-related costs and software subscriptions. However, these pressures were balanced by cost-saving measures in operational efficiency, strong gross margin performance, and strategic investment in growth areas. The company is well-positioned to continue scaling its core offerings, but focused attention on cost optimization will be essential for maintaining profitability in the coming years.',
   '']},
 {'section': 'Financial Report for FinSolve Technologies Inc. - 2024',
  'subsection': 'Year-Over-Year (YoY) Analysis',
  'content': ['-------------------------------------------',
   "FinSolve Technologies's revenue grew by 25% in 2024,

In [6]:
import json
# output json
output_folder = Path("../data/finance/chunked_reports")
output_folder.mkdir(parents=True, exist_ok=True)

output_file = output_folder / f"chunked_summary.json"
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(summary_chunked, f, ensure_ascii=False, indent=2)
print(f"Wrote {output_file}")

Wrote ..\data\finance\chunked_reports\chunked_summary.json


# quarterly financial report

In [10]:
from pathlib import Path

file = Path("../data/finance/quarterly_financial_report.md")
report = file.read_text(encoding="utf-8", errors="ignore")
report = report.splitlines()

In [13]:
def get_section_f2(line):
    # Clean up whitespace
    line = line.strip()

    # Skip the main markdown title like '# Marketing Report ...'
    if line.startswith("# "):
        return line[2:].strip()

    return None

def get_subsection_f2(line):
    line = line.strip()
    prefix = "## "
    if line.startswith("## "):
        return line[len(prefix):].strip()
    return None

def get_subsubsection_f2(line):
    line = line.strip()
    prefix = "### "
    if line.startswith(prefix):
        return line[len(prefix):].strip()
    return None


lines = report
for i, line in enumerate(lines):
    next_line = lines[i + 1] if i + 1 < len(lines) else None
    
    section = get_section_f2(line)
    subsection = get_subsection_f2(line)
    subsubsection = get_subsubsection_f2(line)
    
    if section:
        print(f"Section: {section}")
    if subsection:
        print(f" Subsection: {subsection}")
    if subsubsection:
        print(f"  SubSubsection: {subsubsection}")

Section: Quarterly Financial Report - FinSolve Technologies Inc. 2024
 Subsection: Executive Summary
 Subsection: Q1 - January to March 2024
  SubSubsection: Quarterly Financial Overview
  SubSubsection: Quarterly Expense Breakdown
  SubSubsection: Cash Flow Analysis
  SubSubsection: Risks & Mitigation
 Subsection: Q2 - April to June 2024
  SubSubsection: Quarterly Financial Overview
  SubSubsection: Quarterly Expense Breakdown
  SubSubsection: Cash Flow Analysis
  SubSubsection: Risks & Mitigation
 Subsection: Q3 - July to September 2024
  SubSubsection: Quarterly Financial Overview
  SubSubsection: Quarterly Expense Breakdown
  SubSubsection: Cash Flow Analysis
  SubSubsection: Risks & Mitigation
 Subsection: Q4 - October to December 2024
  SubSubsection: Quarterly Financial Overview
  SubSubsection: Quarterly Expense Breakdown
  SubSubsection: Cash Flow Analysis
  SubSubsection: Risks & Mitigation
 Subsection: 2024 Annual Summary
 Subsection: Recommendations for 2025
 Subsection: Co

In [23]:
def get_subsection_content_f2(lines):

    current_section = None
    current_subsection = None
    current_subsub = None

    collecting = False
    content = []
    results = []

    def save_content():
        nonlocal collecting, content, current_section, current_subsection, current_subsub, results

        if not collecting or not content or current_section is None:
            return
        
        if len(content) == 1:
            return

        results.append({
            "section": current_section,
            "subsection": current_subsection,
            "subsubsection": current_subsub,
            "content": content[:]
        })

        content = []

    for i, line in enumerate(lines):
        stripped = line.strip()
        next_line = lines[i + 1] if i + 1 < len(lines) else None

        
        new_section = get_section_f2(stripped)
        if new_section is not None:
            if collecting:
                save_content()

            current_section = new_section
            current_subsection = None
            current_subsub = None
            collecting = False
            continue

        
        new_subsection = get_subsection_f2(stripped)
        if new_subsection is not None:
            if collecting:
                save_content()

            current_subsection = new_subsection
            current_subsub = None
            collecting = True
            continue

        
        new_subsub = get_subsubsection_f2(stripped)
        if new_subsub is not None:
            if collecting:
                save_content()

            current_subsub = new_subsub
            collecting = True
            continue

        
        if collecting:
            content.append(line)

    
    if collecting:
        save_content()

    return results

chunked_report = get_subsection_content_f2(report)

print(f"Found {len(chunked_report)} chunks")

print("\nFirst chunk:")
chunked_report

Found 19 chunks

First chunk:


[{'section': 'Quarterly Financial Report - FinSolve Technologies Inc. 2024',
  'subsection': 'Executive Summary',
  'subsubsection': None,
  'content': ['In 2024, FinSolve Technologies Inc. delivered exceptional financial performance, achieving significant year-over-year (YoY) growth across all quarters. With a strategic focus on market expansion, customer acquisition, and operational efficiency, the company saw revenue increase from $2.1 billion in Q1 to $2.6 billion in Q4, alongside consistent improvements in gross margin, operating income, and net income. This report provides a comprehensive overview of FinSolve Technologies’s financial results, expense breakdowns, cash flow analyses, and risk mitigation strategies for each quarter of 2024, underscoring our commitment to sustainable growth and shareholder value.',
   '',
   '---',
   '']},
 {'section': 'Quarterly Financial Report - FinSolve Technologies Inc. 2024',
  'subsection': 'Q1 - January to March 2024',
  'subsubsection': 'Qu

In [24]:
import json
# output json
output_folder = Path("../data/finance/chunked_reports")
output_folder.mkdir(parents=True, exist_ok=True)

output_file = output_folder / f"quarterly_financial_report.json"
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(chunked_report, f, ensure_ascii=False, indent=2)
print(f"Wrote {output_file}")

Wrote ..\data\finance\chunked_reports\quarterly_financial_report.json
