# Load reports

In [134]:
from pathlib import Path

folder = Path("../data/marketing")

# display all the files in the folder
files = list(folder.glob("*"))
print("Files in the folder:")
for file in files:
    print(file.name)

Files in the folder:
marketing_report_2024.md
marketing_report_q1_2024.md
marketing_report_q2_2024.md
marketing_report_q3_2024.md
market_report_q4_2024.md


# Annual Marketing Report

In [135]:
file = Path("../data/marketing/marketing_report_2024.md")
report = file.read_text(encoding="utf-8", errors="ignore")
report = report.splitlines()

In [136]:
def get_section(line):
    # Clean up whitespace
    line = line.strip()

    # Skip the main markdown title like '# Marketing Report ...'
    if line.startswith("# "):
        return line[2:].strip()

    return None

def get_subsection(line, next_line):
    # Clean up whitespace
    line = line.strip()
    next_line = (next_line or "").strip()

    # Check if line ends with ':' and next line is dashes
    if line.endswith(":") and next_line.startswith("---"):
        return line[:-1].strip()  # remove the colon

    return None

lines = report
for i, line in enumerate(lines):
    next_line = lines[i + 1] if i + 1 < len(lines) else None
    
    section = get_section(line)
    subsection = get_subsection(line, next_line)
    
    if section:
        print(f"Section: {section}")
    if subsection:
        print(f" Subsection: {subsection}")

Section: Marketing Report for FinSolve Technologies Inc. - 2024
 Subsection: Executive Summary
 Subsection: Year-Over-Year (YoY) Performance
 Subsection: Campaign Analysis
 Subsection: Vendor Performance
 Subsection: Customer Insights
 Subsection: Marketing Budget Breakdown
 Subsection: Key Metrics & KPIs
 Subsection: Recommendations for Improvement
 Subsection: Appendices


In [137]:
def get_subsection_content(lines):
    
    current_section = None
    current_subsection = None
    collecting = False
    content = []
    results = []

    def save_content():
        nonlocal collecting, content, current_section, current_subsection, results

        if not collecting or not content or current_section is None:
            return

        results.append({
            "section": current_section,
            "subsection": current_subsection,
            "content": content[:]
        })
        content = []

    for i, line in enumerate(lines):
        next_line = lines[i + 1] if i + 1 < len(lines) else None
        stripped = line.strip()

        new_section = get_section(stripped)
        if new_section is not None:
            if collecting:
                save_content()
                collecting = False

            current_section = new_section
            continue

        new_subsection = get_subsection(stripped, next_line)
        if new_subsection is not None:
            if collecting:
                save_content()

            current_subsection = new_subsection
            collecting = True
            continue

        if collecting:
            content.append(line)

    if collecting:
        save_content()

    return results

# Test it
annual_chunked = get_subsection_content(report)
print(f"Found {len(annual_chunked)} subsections")
annual_chunked

Found 9 subsections


[{'section': 'Marketing Report for FinSolve Technologies Inc. - 2024',
  'subsection': 'Executive Summary',
  'content': ['-------------------------------------------',
   "2024 was a pivotal year for the Marketing Department at FinSolve Technologies, marked by ambitious campaigns, strategic partnerships, and a focus on increasing customer engagement across digital and physical channels. The department’s efforts resulted in a 20% increase in new customer acquisition, a 15% growth in brand awareness, and enhanced customer loyalty. Despite a competitive market environment, FinSolve Technologies's marketing initiatives proved to be cost-effective, delivering a solid return on investment (ROI) and positioning the company for further expansion.",
   '']},
 {'section': 'Marketing Report for FinSolve Technologies Inc. - 2024',
  'subsection': 'Year-Over-Year (YoY) Performance',
  'content': ['-------------------------------------------',
   "FinSolve Technologies's marketing team saw a solid 

# Scrape all the files

In [138]:
file = Path("../data/marketing/marketing_report_q1_2024.md")
report = file.read_text(encoding="utf-8", errors="ignore")
report = report.splitlines()

In [139]:
def get_section_f2(line):
    # Clean up whitespace
    line = line.strip()

    # Skip the main markdown title like '# Marketing Report ...'
    if line.startswith("# "):
        return line[2:].strip()

    return None

def get_subsection_f2(line):
    line = line.strip()
    prefix = "## "
    if line.startswith("## "):
        return line[len(prefix):].strip()
    return None

lines = report
for i, line in enumerate(lines):
    next_line = lines[i + 1] if i + 1 < len(lines) else None
    
    section = get_section_f2(line)
    subsection = get_subsection_f2(line)
    
    if section:
        print(f"Section: {section}")
    if subsection:
        print(f" Subsection: {subsection}")

Section: Comprehensive Marketing Report - Q1 2024
 Subsection: Executive Summary
 Subsection: Q1 - Marketing Overview
 Subsection: Q1 - Projections & Targets
 Subsection: Q1 - Benchmarks
 Subsection: Q1 - Strategic Objectives
 Subsection: Campaign Highlights
 Subsection: Performance Analysis
 Subsection: Recommendations for Q2 2024
 Subsection: Conclusion


In [140]:
def get_subsection_content_f2(lines):
    
    current_section = None
    current_subsection = None
    collecting = False
    content = []
    results = []

    def save_content():
        nonlocal collecting, content, current_section, current_subsection, results

        if not collecting or not content or current_section is None:
            return

        results.append({
            "section": current_section,
            "subsection": current_subsection,
            "content": content[:]
        })
        content = []

    for i, line in enumerate(lines):
        next_line = lines[i + 1] if i + 1 < len(lines) else None
        stripped = line.strip()

        new_section = get_section_f2(stripped)
        if new_section is not None:
            if collecting:
                save_content()
                collecting = False

            current_section = new_section
            continue

        new_subsection = get_subsection_f2(stripped)
        if new_subsection is not None:
            if collecting:
                save_content()

            current_subsection = new_subsection
            collecting = True
            continue

        if collecting:
            content.append(line)

    if collecting:
        save_content()

    return results

# Test it
chunked_report = get_subsection_content_f2(report)
print(f"Found {len(chunked_report)} subsections")
print("\nFirst subsection:")
chunked_report

Found 9 subsections

First subsection:


[{'section': 'Comprehensive Marketing Report - Q1 2024',
  'subsection': 'Executive Summary',
  'content': ['Q1 2024 marked a foundational quarter for FinNova, as we focused on building robust marketing infrastructure to support aggressive expansion and enhance customer acquisition channels. This report details our marketing strategies, performance metrics, and strategic objectives, emphasizing our efforts to expand into Europe, launch the InstantPay feature, and boost social media engagement. With a $2 million marketing spend, we achieved significant milestones, setting a strong trajectory for the remainder of 2024.',
   '']},
 {'section': 'Comprehensive Marketing Report - Q1 2024',
  'subsection': 'Q1 - Marketing Overview',
  'content': ['In Q1 2024, FinNova prioritized establishing a scalable framework for growth, with a focus on strengthening customer acquisition channels and enhancing brand visibility. Key initiatives included:',
   '',
   '- **European Market Entry**: Launched ta

In [141]:
reports = []
files.remove(next(f for f in files if f.name == "marketing_report_2024.md"))
for file in files:
    report = file.read_text(encoding="utf-8", errors="ignore")
    report = report.splitlines()
    reports.append(report)

In [142]:
chunked_reports = []
for i in range(len(reports)):
    print(f"Processing report {i+1}/{len(reports)}")
    chunked_report = get_subsection_content_f2(reports[i])
    print(f" Report {i+1} has {len(chunked_report)} subsections")
    chunked_reports.append(chunked_report)

Processing report 1/4
 Report 1 has 9 subsections
Processing report 2/4
 Report 2 has 9 subsections
Processing report 3/4
 Report 3 has 9 subsections
Processing report 4/4
 Report 4 has 9 subsections


In [145]:
# export as json files
import json

output_folder = Path("../data/marketing/chunked_reports")
output_folder.mkdir(parents=True, exist_ok=True)

for i, chunked_report in enumerate(chunked_reports):
    output_file = output_folder / f"chunked_report_q{i+1}.json"
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(chunked_report, f, ensure_ascii=False, indent=2)
    print(f"Wrote {output_file}")

output_file = output_folder / f"chunked_report_annual.json"
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(annual_chunked, f, ensure_ascii=False, indent=2)
print(f"Wrote {output_file}")

Wrote ..\data\marketing\chunked_reports\chunked_report_q1.json
Wrote ..\data\marketing\chunked_reports\chunked_report_q2.json
Wrote ..\data\marketing\chunked_reports\chunked_report_q3.json
Wrote ..\data\marketing\chunked_reports\chunked_report_q4.json
Wrote ..\data\marketing\chunked_reports\chunked_report_annual.json
