In [9]:
from docx import Document
import json
import re

In [13]:
doc = Document("../synthetic_data/2.docx")

# Every paragraph (including headings)
docx_text = "\n".join(p.text for p in doc.paragraphs if p.text.strip())

# Every paragraph (including headings)
for p in doc.paragraphs:
    print(p.text)

# Tables â†’ nested lists
for table in doc.tables:
    rows = [[cell.text for cell in row.cells] for row in table.rows]
    print(rows)

# Access styles, runs, etc.
first_run = doc.paragraphs[0].runs[0]
print(first_run.text, first_run.bold)

1. Digital Marketing and Social Media Strategy
Chapter 1: Introduction to Digital Marketing
1.1 Digital Marketing Landscape Overview
1.2 Consumer Behavior in Digital Channels
1.3 Marketing Funnel Optimization
1.4 Cross-Channel Integration Strategies
1.5 Performance Measurement Frameworks
1.6 Budget Allocation and ROI Analysis
1.7 Competitive Intelligence Gathering
1.8 Brand Positioning in Digital Space
1.9 Customer Journey Mapping
1.10 Digital Transformation Impact
1.11 Emerging Platform Opportunities
1.12 Privacy Regulations and Compliance
1.13 Marketing Technology Stack
1.14 Data-Driven Decision Making
1.15 Future Trends and Predictions
Chapter 2: Social Media Platform Strategies
2.1 Platform Selection and Optimization
2.2 Content Creation and Curation
2.3 Community Management Best Practices
2.4 Influencer Partnership Development
2.5 Social Media Advertising Campaigns
2.6 Hashtag Strategy and Research
2.7 User-Generated Content Leveraging


1. Digital Marketing and Social Media Strat

In [None]:
def parse_structured_toc(text):
    lines = [line.strip() for line in text.strip().splitlines() if line.strip()]
    toc = {
        "chapters": []
    }

    # Try to extract optional ID and name from the first line
    first_line = lines[0]
    if match := re.match(r"^(\d+)\.\s*(.+)", first_line):
        toc["id"] = int(match.group(1))
        toc["name"] = match.group(2).strip()
        lines = lines[1:]  # Remove title line

    current_chapter = None
    chapter_re = re.compile(r"^Chapter\s+(\d+):\s+(.+)$", re.IGNORECASE)
    subchapter_re = re.compile(r"^(\d+)\.(\d+)\s+(.+)$")

    for line in lines:
        if (match := chapter_re.match(line)):
            number = int(match.group(1))
            title = match.group(2).strip()
            current_chapter = {
                "number": number,
                "title": title,
                "subchapters": []
            }
            toc["chapters"].append(current_chapter)

        elif (match := subchapter_re.match(line)) and current_chapter:
            sub_number = f"{match.group(1)}.{match.group(2)}"
            title = match.group(3).strip()
            current_chapter["subchapters"].append({
                "number": sub_number,
                "title": title
            })

    return toc


parsed = parse_structured_toc(docx_text)
print(json.dumps(parsed, indent=2, ensure_ascii=False))

{
  "chapters": [
    {
      "number": 1,
      "title": "Introduction to Digital Marketing",
      "subchapters": [
        {
          "number": "1.1",
          "title": "Digital Marketing Landscape Overview"
        },
        {
          "number": "1.2",
          "title": "Consumer Behavior in Digital Channels"
        },
        {
          "number": "1.3",
          "title": "Marketing Funnel Optimization"
        },
        {
          "number": "1.4",
          "title": "Cross-Channel Integration Strategies"
        },
        {
          "number": "1.5",
          "title": "Performance Measurement Frameworks"
        },
        {
          "number": "1.6",
          "title": "Budget Allocation and ROI Analysis"
        },
        {
          "number": "1.7",
          "title": "Competitive Intelligence Gathering"
        },
        {
          "number": "1.8",
          "title": "Brand Positioning in Digital Space"
        },
        {
          "number": "1.9",
          "