In [36]:
from docx import Document
import json
import re
from collections import OrderedDict
import os

In [35]:
doc = Document("../synthetic_data/2_full.docx")

# Every paragraph (including headings)
docx_text = "\n".join(p.text for p in doc.paragraphs if p.text.strip())

# Every paragraph (including headings)
for p in doc.paragraphs:
    print(p.text)
    


1. Digital Marketing and Social Media Strategy
Chapter 1: Introduction to Digital Marketing
1.1 Digital Marketing Landscape Overview
1.2 Consumer Behavior in Digital Channels
1.3 Marketing Funnel Optimization
1.4 Cross-Channel Integration Strategies
1.5 Performance Measurement Frameworks
1.6 Budget Allocation and ROI Analysis
1.7 Competitive Intelligence Gathering
1.8 Brand Positioning in Digital Space
1.9 Customer Journey Mapping
1.10 Digital Transformation Impact
1.11 Emerging Platform Opportunities
1.12 Privacy Regulations and Compliance
1.13 Marketing Technology Stack
1.14 Data-Driven Decision Making
1.15 Future Trends and Predictions
Chapter 2: Social Media Platform Strategies
2.1 Platform Selection and Optimization
2.2 Content Creation and Curation
2.3 Community Management Best Practices
2.4 Influencer Partnership Development
2.5 Social Media Advertising Campaigns
2.6 Hashtag Strategy and Research
2.7 User-Generated Content Leveraging
2.8 Crisis Management and Response
2.9 Social

In [31]:
def parse_structured_toc(text):
    lines = [line.strip() for line in text.strip().splitlines() if line.strip()]
    books = []
    
    current_book = None
    current_chapter = None
    
    # Regex patterns
    book_title_re = re.compile(r"^(\d+)\.\s+(.+)", re.IGNORECASE)
    chapter_re = re.compile(r"^•?\s*Chapter\s+(\d+):\s+(.+)$", re.IGNORECASE)
    subchapter_re = re.compile(r"^(\d+)\.(\d+)\s+(.+)$")

    for line in lines:
        # Check if this is a new book title
        if (match := book_title_re.match(line)):
            # Save previous book if exists
            if current_book:
                books.append(current_book)
            
            # Start new book
            current_book = OrderedDict()
            current_book["id"] = int(match.group(1))
            current_book["name"] = match.group(2).strip()
            current_book["chapters"] = []
            current_chapter = None

        # Check if this is a chapter
        elif (match := chapter_re.match(line)) and current_book:
            number = int(match.group(1))
            title = match.group(2).strip()
            current_chapter = {
                "number": number,
                "title": title,
                "subchapter_count": 0,
                "subchapters": []
            }
            current_book["chapters"].append(current_chapter)

        # Check if this is a subchapter
        elif (match := subchapter_re.match(line)) and current_chapter:
            sub_number = f"{match.group(1)}.{match.group(2)}"
            title = match.group(3).strip()
            current_chapter["subchapters"].append({
                "number": sub_number,
                "title": title
            })
            # Update subchapter count
            current_chapter["subchapter_count"] = len(current_chapter["subchapters"])
    # Don't forget to add the last book
    if current_book:
        books.append(current_book)

    # If only one book, return it directly (maintains backward compatibility)
    if len(books) == 1:
        return books[0]
    
    # If multiple books, return list
    return books


parsed = parse_structured_toc(docx_text)
print(json.dumps(parsed, indent=2, ensure_ascii=False))

[
  {
    "id": 1,
    "name": "Digital Marketing and Social Media Strategy",
    "chapters": [
      {
        "number": 1,
        "title": "Introduction to Digital Marketing",
        "subchapter_count": 15,
        "subchapters": [
          {
            "number": "1.1",
            "title": "Digital Marketing Landscape Overview"
          },
          {
            "number": "1.2",
            "title": "Consumer Behavior in Digital Channels"
          },
          {
            "number": "1.3",
            "title": "Marketing Funnel Optimization"
          },
          {
            "number": "1.4",
            "title": "Cross-Channel Integration Strategies"
          },
          {
            "number": "1.5",
            "title": "Performance Measurement Frameworks"
          },
          {
            "number": "1.6",
            "title": "Budget Allocation and ROI Analysis"
          },
          {
            "number": "1.7",
            "title": "Competitive Intelligence Ga

### Process text and store in 

In [46]:
data_dir = "../synthetic_data"
output_dir = "../synthetic_data/json"
files = [file for file in os.listdir(data_dir) if not file.startswith("~")]

full_results = []
chapters_results = []

for file in files:
    if file.endswith("_full.docx"):
        cur_path = os.path.join(data_dir, file)
        doc = Document(cur_path)
        docx_text = "\n".join(p.text for p in doc.paragraphs if p.text.strip())
        result = parse_structured_toc(docx_text)
        full_results.append(result)
    elif file.endswith("_chapters.docx"):
        cur_path = os.path.join(data_dir, file)
        doc = Document(cur_path)
        docx_text = "\n".join(p.text for p in doc.paragraphs if p.text.strip())
        result = parse_structured_toc(docx_text)
        chapters_results.append(result)

with open(os.path.join(output_dir, "all_full.json"), "w", encoding="utf-8") as f:
    json.dump(full_results, f, indent=2, ensure_ascii=False)

with open(os.path.join(output_dir, "all_chapters.json"), "w", encoding="utf-8") as f:
    json.dump(chapters_results, f, indent=2, ensure_ascii=False)

In [45]:
for file in files:
    if file.endswith("_full.docx"):
        print(os.path.join(data_dir, file))

../synthetic_data/5_full.docx
../synthetic_data/3_full.docx
../synthetic_data/4_full.docx
../synthetic_data/2_full.docx
../synthetic_data/1_full.docx
