In [7]:
import json
import re
import os

In [8]:
# Load extracted orders
input_file = "orders.json"
if not os.path.exists(input_file):
    print(f"Error: {input_file} not found. Please run orders.ipynb first.")
else:
    with open(input_file, 'r', encoding='utf-8') as f:
        orders = json.load(f)
    print(f"Loaded {len(orders)} orders.")

Loaded 218 orders.


In [9]:
def chunk_order_text(text):
    """
    Splits text by 'Section X.' or 'Sec. X.' headers.
    Returns a list of dictionaries: [{'label': 'Section 1', 'text': '...'}]
    """
    if not text:
        return []
    
    # Regex to find headers like "Section 1." or "Sec. 1."
    # Captures the header itself to use as a label
    # Updated to handle newlines/whitespace before the final period (e.g. "Section 1\n\n.")
    pattern = r"((?:Section|Sec\.)\s+\d+\s*\.)"
    
    # Split by the pattern. capture=True keeps the separator (the header)
    parts = re.split(pattern, text)
    
    chunks = []
    current_label = "Preamble" # Text before first section
    current_text = ""
    
    # parts[0] is text before first match (Preamble if exists)
    if parts[0].strip():
        chunks.append({'label': 'Preamble', 'text': parts[0].strip()})
        
    # Loop through the rest: header, text, header, text...
    # Because regex capture group is used, parts list looks like: [preamble, Header1, Body1, Header2, Body2...]
    for i in range(1, len(parts), 2):
        header = parts[i].strip()
        body = parts[i+1].strip() if i+1 < len(parts) else ""
        
        # Clean up body text (remove excessive newlines if needed, though usually fine)
        body = re.sub(r'\n+', ' ', body)
        
        chunks.append({
            'label': header,
            'text': (header + " " + body).strip() # Include header in text for context
        })
        
    return chunks

In [10]:
all_chunks = []

print("Chunking orders...")
for order in orders:
    text = order.get('full_text', '')
    extracted_chunks = chunk_order_text(text)
    
    for c in extracted_chunks:
        # Create a new record with same metadata + chunk info
        chunk_record = order.copy()
        chunk_record['chunk_text'] = c
        
        # Remove full_text to save space if desired, but user asked to "keep all metadata"
        # The prompt said "have each chunk have the same metadata as the og executive order"
        # Usually 'full_text' is metadata, but redundant here. I'll keep it to be safe or remove if too large.
        # Let's remove 'full_text' from the chunk object to avoid massive duplication, 
        # as 'chunk_text' contains the relevant part.
        if 'full_text' in chunk_record:
            del chunk_record['full_text']
            
        all_chunks.append(chunk_record)
        
print(f"Created {len(all_chunks)} chunks from {len(orders)} orders.")

Chunking orders...
Created 1471 chunks from 218 orders.


In [11]:
output_file = "order_chunks.json"
with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(all_chunks, f, ensure_ascii=False, indent=2)
    
print(f"Saved to {output_file}")

Saved to order_chunks.json


In [12]:
if all_chunks:
    print("Sample chunk:")
    print(json.dumps(all_chunks[1], indent=2))
else:
    print("No chunks generated.")

Sample chunk:
{
  "citation": "90 FR 43895",
  "document_number": " 2025-17509",
  "start_page": "43897",
  "url": "https://www.federalregister.gov/documents/2025/09/10/2025-17509/strengthening-efforts-to-protect-us-nationals-from-wrongful-detention-abroad",
  "pdf_url": "https://www.govinfo.gov/content/pkg/FR-2025-09-10/pdf/2025-17509.pdf",
  "doc_type": "Presidential Document",
  "doc_subtype": "Executive Order",
  "publication_date": "09/10/2025",
  "signing_date": "2025-09-05",
  "fr_page": "43895",
  "title": "Strengthening Efforts To Protect U.S. Nationals From Wrongful Detention Abroad",
  "notes": "See: EO 11295, August 5, 1966",
  "order_number": "14348",
  "internal_id": NaN,
  "chunk_text": {
    "label": "Section 1\n\n.",
    "text": "Section 1\n\n. Purpose. The United States must strengthen efforts to protect U.S. nationals from wrongful detention abroad. The United States Government is committed to using every tool available to curb this coercive tactic used by foreign ad