In [8]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import json
import time
import os

In [9]:
# Load the CSV file. Assuming no header based on inspection.
# Columns mapping based on observation:
# 0: Citation, 1: Doc Number, 2: Start Page, 3: URL, 4: PDF URL
# 5: Type, 6: Subtype, 7: Pub Date, 8: Sign Date, 9: FR Page
# 10: Title, 11: Notes, 12: EO Number, 13: Internal ID

csv_path = "orders.csv"

try:
    # Attempt to read with no header
    df = pd.read_csv(csv_path, header=None)
    print(f"Loaded {len(df)} rows.")
    
    # Rename useful columns for clarity
    column_mapping = {
        0: 'citation',
        1: 'document_number',
        2: 'start_page',
        3: 'url',
        4: 'pdf_url',
        5: 'doc_type',
        6: 'doc_subtype',
        7: 'publication_date',
        8: 'signing_date',
        9: 'fr_page',
        10: 'title',
        11: 'notes',
        12: 'order_number',
        13: 'internal_id'
    }
    df.rename(columns=column_mapping, inplace=True)
    
    # Display first few rows to verify
    display(df.head())

except Exception as e:
    print(f"Error loading CSV: {e}")

Loaded 219 rows.


Unnamed: 0,citation,document_number,start_page,url,pdf_url,doc_type,doc_subtype,publication_date,signing_date,fr_page,title,notes,order_number,internal_id
0,citation,document_number,end_page,html_url,pdf_url,type,subtype,publication_date,signing_date,start_page,title,disposition_notes,executive_order_number,not_received_for_publication
1,90 FR 43895,2025-17509,43897,https://www.federalregister.gov/documents/2025...,https://www.govinfo.gov/content/pkg/FR-2025-09...,Presidential Document,Executive Order,09/10/2025,2025-09-05,43895,Strengthening Efforts To Protect U.S. National...,"See: EO 11295, August 5, 1966",14348,
2,90 FR 19611,2025-08266,19614,https://www.federalregister.gov/documents/2025...,https://www.govinfo.gov/content/pkg/FR-2025-05...,Presidential Document,Executive Order,05/08/2025,2025-05-05,19611,Improving the Safety and Security of Biologica...,,14292,
3,90 FR 17519,2025-07368,17523,https://www.federalregister.gov/documents/2025...,https://www.govinfo.gov/content/pkg/FR-2025-04...,Presidential Document,Executive Order,04/28/2025,2025-04-23,17519,Advancing Artificial Intelligence Education fo...,,14277,
4,90 FR 17525,2025-07369,17527,https://www.federalregister.gov/documents/2025...,https://www.govinfo.gov/content/pkg/FR-2025-04...,Presidential Document,Executive Order,04/28/2025,2025-04-23,17525,Preparing Americans for High-Paying Skilled Tr...,"See: EO 14302, May 23, 2025",14278,


In [10]:
def scrape_federal_register(url):
    """
    Fetches the Federal Register page and extracts the full text of the executive order.
    """
    try:
        headers = {
            "User-Agent": "LegalAI-Scraper/1.0 (Educational Purpose)"
        }
        response = requests.get(url, headers=headers, timeout=15)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Federal Register usually holds the document text in specific sections
        # Common identifiers for content:
        # - div class="full-text"
        # - div id="fulltext_content_area"
        # - div class="document-content"
        
        # Attempt to find the main content container
        content_div = soup.find('div', class_='full-text') or \
                      soup.find('div', id='fulltext_content_area') or \
                      soup.find('div', class_='document-content')
        
        if content_div:
            # Extract text, preserving some structure with newlines
            full_text = content_div.get_text(separator='\n\n', strip=True)
            return full_text
        else:
            # Fallback: Body text (messy but better than nothing)
            return soup.body.get_text(separator='\n\n', strip=True)
            
    except Exception as e:
        print(f"Error scraping {url}: {e}")
        return None

In [11]:
orders_data = []

# Rate limiting: Sleep between requests
SLEEP_SEC = 1

print("Starting scraping...")
for index, row in df.iterrows():
    url = row.get('url')
    title = row.get('title', f"Order {index}")
    
    if not url or not isinstance(url, str) or not url.startswith("http"):
        print(f"Skipping invalid URL at index {index}")
        continue
        
    print(f"Scraping [{index+1}/{len(df)}]: {title[:50]}...")
    
    full_text = scrape_federal_register(url)
    
    # construct record
    record = row.to_dict()
    record['full_text'] = full_text
    
    if not full_text:
        print("  -> No text found.")
        
    orders_data.append(record)
    
    time.sleep(SLEEP_SEC)

Starting scraping...
Skipping invalid URL at index 0
Scraping [2/219]: Strengthening Efforts To Protect U.S. Nationals Fr...
Scraping [3/219]: Improving the Safety and Security of Biological Re...
Scraping [4/219]: Advancing Artificial Intelligence Education for Am...
Scraping [5/219]: Preparing Americans for High-Paying Skilled Trade ...
Scraping [6/219]: Restoring Equality of Opportunity and Meritocracy...
Scraping [7/219]: Transparency Regarding Foreign Influence at Americ...
Scraping [8/219]: White House Initiative To Promote Excellence and I...
Scraping [9/219]: Establishing the United States Investment Accelera...
Scraping [10/219]: Ensuring Commercial, Cost-Effective Solutions in F...
Scraping [11/219]: Ensuring National Security and Economic Resilience...
Scraping [12/219]: Preventing Woke AI in the Federal Government...
Scraping [13/219]: Promoting the Export of the American AI Technology...
Scraping [14/219]: Ending Crime and Disorder on America's Streets...
Scraping [15/219]

In [12]:
output_file = "orders.json"

with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(orders_data, f, ensure_ascii=False, indent=2)
    
print(f"Saved {len(orders_data)} orders to {output_file}")

Saved 218 orders to orders.json


In [13]:
# Verify the output
if orders_data:
    print("First order snippet:")
    print(orders_data[0].get('full_text', '')[:500])
else:
    print("No data to verify.")

First order snippet:
(

printed page 43895)

Executive Order 14348

of September 5, 2025

Strengthening Efforts To Protect U.S. Nationals From Wrongful Detention Abroad

By the authority vested in me as President by the Constitution and the laws of the United States of America, including the Robert Levinson Hostage Recovery and Hostage-Taking Accountability Act (

22 U.S.C. 1741

et seq.

) (Levinson Act), it is hereby ordered:

Section 1

.

Purpose.

The United States must strengthen efforts to protect U.S. nation
