In [None]:
import requests
from bs4 import BeautifulSoup
import time

# Base URL for the milestone documents
base_url = "https://www.archives.gov"

# URL containing the list of documents
list_url = "https://www.archives.gov/milestone-documents/list"

# Send a request to fetch the list page
response = requests.get(list_url)
if response.status_code != 200:
    print("Failed to retrieve the page.")
    exit()


soup = BeautifulSoup(response.text, 'html.parser')


doc_links = []
for link in soup.select("h3 a"):
    href = link.get("href")
    if href.startswith("/milestone-documents/"):
        doc_links.append(base_url + href)


document_texts = {}


for doc_url in doc_links:
    print(f"Scraping: {doc_url}")
    doc_response = requests.get(doc_url)
    if doc_response.status_code != 200:
        print(f"Failed to retrieve {doc_url}")
        continue

    doc_soup = BeautifulSoup(doc_response.text, 'html.parser')

    
    paragraphs = [p.get_text(strip=True) for p in doc_soup.find_all("p")]

    if paragraphs:
        longest_paragraph = max(paragraphs, key=len)
        document_texts[doc_url] = longest_paragraph
        print(f"Extracted text from {doc_url}: {longest_paragraph[:100]}...")  # Preview first 100 chars
    else:
        print(f"No transcription found for {doc_url}")

    
    time.sleep(2)

# Print some results
for doc, text in list(document_texts.items())[:3]:  # Show first 3 for preview
    print(f"\nDocument: {doc}\n{text[:500]}...\n")  # Print first 500 chars





Scraping: https://www.archives.gov/milestone-documents/lee-resolution
Extracted text from https://www.archives.gov/milestone-documents/lee-resolution: The colonists elected delegates to attend a Continental Congress that eventually became the governin...
Scraping: https://www.archives.gov/milestone-documents/declaration-of-independence
Extracted text from https://www.archives.gov/milestone-documents/declaration-of-independence: We hold these truths to be self-evident, that all men are created equal, that they are endowed by th...
Scraping: https://www.archives.gov/milestone-documents/articles-of-confederation
Extracted text from https://www.archives.gov/milestone-documents/articles-of-confederation: The united states, in congress assembled, shall also be the last resort on appeal, in all disputes a...
Scraping: https://www.archives.gov/milestone-documents/treaty-of-alliance-with-france
Extracted text from https://www.archives.gov/milestone-documents/treaty-of-alliance-with-france: The 

In [None]:
#/projectnb/sparkgrp/mass-sec-state-deeds-data/1720-1780/

In [None]:
document_texts

{'https://www.archives.gov/milestone-documents/lee-resolution': 'The colonists elected delegates to attend a Continental Congress that eventually became the governing body of the union during the Revolution. Its second meeting convened in Philadelphia in 1775. The delegates to Congress adopted strict rules of secrecy to protect the cause of American liberty and their own lives. In less than a year, most of the delegates abandoned hope of reconciliation with Britain.',
 'https://www.archives.gov/milestone-documents/declaration-of-independence': 'We hold these truths to be self-evident, that all men are created equal, that they are endowed by their Creator with certain unalienable Rights, that among these are Life, Liberty and the pursuit of Happiness.--That to secure these rights, Governments are instituted among Men, deriving their just powers from the consent of the governed, --That whenever any Form of Government becomes destructive of these ends, it is the Right of the People to alt

In [5]:
def extract_title(url):
    # Extract last part of URL and replace dashes with spaces
    title = url.split("/")[-1].replace("-", " ")
    # Capitalize first letter of each word
    return title.title()

In [6]:
cleaned_dict = {extract_title(url): text for url, text in document_texts.items()}

In [7]:
cleaned_dict

{'Lee Resolution': 'The colonists elected delegates to attend a Continental Congress that eventually became the governing body of the union during the Revolution. Its second meeting convened in Philadelphia in 1775. The delegates to Congress adopted strict rules of secrecy to protect the cause of American liberty and their own lives. In less than a year, most of the delegates abandoned hope of reconciliation with Britain.',
 'Declaration Of Independence': 'We hold these truths to be self-evident, that all men are created equal, that they are endowed by their Creator with certain unalienable Rights, that among these are Life, Liberty and the pursuit of Happiness.--That to secure these rights, Governments are instituted among Men, deriving their just powers from the consent of the governed, --That whenever any Form of Government becomes destructive of these ends, it is the Right of the People to alter or to abolish it, and to institute new Government, laying its foundation on such princi

In [8]:
import numpy as np 

In [9]:
np.save("document_texts.npy", cleaned_dict)