In [1]:
!pip install selenium
!apt-get update
!apt install -y chromium-chromedriver
import sys
sys.path.insert(0, '/usr/lib/chromium-browser/chromedriver')


Collecting selenium
  Downloading selenium-4.25.0-py3-none-any.whl.metadata (7.1 kB)
Collecting trio~=0.17 (from selenium)
  Downloading trio-0.27.0-py3-none-any.whl.metadata (8.6 kB)
Collecting trio-websocket~=0.9 (from selenium)
  Downloading trio_websocket-0.11.1-py3-none-any.whl.metadata (4.7 kB)
Collecting sortedcontainers (from trio~=0.17->selenium)
  Downloading sortedcontainers-2.4.0-py2.py3-none-any.whl.metadata (10 kB)
Collecting outcome (from trio~=0.17->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.9->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Collecting h11<1,>=0.9.0 (from wsproto>=0.14->trio-websocket~=0.9->selenium)
  Downloading h11-0.14.0-py3-none-any.whl.metadata (8.2 kB)
Downloading selenium-4.25.0-py3-none-any.whl (9.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.7/9.7 MB[0m [31m23.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownload

In [2]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options

# Set Chrome options for Selenium in headless mode
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')

# Initialize WebDriver
driver = webdriver.Chrome(options=chrome_options)


In [3]:
import os
import requests
import pandas as pd
from selenium.webdriver.common.by import By


def summarize_text(text, max_sentences=5):
    sentences = text.split('. ')
    summary = '. '.join(sentences[:max_sentences]) + '.' if len(sentences) > max_sentences else text
    return summary


def generate_title_from_url(url, page_text):

    if 'dc' in url.lower():
        return "Washington DC Policies"
    elif 'florida' in url.lower():
        return "Florida Statutes - Medical Assistance"
    elif 'iowa' in url.lower():
        return "Iowa Chapter 249A Medical Assistance"
    elif 'texas' in url.lower():
        return "Texas Administrative Code"
    elif 'ct' in url.lower():
        return "Connecticut Uniform Policy Manual"
    elif 'ohio' in url.lower():
        return "Ohio Administrative Code"
    elif 'pennsylvania' in url.lower():
        return "Pennsylvania Code"
    else:

        return " ".join(page_text.split()[:10]) + "..."

# Function to scrape policy text, PDFs, and titles from a given URL
def scrape_policies_and_pdfs(url, state_name, output_folder):
    # Navigate to the given URL using Selenium
    driver.get(url)

    # Initialize a list to store all policy data
    policy_data = []

    # Get all the text content on the webpage
    page_text = driver.find_element(By.TAG_NAME, "body").text

    # Generate a summary of the text
    policy_summary = summarize_text(page_text)

    # Generate a meaningful title from the URL or page text
    title = generate_title_from_url(url, page_text)

    # Find all the PDF links
    pdf_links = driver.find_elements(By.XPATH, "//a[contains(@href, '.pdf')]")

    # Initialize an empty list to store PDF metadata
    pdf_data = []

    # Make sure the output folder exists
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Download each PDF and store its metadata
    for link in pdf_links:
        pdf_url = link.get_attribute('href')
        pdf_name = pdf_url.split("/")[-1]
        pdf_path = os.path.join(output_folder, pdf_name)

        # Download the PDF and save it locally
        response = requests.get(pdf_url)
        with open(pdf_path, 'wb') as file:
            file.write(response.content)

        # Store PDF metadata
        pdf_data.append({"PDF Name": pdf_name, "PDF URL": pdf_url})

    # Combine the policy text, summary, title, and PDF metadata
    policy_data.append({
        "State": state_name,
        "Title": title,
        "Policy Text": page_text,
        "Summary": policy_summary,
        "PDFs": "; ".join([f"{pdf['PDF Name']} ({pdf['PDF URL']})" for pdf in pdf_data]) if pdf_data else "None"
    })

    # Convert to DataFrame and return
    return pd.DataFrame(policy_data)

# Define the list of URLs and corresponding state names
urls_states = [
    {"url": "https://www.dcregs.dc.gov/Common/DCMR/RuleList.aspx?ChapterNum=29-95&ChapterId=3476", "state": "DC"},
    {"url": "http://www.leg.state.fl.us/statutes/index.cfm?App_mode=Display_Statute&Search_String=&URL=0400-0499/0409/0409PARTIIIContentsIndex.html", "state": "Florida"},
    {"url": "https://www.legis.iowa.gov/publications/search?tab=true&rows=10&start=0&sort=lbl%20desc%2Csn%20asc%2Cname%20asc&q=&fq=-status%3A%22Reserved%22%20AND%20-status%3A%22Repealed%22%20AND%20-status%3A%22Rescinded%22&fq=(l5%3A%22law%3A1code%3A1476%3A06%3A0006%3A00249A-1388805%7CCHAPTER%20249A%20MEDICAL%20ASSISTANCE%22", "state": "Iowa"},
    {"url": "https://texreg.sos.state.tx.us/public/readtac$ext.ViewTAC?tac_view=3&ti=1&pt=15", "state": "Texas"},
    {"url": "https://portal.ct.gov/dss/lists/uniform-policy-manual?page=37", "state": "Connecticut"},
    {"url": "https://portal.ct.gov/dss/lists/uniform-policy-manual?page=38", "state": "Connecticut"},
    {"url": "https://portal.ct.gov/dss/lists/uniform-policy-manual?page=39", "state": "Connecticut"},
    {"url": "https://portal.ct.gov/dss/lists/uniform-policy-manual?page=40", "state": "Connecticut"},
    {"url": "https://portal.ct.gov/dss/lists/uniform-policy-manual?page=41", "state": "Connecticut"},
    {"url": "https://portal.ct.gov/dss/lists/uniform-policy-manual?page=42", "state": "Connecticut"},
    {"url": "https://codes.ohio.gov/ohio-administrative-code/5160", "state": "Ohio"},
    {"url": "https://www.pacodeandbulletin.gov/Display/pacode?titleNumber=055&file=/secure/pacode/data/055/055toc.html&searchunitkeywords=&operator=OR&title=null", "state": "Pennsylvania"}
]

# Initialize an empty DataFrame to store all results
all_policies = pd.DataFrame()

# Set the output folder for PDF downloads
output_folder = "/content/pdfs"

# Loop through each URL, scrape the data, and append it to the DataFrame
for entry in urls_states:
    state_df = scrape_policies_and_pdfs(entry['url'], entry['state'], output_folder)
    all_policies = pd.concat([all_policies, state_df], ignore_index=True)

# Save the combined data to a single CSV file
all_policies.to_csv("/content/all_policies_with_titles.csv", index=False)
print("All data scraped and saved to 'all_policies_with_titles.csv'.")


All data scraped and saved to 'all_policies_with_titles.csv'.
