In [1]:
# file: vdoe_math_scraper.py

import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from pathlib import Path
from docx import Document
import fitz  # PyMuPDF

BASE_URL = "https://www.doe.virginia.gov"
TARGET_URL = "https://www.doe.virginia.gov/teaching-learning-assessment/k-12-standards-instruction/mathematics/2023-sol-instructional-resources"
DOWNLOAD_DIR = Path("downloads")
OUTPUT_DIR = Path("extracted")
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
}

DOWNLOAD_DIR.mkdir(exist_ok=True)
OUTPUT_DIR.mkdir(exist_ok=True)

def get_exemplar_doc_links():
    resp = requests.get(TARGET_URL, headers=HEADERS)
    soup = BeautifulSoup(resp.text, "html.parser")
    print(soup)
    links = []
    for a in soup.find_all("a", href=True):
        href = a["href"]
        text = a.text.strip()
        if any(keyword in text.lower() for keyword in [".pdf", ".doc", ".docx"]):
            continue  # skip links with extensions in text, not href
        if any(href.lower().endswith(ext) for ext in [".pdf", ".docx"]):
            links.append(urljoin(BASE_URL, href))
    return links

def download_documents(links):
    for url in links:
        filename = url.split("/")[-1]
        filepath = DOWNLOAD_DIR / filename
        if not filepath.exists():
            r = requests.get(url, headers=HEADERS)
            filepath.write_bytes(r.content)
            print(f"Downloaded: {filename}")
        else:
            print(f"Already exists: {filename}")

def extract_text_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    return "\n".join(page.get_text() for page in doc)

def extract_text_docx(docx_path):
    doc = Document(docx_path)
    return "\n".join(p.text for p in doc.paragraphs if p.text.strip())

def extract_all_documents():
    for file in DOWNLOAD_DIR.iterdir():
        out_path = OUTPUT_DIR / f"{file.stem}.txt"
        if out_path.exists():
            continue
        if file.suffix.lower() == ".pdf":
            text = extract_text_pdf(file)
        elif file.suffix.lower() == ".docx":
            text = extract_text_docx(file)
        else:
            print(f"Unsupported file type: {file.name}")
            continue
        out_path.write_text(text, encoding="utf-8")
        print(f"Extracted: {out_path.name}")

def main():
    links = get_exemplar_doc_links()
    print(f"Found {len(links)} document links.")
    # download_documents(links)
    # extract_all_documents()

if __name__ == "__main__":
    main()


<html><head>
<title>Access Denied</title>
</head><body>
<h1>Access Denied</h1>
 
You don't have permission to access "http://www.doe.virginia.gov/$(SERVE_403)/teaching-learning-assessment/k-12-standards-instruction/mathematics/2023-sol-instructional-resources?" on this server.<p>
Reference #18.50a6d017.1743634814.2394f02
<p>https://errors.edgesuite.net/18.50a6d017.1743634814.2394f02</p>
</p></body>
</html>

Found 0 document links.


In [5]:
# file: playwright_scrape_vdoe.py

import asyncio
import os
from pathlib import Path
from playwright.async_api import async_playwright

BASE_URL = "https://www.doe.virginia.gov/teaching-learning-assessment/k-12-standards-instruction/mathematics/2023-sol-instructional-resources"
DOWNLOAD_DIR = Path("downloads")
DOWNLOAD_DIR.mkdir(exist_ok=True)

async def scrape_vdoe_documents():
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        context = await browser.new_context(accept_downloads=True)
        page = await context.new_page()
        await page.goto(BASE_URL, timeout=60000)
        await page.wait_for_timeout(5000)

        html = await page.content()
        print(html)


        links = await page.eval_on_selector_all(
            "a[href$='.pdf'], a[href$='.docx']",
            "elements => elements.map(e => ({ href: e.href, name: e.textContent.trim() }))"
        )

        print(f"Found {len(links)} documents.")

        for link in links:
            url = link["href"]
            filename = url.split("/")[-1]
            file_path = DOWNLOAD_DIR / filename
            if file_path.exists():
                print(f"Already exists: {filename}")
                continue
            try:
                download = await page.wait_for_event("download", timeout=10000)
                await page.goto(url)
                await download.save_as(file_path)
                print(f"Downloaded: {filename}")
            except Exception as e:
                print(f"Failed: {filename} - {e}")

        await browser.close()

import nest_asyncio
nest_asyncio.apply()

await scrape_vdoe_documents()


<html><head>
<title>Access Denied</title>
</head><body>
<h1>Access Denied</h1>
 
You don't have permission to access "http://www.doe.virginia.gov/$(SERVE_403)/teaching-learning-assessment/k-12-standards-instruction/mathematics/2023-sol-instructional-resources?" on this server.<p>
Reference #18.a8813217.1743635375.e15672b
</p><p>https://errors.edgesuite.net/18.a8813217.1743635375.e15672b</p>


</body></html>
Found 0 documents.


In [3]:
import requests
from bs4 import BeautifulSoup
import os
from urllib.parse import urljoin
# import PyPDF2
from io import BytesIO
from docx import Document

# Step 1: Access the webpage
url = 'https://www.doe.virginia.gov/teaching-learning-assessment/k-12-standards-instruction/mathematics/2023-sol-instructional-resources'
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
print(soup)
# Step 2: Locate the Exemplar Mathematics Instructional Plans section
# This will depend on the structure of the webpage; adjust the selector as needed
sections = soup.find_all('section')  # Example; adjust based on actual HTML structure

# Step 3: Extract document links
doc_links = []
for section in sections:
    if 'Exemplar Mathematics Instructional Plans' in section.text:
        links = section.find_all('a', href=True)
        for link in links:
            href = link['href']
            if href.endswith('.pdf') or href.endswith('.docx'):
                doc_links.append(urljoin(url, href))


# # Step 4: Download and parse documents
# for doc_url in doc_links:
#     doc_response = requests.get(doc_url)
#     if doc_url.endswith('.pdf'):
#         # Parse PDF
#         with BytesIO(doc_response.content) as open_pdf_file:
#             reader = PyPDF2.PdfReader(open_pdf_file)
#             text = ''
#             for page in reader.pages:
#                 text += page.extract_text()
#     elif doc_url.endswith('.docx'):
#         # Parse Word document
#         with BytesIO(doc_response.content) as open_docx_file:
#             document = Document(open_docx_file)
#             text = ''
#             for para in document.paragraphs:
#                 text += para.text

#     # Step 5: Store the extracted text
#     # For demonstration, printing the first 500 characters of each document
#     print(f'Contents of {doc_url}:\n{text[:500]}\n{"="*40}\n')


<html><head>
<title>Access Denied</title>
</head><body>
<h1>Access Denied</h1>
 
You don't have permission to access "http://www.doe.virginia.gov/$(SERVE_403)/teaching-learning-assessment/k-12-standards-instruction/mathematics/2023-sol-instructional-resources?" on this server.<p>
Reference #18.a8813217.1743635507.e23a26c
<p>https://errors.edgesuite.net/18.a8813217.1743635507.e23a26c</p>
</p></body>
</html>



In [6]:
import requests
from bs4 import BeautifulSoup
import os
import re
from urllib.parse import urljoin
# import PyPDF2
import docx
import io

def download_file(url, save_dir):
    """
    Download a file from URL and save it to the specified directory
    Returns the path to the downloaded file
    """
    # Create directory if it doesn't exist
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    
    # Get filename from URL
    filename = os.path.basename(url)
    file_path = os.path.join(save_dir, filename)
    
    # Download the file
    response = requests.get(url, stream=True)
    if response.status_code == 200:
        with open(file_path, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
        print(f"Downloaded: {filename}")
        return file_path
    else:
        print(f"Failed to download: {url}")
        return None

def extract_text_from_pdf(file_path):
    """Extract text content from a PDF file"""
    try:
        text = ""
        with open(file_path, 'rb') as f:
            pdf_reader = PyPDF2.PdfReader(f)
            for page_num in range(len(pdf_reader.pages)):
                text += pdf_reader.pages[page_num].extract_text() + "\n"
        return text
    except Exception as e:
        print(f"Error extracting text from PDF {file_path}: {e}")
        return None

def extract_text_from_docx(file_path):
    """Extract text content from a Word document"""
    try:
        doc = docx.Document(file_path)
        text = "\n".join([para.text for para in doc.paragraphs])
        return text
    except Exception as e:
        print(f"Error extracting text from DOCX {file_path}: {e}")
        return None

def extract_text_from_file(file_path):
    """Extract text from either PDF or DOCX based on file extension"""
    if file_path.lower().endswith('.pdf'):
        return extract_text_from_pdf(file_path)
    elif file_path.lower().endswith(('.docx', '.doc')):
        return extract_text_from_docx(file_path)
    else:
        print(f"Unsupported file type: {file_path}")
        return None

def scrape_virginia_doe_math_resources():
    """
    Scrape the Virginia DOE Mathematics Instructional Resources page,
    download PDF and Word documents from the Exemplar Mathematics Instructional Plans section,
    and extract their text content.
    """
    # URL of the target page
    url = "https://www.doe.virginia.gov/teaching-learning-assessment/k-12-standards-instruction/mathematics/2023-sol-instructional-resources"
    
    # Directory to save downloaded files
    download_dir = "virginia_doe_math_resources"
    
    # Send request to the webpage
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed to retrieve the webpage. Status code: {response.status_code}")
        return
    
    # Parse the HTML content
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Find the section "Exemplar Mathematics Instructional Plans"
    exemplar_section = None
    for heading in soup.find_all(['h2', 'h3', 'h4']):
        if "Exemplar Mathematics Instructional Plans" in heading.get_text():
            exemplar_section = heading
            break
    
    if not exemplar_section:
        print("Could not find the 'Exemplar Mathematics Instructional Plans' section.")
        return
    
    # Find all links in the section and following content until the next heading
    links = []
    current = exemplar_section.next_sibling
    
    while current and not current.name in ['h2', 'h3', 'h4']:
        if hasattr(current, 'find_all'):
            for link in current.find_all('a', href=True):
                href = link['href']
                # Check if it's a PDF or Word document link
                if href.lower().endswith(('.pdf', '.doc', '.docx')):
                    full_url = urljoin(url, href)
                    links.append((link.get_text().strip(), full_url))
        
        current = current.next_sibling
    
    if not links:
        print("No document links found in the exemplar section.")
        return
    
    # Download files and extract content
    documents_data = []
    
    for title, link_url in links:
        print(f"Processing: {title} - {link_url}")
        
        # Download the file
        file_path = download_file(link_url, download_dir)
        
        if file_path:
            # Extract text from the file
            content = extract_text_from_file(file_path)
            
            if content:
                # Store the data
                documents_data.append({
                    'title': title,
                    'url': link_url,
                    'file_path': file_path,
                    'content': content
                })
    
    # Return the collected data
    return documents_data

def save_data_to_file(data, output_file="virginia_doe_math_data.txt"):
    """Save the extracted data to a text file"""
    with open(output_file, 'w', encoding='utf-8') as f:
        for doc in data:
            f.write(f"Title: {doc['title']}\n")
            f.write(f"URL: {doc['url']}\n")
            f.write(f"File: {doc['file_path']}\n")
            f.write("Content:\n")
            f.write("-" * 80 + "\n")
            f.write(doc['content'])
            f.write("\n" + "=" * 80 + "\n\n")
    
    print(f"Data saved to {output_file}")

def main():
    print("Starting to scrape Virginia DOE Mathematics Resources...")
    documents_data = scrape_virginia_doe_math_resources()
    
    # if documents_data:
    #     print(f"Successfully scraped {len(documents_data)} documents.")
    #     # Save data to file
    #     save_data_to_file(documents_data)
        
    #     # Example of how to work with the data
    #     print("\nExtracted document titles:")
    #     for doc in documents_data:
    #         print(f"- {doc['title']}")
    # else:
    #     print("No documents were successfully scraped.")

if __name__ == "__main__":
    main()

Starting to scrape Virginia DOE Mathematics Resources...
Failed to retrieve the webpage. Status code: 403


In [10]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import time

url = "https://www.doe.virginia.gov/teaching-learning-assessment/k-12-standards-instruction/mathematics/2023-sol-instructional-resources"

# Setup Chrome options (non-headless for bypassing detection)
options = webdriver.ChromeOptions()
options.add_argument("--disable-blink-features=AutomationControlled")
options.add_argument("--start-maximized")
options.add_experimental_option('excludeSwitches', ['enable-automation'])
options.add_experimental_option('useAutomationExtension', False)

# Initiate browser
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

# Navigate to the webpage
driver.get(url)

# Wait longer to ensure the page fully loads
time.sleep(10)

# Get page source
soup = BeautifulSoup(driver.page_source, 'html.parser')

# Check page content
print(soup.prettify()[:3000])

# Close the browser
driver.quit()


<html class="k-webkit k-webkit134" lang="en">
 <head id="Head1">
  <style>
   @charset "UTF-8";[ng\:cloak],[ng-cloak],[data-ng-cloak],[x-ng-cloak],.ng-cloak,.x-ng-cloak,.ng-hide:not(.ng-hide-animate){display:none !important;}ng\:form{display:block;}.ng-animate-shim{visibility:hidden;}.ng-anchor{position:absolute;}
  </style>
  <meta charset="utf-8"/>
  <link href="/DefaultContent/Default/bootstrap.v3.4.1.min.css" rel="stylesheet"/>
  <meta content="IE=9; IE=8; IE=7; IE=EDGE" http-equiv="X-UA-Compatible"/>
  <title>
   2023 Mathematics Instructional Resources | Virginia Department of Education
  </title>
  <script>
   (function() {
            window['angular_base'] = "/";
         })();
  </script>
  <meta content="width=device-width" name="viewport"/>
  <link href="/DefaultContent/Default/StyleBundleDesignTheme.cssbnd?v=VWOqjwtVEIsOlLOO6OsbfgU_dxrQE7HIack0i3iIJJk1" rel="stylesheet"/>
  <link href="/Project/Contents/Main/StyleBundleDesignTheme.cssbnd?v=daTfwoQmZydJ70TdW6MmzxbJWgdW8TS5Q

In [16]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import requests
import os
import time
from urllib.parse import urljoin

options = webdriver.ChromeOptions()
options.add_argument("--disable-blink-features=AutomationControlled")
options.add_argument("--start-maximized")
options.add_experimental_option('excludeSwitches', ['enable-automation'])
options.add_experimental_option('useAutomationExtension', False)

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

url = "https://www.doe.virginia.gov/teaching-learning-assessment/k-12-standards-instruction/mathematics/2023-sol-instructional-resources"
driver.get(url)

time.sleep(10)

soup = BeautifulSoup(driver.page_source, 'html.parser')

# Find all h4 tags
h4_tags = soup.find_all('h4')

doc_links = []

for h4 in h4_tags:
    # Get the next <ul> after each h4 tag
    ul_tag = h4.find_next_sibling('ul')
    print(ul_tag)
    if ul_tag:
        links = ul_tag.find_all('a', href=True)
        for link in links:
            href = link['href']
            # if href.endswith('.pdf') or href.endswith('.docx'):
            full_url = urljoin(url, href)
            doc_links.append(full_url)

# Download files into a 'downloads' folder
os.makedirs('downloads', exist_ok=True)
for file_url in doc_links:
    file_name = file_url.split('/')[-1]
    print(f"Downloading {file_name}...")
    file_resp = requests.get(file_url)
    if file_resp.status_code == 200:
        with open(os.path.join('downloads', file_name), 'wb') as f:
            f.write(file_resp.content)
        print(f"{file_name} downloaded successfully.")
    else:
        print(f"Failed to download {file_name} (status code: {file_resp.status_code}).")

driver.quit()


<ul>
<li>K.NS.1 - Meaningful Rote Counting (<a href="https://www.doe.virginia.gov/home/showpublisheddocument/51701/638398744219930000" target="_self">Word</a> | <a href="https://www.doe.virginia.gov/home/showpublisheddocument/51703/638398744225730000" target="_self">PDF</a>) </li>
<li>K.NS.2 - Number Boards (<a href="https://www.doe.virginia.gov/home/showpublisheddocument/51709/638398744243370000" target="_self">Word</a> | <a href="https://www.doe.virginia.gov/home/showpublisheddocument/51711/638398744249770000" target="_self">PDF</a>)</li>
<li>K.NS.2 - Build and Compare (<a href="https://www.doe.virginia.gov/home/showpublisheddocument/51705/638398744231330000" target="_self">Word</a> | <a href="https://www.doe.virginia.gov/home/showpublisheddocument/51707/638398744236970000" target="_self">PDF</a>)</li>
<li>K.CE.1 - How Many More to Equal 5? (<a href="https://www.doe.virginia.gov/home/showpublisheddocument/51697/638398744208830000" target="_self">Word</a> | <a href="https://www.doe.vi

In [20]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
import time
import os

# Set up download directory
download_dir = os.path.abspath("downloads")
os.makedirs(download_dir, exist_ok=True)

# Configure Selenium Chrome Options for downloading files automatically
options = webdriver.ChromeOptions()
prefs = {
    "download.default_directory": download_dir,
    "plugins.always_open_pdf_externally": True,
    "download.prompt_for_download": False,
    "download.directory_upgrade": True
}
options.add_experimental_option("prefs", prefs)
options.add_argument("--disable-blink-features=AutomationControlled")
options.add_experimental_option('excludeSwitches', ['enable-automation'])

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

url = "https://www.doe.virginia.gov/teaching-learning-assessment/k-12-standards-instruction/mathematics/2023-sol-instructional-resources"
driver.get(url)
time.sleep(10)

# Extract links and click to download via Selenium directly
h4_tags = driver.find_elements(By.TAG_NAME, 'h4')

for h4 in h4_tags:
    ul = h4.find_element(By.XPATH, 'following-sibling::ul[1]')
    lis = ul.find_elements(By.TAG_NAME, 'li')
    for li in lis:
        lesson_text = li.text.split('(')[0].strip()
        links = li.find_elements(By.TAG_NAME, 'a')
        print(f"\nLesson: {lesson_text}")

        for link in links:
            doc_type = link.text.strip()
            print(f"  Downloading {doc_type} document...")
            # Open link in a new tab
            driver.execute_script("window.open(arguments[0]);", link.get_attribute('href'))
            driver.switch_to.window(driver.window_handles[1])
            time.sleep(5)  # Wait for download to trigger
            driver.close()
            driver.switch_to.window(driver.window_handles[0])

driver.quit()
print(f"All files downloaded to {download_dir}")



Lesson: K.NS.1 - Meaningful Rote Counting
  Downloading Word document...


NoSuchWindowException: Message: no such window: target window already closed
from unknown error: web view not found
  (Session info: chrome=134.0.6998.166)
Stacktrace:
0   chromedriver                        0x00000001049376c8 cxxbridge1$str$ptr + 2791212
1   chromedriver                        0x000000010492fc9c cxxbridge1$str$ptr + 2759936
2   chromedriver                        0x0000000104481e30 cxxbridge1$string$len + 92928
3   chromedriver                        0x000000010445ba00 chromedriver + 129536
4   chromedriver                        0x00000001044f1418 cxxbridge1$string$len + 549096
5   chromedriver                        0x00000001044f8d1c cxxbridge1$string$len + 580076
6   chromedriver                        0x00000001044bd2fc cxxbridge1$string$len + 335820
7   chromedriver                        0x00000001048fc6c4 cxxbridge1$str$ptr + 2549544
8   chromedriver                        0x00000001048ff988 cxxbridge1$str$ptr + 2562540
9   chromedriver                        0x00000001048dc71c cxxbridge1$str$ptr + 2418560
10  chromedriver                        0x00000001049001e8 cxxbridge1$str$ptr + 2564684
11  chromedriver                        0x00000001048cd750 cxxbridge1$str$ptr + 2357172
12  chromedriver                        0x000000010491ff58 cxxbridge1$str$ptr + 2695100
13  chromedriver                        0x00000001049200e0 cxxbridge1$str$ptr + 2695492
14  chromedriver                        0x000000010492f910 cxxbridge1$str$ptr + 2759028
15  libsystem_pthread.dylib             0x000000019c159f94 _pthread_start + 136
16  libsystem_pthread.dylib             0x000000019c154d34 thread_start + 8


In [21]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
import time
import os

# Set up download directory
download_dir = os.path.abspath("downloads")
os.makedirs(download_dir, exist_ok=True)

# Configure Selenium Chrome Options for auto-download
options = webdriver.ChromeOptions()
prefs = {
    "download.default_directory": download_dir,
    "plugins.always_open_pdf_externally": True,
    "download.prompt_for_download": False,
    "download.directory_upgrade": True
}
options.add_experimental_option("prefs", prefs)
options.add_argument("--disable-blink-features=AutomationControlled")
options.add_experimental_option('excludeSwitches', ['enable-automation'])

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

url = "https://www.doe.virginia.gov/teaching-learning-assessment/k-12-standards-instruction/mathematics/2023-sol-instructional-resources"
driver.get(url)
time.sleep(10)

# Extract links and click to download via Selenium directly
h4_tags = driver.find_elements(By.TAG_NAME, 'h4')

for h4 in h4_tags:
    ul = h4.find_element(By.XPATH, 'following-sibling::ul[1]')
    lis = ul.find_elements(By.TAG_NAME, 'li')
    for li in lis:
        lesson_text = li.text.split('(')[0].strip()
        links = li.find_elements(By.TAG_NAME, 'a')
        print(f"\nLesson: {lesson_text}")

        for link in links:
            doc_type = link.text.strip()
            print(f"  Downloading {doc_type} document...")
            
            initial_windows = driver.window_handles
            driver.execute_script("window.open(arguments[0]);", link.get_attribute('href'))
            time.sleep(5)  # Allow file download to trigger
            
            new_windows = driver.window_handles
            opened_windows = list(set(new_windows) - set(initial_windows))
            
            if opened_windows:
                driver.switch_to.window(opened_windows[0])
                driver.close()
            
            driver.switch_to.window(initial_windows[0])

driver.quit()
print(f"All files downloaded to {download_dir}")



Lesson: K.NS.1 - Meaningful Rote Counting
  Downloading Word document...
  Downloading PDF document...

Lesson: K.NS.2 - Number Boards
  Downloading Word document...
  Downloading PDF document...

Lesson: K.NS.2 - Build and Compare
  Downloading Word document...
  Downloading PDF document...

Lesson: K.CE.1 - How Many More to Equal 5?
  Downloading Word document...
  Downloading PDF document...

Lesson: 1.NS.2 - Counting and Writing Beyond 100
  Downloading Word document...
  Downloading PDF document...

Lesson: 1.NS.2 - Comparing Numbers with Cubes and 10 Frames
  Downloading Word document...
  Downloading PDF document...

Lesson: 1.NS.2 - Is it More or Less?
  Downloading Word document...
  Downloading PDF document...

Lesson: 1.NS.2 - Estimate the Number of Objects to 120
  Downloading Word document...
  Downloading PDF document...

Lesson: 2.NS.1 - Guess My Pattern
  Downloading Word document...
  Downloading PDF document...

Lesson: 2.NS.1 - Even or Odd
  Downloading Word documen

In [14]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
import time
import os
from PyPDF2 import PdfReader
from docx import Document

# --- Setup Download Directory ---
download_dir = os.path.abspath("downloads/mathlp")
os.makedirs(download_dir, exist_ok=True)

# --- Chrome Options for Selenium ---
options = webdriver.ChromeOptions()
prefs = {
    "download.default_directory": download_dir,
    "plugins.always_open_pdf_externally": True,
    "download.prompt_for_download": False,
    "download.directory_upgrade": True
}
options.add_experimental_option("prefs", prefs)
options.add_argument("--disable-blink-features=AutomationControlled")
options.add_experimental_option('excludeSwitches', ['enable-automation'])

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

# --- Access the webpage ---
url = "https://www.doe.virginia.gov/teaching-learning-assessment/k-12-standards-instruction/mathematics/2023-sol-instructional-resources"
driver.get(url)
time.sleep(10)

# --- Get lesson plans ---
h4_tags = driver.find_elements(By.TAG_NAME, 'h4')
lessons = []

for h4 in h4_tags:
    ul = h4.find_element(By.XPATH, 'following-sibling::ul[1]')
    lis = ul.find_elements(By.TAG_NAME, 'li')
    for li in lis:
        lesson_text = li.text.split('(')[0].strip()
        links = li.find_elements(By.TAG_NAME, 'a')

        for link in links:
            doc_type = link.text.strip()
            if doc_type in ["PDF"]:
                href = link.get_attribute('href')
                lessons.append({"title": lesson_text, "url": href, "type": doc_type})


# --- Download files ---
for lesson in lessons:
    print(f"Downloading {lesson['title']} - {lesson['type']}...")
    initial_windows = driver.window_handles
    driver.execute_script("window.open(arguments[0]);", lesson['url'])
    time.sleep(5)
    new_windows = driver.window_handles
    opened_windows = list(set(new_windows) - set(initial_windows))
    if opened_windows:
        driver.switch_to.window(opened_windows[0])
        driver.close()
    driver.switch_to.window(initial_windows[0])

# Wait a bit to ensure downloads complete
time.sleep(10)
driver.quit()

# --- Extract and Concatenate Text from downloaded files ---
output_text_path = "all_lessons_combined.txt"
with open(output_text_path, 'w', encoding='utf-8') as outfile:
    for lesson in lessons:
        file_ext = ".pdf" if lesson['type'] == "PDF" else ".docx"
        matching_files = [f for f in os.listdir(download_dir) if f.endswith(file_ext)]
        file_path = max([os.path.join(download_dir, f) for f in matching_files], key=os.path.getctime)

        print(f"Processing {file_path}...")
        outfile.write(f"\n{'='*80}\n")
        outfile.write(f"Lesson Title: {lesson['title']} ({lesson['type']})\n")
        outfile.write(f"{'='*80}\n\n")

        try:
            if lesson['type'] == "PDF":
                with open(file_path, 'rb') as f:
                    reader = PdfReader(f)
                    text = "\n".join([page.extract_text() for page in reader.pages])
            else:
                doc = Document(file_path)
                text = "\n".join([para.text for para in doc.paragraphs])

            outfile.write(text.strip())
            outfile.write("\n\n")
        except Exception as e:
            outfile.write(f"Failed to read {lesson['title']} ({lesson['type']}): {e}\n\n")

print(f"\n✅ All lesson texts have been concatenated into '{output_text_path}'.")


Downloading K.NS.1 - Meaningful Rote Counting - PDF...
Downloading K.NS.2 - Number Boards - PDF...


KeyboardInterrupt: 

In [29]:
from PyPDF2 import PdfReader

# Path to your PDF file
pdf_path = "Smart_but_Scattered.pdf"
reader = PdfReader(pdf_path)

# Function to convert book page to PDF page
book_to_pdf_offset = 16  # Book page 11 => PDF page 27

# Define book page ranges you want
book_page_ranges = [(17, 33), (203, 238)]

# Extract the text from those ranges
extracted_text = ""

for start, end in book_page_ranges:
    for book_page in range(start, end + 1):
        # pdf_page = book_page + book_to_pdf_offset - 1
        pdf_page = book_page
        if 0 <= pdf_page < len(reader.pages):
            extracted_text += reader.pages[pdf_page].extract_text() + "\n\n"

# Save to a .txt file
output_path = "Smart_but_Scattered_Extracted.txt"
with open(output_path, "w", encoding="utf-8") as f:
    f.write(extracted_text)

print(f"Extracted text saved to: {output_path}")


Extracted text saved to: Smart_but_Scattered_Extracted.txt


In [30]:
def combine_lesson_plans(pdf_dir, output_file):
    with open(output_file, "w", encoding="utf-8") as outfile:
        for filename in sorted(os.listdir(pdf_dir)):
            if filename.lower().endswith(".pdf"):
                filepath = os.path.join(pdf_dir, filename)
                print(f"Processing: {filename}")
                text = extract_text_from_pdf(filepath)

                outfile.write(f"\n--- Start of Lesson Plan: {filename} ---\n")
                outfile.write(text.strip())
                outfile.write(f"\n--- End of Lesson Plan: {filename} ---\n\n")


combine_lesson_plans("downloads/mathlp", "all_lessons_combined_new.txt")
print(f"All lesson plans combined into 'all_lessons_combined_new.txt'")

Processing: 1NS2amipcountwrite120.pdf
Processing: 1NS2bmipestimateobjects.pdf
Processing: 1NS2efmipmoreorless.pdf
Processing: 1NS2emipcomparewcubes.pdf
Processing: 2CE1dfmipfourinrowa.pdf
Processing: 2NS1admipguessmypattern.pdf
Processing: 2NS1hjmipeveno.pdf
Processing: 2NS2aemipplace valuematactivities.pdf
Processing: 2NS4bdmipracetodollar.pdf
Processing: 2PS1aemipall about the data Slow Reveal PPT (1).pdf
Processing: 2PS1aemipall about the data Slow Reveal PPT.pdf
Processing: 2PS1aemipallaboutthedata.pdf
Processing: 3MG3cmiphoppinelapsedtime.pdf
Processing: 3MG3cmipwheredidtimego.pdf
Processing: 3NS1abcmipPlace Value.pdf
Processing: 3NS1abcmipplacevaluegames.pdf
Processing: 3NS1mipplacevaluemat.pdf
Processing: 3NS2abmipcomparingandorderingnumbers.pdf
Processing: 3NS3dcomposinganddecompsingfractioncenters.pdf
Processing: 3NS3dmipcomposinganddecompsing fractions.pdf
Processing: 4CE1cmipaddsubwholes.pdf
Processing: 4CE4bmipthanksgivinglunch.pdf
Processing: 4MG3emipexploringRelationshipa

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import requests
import os
import time
from urllib.parse import urljoin

options = webdriver.ChromeOptions()
options.add_argument("--disable-blink-features=AutomationControlled")
options.add_argument("--start-maximized")
options.add_experimental_option('excludeSwitches', ['enable-automation'])
options.add_experimental_option('useAutomationExtension', False)

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

url = "https://www.doe.virginia.gov/teaching-learning-assessment/k-12-standards-instruction/mathematics/2023-sol-instructional-resources"
driver.get(url)

time.sleep(10)

soup = BeautifulSoup(driver.page_source, 'html.parser')

# Find all h4 tags
h4_tags = soup.find_all('h4')

doc_links = []

for h4 in h4_tags:
    # Get the next <ul> after each h4 tag
    ul_tag = h4.find_next_sibling('ul')
    print(ul_tag)
    if ul_tag:
        links = ul_tag.find_all('a', href=True)
        for link in links:
            href = link['href']
            # if href.endswith('.pdf') or href.endswith('.docx'):
            full_url = urljoin(url, href)
            doc_links.append(full_url)

# Download files into a 'downloads' folder
os.makedirs('downloads', exist_ok=True)
for file_url in doc_links:
    file_name = file_url.split('/')[-1]
    print(f"Downloading {file_name}...")
    file_resp = requests.get(file_url)
    if file_resp.status_code == 200:
        with open(os.path.join('downloads', file_name), 'wb') as f:
            f.write(file_resp.content)
        print(f"{file_name} downloaded successfully.")
    else:
        print(f"Failed to download {file_name} (status code: {file_resp.status_code}).")

driver.quit()


<ul>
<li>K.NS.1 - Meaningful Rote Counting (<a href="https://www.doe.virginia.gov/home/showpublisheddocument/51701/638398744219930000" target="_self">Word</a> | <a href="https://www.doe.virginia.gov/home/showpublisheddocument/51703/638398744225730000" target="_self">PDF</a>) </li>
<li>K.NS.2 - Number Boards (<a href="https://www.doe.virginia.gov/home/showpublisheddocument/51709/638398744243370000" target="_self">Word</a> | <a href="https://www.doe.virginia.gov/home/showpublisheddocument/51711/638398744249770000" target="_self">PDF</a>)</li>
<li>K.NS.2 - Build and Compare (<a href="https://www.doe.virginia.gov/home/showpublisheddocument/51705/638398744231330000" target="_self">Word</a> | <a href="https://www.doe.virginia.gov/home/showpublisheddocument/51707/638398744236970000" target="_self">PDF</a>)</li>
<li>K.CE.1 - How Many More to Equal 5? (<a href="https://www.doe.virginia.gov/home/showpublisheddocument/51697/638398744208830000" target="_self">Word</a> | <a href="https://www.doe.vi

In [32]:
import os
import fitz  # PyMuPDF

# Directory containing the PDF lesson plans
PDF_DIR = "downloads/matholdlp"  # Change this to your folder path
OUTPUT_FILE = "all_lesson_plans_new_1.txt"

# Stop marker text to halt extraction before appendix pages
STOP_MARKER_1 = "Note: The following pages are intended for classroom use for students as a visual aid to learning"
STOP_MARKER_2 = '''Note: The following pages are intended for classroom use for students as a visual aid to
learning.'''
STOP_MARKER_3 = '''Note: The following pages are intended for classroom use for students as a visual aid to
learning'''

def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text_parts = []
    for page in doc:
        page_text = page.get_text()

        # Skip typical headers/footers by removing lines that are repetitive and likely not body content
        lines = page_text.splitlines()
        cleaned_lines = []
        for line in lines:
            # Adjust rules based on typical header/footer content (e.g., page numbers or copyrights)
            line_stripped = line.strip()
            if not line_stripped:
                continue
            if ("Virginia Department of Education" in line_stripped or
                line_stripped.isdigit() or
                line_stripped.lower().startswith("mathematics instructional plan")):
                continue
            cleaned_lines.append(line_stripped)
        
        cleaned_page = "\n".join(cleaned_lines)
        if STOP_MARKER_1 in cleaned_page:
            cleaned_page = cleaned_page.split(STOP_MARKER_1)[0]
            text_parts.append(cleaned_page.strip())
            break
        elif STOP_MARKER_2 in cleaned_page:
            cleaned_page = cleaned_page.split(STOP_MARKER_2)[0]
            text_parts.append(cleaned_page.strip())
            break
        elif STOP_MARKER_3 in cleaned_page:
            cleaned_page = cleaned_page.split(STOP_MARKER_3)[0]
            text_parts.append(cleaned_page.strip())
            break
        text_parts.append(cleaned_page.strip())
    doc.close()
    return "\n".join(text_parts)


def combine_lesson_plans(pdf_dir, output_file):
    with open(output_file, "w", encoding="utf-8") as outfile:
        for filename in sorted(os.listdir(pdf_dir)):
            if filename.lower().endswith(".pdf"):
                filepath = os.path.join(pdf_dir, filename)
                print(f"Processing: {filename}")
                text = extract_text_from_pdf(filepath)

                outfile.write(f"\n--- Start of Lesson Plan {filename} ---\n")
                outfile.write(text)
                outfile.write(f"\n--- End of Lesson Plan {filename} ---\n\n")


combine_lesson_plans(PDF_DIR, OUTPUT_FILE)
print(f"All lesson plans combined into '{OUTPUT_FILE}'")


Processing: Algebra I-Equations and Inequalities-A.4ae - Progressing Through Equations.pdf
Processing: Algebra I-Equations and Inequalities-A.4be - Solving Quadratic Equations Using Square Roots and the Quadratic Formula.pdf
Processing: Algebra I-Equations and Inequalities-A.4be - Solving Quadratic Equations by Factoring.pdf
Processing: Algebra I-Equations and Inequalities-A.4c - Literal Equations and Formulas.pdf
Processing: Algebra I-Equations and Inequalities-A.4de - Road Trip: Applying Systems of Linear Equations.pdf
Processing: Algebra I-Equations and Inequalities-A.4de - Spring Fling Carnival: Applying Systems of Linear Equations.pdf
Processing: Algebra I-Equations and Inequalities-A.5a - Solving Linear Inequalities in One Variable.pdf
Processing: Algebra I-Equations and Inequalities-A.5ac - Lemonade Stand: Solving Practical Problems Using Linear Inequalities in One Variable.pdf
Processing: Algebra I-Equations and Inequalities-A.5b - Represent the Solution of a Linear Inequality 

In [35]:
from PyPDF2 import PdfReader

def extract_text_from_pdf(pdf_path, output_path, page_ranges):
    reader = PdfReader(pdf_path)
    extracted_text = ""

    for page_num in page_ranges:
        if 0 <= page_num < len(reader.pages):
            page = reader.pages[page_num]
            extracted_text += page.extract_text() + "\n\n"

    with open(output_path, "w", encoding="utf-8") as f:
        f.write(extracted_text)
    print(f"Extracted text saved to: {output_path}")

pdf_file = "Smart_but_Scattered.pdf"  # Update path if needed
output_file = "extracted_text.txt"
page_ranges = list(range(17, 33)) + list(range(203, 238))  # Pages 18–32 and 204–237
extract_text_from_pdf(pdf_file, output_file, page_ranges)

Extracted text saved to: extracted_text.txt


In [40]:
from PyPDF2 import PdfReader
import re

def clean_page_text(text):
    lines = text.splitlines()
    cleaned_lines = []

    for line in lines:
        line = line.strip()
        # Skip empty lines and lines that are likely page headers/footers/page numbers
        if not line or re.fullmatch(r"\d+", line) or len(line) < 5:
            continue
        # Remove running headers/footers if known patterns exist
        if re.search(r"Smart but Scattered", line, re.IGNORECASE):
            continue
        cleaned_lines.append(line)

    return " ".join(cleaned_lines)

def extract_and_clean(pdf_path, output_path, page_ranges):
    reader = PdfReader(pdf_path)
    extracted_text = ""

    for page_num in page_ranges:
        if 0 <= page_num < len(reader.pages):
            raw_text = reader.pages[page_num].extract_text()
            if raw_text:
                cleaned = clean_page_text(raw_text)
                extracted_text += cleaned + "\n\n"

    with open(output_path, "w", encoding="utf-8") as f:
        f.write(extracted_text)
    print(f"Cleaned and extracted text saved to: {output_path}")

pdf_path = "Smart_but_Scattered.pdf"
output_path = "extracted_text_cleaned_final.txt"
page_ranges =  list(range(18, 32)) + list(range(197, 288))
extract_and_clean(pdf_path, output_path, page_ranges)

Cleaned and extracted text saved to: extracted_text_cleaned_final.txt


In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import requests
import os
import time
from urllib.parse import urljoin

options = webdriver.ChromeOptions()
options.add_argument("--disable-blink-features=AutomationControlled")
options.add_argument("--start-maximized")
options.add_experimental_option('excludeSwitches', ['enable-automation'])
options.add_experimental_option('useAutomationExtension', False)

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

url = "https://www.doe.virginia.gov/teaching-learning-assessment/k-12-standards-instruction/mathematics/2023-sol-instructional-resources"
driver.get(url)

time.sleep(10)

soup = BeautifulSoup(driver.page_source, 'html.parser')

# Find all h4 tags
h4_tags = soup.find_all('h4')

doc_links = []

for h4 in h4_tags:
    # Get the next <ul> after each h4 tag
    ul_tag = h4.find_next_sibling('ul')
    print(ul_tag)
    if ul_tag:
        links = ul_tag.find_all('a', href=True)
        for link in links:
            href = link['href']
            # if href.endswith('.pdf') or href.endswith('.docx'):
            full_url = urljoin(url, href)
            doc_links.append(full_url)

# Download files into a 'downloads' folder
os.makedirs('downloads', exist_ok=True)
for file_url in doc_links:
    file_name = file_url.split('/')[-1]
    print(f"Downloading {file_name}...")
    file_resp = requests.get(file_url)
    if file_resp.status_code == 200:
        with open(os.path.join('downloads', file_name), 'wb') as f:
            f.write(file_resp.content)
        print(f"{file_name} downloaded successfully.")
    else:
        print(f"Failed to download {file_name} (status code: {file_resp.status_code}).")

driver.quit()


<ul>
<li>K.NS.1 - Meaningful Rote Counting (<a href="https://www.doe.virginia.gov/home/showpublisheddocument/51701/638398744219930000" target="_self">Word</a> | <a href="https://www.doe.virginia.gov/home/showpublisheddocument/51703/638398744225730000" target="_self">PDF</a>) </li>
<li>K.NS.2 - Number Boards (<a href="https://www.doe.virginia.gov/home/showpublisheddocument/51709/638398744243370000" target="_self">Word</a> | <a href="https://www.doe.virginia.gov/home/showpublisheddocument/51711/638398744249770000" target="_self">PDF</a>)</li>
<li>K.NS.2 - Build and Compare (<a href="https://www.doe.virginia.gov/home/showpublisheddocument/51705/638398744231330000" target="_self">Word</a> | <a href="https://www.doe.virginia.gov/home/showpublisheddocument/51707/638398744236970000" target="_self">PDF</a>)</li>
<li>K.CE.1 - How Many More to Equal 5? (<a href="https://www.doe.virginia.gov/home/showpublisheddocument/51697/638398744208830000" target="_self">Word</a> | <a href="https://www.doe.vi

In [41]:
# file: extract_clean_pdf_text.py

import fitz  # PyMuPDF

def extract_and_clean_text(pdf_path: str, page_ranges: list[tuple[int, int]]) -> str:
    doc = fitz.open(pdf_path)
    pages_to_extract = []
    for start, end in page_ranges:
        pages_to_extract.extend(range(start - 1, end))  # zero-based

    content = []
    for page_num in pages_to_extract:
        page = doc[page_num]
        blocks = page.get_text("blocks")
        blocks = sorted(blocks, key=lambda b: b[1])  # sort top to bottom

        # Filter out headers/footers using y-coordinates
        body_blocks = [b[4].strip() for b in blocks if 50 < b[1] < page.rect.height - 50]
        content.append("\n".join(body_blocks))

    full_text = "\n\n".join(content)

    # Cleanup spacing and formatting
    full_text = re.sub(r'([,.!?])(?=\S)', r'\1 ', full_text)
    full_text = re.sub(r" {2,}", " ", full_text)
    full_text = re.sub(r"(?<!\n)\n(?!\n)", " ", full_text)
    full_text = re.sub(r"\n{2,}", "\n\n", full_text)

    return full_text.strip()

import re

input_pdf = "Smart_but_Scattered.pdf"
output_txt = "extracted_clean_text.txt"
page_ranges = [(18, 32), (197, 288)]

cleaned_text = extract_and_clean_text(input_pdf, page_ranges)

with open(output_txt, "w", encoding="utf-8") as f:
    f.write(cleaned_text)

print(f"Cleaned text extracted and saved to '{output_txt}'")


Cleaned text extracted and saved to 'extracted_clean_text.txt'


In [42]:
# file: extract_clean_pdf_text.py

import fitz  # PyMuPDF
import re
from collections import Counter
from typing import List, Tuple

def extract_and_clean_text(pdf_path: str, page_ranges: List[Tuple[int, int]], debug: bool = False) -> str:
    doc = fitz.open(pdf_path)
    pages_to_extract = []
    for start, end in page_ranges:
        pages_to_extract.extend(range(start - 1, end))  # zero-based

    header_footer_candidates = Counter()
    page_texts = []

    for page_num in pages_to_extract:
        page = doc[page_num]
        blocks = page.get_text("blocks")
        blocks = sorted(blocks, key=lambda b: b[1])  # sort by vertical position

        text_lines = []
        for b in blocks:
            y0 = b[1]
            y1 = b[3]
            text = b[4].strip()
            if not text or len(text) < 10:
                continue  # skip empty or tiny blocks
            if y0 < 100 or y1 > page.rect.height - 100:
                header_footer_candidates[text] += 1
            else:
                text_lines.append(text)

        page_texts.append(text_lines)

    # Identify repetitive headers/footers
    repeated_lines = {line for line, count in header_footer_candidates.items() if count > 3}

    content = []
    for text_lines in page_texts:
        filtered_lines = [line for line in text_lines if line not in repeated_lines]
        content.append("\n".join(filtered_lines))

    full_text = "\n\n".join(content)

    # Clean spacing and punctuation
    full_text = re.sub(r'([,.!?])(?=\S)', r'\1 ', full_text)
    full_text = re.sub(r" {2,}", " ", full_text)
    full_text = re.sub(r"(?<!\n)\n(?!\n)", " ", full_text)
    full_text = re.sub(r"\n{2,}", "\n\n", full_text)

    if debug:
        with open("debug_raw_pages.txt", "w", encoding="utf-8") as dbg:
            for i, lines in enumerate(page_texts):
                dbg.write(f"--- Page {pages_to_extract[i] + 1} ---\n")
                dbg.write("\n".join(lines) + "\n\n")

    return full_text.strip()

input_pdf = "Smart_but_Scattered.pdf"
output_txt = "extracted_clean_text_1.txt"
page_ranges = [(18, 32), (197, 288)]

cleaned_text = extract_and_clean_text(input_pdf, page_ranges, debug=True)

with open(output_txt, "w", encoding="utf-8") as f:
    f.write(cleaned_text)

print(f"Cleaned text extracted and saved to '{output_txt}'")


Cleaned text extracted and saved to 'extracted_clean_text_1.txt'


In [43]:
# file: extract_clean_pdf_text.py

import fitz  # PyMuPDF
import re
from collections import Counter
from typing import List, Tuple

def extract_and_clean_text(pdf_path: str, page_ranges: List[Tuple[int, int]], debug: bool = False) -> str:
    doc = fitz.open(pdf_path)
    pages_to_extract = []
    for start, end in page_ranges:
        pages_to_extract.extend(range(start - 1, end))  # zero-based

    header_footer_candidates = Counter()
    page_texts = []
    chapter_titles = {}

    for page_num in pages_to_extract:
        page = doc[page_num]
        blocks = page.get_text("blocks")
        blocks = sorted(blocks, key=lambda b: b[1])  # sort by vertical position

        text_lines = []
        top_block = None

        for b in blocks:
            y0 = b[1]
            y1 = b[3]
            text = b[4].strip()
            if not text or len(text) < 10:
                continue
            if y0 < 100 or y1 > page.rect.height - 100:
                header_footer_candidates[text] += 1
            else:
                if top_block is None or y0 < top_block[1]:
                    top_block = (text, y0)
                text_lines.append(text)

        if top_block:
            chapter_titles[page_num] = top_block[0]
        page_texts.append(text_lines)

    # Identify repetitive headers/footers
    repeated_lines = {line for line, count in header_footer_candidates.items() if count > 3}

    content = []
    for idx, text_lines in enumerate(page_texts):
        page_num = pages_to_extract[idx]
        filtered_lines = [line for line in text_lines if line not in repeated_lines]
        if filtered_lines:
            if idx > 0:
                chapter_heading = chapter_titles.get(page_num, "Unknown Chapter Title")
                content.append(f"\n\n--- CHAPTER: {chapter_heading} (Page {page_num+1}) ---\n\n")
            content.append("\n".join(filtered_lines))

    full_text = "\n\n".join(content)

    # Clean spacing and punctuation
    full_text = re.sub(r'([,.!?])(?=\S)', r'\1 ', full_text)
    full_text = re.sub(r" {2,}", " ", full_text)
    full_text = re.sub(r"(?<!\n)\n(?!\n)", " ", full_text)
    full_text = re.sub(r"\n{2,}", "\n\n", full_text)

    if debug:
        with open("debug_raw_pages.txt", "w", encoding="utf-8") as dbg:
            for i, lines in enumerate(page_texts):
                dbg.write(f"--- Page {pages_to_extract[i] + 1} ---\n")
                dbg.write("\n".join(lines) + "\n\n")

    return full_text.strip()

input_pdf = "Smart_but_Scattered.pdf"
output_txt = "extracted_clean_text_3.txt"
page_ranges = [(18, 32), (197, 288)]

cleaned_text = extract_and_clean_text(input_pdf, page_ranges, debug=True)

with open(output_txt, "w", encoding="utf-8") as f:
    f.write(cleaned_text)

print(f"Cleaned text extracted and saved to '{output_txt}'")


Cleaned text extracted and saved to 'extracted_clean_text_3.txt'


In [46]:
import os
import time
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager

# Constants
BASE_URL = "https://www.doe.virginia.gov/teaching-learning-assessment/k-12-standards-instruction/mathematics/instructional-resources/mathematics-instructional-plans"
DOWNLOAD_DIR = "downloads/matholdlp"

# Create download directory if needed
os.makedirs(DOWNLOAD_DIR, exist_ok=True)

# Configure Selenium
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--no-sandbox")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
driver.get(BASE_URL)
wait = WebDriverWait(driver, 15)

# PDF downloader session
session = requests.Session()
session.headers.update({
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0 Safari/537.36"
})

# Locate all sidebar section links (Grades, Geometry, etc.)
section_links = wait.until(EC.presence_of_all_elements_located(
    (By.CSS_SELECTOR, "nav[aria-label='Page Contents Menu'] a")
))

visited = set()
for link in section_links:
    href = link.get_attribute("href")
    section_name = link.text.strip()
    if not href or href in visited:
        continue
    visited.add(href)

    try:
        driver.execute_script("arguments[0].scrollIntoView(true);", link)
        link.click()
        time.sleep(2)
    except Exception as e:
        print(f"Error clicking {section_name}: {e}")
        continue

    pdf_links = driver.find_elements(By.XPATH, "//a[contains(text(), 'PDF Version')]")
    for pdf in pdf_links:
        pdf_url = pdf.get_attribute("href")
        if not pdf_url or not pdf_url.endswith(".pdf"):
            continue

        try:
            label = pdf.find_element(By.XPATH, "./preceding-sibling::a[1]").text.strip()
            filename = f"{label.replace(' ', '_').replace('/', '_')}.pdf"
        except:
            filename = os.path.basename(pdf_url.split("?")[0])

        save_path = os.path.join(DOWNLOAD_DIR, filename)
        if os.path.exists(save_path):
            continue

        try:
            response = session.get(pdf_url, timeout=20)
            if response.status_code == 200:
                with open(save_path, "wb") as f:
                    f.write(response.content)
                print(f"Downloaded: {filename}")
            else:
                print(f"Failed ({response.status_code}): {pdf_url}")
        except Exception as e:
            print(f"Download error: {pdf_url} — {e}")

# Cleanup
driver.quit()


TimeoutException: Message: 


In [50]:
import os
import time
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

# Constants
HTML_FILE = "downloads/Mathematics Instructional Plans (MIPs) _ Virginia Department of Education.html"
BASE_URL = "https://www.doe.virginia.gov"
DOWNLOAD_DIR = "downloads/matholdlp"

# Create download directory if needed
os.makedirs(DOWNLOAD_DIR, exist_ok=True)

# Read HTML file
with open(HTML_FILE, "r", encoding="utf-8") as file:
    html_content = file.read()

# Parse with BeautifulSoup
soup = BeautifulSoup(html_content, "html.parser")

# PDF downloader session
session = requests.Session()
session.headers.update({
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0 Safari/537.36",
    "Referer": BASE_URL
})

# Locate all section headers
sections = soup.select("h2, h3")

# Map of section headers to their contained PDF links
pdf_tasks = []
current_section = "General"

for tag in soup.find_all():
    if tag.name in ["h2", "h3"]:
        current_section = tag.get_text(strip=True)
    if tag.name == "a" and tag.text and "PDF Version" in tag.text:
        raw_url = tag.get("href")
        pdf_url = urljoin(BASE_URL, raw_url)

        try:
            label_tag = tag.find_previous_sibling("a")
            label = label_tag.get_text(strip=True) if label_tag else "document"
        except:
            label = "document"

        topic = current_section.replace(' ', '_').replace('/', '_')
        label_clean = label.replace(' ', '_').replace('/', '_')
        filename = f"{topic}_{label_clean}.pdf"
        pdf_tasks.append((pdf_url, filename))

# Download PDFs
for pdf_url, filename in pdf_tasks:
    save_path = os.path.join(DOWNLOAD_DIR, filename)
    if os.path.exists(save_path):
        continue

    try:
        response = session.get(pdf_url, timeout=20)
        if response.status_code == 200:
            with open(save_path, "wb") as f:
                f.write(response.content)
            print(f"Downloaded: {filename}")
        else:
            print(f"Failed ({response.status_code}): {pdf_url}")
    except Exception as e:
        print(f"Download error: {pdf_url} — {e}")


Failed (403): https://www.doe.virginia.gov/home/showpublisheddocument/16306/638036744570400000
Failed (403): https://www.doe.virginia.gov/home/showpublisheddocument/16318/638036744599000000
Failed (403): https://www.doe.virginia.gov/home/showpublisheddocument/16310/638036744579170000
Failed (403): https://www.doe.virginia.gov/home/showpublisheddocument/16314/638036744589770000
Failed (403): https://www.doe.virginia.gov/home/showpublisheddocument/16322/638036744608370000
Failed (403): https://www.doe.virginia.gov/home/showpublisheddocument/16330/638036744625870000
Failed (403): https://www.doe.virginia.gov/home/showpublisheddocument/16332/638036744630570000
Failed (403): https://www.doe.virginia.gov/home/showpublisheddocument/16326/638036744616500000
Failed (403): https://www.doe.virginia.gov/home/showpublisheddocument/16338/638036744643870000
Failed (403): https://www.doe.virginia.gov/home/showpublisheddocument/16342/638036744653230000
Failed (403): https://www.doe.virginia.gov/home/sh

In [52]:
import os
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager

# Constants
HTML_FILE = os.path.abspath("downloads/Mathematics Instructional Plans (MIPs) _ Virginia Department of Education.html")
DOWNLOAD_DIR = os.path.abspath("downloads/matholdlp")

# Ensure download folder exists
os.makedirs(DOWNLOAD_DIR, exist_ok=True)

# Setup Selenium Chrome options
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_experimental_option("prefs", {
    "download.default_directory": DOWNLOAD_DIR,
    "download.prompt_for_download": False,
    "download.directory_upgrade": True,
    "plugins.always_open_pdf_externally": True
})

# Launch driver
# driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
driver = webdriver.Chrome(service=Service(ChromeDriverManager(architecture='arm64').install()), options=options)

file_url = f"file://{HTML_FILE}"
driver.get(file_url)
time.sleep(2)

# Use BeautifulSoup to determine section context
with open(HTML_FILE, "r", encoding="utf-8") as file:
    soup = BeautifulSoup(file.read(), "html.parser")

section_map = {}
current_section = "General"
for tag in soup.find_all():
    if tag.name in ["h2", "h3"]:
        current_section = tag.get_text(strip=True)
    if tag.name == "a" and tag.text and "PDF Version" in tag.text:
        section_map[tag.text.strip()] = current_section

# Find and click all PDF links
links = driver.find_elements(By.XPATH, "//a[contains(text(), 'PDF Version')]")
for i, link in enumerate(links):
    try:
        label = link.text.strip()
        topic = section_map.get(label, "General").replace(" ", "_").replace("/", "_")
        filename = f"{topic}_document_{i+1}.pdf"

        driver.execute_script("arguments[0].scrollIntoView(true);", link)
        link.click()
        print(f"Triggered download for: {filename}")
        time.sleep(1.5)  # Give time for download to start
    except Exception as e:
        print(f"Error clicking link {i+1}: {e}")

# Done
driver.quit()
print("All downloads triggered.")


TypeError: ChromeDriverManager.__init__() got an unexpected keyword argument 'architecture'

In [2]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
import time
import os
from PyPDF2 import PdfReader
from docx import Document

download_dir = os.path.abspath("downloads/matholdlp")
os.makedirs(download_dir, exist_ok=True)

HTML_FILE = os.path.abspath("downloads/Mathematics Instructional Plans (MIPs) _ Virginia Department of Education.html")
DOWNLOAD_DIR = os.path.abspath("downloads/matholdlp")

# --- Chrome Options for Selenium ---
options = webdriver.ChromeOptions()
prefs = {
    "download.default_directory": download_dir,
    "plugins.always_open_pdf_externally": True,
    "download.prompt_for_download": False,
    "download.directory_upgrade": True
}
options.add_experimental_option("prefs", prefs)
options.add_argument("--disable-blink-features=AutomationControlled")
options.add_experimental_option('excludeSwitches', ['enable-automation'])

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

file_url = f"file://{HTML_FILE}"
driver.get(file_url)
time.sleep(2)

WebDriverException: Message: Can not connect to the Service /Users/kshitijnarvekar/.wdm/drivers/chromedriver/mac64/134.0.6998.165/chromedriver-mac-arm64/chromedriver


In [17]:
import os
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By

# --- Setup Download Directory ---
download_dir = os.path.abspath("downloads/matholdlp")
os.makedirs(download_dir, exist_ok=True)

# --- Chrome Options for Selenium ---
options = webdriver.ChromeOptions()
prefs = {
    "download.default_directory": download_dir,
    "plugins.always_open_pdf_externally": True,
    "download.prompt_for_download": False,
    "download.directory_upgrade": True
}
options.add_experimental_option("prefs", prefs)
options.add_argument("--disable-blink-features=AutomationControlled")
options.add_experimental_option('excludeSwitches', ['enable-automation'])

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

# --- Open local HTML file ---
html_path = os.path.abspath("downloads/Mathematics Instructional Plans (MIPs) _ Virginia Department of Education.html")
driver.get(f"file://{html_path}")
time.sleep(3)

# --- Extract Section -> PDF Links ---
h2_headers = driver.find_elements(By.TAG_NAME, "h2") + driver.find_elements(By.TAG_NAME, "h3")
section = "General"
links_data = []

for el in driver.find_elements(By.TAG_NAME, "*"):
    if el.tag_name in ["h2", "h3"]:
        section = el.text.strip()
    if el.tag_name == "a" and "PDF Version" in el.text:
        title_el = el.find_element(By.XPATH, "./preceding-sibling::a[1]") if len(el.find_elements(By.XPATH, "./preceding-sibling::a")) > 0 else None
        title = title_el.text.strip() if title_el else "Document"
        href = el.get_attribute("href")
        links_data.append({"section": section, "title": title, "url": href})

# --- Click to trigger download ---
# print(links_data)
for lesson in links_data:
    print(f"Downloading {lesson['title']} - {lesson['section']}...")
    initial_windows = driver.window_handles
    driver.execute_script("window.open(arguments[0]);", lesson['url'])
    time.sleep(5)
    new_windows = driver.window_handles
    opened_windows = list(set(new_windows) - set(initial_windows))
    if opened_windows:
        driver.switch_to.window(opened_windows[0])
        driver.close()
    driver.switch_to.window(initial_windows[0])

# --- Wait for all downloads ---
time.sleep(10)
driver.quit()

print("All downloads triggered.")


Downloading Word - Number and Number Sense...
Downloading Word - Number and Number Sense...
Downloading Word - Number and Number Sense...
Downloading Word - Number and Number Sense...
Downloading Word - Number and Number Sense...
Downloading Word - Number and Number Sense...
Downloading Word - Number and Number Sense...


KeyboardInterrupt: 

In [18]:
# filepath: scripts/download_math_mips.py

import os
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.webdriver import Options
from webdriver_manager.chrome import ChromeDriverManager

# --- Setup Download Directory ---
download_dir = os.path.abspath("downloads/matholdlp")
os.makedirs(download_dir, exist_ok=True)

# --- Chrome Options for Selenium ---
options = Options()
prefs = {
    "download.default_directory": download_dir,
    "plugins.always_open_pdf_externally": True,
    "download.prompt_for_download": False,
    "download.directory_upgrade": True
}
options.add_experimental_option("prefs", prefs)
options.add_argument("--disable-blink-features=AutomationControlled")
options.add_experimental_option('excludeSwitches', ['enable-automation'])

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

# --- Open local HTML file ---
html_path = os.path.abspath("downloads/Mathematics Instructional Plans (MIPs) _ Virginia Department of Education.html")
driver.get(f"file://{html_path}")
time.sleep(2)

# --- Track headers and extract PDF links with full context ---
grade = ""
section = ""
links_data = []

elements = driver.find_elements(By.XPATH, "//*")

for el in elements:
    tag = el.tag_name.lower()

    if tag == "h2":
        grade = el.text.strip()
    elif tag == "h3":
        section = el.text.strip()
    elif tag == "a" and "PDF Version" in el.text:
        try:
            # Get preceding sibling text node or parent li
            parent_li = el.find_element(By.XPATH, "./ancestor::li[1]")
            full_text = parent_li.text.strip()
            title = full_text.split(" (")[0].split(" /")[0]
        except:
            title = "Unknown"

        links_data.append({
            "grade": grade,
            "section": section,
            "title": title,
            "url": el.get_attribute("href")
        })

# --- Trigger Downloads ---
for lesson in links_data:
    filename = f"{lesson['grade']}-{lesson['section']}-{lesson['title']}.pdf"
    print(f"Downloading: {filename}")
    initial_windows = driver.window_handles
    driver.execute_script("window.open(arguments[0]);", lesson['url'])
    time.sleep(3)
    new_windows = driver.window_handles
    new_tab = list(set(new_windows) - set(initial_windows))
    if new_tab:
        driver.switch_to.window(new_tab[0])
        driver.close()
    driver.switch_to.window(initial_windows[0])

# --- Wait for Downloads ---
time.sleep(10)
driver.quit()

print("✅ All downloads triggered.")


Downloading: Kindergarten-Number and Number Sense-K.1ab - How Many? Counting Centers.pdf
Downloading: Kindergarten-Number and Number Sense-K.1ab - Number Designs – Counting Centers.pdf
Downloading: Kindergarten-Number and Number Sense-K.1ab - Lily Pad Hop.pdf
Downloading: Kindergarten-Number and Number Sense-K.1ab - Number Boards.pdf
Downloading: Kindergarten-Number and Number Sense-K.2a - Build and Compare.pdf
Downloading: Kindergarten-Number and Number Sense-K.2a - How Many Snails?.pdf
Downloading: Kindergarten-Number and Number Sense-K.2a - Splash!.pdf
Downloading: Kindergarten-Number and Number Sense-K.2a - More, Fewer, or the Same?.pdf
Downloading: Kindergarten-Number and Number Sense-K.2b - Ordering Sets.pdf
Downloading: Kindergarten-Number and Number Sense-K.3abc - Garbage.pdf
Downloading: Kindergarten-Number and Number Sense-K.3abd - Meaningful Rote Counting.pdf
Downloading: Kindergarten-Number and Number Sense-K.4ab - Bears in Caves.pdf
Downloading: Kindergarten-Number and Num

KeyboardInterrupt: 

In [21]:
import os
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.webdriver import Options
from webdriver_manager.chrome import ChromeDriverManager
import glob
import shutil

download_dir = os.path.abspath("downloads/matholdlp")
os.makedirs(download_dir, exist_ok=True)

options = Options()
prefs = {
    "download.default_directory": download_dir,
    "plugins.always_open_pdf_externally": True,
    "download.prompt_for_download": False,
    "download.directory_upgrade": True
}
options.add_experimental_option("prefs", prefs)
options.add_argument("--disable-blink-features=AutomationControlled")
options.add_experimental_option('excludeSwitches', ['enable-automation'])

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

html_path = os.path.abspath("downloads/Mathematics Instructional Plans (MIPs) _ Virginia Department of Education.html")
driver.get(f"file://{html_path}")
time.sleep(2)

grade = ""
section = ""
links_data = []

elements = driver.find_elements(By.XPATH, "//*")

for el in elements:
    tag = el.tag_name.lower()

    if tag == "h2":
        grade = el.text.strip()
    elif tag == "h3":
        section = el.text.strip()
    elif tag == "a" and "PDF Version" in el.text:
        try:
            # Get preceding sibling text node or parent li
            parent_li = el.find_element(By.XPATH, "./ancestor::li[1]")
            full_text = parent_li.text.strip()
            title = full_text.split(" (")[0].split(" /")[0]
        except:
            title = "Unknown"

        links_data.append({
            "grade": grade,
            "section": section,
            "title": title,
            "url": el.get_attribute("href")
        })
existing_files = set(glob.glob(os.path.join(download_dir, "*.pdf")))

for lesson in links_data:
    filename = f"{lesson['grade']}-{lesson['section']}-{lesson['title']}.pdf"
    print(f"Downloading: {filename}")
    initial_windows = driver.window_handles
    driver.execute_script("window.open(arguments[0]);", lesson['url'])
    time.sleep(3)
    new_windows = driver.window_handles
    new_tab = list(set(new_windows) - set(initial_windows))
    if new_tab:
        driver.switch_to.window(new_tab[0])
        driver.close()
    driver.switch_to.window(initial_windows[0])

time.sleep(50)
driver.quit()

new_files = list(set(glob.glob(os.path.join(download_dir, "*.pdf"))) - existing_files)
new_files.sort(key=os.path.getctime)

for downloaded, lesson in zip(new_files, links_data):
    desired_name = f"{lesson['grade']}-{lesson['section']}-{lesson['title']}.pdf"
    desired_name = desired_name.replace("?", "").replace("/", "-")  # sanitize
    target_path = os.path.join(download_dir, desired_name)
    shutil.move(downloaded, target_path)



Downloading: Kindergarten-Number and Number Sense-K.1ab - How Many? Counting Centers.pdf
Downloading: Kindergarten-Number and Number Sense-K.1ab - Number Designs – Counting Centers.pdf
Downloading: Kindergarten-Number and Number Sense-K.1ab - Lily Pad Hop.pdf
Downloading: Kindergarten-Number and Number Sense-K.1ab - Number Boards.pdf
Downloading: Kindergarten-Number and Number Sense-K.2a - Build and Compare.pdf
Downloading: Kindergarten-Number and Number Sense-K.2a - How Many Snails?.pdf
Downloading: Kindergarten-Number and Number Sense-K.2a - Splash!.pdf
Downloading: Kindergarten-Number and Number Sense-K.2a - More, Fewer, or the Same?.pdf
Downloading: Kindergarten-Number and Number Sense-K.2b - Ordering Sets.pdf
Downloading: Kindergarten-Number and Number Sense-K.3abc - Garbage.pdf
Downloading: Kindergarten-Number and Number Sense-K.3abd - Meaningful Rote Counting.pdf
Downloading: Kindergarten-Number and Number Sense-K.4ab - Bears in Caves.pdf
Downloading: Kindergarten-Number and Num

In [40]:
from pathlib import Path
import fitz  # PyMuPDF
import re

# Load PDF
pdf_path = Path("downloads/Team-Building-Activities.pdf")
doc = fitz.open(pdf_path)

# Extract all text from the PDF
raw_text = "\n".join([page.get_text() for page in doc])

# Identify and extract each activity using known pattern "This is:" followed by standard sections
activity_splits = re.split(r"(?=\n[A-Z][A-Za-z\s&]+\nThis is:)\s*", raw_text)

formatted_activities = []

for block in activity_splits:
    title_match = re.search(r"^([A-Z][A-Za-z\s&]+)\nThis is:", block)
    if not title_match:
        continue

    title = title_match.group(1).strip()
    objective = re.search(r"The Purpose is:(.*?)\n\s*Materials", block, re.DOTALL)
    materials = re.search(r"Materials:(.*?)\n\s*How To", block, re.DOTALL)
    instructions = re.search(r"How To:(.*?)(\n\s*(Ask these Questions|Tips for Success|Variations|Examples of use))", block, re.DOTALL)
    debrief = re.search(r"Ask these Questions:(.*?)(\n\s*(Tips for Success|Variations|\Z))", block, re.DOTALL)


    def clean(text):
        return re.sub(r"\s+", " ", text.strip()) if text else "N/A"

    formatted = f"""
Title: {title}
Objective: {clean(objective.group(1))}
Materials Needed: {clean(materials.group(1))}
Instructions: {clean(instructions.group(1))}
Debrief / Discussion Points: {clean(debrief.group(1))}
"""
    formatted_activities.append(formatted.strip())

# Write all activities to a text file
output_path = Path("Icebreaker_Activities_All.txt")
with open(output_path, "w") as f:
    f.write("\n\n".join(formatted_activities))

output_path

PosixPath('Icebreaker_Activities_All.txt')

In [42]:
from pathlib import Path
import fitz  # PyMuPDF
import re

# Load PDF
pdf_path = Path("downloads/Team-Building-Activities.pdf")
doc = fitz.open(pdf_path)

# Extract all text from the PDF
raw_text = "\n".join([page.get_text() for page in doc])

# Identify and extract each activity using known pattern "This is:" followed by standard sections
activity_splits = re.split(r"(?=\n[A-Z][A-Za-z\s&]+\nThis is:)\s*", raw_text)

formatted_activities = []

for block in activity_splits:
    title_match = re.search(r"^([A-Z][A-Za-z\s&]+)\nThis is:", block)
    if not title_match:
        continue

    title = title_match.group(1).strip()
    objective = re.search(r"The Purpose is:(.*?)\n\s*Materials", block, re.DOTALL)
    materials = re.search(r"Materials:(.*?)\n\s*How To", block, re.DOTALL)
    instructions = re.search(r"How To:(.*?)(\n\s*(Ask these Questions|Tips for Success|Variations|Examples of use))", block, re.DOTALL)
    debrief = re.search(r"Ask these Questions:(.*?)(\n\s*(Tips for Success|Variations|\Z))", block, re.DOTALL)
    tips = re.search(r"Tips for Success:(.*?)(\n\s*(Variations|\Z))", block, re.DOTALL)
    variations = re.search(r"Variations:(.*?)(\n\s*(\Z))", block, re.DOTALL)

    def clean(text):
        if not text:
            return "N/A"
        text = re.sub(r"[^\w\s,.!?-]", "", text)  # Remove special characters except basic punctuation
        return re.sub(r"\s+", " ", text.strip())

    additional_details = []
    if tips:
        additional_details.append(f"Tips for Success: {clean(tips.group(1))}")
    if variations:
        additional_details.append(f"Variations: {clean(variations.group(1))}")

    formatted = f"""
Title: {clean(title)}
Objective: {clean(objective.group(1))}
Materials Needed: {clean(materials.group(1))}
Instructions: {clean(instructions.group(1))}
Debrief / Discussion Points: {clean(debrief.group(1))}
Additional Details: {" | ".join(additional_details) if additional_details else "N/A"}
"""
    formatted_activities.append(formatted.strip())

# Write all activities to a text file
output_path = Path("Icebreaker_Activities_All.txt")
with open(output_path, "w") as f:
    f.write("\n\n".join(formatted_activities))

output_path


PosixPath('Icebreaker_Activities_All.txt')