In [1]:
import json
import time
import os
import webbrowser
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.edge.service import Service
from selenium.webdriver.edge.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

#runnning edge in headless mode might also help in reducing time

# ✅ Load JSON file with topics and URLs -> you will find the code to give json file in previous notebook
with open("topics_fresh.json", "r", encoding="utf-8") as file:
    topics = json.load(file)

# ✅ Get the current working directory just to make sure that files are being saved in the same directory
current_directory = os.getcwd()

# ✅ Replace this with your path
EDGE_DRIVER_PATH = r"msedgedriver.exe"

# ✅ Configure Edge WebDriver //Chrome is more convienet to download pdfs without giving a prompt 
edge_options = Options()
edge_options.add_argument("--kiosk-printing")  # ✅ Allows printing without pop-ups

# ✅ Start Edge WebDriver
service = Service(EDGE_DRIVER_PATH)
driver = webdriver.Edge(service=service, options=edge_options)


index_css_style = """
<style>
    body { font-family: Arial, sans-serif; margin: 40px; padding: 20px; background: #f4f4f4; color: #333; }
    h1 { color: #0073e6; text-align: center; }
    .index-container { max-width: 800px; margin: auto; background: white; padding: 20px; border-radius: 8px; box-shadow: 0px 0px 10px rgba(0, 0, 0, 0.1); }
    ul { list-style: none; padding: 0; }
    li { padding: 10px; border-bottom: 1px solid #ddd; transition: background 0.3s; }
    li:hover { background: #f0f0f0; }
    a { text-decoration: none; font-size: 18px; color: #0073e6; font-weight: bold; }
    a:hover { text-decoration: underline; }
</style>
"""

index_links = []

for idx, topic in enumerate(topics, start=1):
    name = topic["name"].replace("\n", " ").replace(" ", "_")  # Safe filename
    url = topic["url"]
    html_filename = f"{name}_styled.html"

    print(f"📌 Processing: {name}")

    # ✅ Load webpage
    driver.get(url)
    try:
        # ✅ Wait until the content is loaded -> you can evn add a delay for better rendering - but this takes less time
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, ".content-section .content-container"))
        )
    except Exception as e:
        print(f"⚠️ Timeout: Skipping {name} due to slow loading")
        continue

    # Extract content - i have sepcially selected <div >container after inspecting the webpage - but can add more divisions to improve ui/ux
    soup = BeautifulSoup(driver.page_source, "html.parser")
    content_div = soup.select_one(".content-section .content-container")

    if content_div:
        #  Create HTML file with extracted content and download button -> since i used jupyter - i directly could not write on a html page
        html_content = f"""
        <html>
        <head>
            <script>
                function hideButton() {{ document.getElementById('download-btn').style.display = 'none'; window.print(); }}
            </script>
        </head>
        <body>
            {content_div.prettify()}
            <button id="download-btn" onclick="hideButton();" style="display: block; margin: 20px auto; padding: 10px 15px; font-size: 18px; background: #0073e6; color: white; border: none; border-radius: 5px; cursor: pointer;">Download as PDF</button>
        </body>
        </html>
        """

        with open(html_filename, "w", encoding="utf-8") as file:
            file.write(html_content)

        print(f"✅ HTML saved: {html_filename}")

        # Adding link to index page
        index_links.append(f'<li><a href="{html_filename}">{topic["name"]}</a></li>')
    else:
        print(f"⚠️ No content found for {name}")

#  Create Index Page
table_of_contents = f"""
<html>
<head>
    {index_css_style}
    <title>Index Page</title>
</head>
<body>
    <div class="index-container">
        <h1>Extracted Topics</h1>
        <ul>
            {''.join(index_links)}
        </ul>
    </div>
</body>
</html>
"""

index_path = os.path.join(current_directory, "index.html")
with open(index_path, "w", encoding="utf-8") as file:
    file.write(table_of_contents)

print("✅ Index page created: index.html")

# since we are using jupyter -> to autmatically open on host
webbrowser.open(f"file://{index_path}")

# ✅ Quit WebDriver
driver.quit()
print("🎉 All pages have been successfully saved!")


📌 Processing: Physics_HL
✅ HTML saved: Physics_HL_styled.html
📌 Processing: 1.1.2_Specific_heat_capacity_and_latent_heat
✅ HTML saved: 1.1.2_Specific_heat_capacity_and_latent_heat_styled.html
📌 Processing: 1.1.3_Methods_of_heat_transfer_(conduction,_convection,_radiation)
✅ HTML saved: 1.1.3_Methods_of_heat_transfer_(conduction,_convection,_radiation)_styled.html
📌 Processing: 1.1.4_Thermal_expansion
✅ HTML saved: 1.1.4_Thermal_expansion_styled.html
📌 Processing: 1.2.1_Earth's_energy_balance
✅ HTML saved: 1.2.1_Earth's_energy_balance_styled.html
📌 Processing: 1.2.2_Greenhouse_gases_and_their_role
✅ HTML saved: 1.2.2_Greenhouse_gases_and_their_role_styled.html
📌 Processing: 1.2.3_Impact_of_human_activity_on_the_greenhouse_effect
✅ HTML saved: 1.2.3_Impact_of_human_activity_on_the_greenhouse_effect_styled.html
📌 Processing: 1.3.1_Ideal_gas_law_(PV_=_nRT)
✅ HTML saved: 1.3.1_Ideal_gas_law_(PV_=_nRT)_styled.html
📌 Processing: 1.3.2_Boyle’s_law,_Charles’s_law,_Avogadro’s_law
✅ HTML saved: 1