In [7]:
import requests
from bs4 import BeautifulSoup
import json


BASE_URL = "https://www.sparkl.me"


START_URL = "https://www.sparkl.me/learn/ib/physics-hl/types-of-waves-transverse-and-longitudinal/revision-notes/1280"

def extract_topic_links(url):
    response = requests.get(url)
    if response.status_code != 200:
        print("❌ Failed to fetch page")
        return []

    soup = BeautifulSoup(response.text, "html.parser")

    # Find all <a> elements with links
    links = []
    for a in soup.find_all("a", href=True):
        href = a["href"].strip()
        
        # Only collect internal topic links
        if "/learn/ib/physics-hl/" in href and "revision-notes" in href:
            full_url = BASE_URL + href if not href.startswith("http") else href
            links.append({"name": a.text.strip(), "url": full_url})

    return links

# Extract links
topics = extract_topic_links(START_URL)

# Remove duplicates using a dictionary (keyed by URL)
unique_topics = {topic["url"]: topic for topic in topics}.values()

# Save to JSON
with open("topics_fresh.json", "w", encoding="utf-8") as f:
    json.dump(list(unique_topics), f, indent=4, ensure_ascii=False)

print(f"✅ Extracted {len(unique_topics)} unique topics. Saved to topics_fresh.json")


✅ Extracted 84 unique topics. Saved to topics_fresh.json


In [8]:
import json
import requests
from bs4 import BeautifulSoup


with open("topics_fresh.json", "r", encoding="utf-8") as file:
    topics = json.load(file)


scraped_data = {}

# Loop through each topic and scrape content
for topic in topics:
    url = topic["url"]
    print(f"🔍 Scraping: {topic['name']}")

    try:
        response = requests.get(url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, "html.parser")
            
            # Find the main content section (adjust the class if needed)
            content = soup.find("div", class_="content-section")  # Change class if needed
            
            # Extract text if content is found
            if content:
                scraped_data[topic["name"]] = content.get_text(separator="\n", strip=True)
            else:
                scraped_data[topic["name"]] = "⚠ No content found"
        else:
            scraped_data[topic["name"]] = "❌ Failed to retrieve page"
    
    except Exception as e:
        scraped_data[topic["name"]] = f"🚨 Error: {e}"

# Save the scraped data to a JSON file
with open("scraped_data_fresh.json", "w", encoding="utf-8") as file:
    json.dump(scraped_data, file, indent=4, ensure_ascii=False)

print("✅ Scraping complete! Data saved to scraped_data_fresh.json")


🔍 Scraping: Physics HL
🔍 Scraping: 1.1.2
Specific heat capacity and latent heat
🔍 Scraping: 1.1.3
Methods of heat transfer (conduction, convection, radiation)
🔍 Scraping: 1.1.4
Thermal expansion
🔍 Scraping: 1.2.1
Earth's energy balance
🔍 Scraping: 1.2.2
Greenhouse gases and their role
🔍 Scraping: 1.2.3
Impact of human activity on the greenhouse effect
🔍 Scraping: 1.3.1
Ideal gas law (PV = nRT)
🔍 Scraping: 1.3.2
Boyle’s law, Charles’s law, Avogadro’s law
🔍 Scraping: 1.3.3
Real gases and deviations from ideal gas behaviour
🔍 Scraping: 1.4.1
Laws of thermodynamics
🔍 Scraping: 1.4.2
Heat engines and efficiency
🔍 Scraping: 1.4.3
Entropy and spontaneous processes
🔍 Scraping: 1.5.1
Electric charge and current
🔍 Scraping: 1.5.2
Ohm’s law and resistivity
🔍 Scraping: 1.5.3
Kirchhoff’s laws
🔍 Scraping: 1.5.4
Power dissipation in resistors
🔍 Scraping: 2.1.1
Types of waves: Transverse and longitudinal
🔍 Scraping: 2.1.2
Properties of waves (amplitude, frequency, wavelength, speed)
🔍 Scraping: 2.1.3


In [9]:
import json


with open("scraped_data_fresh.json", "r", encoding="utf-8") as file:
    scraped_data = json.load(file)


html_content = """<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Formatted Scraped Data</title>
    <script src="https://polyfill.io/v3/polyfill.min.js?features=es6"></script>
    <script id="MathJax-script" async src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js"></script>
    <style>
        body {
            font-family: 'Arial', sans-serif;
            margin: 20px;
            padding: 20px;
            background-color: #f8f9fa;
            color: #333;
            transition: background 0.5s, color 0.5s;
        }
        h1 {
            text-align: center;
            color: #007bff;
            font-size: 32px;
            font-weight: bold;
            text-transform: uppercase;
        }
        .search-bar {
            text-align: center;
            margin-bottom: 20px;
        }
        input {
            padding: 10px;
            width: 80%;
            font-size: 16px;
            border-radius: 5px;
            border: 1px solid #ccc;
        }
        .topic {
            background-color: #ffffff;
            border-left: 5px solid #007bff;
            margin: 20px 0;
            padding: 15px;
            border-radius: 8px;
            box-shadow: 0px 4px 8px rgba(0, 0, 0, 0.1);
            line-height: 1.6;
            transition: transform 0.3s ease-in-out;
        }
        .topic:hover {
            transform: scale(1.02);
        }
        .topic h2 {
            color: #0056b3;
            font-size: 24px;
            font-weight: bold;
            text-transform: uppercase;
            cursor: pointer;
            transition: color 0.3s;
        }
        .topic h2:hover {
            color: #ff5733;
        }
        .content {
            display: none;
            margin-top: 10px;
        }
        .formula {
            background-color: #eef2ff;
            font-family: monospace;
            padding: 10px;
            display: block;
            border-radius: 5px;
            margin: 10px 0;
            text-align: center;
            font-size: 18px;
        }
        .highlight {
            color: #ff5733;
            font-weight: bold;
        }
        .dark-mode {
            background-color: #1a1a1a;
            color: #f5f5f5;
        }
        .dark-mode .topic {
            background-color: #333;
            border-left-color: #ff5733;
        }
        .dark-mode .formula {
            background-color: #555;
        }
        .toggle-container {
            text-align: center;
            margin-bottom: 20px;
        }
        .toggle-btn {
            padding: 10px 15px;
            font-size: 16px;
            background: #007bff;
            color: white;
            border: none;
            border-radius: 5px;
            cursor: pointer;
        }
    </style>
    <script>
        function toggleDarkMode() {
            document.body.classList.toggle("dark-mode");
        }

        function toggleContent(topicId) {
            let content = document.getElementById(topicId);
            content.style.display = (content.style.display === "none") ? "block" : "none";
        }

        function searchTopics() {
            let input = document.getElementById('searchInput').value.toLowerCase();
            let topics = document.getElementsByClassName('topic');

            for (let i = 0; i < topics.length; i++) {
                let topicText = topics[i].innerText.toLowerCase();
                topics[i].style.display = topicText.includes(input) ? "block" : "none";
            }
        }
    </script>
</head>
<body>
    <h1>Scraped Topics & Data</h1>

    <div class="toggle-container">
        <button class="toggle-btn" onclick="toggleDarkMode()">🌙 Toggle Dark Mode</button>
    </div>

    <div class="search-bar">
        <input type="text" id="searchInput" placeholder="🔍 Search topics..." onkeyup="searchTopics()">
    </div>
"""


for index, (topic, content) in enumerate(scraped_data.items()):
    content = content.replace("$", "")
    formatted_content = content.replace("**", "<span class='highlight'>").replace("**", "</span>")
    formatted_content = formatted_content.replace("E=mc^2", "\\( E = mc^2 \\)")

    formatted_lines = []
    for line in formatted_content.split("\n"):
        if "=" in line and len(line) < 100:  
            formatted_lines.append(f"<div class='formula'>\\( {line} \\)</div>")
        else:
            formatted_lines.append(f"<p>{line}</p>")

    formatted_content = "\n".join(formatted_lines)

    html_content += f"""
    <div class="topic">
        <h2 onclick="toggleContent('content{index}')">{topic} 🔽</h2>
        <div class="content" id="content{index}">
            {formatted_content}
        </div>
    </div>
    """

html_content += """
</body>
</html>
"""


with open("display_fresh.html", "w", encoding="utf-8") as file:
    file.write(html_content)

print("✅ Beautifully formatted display_fresh.html generated successfully!")


✅ Beautifully formatted display_fresh.html generated successfully!


In [12]:
import os
import http.server
import socketserver
import webbrowser

PORT = 8001  # or give 8001 sometimes wont work

# Get the current directory
web_dir = os.getcwd()
os.chdir(web_dir)

# Start HTTP server
Handler = http.server.SimpleHTTPRequestHandler
httpd = socketserver.TCPServer(("", PORT), Handler)

# Open the file in browser
server_url = f"http://localhost:{PORT}/display_fresh.html"
print(f"✅ Server running at: {server_url}")
print("📢 Opening display_fresh.html in your browser...")

webbrowser.open(server_url)

#infinite on
httpd.serve_forever()


OSError: [WinError 10048] Only one usage of each socket address (protocol/network address/port) is normally permitted