# Modern Web Scraping in Jupiter NoteBook
This notebook scrapes quotes from quotes.toscrape.com, handling both static and JavaScript-rendered content.

In [1]:
# Install required packages (run once)
!pip install requests beautifulsoup4 playwright
!python3 -m playwright install
!pip install pandas openpyxl



# Import needed Libraries

In [2]:
import csv
import time
from typing import List, Dict
import requests
from bs4 import BeautifulSoup
from playwright.sync_api import sync_playwright
import pandas as pd

# Static Scraping Function
Uses requests + BeautifulSoup for faster scraping of static content

In [3]:
def scrape_static_quote() -> List[Dict[str, str]]: 
    """ Scrape quotes from static HTML pages"""
    base_url = "https://quotes.toscrape.com" #Change page as you want
    quotes = []
    page = 1
    while True: 
        print(f"Scaping static page {page}...")
        response = requests.get(f"{base_url}/page/{page}/")
        if response.status_code != 200:
            break

        soup = BeautifulSoup(response.text, "html.parser")
        quote_elements = soup.find_all("div", class_ = "quote")

        for quote in quote_elements: 
            text = quote.find("span", class_ = "text").text
            author = quote.find("small", class_ = "author").text
            tags = [tag.text for tag in quote.find_all("a", class_ = "tag")]
        
            quotes.append({
                "text" : text, 
                "author" : author,
                "tags" : ",".join(tags)
            })

        next_button = soup.find("li", class_ = "next")
        if not next_button:
            break

        page += 1
        time.sleep

    return quotes

# Dynamic Scraping Function
Uses Playwright for JavaScript-rendered content

In [4]:
def scrape_dynamic_quote() -> List[Dict[str, str]]:
    """Scrape quotes from JavaScript-rendered pages"""
    quotes = []
    with sync_playwright() as p: 
        browser = p.chromium.launch(headless = True)
        page = browser.new_page()
        page_num = 1

        while True: 
            print(f"Scraping dynamic page {page_num}...")
            page.goto(f"https://quotes.toscrape.com/page/{page_num}") #Change link to your need
            page.wait_for_selector(".quote")

            quote_elements = page.query_selector_all(".quote")
            for quote in quote_elements:
                text = quote.query_selector(".text").inner_text()
                author = quote.query_selector(".author").inner_text()
                tags = [tag.inner_text()
                       for tag in quote.query_selector_all(".tag")]

                quotes.append({
                    "text" : text,
                    "author" : author,
                    "tags" : ",".join(tags)
                })

            next_button = page.query_selector("li.next")
            if not next_button:
                break

            page_num += 1
            time.sleep(1)

        browser.close()
        
    return quotes

# Save to CSV Function

In [18]:
def save_to_csv(quotes: List[Dict[str, str]], filename: str) -> None: 
    """Save scraped quotes to a CSV file"""
    with open(filename, "w", newline = "", encoding = "utf-8") as file:
        writer = csv.DictWriter(file, fieldnames = ["text", "author", "tags"])
        writer.writeheader()
        writer.writerows(quotes)
    print(f"Successfully saved {len(quotes)} quotes to {filename}")

# Save to Excel Function

In [6]:
def save_to_excel(quotes: List[Dict[str, str]], filename: str) -> None: 
    """Save scraped quotes to an Excel (.xlsx) file"""
    df = pd.DataFrame(quotes)
    df.to_excel(filename, index = False)
    print(f"Successfully saved {len(quotes)} quotes to {filename}")

# Main Execution
Tries static scraping first, falls back to dynamic if needed

In [20]:
print("Starting quote scraping...")

try: 
    print("Attemping static scraping...")
    quotes = scrape_static_quote()
    print(f"Found {len(quotes)} quotes via static scraping")
except Exception as e: 
    print(f"Static Scraping failed: {e}")
    print("Falling back to dynamic scraping...")
    quotes = scrape_dynamic_quote()
    print(f"Found {len(quotes)} quotes via dynamic scraping")

if quotes: 
    save_to_csv(quotes, "quotes.csv")
    save_to_excel(quotes, "quotes.xlsx")
else: 
    print("No quotes were scraped")

Starting quote scraping...
Attemping static scraping...
Scaping static page 1...
Scaping static page 2...
Scaping static page 3...
Scaping static page 4...
Scaping static page 5...
Scaping static page 6...
Scaping static page 7...
Scaping static page 8...
Scaping static page 9...
Scaping static page 10...
Found 100 quotes via static scraping
Successfully saved 100 quotes to quotes.csv
Successfully saved 100 quotes to quotes.xlsx


# Display Sample Results
Show the first 5 quotes we scraped

In [22]:
if quotes: 
    print("\nSimmple quotes: ")
    for i, quote in enumerate(quotes[:5], 1): 
        print(f"{i}. {quote['text']} - {quote["author"]}")
        print(f"    Tags: {quote["tags"]}\n")
else:
    print("No quotes to display")


Simmple quotes: 
1. “The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.” - Albert Einstein
    Tags: change,deep-thoughts,thinking,world

2. “It is our choices, Harry, that show what we truly are, far more than our abilities.” - J.K. Rowling
    Tags: abilities,choices

3. “There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.” - Albert Einstein
    Tags: inspirational,life,live,miracle,miracles

4. “The person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.” - Jane Austen
    Tags: aliteracy,books,classic,humor

5. “Imperfection is beauty, madness is genius and it's better to be absolutely ridiculous than absolutely boring.” - Marilyn Monroe
    Tags: be-yourself,inspirational

