In [2]:
import requests
from bs4 import BeautifulSoup
import time
import csv
import random
from urllib.parse import urljoin
import os

# ========== CONFIG ==========
BASE_URL = "http://books.toscrape.com/"
START_URL = BASE_URL
OUTPUT_DIR = "task1_outputs"
OUTPUT_CSV = os.path.join(OUTPUT_DIR, "books.csv")
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "\
             "(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
REQUEST_HEADERS = {"User-Agent": USER_AGENT}
DELAY_MIN = 1.0   # minimum delay between requests (seconds)
DELAY_MAX = 2.5   # maximum delay
MAX_RETRIES = 3
TIMEOUT = 10      # seconds

os.makedirs(OUTPUT_DIR, exist_ok=True)

# ========== UTILITIES ==========
def safe_get(url, session=None, retries=MAX_RETRIES):
    s = session or requests.Session()
    for attempt in range(1, retries+1):
        try:
            resp = s.get(url, headers=REQUEST_HEADERS, timeout=TIMEOUT)
            resp.raise_for_status()
            return resp
        except Exception as e:
            print(f"[warn] request failed ({attempt}/{retries}) for {url}: {e}")
            if attempt == retries:
                raise
            time.sleep(1.5 * attempt)
    return None

def random_delay():
    time.sleep(random.uniform(DELAY_MIN, DELAY_MAX))

# ========== SCRAPING LOGIC ==========
def parse_book_card(card):
    """Given a product card tag, extract fields."""
    # Title
    title_tag = card.select_one("h3 a")
    title = title_tag["title"].strip() if title_tag and title_tag.has_attr("title") else title_tag.get_text(strip=True)
    # Relative product page url
    rel_url = title_tag["href"] if title_tag and title_tag.has_attr("href") else ""
    product_url = urljoin(BASE_URL, rel_url)
    # Price
    price_tag = card.select_one(".price_color")
    price = price_tag.get_text(strip=True) if price_tag else ""
    # Availability
    availability = card.select_one(".availability").get_text(strip=True) if card.select_one(".availability") else ""
    # Rating (class contains 'star-rating Three' etc.)
    rating_class = card.select_one("p.star-rating")
    rating = ""
    if rating_class and rating_class.has_attr("class"):
        classes = rating_class["class"]
        # classes example: ['star-rating', 'Three']
        rating = [c for c in classes if c != "star-rating"][0] if len(classes) > 1 else ""
    return {
        "title": title,
        "product_url": product_url,
        "price": price,
        "availability": availability,
        "rating": rating
    }

def scrape_books(start_url=START_URL, max_pages=None):
    results = []
    next_page_url = start_url
    page_count = 0
    session = requests.Session()

    while next_page_url:
        page_count += 1
        print(f"[info] Fetching page {page_count}: {next_page_url}")
        resp = safe_get(next_page_url, session=session)
        soup = BeautifulSoup(resp.text, "html.parser")

        # Select product cards
        cards = soup.select("article.product_pod")
        for c in cards:
            data = parse_book_card(c)
            results.append(data)

        # Pagination: look for 'next' button
        next_btn = soup.select_one("li.next a")
        if next_btn and next_btn.has_attr("href"):
            rel_next = next_btn["href"]
            next_page_url = urljoin(next_page_url, rel_next)
        else:
            next_page_url = None

        page_count += 0  # helpful for debugging
        random_delay()

        if max_pages and page_count >= max_pages:
            print("[info] Reached max_pages limit.")
            break

    return results

# ========== RUN ==========
if __name__ == "__main__":
    print("[start] scraping started")
    try:
        data = scrape_books(max_pages=None)  # set an int to limit pages
        # Save to CSV
        fieldnames = ["title", "product_url", "price", "availability", "rating"]
        with open(OUTPUT_CSV, "w", newline="", encoding="utf-8") as f:
            writer = csv.DictWriter(f, fieldnames=fieldnames)
            writer.writeheader()
            for row in data:
                writer.writerow(row)
        print(f"[done] saved {len(data)} rows to {OUTPUT_CSV}")
    except Exception as e:
        print(f"[error] scraping aborted: {e}")

[start] scraping started
[info] Fetching page 1: http://books.toscrape.com/
[info] Fetching page 2: http://books.toscrape.com/catalogue/page-2.html
[info] Fetching page 3: http://books.toscrape.com/catalogue/page-3.html
[info] Fetching page 4: http://books.toscrape.com/catalogue/page-4.html
[info] Fetching page 5: http://books.toscrape.com/catalogue/page-5.html
[info] Fetching page 6: http://books.toscrape.com/catalogue/page-6.html
[info] Fetching page 7: http://books.toscrape.com/catalogue/page-7.html
[info] Fetching page 8: http://books.toscrape.com/catalogue/page-8.html
[info] Fetching page 9: http://books.toscrape.com/catalogue/page-9.html
[info] Fetching page 10: http://books.toscrape.com/catalogue/page-10.html
[info] Fetching page 11: http://books.toscrape.com/catalogue/page-11.html
[info] Fetching page 12: http://books.toscrape.com/catalogue/page-12.html
[info] Fetching page 13: http://books.toscrape.com/catalogue/page-13.html
[info] Fetching page 14: http://books.toscrape.com/c