In [10]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager

import pandas as pd
import re
from urllib.parse import urljoin

In [3]:
BASE = "https://books.toscrape.com/"

In [4]:
opts = webdriver.ChromeOptions()

In [5]:
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=opts)
wait = WebDriverWait(driver, 15)

In [7]:
def parse_rating_int(star_tag):
    if not star_tag:
        return None
    classes = star_tag.get_attribute("class").split()
    m = {"One": 1, "Two": 2, "Three": 3, "Four": 4, "Five": 5}
    for c in classes:
        if c in m:
            return m[c]
    return None
    

In [8]:
def parse_availability(av_text):
    if not av_text:
        return 0
    m = re.search(r"(\d+)", av_text)
    return int(m.group(1)) if m else 0

In [9]:
def get_table_kv():
    rows = driver.find_elements(By.CSS_SELECTOR, "table.table.table-stripped tr")
    d = {}
    for r in rows:
        th = r.find_element(By.TAG_NAME, "th").text.strip()
        tr = r.find_element(By.TAG_NAME, "tr").text.strip()
        d[th] = td
    return d

In [11]:
def scrape_book_detail(book_url):
    driver.get(book_url)
    wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "div.product_main h1")))

    title = driver.find_element(By.CSS_SELECTOR, "div.product_main h1").text.strip()

    rating = parse_rating_int(driver.find_element(By.CSS_SELECTOR, "p.star-rating"))

    info = get_table_kv()
    price_excl = info.get("Price (excl. tax)")
    price_incl = info.get("Price (incl. tax)")
    tax = info.get("Tax")
    upc = info.get("UPC")
    num_reviews = info.get("Number of reviews")
    availability_text = driver.find_element(By.CSS_SELECTOR, "div.product_main p.availability").text.strip()
    availability_n = parse_availability(availability_text)

    category = None
    try:
        category = driver.find_element(By.CLASS_NAME, "ul.breadcrumb li:nth-of-type(3) a").text.strip()
    except:
        pass

    description = None
    try:
        driver.find_element(By.ID, "product_description")
        description = driver.find_element(By.CSS_SELECTOR, "#product_description ~ p").text.strip()
    except:
        description = None
    
    img_src = driver.find_element(By.CSS_SELECTOR, "div.item.active img").get_attribute("src")
    img_url = urljoin(driver.current_url, img_src)

    return {
        "title": title,
        "rating": rating,
        "availability_text": availability_text,
        "availability_n": availability_n,
        "category": category,
        "description": description,
        "upc": upc,
        "price_excl_tax": price_excl,
        "price_incl_tax": price_incl,
        "tax": tax,
        "num_reviews": num_reviews,
        "image_url": img_url,
        "product_page_url": book_url,
    }

In [12]:
def scrape_all_books():
    data = []
    current_url = BASE
    driver.get(current_url)

    while True:
        wait.until(EC.presence_of_all_elements_located(By.CSS_SELECTOR, "article.product_pod"))
        listing_url = driver.current_url

        a_tags = driver.find_element(By.CSS_SELECTOR, "article.product_pod h3 a")
        book_links = [urljoin(listing_url, a.get_attribute("href")) for a in a_tags]

        try:
            next_href = driver.find_element(By.CSS_SELECTOR, "li.next a").get_attribute("href")
            next_url = urljoin(listing_url, next_href)
        except:
            next_url = None

        for link in book_links:
            rec = scrape_book_detail(link)
            data.append(rec)

        if not next_url:
            break

        current_url = next_url
        driver.get(current_url)

    return pd.DataFrame(data)

In [13]:
df = scrape_all_books()

TypeError: presence_of_all_elements_located() takes 1 positional argument but 2 were given