# Setup

Install some packages to start

In [None]:
# ONLY RUN ON GOOGLE COLAB
!uv pip install curl_cffi pandas selectolax selenium rich --system
!apt-get update
!apt install chromium-chromedriver
!cp /usr/lib/chromium-browser/chromedriver /usr/bin


[2mUsing Python 3.10.12 environment at /usr[0m
[2mAudited [1m5 packages[0m [2min 93ms[0m[0m
Get:1 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Hit:2 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:3 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Hit:4 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
Hit:5 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:6 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:7 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Get:11 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 Packages [2,672 kB]
Get:12 http://archive.ubuntu.com/ubuntu jammy-updates/universe amd64 Packages [1,452 k

## Exercise: Get titles from http://books.toscrape.com/

## Scraper type: Selenium

In [4]:
from selenium import webdriver
from selenium.webdriver.common.by import By
import time


def setup_driver(headless: bool = False):
    options = webdriver.ChromeOptions()
    if headless:
        options.add_argument("--headless")  # Run Chrome in headless mode
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    return webdriver.Chrome(options=options)

In [None]:
headless = True
driver = setup_driver(headless=headless)
try:
    # Open a webpage
    driver.get("https://books.toscrape.com/")
    if not headless:
        time.sleep(10)
    # Wait for the page to load and element to be available (optional)
    driver.implicitly_wait(10)  # seconds

    # Example: Find an element by name and print its text
    element = driver.find_element(By.TAG_NAME, "h1")
    print(element.text)

    book_titles = driver.find_elements(By.CSS_SELECTOR, "h3 > a")
    for title in book_titles:
        print(title.text)
except Exception as e:
    print(f"An error occurred: {e}")
finally:
    # Close the browser
    driver.quit()

All products
A Light in the ...
Tipping the Velvet
Soumission
Sharp Objects
Sapiens: A Brief History ...
The Requiem Red
The Dirty Little Secrets ...
The Coming Woman: A ...
The Boys in the ...
The Black Maria
Starving Hearts (Triangular Trade ...
Shakespeare's Sonnets
Set Me Free
Scott Pilgrim's Precious Little ...
Rip it Up and ...
Our Band Could Be ...
Olio
Mesaerion: The Best Science ...
Libertarianism for Beginners
It's Only the Himalayas


## Exercise: Get all titles, price, and whether in stock from all pages

In [None]:
driver = setup_driver(headless=headless)
try:
    # Open a webpage
    driver.get("https://books.toscrape.com/")

    # Wait for the page to load and element to be available (optional)
    driver.implicitly_wait(10)  # seconds

    # Example: Find an element by name and print its text
    element = driver.find_element(By.TAG_NAME, "h1")
    print(element.text)

    results = []
    while True:
        books = driver.find_elements(By.CSS_SELECTOR, "ol.row > li")
        # XPATH
        # books = driver.find_elements(By.XPATH, "//ol[@class='row']/li")
        for book in books:
            # CSS
            title = book.find_element(By.CSS_SELECTOR, "h3 > a")
            in_stock = book.find_element(By.CSS_SELECTOR, "p.instock")
            button = book.find_element(By.CSS_SELECTOR, "button.btn-primary")
            price = book.find_element(By.CSS_SELECTOR, "p.price_color")

            # XPATH
            # title = book.find_element(By.XPATH, './h3/a')
            # in_stock = book.find_element(By.XPATH, "p[contains(@class, 'instock')]")
            # button = book.find_element(By.XPATH, "./button[contains(@class, 'btn-primary']")
            # price = book.find_element(By.XPATH, "./p[@class='price_color']")

            results.append(
                {
                    "title": title.text,
                    "in_stock": in_stock.text,
                    "button": button.text,
                    "price": price.text,
                }
            )
            print(f"{title.text}, {in_stock.text}, {price.text}")
        try:
            next_page = driver.find_element(By.CSS_SELECTOR, "li.next > a")
            next_page.click()
        except Exception:
            break
    print(results)
except Exception as e:
    print(f"An error occurred: {e}")
finally:
    # Close the browser
    driver.quit()

All products
A Light in the ..., In stock, £51.77
Tipping the Velvet, In stock, £53.74
Soumission, In stock, £50.10
Sharp Objects, In stock, £47.82
Sapiens: A Brief History ..., In stock, £54.23
The Requiem Red, In stock, £22.65
The Dirty Little Secrets ..., In stock, £33.34
The Coming Woman: A ..., In stock, £17.93
The Boys in the ..., In stock, £22.60
The Black Maria, In stock, £52.15
Starving Hearts (Triangular Trade ..., In stock, £13.99
Shakespeare's Sonnets, In stock, £20.66
Set Me Free, In stock, £17.46
Scott Pilgrim's Precious Little ..., In stock, £52.29
Rip it Up and ..., In stock, £35.02
Our Band Could Be ..., In stock, £57.25
Olio, In stock, £23.88
Mesaerion: The Best Science ..., In stock, £37.59
Libertarianism for Beginners, In stock, £51.33
It's Only the Himalayas, In stock, £45.17
In Her Wake, In stock, £12.84
How Music Works, In stock, £37.32
Foolproof Preserving: A Guide ..., In stock, £30.52
Chase Me (Paris Nights ..., In stock, £25.27
Black Dust, In stock, £34.53
Bi

# Interlude: Save CSV to Google Drive

In [None]:
# For Colab only
import pandas as pd
from google.colab import drive

drive.mount("/content/drive")

df = pd.DataFrame(results)
df.to_csv("drive/My Drive/Colab Notebooks/books.csv", index=False)

drive.flush_and_unmount()
print("All changes made in this colab session should now be visible in Drive.")

# Scraper type 2: http2
Does not use a browser. Is much faster than Selenium but cannot rely on Javascript at all or XPATH.

In [7]:
from curl_cffi import requests
from selectolax.parser import HTMLParser

results = []
for n in range(1, 51):
    r = requests.get(
        f"https://books.toscrape.com/catalogue/page-{n}.html", impersonate="chrome"
    )
    tree = HTMLParser(r.content)
    for book in tree.css("ol.row > li"):
        title = book.css_first("h3 > a")
        in_stock = book.css_first("p.instock")
        button = book.css_first("button.btn-primary")
        price = book.css_first("p.price_color")

        results.append(
            {
                "title": title.text(strip=True),
                "in_stock": in_stock.text(strip=True),
                "button": button.text(strip=True),
                "price": price.text(strip=True),
            }
        )

        print(
            f"{title.text(strip=True)}, {in_stock.text(strip=True)}, {price.text(strip=True)}"
        )

print(results)


A Light in the ..., In stock, £51.77
Tipping the Velvet, In stock, £53.74
Soumission, In stock, £50.10
Sharp Objects, In stock, £47.82
Sapiens: A Brief History ..., In stock, £54.23
The Requiem Red, In stock, £22.65
The Dirty Little Secrets ..., In stock, £33.34
The Coming Woman: A ..., In stock, £17.93
The Boys in the ..., In stock, £22.60
The Black Maria, In stock, £52.15
Starving Hearts (Triangular Trade ..., In stock, £13.99
Shakespeare's Sonnets, In stock, £20.66
Set Me Free, In stock, £17.46
Scott Pilgrim's Precious Little ..., In stock, £52.29
Rip it Up and ..., In stock, £35.02
Our Band Could Be ..., In stock, £57.25
Olio, In stock, £23.88
Mesaerion: The Best Science ..., In stock, £37.59
Libertarianism for Beginners, In stock, £51.33
It's Only the Himalayas, In stock, £45.17
In Her Wake, In stock, £12.84
How Music Works, In stock, £37.32
Foolproof Preserving: A Guide ..., In stock, £30.52
Chase Me (Paris Nights ..., In stock, £25.27
Black Dust, In stock, £34.53
Birdsong: A Sto

## Scraper type 3: API


### Suggested Workflow

Here's a suggested workflow for reversing an API to understand how it works and potentially use it directly for data collection:

  1. Reconnaissance: Open the website you want to reverse engineer in your web browser. Open the developer tools (usually by pressing F12) and navigate to the "Network" tab. Refresh the page to capture all network requests.

  2. Identify the API Call: Look for requests with a "Type" of "xhr," "fetch," or "json." These are typically API calls. Examine the "Preview" or "Response" tab of these requests to see the data being returned. Focus on requests that return the data you're interested in.

  3. Copy as cURL and Convert to Python: Right-click on the API request you identified and select "Copy" -> "Copy as cURL." This copies the request as a cURL command, which can be used to replicate the request. Use a tool like [curlconverter](https://curlconverter.com/) to convert this cURL command into Python code using the requests library. This will provide you with a Python script that can make the same API request.

  4. Use curl_cffi instead of requests: It's recommended to use the curl_cffi library instead of requests because it has better performance and has impersonation/fingerprint generation built-in. The latter means it can get through a lot of sites that block scraping. Replace the `import requests` statement in the converted code with `from curl_cffi import requests`.

  5. Beware of cookies and other authentication measures. This API example is relatively uncomplicated because there is no authentication or blocking.

Example:
  https://adc.hidoe.us/#/chronic-absenteeism



In [None]:
from curl_cffi import requests
from rich import print

headers = {
    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:132.0) Gecko/20100101 Firefox/132.0",
    "Accept": "application/json, text/plain, */*",
    "Accept-Language": "en-US,en;q=0.5",
    # 'Accept-Encoding': 'gzip, deflate, br, zstd',
    "Connection": "keep-alive",
    "Referer": "https://adc.hidoe.us/",
    "Sec-Fetch-Dest": "empty",
    "Sec-Fetch-Mode": "cors",
    "Sec-Fetch-Site": "same-origin",
    "Priority": "u=0",
    "Pragma": "no-cache",
    "Cache-Control": "no-cache",
    # Requests doesn't support trailers
    # 'TE': 'trailers',
}

entities = requests.get(
    "https://adc.hidoe.us/api/entity/list/2024Final",
    impersonate="chrome",
    headers=headers,
)
if not entities.ok:
    raise Exception(f"Request failed with status code {entities.status_code}")

# Response Example:
#     {
# 			"type": "CA",
# 			"value": 4,
# 			"description": "Aiea-Moanalua-Radford",
# 			"name": "Aiea-Moanalua-Radford (4)",
# 			"schoolType": null,
# 			"striveHIType": null,
# 			"gradRetCategory": null,
# 			"island": "All Islands",
# 			"parentEntityCode": null
# 		}

# We want to keep only the name and value of each entity
school_ids = []
for entity in entities.json()["data"]:
    values = {"name": entity["name"], "value": entity["value"]}
    school_ids.append(values)
print(school_ids)

In [11]:
# Loop through entities to get results
results = []
for school_id in school_ids[:15]:
    params = {
        "dataLoadTag": "2024Final",
        "dataType": "all",
        "denNum": "Denominator",
        "endYear": "2024",
        "entity": school_id["value"],
        "entityType": "School",
        "gradRetCategory": "1",
        "listType": "ReadingChronicAbsenteeism",
        "schoolTitle": "all",
        "schoolType": "all",
        "series": "grade",
        "seriesType": "grade",
        "startYear": "2017",
        "subject": "R",
    }

    response = requests.get(
        "https://adc.hidoe.us/api/chronic-absenteeism/grade",
        params=params,
        # headers=headers,
        impersonate="chrome",
    )
    if not response.ok:
        raise Exception(f"Request failed with status code {response.status_code}")
    print(f"Data for {school_id=}:", response.json(), "\n")
    school_id["data"] = response.json()
    results.append(school_id)

print(results)