In [None]:
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from bs4 import BeautifulSoup
import json
import re

domain = "https://nespresso.com"

# Configure session with retries
session = requests.Session()
retries = Retry(total=5, backoff_factor=1, status_forcelist=[500, 502, 503, 504])
session.mount("https://", HTTPAdapter(max_retries=retries))


def getJsonFromPageWithQuery(URL, selector):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }
    page = session.get(URL, headers=headers, timeout=10)
    if page.status_code != 200:
        raise ValueError(f"Failed to fetch URL: {URL}, Status Code: {page.status_code}")
    soup = BeautifulSoup(page.content, "html.parser")
    tags = soup.select(selector)
    if not tags:
        raise ValueError(f"Selector '{selector}' not found on the page.")
    tag = str(tags[0])
    json_string = tag[tag.find("{") : tag.rfind("}") + 1]
    return json.loads(json_string)

In [None]:
def get_main_list(start_url):
    """Fetch the main product/category JSON from the listing page."""
    main_list = getJsonFromPageWithQuery(
        start_url,
        "div[id^=respProductListPLPCapsule]+script",
    )

    products = (
        main_list.get("configuration", {}).get("eCommerceData", {}).get("products", [])
    )
    if not products:
        raise ValueError("No products found in the JSON data.")

    categories = (
        main_list.get("configuration", {}).get("eCommerceData", {}).get("categories", [])
    )
    excluded_range_ids = [
        "nesclub2.tw.b2c/cat/capsule-range-limited-edition-b2c",
        "nesclub2.tw.b2c/cat/capsule-range-assortment",
    ]
    capsule_ranges = [
        x
        for x in categories
        if (
            "nesclub2.tw.b2c/cat/capsule-range" in x["superCategories"]
            and x["id"] not in excluded_range_ids
        )
    ]

    return main_list, products, categories, capsule_ranges

In [None]:
def extract_simple_data(domain, products, capsule_ranges):
    """Iterate ranges and products, fetch item pages and extract simplified data."""
    simple_data = dict()
    for range in capsule_ranges:
        print("Current range: %s" % range["name"])

        # Initialize array for items in this range
        range_items = dict()

        # Grab items from the category
        items = [
            x
            for x in products
            if range["id"] in x["ranges"]
            and x["type"] == "capsule"
            and x["unitQuantity"] == 1
        ]

        # Grab the detailed JSON from the item's own page
        for item in items:
            print("Current capsule: %s" % item["name"])

            URL = domain + item["url"]
            item_info = getJsonFromPageWithQuery(
                URL, "div[id^=respProductDetailPDPCapsule]+script"
            )
            product = (
                item_info.get("configuration", {})
                .get("eCommerceData", {})
                .get("product", {})
            )

            # Extract properties
            description = [x["text"] for x in product.get("ingredients", [])]
            caffeine_matches = re.findall(r"(\d+)\s?mg", str(description))
            caffeine_mg = int(caffeine_matches[0]) if caffeine_matches else None
            image_url = domain + product.get("image", {}).get("url", "")

            # Write item properties to output
            range_items[product.get("name", "Unknown")] = dict(
                [("caffeine_mg", caffeine_mg), ("image_url", image_url)]
            )

        # Write all data for this range
        simple_data[range["name"]] = range_items

    return simple_data

In [None]:
def save_data(simple_data, output_path):
    """Save the scraped simple_data to the provided output path as JSON."""
    with open(output_path, "w") as outfile:
        json.dump(simple_data, outfile, indent=2)
    print(f"Saved data to {output_path}")

In [None]:
if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser(description="Scrape Nespresso capsules and save simplified JSON.")
    parser.add_argument(
        "--url",
        "-u",
        default="https://nespresso.com/tw/en/order/capsules/vertuo",
        help="Listing page URL to start scraping (default the Vertuo listing)",
    )
    parser.add_argument(
        "--output",
        "-o",
        default="data.json",
        help="Output JSON file path (default: data.json)",
    )
    args = parser.parse_args()

    # Run pipeline
    main_list, products, categories, capsule_ranges = get_main_list(args.url)
    simple_data = extract_simple_data(domain, products, capsule_ranges)
    save_data(simple_data, args.output)