In [14]:
import requests
import csv
import re
from bs4 import BeautifulSoup

def extract_process(body_text):
    """
    Look for text after 'Process:' and up to either 'Varietal:' or the end.
    Example match: 'Washed'
    """
    match = re.search(r"Process:\s*(.*?)\s*(?=Varietal:|$)", body_text, re.IGNORECASE)
    if match:
        return match.group(1).strip()
    return ""

def extract_varietal(body_text):
    """
    Look for text after 'Varietal:' and up to 'Tasting Notes:' or the end.
    Example match: 'Pink Bourbon'
    """
    match = re.search(r"Varietal:\s*(.*?)\s*(?=Tasting Notes:|$)", body_text, re.IGNORECASE)
    if match:
        return match.group(1).strip()
    return ""

def extract_tasting_notes(body_text):
    """
    Look for text after 'Tasting Notes:' to the end of the line.
    Then remove trailing text such as "Omni Roast, available as..." if present.
    """
    match = re.search(r"Tasting Notes:\s*(.*)", body_text, re.IGNORECASE)
    if not match:
        return ""

    notes = match.group(1).strip()
    
    # 1) Remove anything after "Omni Roast" (case-insensitive):
    notes = re.sub(r"(?i)\b(omni roast|filter roast).*", "", notes).strip()

    # 2) Optionally also remove "Filter Roast" if you want:
    # notes = re.sub(r"(?i)\bfilter roast.*", "", notes).strip()

    return notes

def parse_variant_title(variant_title):
    """
    Splits a variant title like "200g / Whole Beans" into (weight, detail).
    If the format doesn't match, fallback to (variant_title, "").
    """
    parts = variant_title.split(" / ", maxsplit=1)
    if len(parts) == 2:
        return parts[0].strip(), parts[1].strip()
    return variant_title.strip(), ""

def scrape_prodigal_beans(url, csv_filename):
    # 1. Fetch JSON data
    response = requests.get(url)
    if response.status_code != 200:
        print("Failed to retrieve data")
        return
    data = response.json()
    products = data.get("products", [])

    # 2. Prepare CSV
    with open(csv_filename, mode='w', newline='', encoding='utf-8') as csv_file:
        fieldnames = [
             "Product Title", "Variant Weight",
            "Variant Detail", "Price", "Process", "Varietal", "Tasting Notes"
        ]
        writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
        writer.writeheader()

        # 3. Loop through products
        for product in products:
            product_id = product.get("id")
            product_title = product.get("title")
            body_html = product.get("body_html", "")

            # Convert HTML to plain text for easier regex
            soup = BeautifulSoup(body_html, 'html.parser')
            body_text = soup.get_text(separator=" ", strip=True)

            # Extract fields
            process_info = extract_process(body_text)
            varietal_info = extract_varietal(body_text)
            tasting_notes = extract_tasting_notes(body_text)

            # 4. Loop through variants
            for variant in product.get("variants", []):
                variant_id = variant.get("id")
                variant_title = variant.get("title", "")
                weight, detail = parse_variant_title(variant_title)
                
                # We only keep “Whole Beans”
                if detail.lower() != "whole beans":
                    continue

                row = {
                    "Product Title": product_title,
                    "Variant Weight": weight,
                    "Variant Detail": detail,
                    "Price": variant.get("price", ""),
                    "Process": process_info,
                    "Varietal": varietal_info,
                    "Tasting Notes": tasting_notes
                }
                writer.writerow(row)

    print(f"Data scraped and saved to {csv_filename}")

if __name__ == "__main__":
    url = "https://prodigalroasters.com/collections/coffee-beans-singapore/products.json"
    csv_filename = "prodigal_roasters_wholebeans.csv"
    scrape_prodigal_beans(url, csv_filename)

Data scraped and saved to prodigal_roasters_wholebeans.csv
