In [7]:
import requests
import csv
import re
import json
from bs4 import BeautifulSoup

def extract_line_value(text, label):
    """
    Searches for `label` ignoring whether there's a colon or not,
    returning everything after `label` up to the next newline.
    """
    # Example pattern:
    #   "We Taste" + optional anything + optional colon + optional space + (capture rest of line)
    pattern = rf"{label}.*?:?\s*(.*)"
    match = re.search(pattern, text, re.IGNORECASE)
    if match:
        return match.group(1).split('\n', 1)[0].strip()
    return ""

def extract_country_region(body_html):
    soup = BeautifulSoup(body_html, 'html.parser')
    text = soup.get_text(separator="\n", strip=True)
    return extract_line_value(text, "Country/Region")

def extract_varietal(body_html):
    soup = BeautifulSoup(body_html, 'html.parser')
    text = soup.get_text(separator="\n", strip=True)
    return extract_line_value(text, "Varietal")

def extract_processing(body_html):
    soup = BeautifulSoup(body_html, 'html.parser')
    text = soup.get_text(separator="\n", strip=True)
    return extract_line_value(text, "Processing")

def extract_flavour_notes(body_html):
    soup = BeautifulSoup(body_html, 'html.parser')
    text = soup.get_text(separator="\n", strip=True)
    return extract_line_value(text, "We Taste")

def parse_variant_title(variant_title):
    parts = variant_title.split(" / ", maxsplit=1)
    if len(parts) == 2:
        return parts[0].strip(), parts[1].strip()
    return variant_title.strip(), ""

def scrape_ppp_beans(url, csv_filename):
    response = requests.get(url)
    if response.status_code != 200:
        print("Failed to retrieve data")
        return

    data = response.json()
    products = data.get("products", [])

    with open(csv_filename, mode='w', newline='', encoding='utf-8') as csv_file:
        fieldnames = [
            "Product ID", "Product Title", "Variant ID", "Variant Weight", "Variant Detail",
            "Price", "SKU", "Image URL",
            "Country", "Varietal", "Processing", "Flavour Notes",
            # Optional extra fields:
            # "Elevation", "Harvest", ...
        ]
        writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
        writer.writeheader()

        for product in products:
            product_id = product.get("id")
            product_title = product.get("title")
            body_html = product.get("body_html", "")

            # Extract info
            country_region = extract_country_region(body_html)
            varietal = extract_varietal(body_html)
            processing = extract_processing(body_html)
            flavour_notes = extract_flavour_notes(body_html)
            
            # Optional extra:
            # elevation = extract_line_value(soup_text, "Elevation")
            # harvest = extract_line_value(soup_text, "Harvest")

            images = product.get("images", [])
            image_url = images[0].get("src", "N/A") if images else "N/A"

            variants = product.get("variants", [])
            for variant in variants:
                variant_id = variant.get("id")
                vtitle = variant.get("title", "")
                weight, detail = parse_variant_title(vtitle)
                # Only keep "Whole Beans" if that’s your requirement:
                if detail != "Whole Beans":
                    continue

                row = {
                    "Product ID": product_id,
                    "Product Title": product_title,
                    "Variant ID": variant_id,
                    "Variant Weight": weight,
                    "Variant Detail": detail,
                    "Price": variant.get("price"),
                    "SKU": variant.get("sku"),
                    "Image URL": image_url,
                    "Country": country_region,
                    "Varietal": varietal,
                    "Processing": processing,
                    "Flavour Notes": flavour_notes
                }
                writer.writerow(row)

    print(f"Data scraped and saved to {csv_filename}")

if __name__ == "__main__":
    url = "https://pppcoffee.com/collections/beans/products.json"
    csv_filename = "ppp_beans.csv"
    scrape_ppp_beans(url, csv_filename)
    print(f"Data scraped and saved to {csv_filename}")

Data scraped and saved to ppp_beans.csv
Data scraped and saved to ppp_beans.csv
