In [16]:
import requests
import csv
import re
import json
from bs4 import BeautifulSoup

def extract_flavour_notes(body_html):
    """
    Extracts the flavour notes from the product's HTML if a line beginning with "We Taste:" is present.
    """
    soup = BeautifulSoup(body_html, 'html.parser')
    text = soup.get_text(separator="\n", strip=True)
    match = re.search(r"We Taste:\s*(.*)", text, re.IGNORECASE)
    if match:
        return match.group(1).strip().split("\n")[0].rstrip(".")
    return ""

def extract_country_region(body_html):
    """
    Extracts the country/region information from the product's HTML if a line containing "Country/Region:" is present.
    """
    soup = BeautifulSoup(body_html, 'html.parser')
    text = soup.get_text(separator="\n", strip=True)
    match = re.search(r"Country/Region:\s*(.*)", text, re.IGNORECASE)
    if match:
        return match.group(1).strip().split("\n")[0]
    return ""

def extract_varietal(body_html):
    """
    Extracts the varietal information from the product's HTML if a line containing "Varietal:" is present.
    """
    soup = BeautifulSoup(body_html, 'html.parser')
    text = soup.get_text(separator="\n", strip=True)
    match = re.search(r"Varietal:\s*(.*)", text, re.IGNORECASE)
    if match:
        return match.group(1).strip().split("\n")[0]
    return ""

def extract_processing(body_html):
    """
    Extracts the processing information from the product's HTML if a line containing "Processing:" is present.
    """
    soup = BeautifulSoup(body_html, 'html.parser')
    text = soup.get_text(separator="\n", strip=True)
    match = re.search(r"Processing:\s*(.*)", text, re.IGNORECASE)
    if match:
        return match.group(1).strip().split("\n")[0]
    return ""

def parse_variant_title(variant_title):
    """
    Splits a variant title by " / " into a weight and a grind/brew component.
    If no " / " is found, returns the title and an empty string.
    """
    parts = variant_title.split(" / ", maxsplit=1)
    if len(parts) == 2:
        return parts[0].strip(), parts[1].strip()
    return variant_title.strip(), ""

def scrape_ppp_beans(url, csv_filename):
    # Fetch the JSON data from the provided URL
    response = requests.get(url)
    if response.status_code != 200:
        print("Failed to retrieve data")
        return

    data = response.json()
    products = data.get("products", [])
    
    # Open the CSV file for writing
    with open(csv_filename, mode='w', newline='', encoding='utf-8') as csv_file:
        fieldnames = [
            "Product ID", "Product Title", "Variant ID", "Variant Weight", "Variant Detail",
            "Price", "SKU", "Country", "Varietal", "Processing", "Flavour Notes"
        ]
        writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
        writer.writeheader()

        for product in products:
            product_id = product.get("id")
            product_title = product.get("title")
            body_html = product.get("body_html", "")
            
            # Extract additional details from the product description
            country_region = extract_country_region(body_html)
            varietal = extract_varietal(body_html)
            processing = extract_processing(body_html)
            flavour_notes = extract_flavour_notes(body_html)

            variants = product.get("variants", [])
            for variant in variants:
                variant_id = variant.get("id")
                variant_title = variant.get("title", "")
                weight, detail = parse_variant_title(variant_title)
                # Only include variants where the detail is exactly "Whole Beans"
                if detail != "Whole Beans":
                    continue

                writer.writerow({
                    "Product ID": product_id,
                    "Product Title": product_title,
                    "Variant ID": variant_id,
                    "Variant Weight": weight,
                    "Variant Detail": detail,
                    "Price": variant.get("price"),
                    "SKU": variant.get("sku"),
                    "Country": country_region,
                    "Varietal": varietal,
                    "Processing": processing,
                    "Flavour Notes": flavour_notes
                })

if __name__ == "__main__":
    url = "https://pppcoffee.com/collections/beans/products.json"
    csv_filename = "ppp_beans.csv"
    scrape_ppp_beans(url, csv_filename)
    print(f"Data scraped and saved to {csv_filename}")

Data scraped and saved to ppp_beans.csv
