In [4]:
import requests
import csv
import re
from bs4 import BeautifulSoup

def extract_ul_info(body_html):
    """
    Extracts details from <ul> list items.
    Looks for lines with a "Key: Value" pattern and returns a dict with:
      - Elevation (used for Altitude)
      - Origin (used for Region/Origins)
      - Roast (used for Roast Type)
    """
    result = {"Elevation": "", "Origin": "", "Roast": ""}
    soup = BeautifulSoup(body_html, 'html.parser')
    ul = soup.find('ul')
    if ul:
        for li in ul.find_all('li'):
            text = li.get_text(" ", strip=True)
            if ":" in text:
                key, value = text.split(":", 1)
                key = key.strip().lower()
                value = value.strip()
                if key == "elevation":
                    result["Elevation"] = value
                elif key == "origin":
                    result["Origin"] = value
                elif key == "roast":
                    result["Roast"] = value
    return result

def extract_notes(body_html):
    """
    Returns a tuple (main_note, taste_note) where:
      - main_note: a descriptive paragraph (e.g. containing keywords like "sip", "finish", etc.)
      - taste_note: the text following an explicit "Taste Notes:" label, if present.
    """
    soup = BeautifulSoup(body_html, 'html.parser')
    p_tags = soup.find_all('p')
    
    main_note = None
    taste_note = None
    
    for p in p_tags:
        text = p.get_text(" ", strip=True)
        # If a paragraph contains "Taste Notes:" extract that as the tasting note.
        if "Taste Notes:" in text:
            taste_note = text.split("Taste Notes:")[-1].strip()
        else:
            # If we haven't already set the main note and the text seems descriptive,
            # look for keywords such as "sip", "finish", "floral", or "fruit".
            if main_note is None and any(keyword in text.lower() for keyword in ["sip", "finish", "floral", "fruit"]):
                main_note = text

    return main_note, taste_note

def extract_flavour_notes(body_html):
    """
    Extracts a brief description from the first <h4> element.
    """
    soup = BeautifulSoup(body_html, 'html.parser')
    h4 = soup.find('h4')
    if h4:
        return h4.get_text(" ", strip=True)
    return ""

def process_products(url, csv_filename):
    # Fetch the JSON from the products endpoint.
    response = requests.get(url)
    if response.status_code != 200:
        print("Error fetching products:", response.status_code)
        return
    data = response.json()
    products = data.get("products", [])

    # Define CSV columns. We combine our note fields into one column: "Flavour Note".
    fieldnames = [
        "Title",
        "Primary Image URL",
        "Availability",
        "Roast Type",
        "Flavour Note",  # Coalesced note field.
        "Region",
        "Varietal",
        "Process",
        "Altitude",
        "Vendor",
        "Product Type",
        "Tags",
        "Cup Characteristics",
        "Variant Part 1",
        "Variant Part 2",
        "Variant Part 3",
        "Variant Price",
        "Variant Availability",
        "Grams"
    ]

    with open(csv_filename, "w", newline="", encoding="utf-8") as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()

        for product in products:
            title = product.get("title", "")
            vendor = product.get("vendor", "")
            product_type = product.get("product_type", "")
            tags = ", ".join(product.get("tags", []))
            images = product.get("images", [])
            primary_image_url = images[0]["src"] if images else ""
            # Overall product availability (if any variant is available).
            availability = "Available" if any(variant.get("available", False) for variant in product.get("variants", [])) else "Not Available"
            body_html = product.get("body_html", "")

            # Extract details from the <ul> block.
            ul_info = extract_ul_info(body_html)
            altitude = ul_info.get("Elevation", "")
            region = ul_info.get("Origin", "")
            roast_type = ul_info.get("Roast", "")

            # Extract a brief description from the <h4> element.
            h4_note = extract_flavour_notes(body_html)

            # Extract additional descriptive notes.
            main_note, taste_note = extract_notes(body_html)
            
            # Coalesce the note fields into one "Flavour Note" column.
            if main_note and main_note.strip():
                combined_flavour_note = main_note.strip()
            elif h4_note and h4_note.strip():
                combined_flavour_note = h4_note.strip()
            elif taste_note and taste_note.strip():
                combined_flavour_note = taste_note.strip()
            else:
                combined_flavour_note = ""

            # For these products, we don't have extra meta details.
            varietal = ""
            process_field = ""
            cup_chars = ""
            origins = region  # Optionally, duplicate the origin as the "Origins" field.

            # Loop through each variant.
            for variant in product.get("variants", []):
                variant_title = variant.get("title", "")
                # Split variant title into parts (if applicable).
                parts = [p.strip() for p in variant_title.split(" / ")]
                if len(parts) < 3:
                    parts += [""] * (3 - len(parts))
                variant_part1, variant_part2, variant_part3 = parts[:3]

                variant_price = variant.get("price", "")
                variant_availability = "Available" if variant.get("available", False) else "Not Available"
                grams = variant.get("grams", "")

                row = {
                    "Title": title,
                    "Primary Image URL": primary_image_url,
                    "Availability": availability,
                    "Roast Type": roast_type,
                    "Flavour Note": combined_flavour_note,
                    "Region": region,
                    "Varietal": varietal,
                    "Process": process_field,
                    "Altitude": altitude,
                    "Vendor": vendor,
                    "Product Type": product_type,
                    "Tags": tags,
                    "Cup Characteristics": cup_chars,
                    "Variant Part 1": variant_part1,
                    "Variant Part 2": variant_part2,
                    "Variant Part 3": variant_part3,
                    "Variant Price": variant_price,
                    "Variant Availability": variant_availability,
                    "Grams": grams
                }
                writer.writerow(row)

    print(f"CSV file '{csv_filename}' created with {len(products)} products processed.")

if __name__ == "__main__":
    url = "https://www.splendourcoffee.com/collections/beans/products.json"
    csv_filename = "splendourcoffee_products.csv"
    process_products(url, csv_filename)

CSV file 'splendourcoffee_products.csv' created with 17 products processed.
