In [2]:
import requests
import csv
import re
from bs4 import BeautifulSoup

def extract_tasting_notes_from_blend(body_html):
    """
    Fallback extraction for flavour notes (used when product_type contains “blend”).
    Looks first for an <em> tag and returns its text; if not found, checks for a vertical bar ("|")
    in the first <p> tag and returns the text after it.
    """
    soup = BeautifulSoup(body_html, 'html.parser')
    em_tag = soup.find('em')
    if em_tag:
        return em_tag.get_text().strip()
    first_p = soup.find('p')
    if first_p:
        text = first_p.get_text(" ", strip=True)
        if "|" in text:
            parts = text.split("|")
            if len(parts) > 1:
                return parts[1].strip()
    return ""

def extract_cupping_notes(body_html):
    """
    Extracts cupping notes for single origin products by checking for the first occurrence 
    of an <em> or <i> tag.
    """
    soup = BeautifulSoup(body_html, 'html.parser')
    note_tag = soup.find(['em', 'i'])
    if note_tag:
        return note_tag.get_text().strip()
    return ""

def extract_table_info(body_html):
    """
    Attempts to extract table-based info.
    Searches for the first <table> element and splits the second <td> cell into parts.
    Returns a dict with keys: Region, Varietal, Process, Altitude.
    (Not all products include a table.)
    """
    result = {"Region": "", "Varietal": "", "Process": "", "Altitude": ""}
    soup = BeautifulSoup(body_html, 'html.parser')
    table = soup.find('table')
    if table:
        tds = table.find_all('td')
        if len(tds) >= 2:
            cell_text = tds[1].get_text(separator="|", strip=True)
            parts = [part.strip() for part in cell_text.split("|") if part.strip()]
            # Remove leading colons if present.
            parts = [part.lstrip(":").strip() for part in parts]
            if len(parts) == 5:
                result["Region"] = parts[0]
                result["Varietal"] = parts[2]
                result["Process"] = parts[3]
                result["Altitude"] = parts[4]
            elif len(parts) >= 4:
                result["Region"] = parts[0]
                result["Varietal"] = parts[1]
                result["Process"] = parts[2]
                result["Altitude"] = parts[3]
    return result

def extract_roast_type(body_html):
    """
    Extracts roast type from the first <p> tag.
    Looks for a vertical bar ("|") and returns the text after it,
    normalized to one of "espresso", "filter", or "omni".
    """
    soup = BeautifulSoup(body_html, 'html.parser')
    first_p = soup.find('p')
    if first_p:
        text = first_p.get_text(" ", strip=True)
        if "|" in text:
            parts = text.split("|")
            roast = parts[1].strip().lower()
            if "espresso" in roast:
                return "espresso"
            elif "filter" in roast:
                return "filter"
            elif "omni" in roast:
                return "omni"
    return ""

def extract_meta_info(body_html):
    """
    Extracts meta information from the product’s body_html.
    
    This function searches the entire plain-text (obtained via BeautifulSoup) for patterns:
      - Cup Characteristics: <value>
      - Origins: <value> (if present)
      - Varietal: <value> (if present)
      - Processing: <value> (if present)
    
    Returns a tuple: (cup_chars, origins, varietal, processing)
    """
    soup = BeautifulSoup(body_html, 'html.parser')
    text = soup.get_text(" ", strip=True)
    
    cup_chars = ""
    origins = ""
    meta_varietal = ""
    processing = ""
    
    # Use non-greedy match up to either the next label or end of string.
    m_chars = re.search(r'Cup Characteristics\s*:\s*(.*?)(?=\s*(?:Origins|Varietal|Processing)\s*:|$)', text, re.IGNORECASE | re.DOTALL)
    if m_chars:
        cup_chars = m_chars.group(1).strip()
        # Optionally, remove any trailing "Tea" (if present)
        cup_chars = re.sub(r'\s*Tea\s*$', '', cup_chars, flags=re.IGNORECASE)
    
    m_orig = re.search(r'Origins\s*:\s*(.*?)(?=\s*(?:Varietal|Processing)\s*:|$)', text, re.IGNORECASE | re.DOTALL)
    if m_orig:
        origins = m_orig.group(1).strip()
    
    m_var = re.search(r'Varietal\s*:\s*(.*?)(?=\s*(?:Processing)\s*:|$)', text, re.IGNORECASE | re.DOTALL)
    if m_var:
        meta_varietal = m_var.group(1).strip()
    
    m_proc = re.search(r'Processing\s*:\s*(.*?)(?=\s*$)', text, re.IGNORECASE | re.DOTALL)
    if m_proc:
        processing = m_proc.group(1).strip()
    
    return cup_chars, origins, meta_varietal, processing

def process_products(url, csv_filename):
    # Fetch the JSON from Jewel Coffee’s products endpoint.
    response = requests.get(url)
    if response.status_code != 200:
        print("Error fetching products:", response.status_code)
        return
    data = response.json()
    products = data.get("products", [])

    # Define CSV columns.
    fieldnames = [
        "Title",
        "Primary Image URL",
        "Availability",
        "Roast Type",
        "Flavour Notes",
        "Region",
        "Varietal",
        "Process",
        "Altitude",
        "Vendor",
        "Product Type",
        "Tags",
        "Cup Characteristics",
        "Origins",
        "Variant Part 1",
        "Variant Part 2",
        "Variant Part 3",
        "Variant Price",
        "Variant Availability",
        "Grams"
    ]

    with open(csv_filename, "w", newline="", encoding="utf-8") as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()

        for product in products:
            title = product.get("title", "")
            vendor = product.get("vendor", "")
            product_type = product.get("product_type", "")
            tags = ", ".join(product.get("tags", []))
            images = product.get("images", [])
            primary_image_url = images[0]["src"] if images else ""
            # Determine overall product availability from variants.
            product_availability = any(variant.get("available", False) for variant in product.get("variants", []))
            availability = "Available" if product_availability else "Not Available"
            body_html = product.get("body_html", "")

            # Extract roast type.
            roast_type = extract_roast_type(body_html)
            
            # Determine flavour notes.
            if "blend" in product_type.lower():
                notes = extract_tasting_notes_from_blend(body_html)
            elif "single origin" in product_type.lower():
                notes = extract_cupping_notes(body_html)
            else:
                notes = ""

            # Attempt to extract table info (if a table exists).
            table_info = extract_table_info(body_html)
            region = table_info.get("Region", "")
            table_varietal = table_info.get("Varietal", "")
            process_field = table_info.get("Process", "")
            altitude = table_info.get("Altitude", "")

            # Extract meta info (the cup characteristics, origins, varietal, processing)
            meta_cup, meta_orig, meta_var, meta_proc = extract_meta_info(body_html)

            # For varietal and process, if table info is empty, use meta info.
            varietal = table_varietal if table_varietal else meta_var
            if not process_field:
                process_field = meta_proc
            # Use meta info for cup characteristics and origins.
            cup_chars = meta_cup
            origins = meta_orig

            # Loop through each variant. (Only include variants that have "Whole Bean" and skip ones with "(Wholesale)")
            for variant in product.get("variants", []):
                variant_name = variant.get("title", "")
                if "whole bean" not in variant_name.lower():
                    continue
                if "(wholesale)" in variant_name.lower():
                    continue

                parts = [p.strip() for p in variant_name.split(" / ")]
                if len(parts) < 3:
                    parts += [""] * (3 - len(parts))
                variant_part1, variant_part2, variant_part3 = parts[:3]

                variant_price = variant.get("price", "")
                variant_availability = "Available" if variant.get("available", False) else "Not Available"
                grams = variant.get("grams", "")

                row = {
                    "Title": title,
                    "Primary Image URL": primary_image_url,
                    "Availability": availability,
                    "Roast Type": roast_type,
                    "Flavour Notes": notes,
                    "Region": region,
                    "Varietal": varietal,
                    "Process": process_field,
                    "Altitude": altitude,
                    "Vendor": vendor,
                    "Product Type": product_type,
                    "Tags": tags,
                    "Cup Characteristics": cup_chars,
                    "Origins": origins,
                    "Variant Part 1": variant_part1,
                    "Variant Part 2": variant_part2,
                    "Variant Part 3": variant_part3,
                    "Variant Price": variant_price,
                    "Variant Availability": variant_availability,
                    "Grams": grams
                }
                writer.writerow(row)

    print(f"CSV file '{csv_filename}' created with {len(products)} products processed.")

if __name__ == "__main__":
    # URL for Jewel Coffee products JSON.
    url = "https://jewelcoffee.com/collections/coffee-beans/products.json"
    csv_filename = "jewel_coffee_products_test.csv"
    process_products(url, csv_filename)

CSV file 'jewel_coffee_products_test.csv' created with 30 products processed.
