In [10]:
import requests
import csv
import re
from bs4 import BeautifulSoup

def extract_field(body_html, field_name):
    """
    Extracts the value following a label (e.g. 'Region:') from the HTML text.
    Returns the text after the first colon, or an empty string if not found.
    """
    soup = BeautifulSoup(body_html, 'html.parser')
    text = soup.get_text(separator="\n")
    pattern = rf"{re.escape(field_name)}:\s*(.*)"
    match = re.search(pattern, text)
    if match:
        return match.group(1).strip().split("\n")[0]
    return ""

def extract_harvest(body_html):
    """
    Looks for a phrase like 'Harvest begins in November ... until February'
    and returns a string like 'November - February'. If not found, returns an empty string.
    """
    soup = BeautifulSoup(body_html, 'html.parser')
    text = soup.get_text(separator="\n")
    match = re.search(r"Harvest\s+begins\s+in\s+([A-Za-z]+).*?until\s+([A-Za-z]+)", text, re.IGNORECASE)
    if match:
        return f"{match.group(1)} - {match.group(2)}"
    return ""

def process_products(url, csv_filename):
    response = requests.get(url)
    if response.status_code != 200:
        print("Error fetching products:", response.status_code)
        return

    data = response.json()
    products = data.get("products", [])

    # Define CSV columns
    fieldnames = [
        "Handle",
        "Primary Image URL",
        "Availability",
        "Tasting Notes",
        "Variant Part 1",
        "Variant Part 2",
        "Variant Part 3",
        "Variant Price",
        "Variant Availability",
        "Producer",
        "Region",
        "Variety",
        "Altitude",
        "Harvest",
        "Processing",
        "Brewing"
    ]

    with open(csv_filename, "w", newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()

        for product in products:
            # Skip products with product_type "Brewing Products"
            if product.get("product_type", "").lower() == "brewing products":
                continue

            handle = product.get("handle", "")
            images = product.get("images", [])
            primary_image_url = images[0]["src"] if images else ""
            product_availability = any(variant.get("available", False) for variant in product.get("variants", []))
            availability = "Available" if product_availability else "Not Available"
            body_html = product.get("body_html", "")

            # Extract additional product-level details from the HTML description
            tasting_notes = extract_field(body_html, "Tasting Notes")
            producer = extract_field(body_html, "Producer")
            region = extract_field(body_html, "Region")
            variety = extract_field(body_html, "Variety")
            altitude = extract_field(body_html, "Altitude")
            processing = extract_field(body_html, "Process")
            harvest = extract_harvest(body_html)
            brewing = ""  # Brewing column remains empty since "Brewing Products" are skipped

            # Loop through each variant; each row corresponds to one variant.
            for variant in product.get("variants", []):
                variant_name = variant.get("title", "")
                # Only keep rows if the variant title contains "Whole Beans" (case-insensitive)
                if "whole beans" not in variant_name.lower():
                    continue

                # Split the variant title by " / " into three parts.
                parts = [p.strip() for p in variant_name.split(" / ")]
                # Ensure we have three parts by padding if needed.
                if len(parts) < 3:
                    parts += [""] * (3 - len(parts))
                variant_part1, variant_part2, variant_part3 = parts[:3]

                variant_price = variant.get("price", "")
                variant_availability = "Available" if variant.get("available", False) else "Not Available"

                row = {
                    "Handle": handle,
                    "Primary Image URL": primary_image_url,
                    "Availability": availability,
                    "Tasting Notes": tasting_notes,
                    "Variant Part 1": variant_part1,
                    "Variant Part 2": variant_part2,
                    "Variant Part 3": variant_part3,
                    "Variant Price": variant_price,
                    "Variant Availability": variant_availability,
                    "Producer": producer,
                    "Region": region,
                    "Variety": variety,
                    "Altitude": altitude,
                    "Harvest": harvest,
                    "Processing": processing,
                    "Brewing": brewing
                }
                writer.writerow(row)

    print(f"CSV file '{csv_filename}' created with the filtered and split variants.")

if __name__ == "__main__":
    url = "https://commonmancoffeeroasters.com/collections/all-coffee-blends/products.json"
    csv_filename = "products.csv"
    process_products(url, csv_filename)

CSV file 'products.csv' created with the filtered and split variants.
