In [10]:
import requests
import csv
from bs4 import BeautifulSoup

def extract_flavour_notes(body_html):
    """
    Extracts the flavour notes from the product's HTML.

    Strategy:
      1. Scan all header tags (h1-h6) and split their text by newlines.
         Return the first line that contains a pipe ("|").
      2. If not found, look for a <strong> tag that contains a pipe.
      3. If still not found, look for a <span> with class "OYPEnA" that contains a pipe.
      4. Otherwise, return an empty string.
    """
    soup = BeautifulSoup(body_html, 'html.parser')

    # 1. Check header tags (h1-h6)
    for header in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
        text = header.get_text(separator="\n", strip=True)
        for line in text.split("\n"):
            if "|" in line:
                return line.strip()

    # 2. Fallback: check for a <strong> tag with a pipe
    strong_tag = soup.find('strong')
    if strong_tag:
        strong_text = strong_tag.get_text(separator=" ", strip=True)
        if "|" in strong_text:
            return strong_text

    # 3. Fallback: check for a <span> with class "OYPEnA" that might hold the notes
    span_tag = soup.find('span', class_="OYPEnA")
    if span_tag:
        span_text = span_tag.get_text(separator=" ", strip=True)
        if "|" in span_text:
            return span_text

    return ""

def extract_roast_type(body_html):
    """
    Determines the roast type by scanning the product text for key terms.
    Checks for keywords like "espresso", "filter", "cold brew", "moka", "french press".
    """
    text = BeautifulSoup(body_html, 'html.parser').get_text(separator=" ", strip=True).lower()
    if "espresso" in text:
        return "espresso"
    elif "filter" in text:
        return "filter"
    elif "cold brew" in text:
        return "cold brew"
    elif "moka" in text:
        return "moka"
    elif "french press" in text:
        return "french press"
    return ""

def parse_variant(variant_title):
    """
    Splits the variant title into two parts:
       - Variant Part 1: typically the weight or packaging (e.g. '150g', 'Box of 5 x 12g drip bag')
       - Variant Part 2: typically the grind or brew method (e.g. 'Espresso', 'Filter', 'Whole Beans')

    If the title contains " / ", we split on the first occurrence.
    Otherwise, we treat the entire string as Variant Part 1 and leave Variant Part 2 empty.
    """
    parts = variant_title.split(" / ", maxsplit=1)
    if len(parts) == 2:
        variant_part1 = parts[0].strip()
        variant_part2 = parts[1].strip()
    else:
        variant_part1 = variant_title.strip()
        variant_part2 = ""
    return variant_part1, variant_part2

def process_products(url, csv_filename):
    # Fetch the JSON data.
    response = requests.get(url)
    if response.status_code != 200:
        print("Error fetching products:", response.status_code)
        return

    data = response.json()
    products = data.get("products", [])

    # Define CSV columns.
    fieldnames = [
        "Title",
        "Primary Image URL",
        "Availability",
        "Roast Type",
        "Flavour Notes",
        "Vendor",
        "Product Type",
        "Variant Part 1",
        "Variant Part 2",
        "Variant Price",
        "Variant Availability"
    ]

    with open(csv_filename, "w", newline="", encoding="utf-8") as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()

        for product in products:
            title = product.get("title", "")
            vendor = product.get("vendor", "")
            product_type = product.get("product_type", "")
            images = product.get("images", [])
            primary_image_url = images[0]["src"] if images else ""
            # Product-level availability (if any variant is available)
            availability = "Available" if any(variant.get("available", False) 
                                              for variant in product.get("variants", [])) else "Not Available"
            body_html = product.get("body_html", "")

            # Extract flavour notes and roast type.
            flavour_notes = extract_flavour_notes(body_html)
            roast_type = extract_roast_type(body_html)

            # Process each variant but only include "Whole Beans".
            for variant in product.get("variants", []):
                variant_title = variant.get("title", "")
                if "whole beans" not in variant_title.lower():
                    # Skip non-Whole-Beans variants
                    continue

                variant_part1, variant_part2 = parse_variant(variant_title)
                variant_price = variant.get("price", "")
                variant_availability = "Available" if variant.get("available", False) else "Not Available"

                row = {
                    "Title": title,
                    "Primary Image URL": primary_image_url,
                    "Availability": availability,
                    "Roast Type": roast_type,
                    "Flavour Notes": flavour_notes,
                    "Vendor": vendor,
                    "Product Type": product_type,
                    "Variant Part 1": variant_part1,
                    "Variant Part 2": variant_part2,
                    "Variant Price": variant_price,
                    "Variant Availability": variant_availability
                }
                writer.writerow(row)

    print(f"CSV file '{csv_filename}' created with only Whole Beans variants included.")

if __name__ == "__main__":
    url = "https://www.parchmen.co/collections/roasted-coffee-beans/products.json"
    csv_filename = "parchmen.csv"
    process_products(url, csv_filename)

CSV file 'parchmen.csv' created with only Whole Beans variants included.
