In [4]:
import re
import csv
import requests
from bs4 import BeautifulSoup
from datetime import datetime

def clean_timestamp(timestamp):
    """Convert timestamp to a cleaner format: 'YYYY-MM-DD'."""
    try:
        dt = datetime.strptime(timestamp, "%Y-%m-%dT%H:%M:%S%z")
        return dt.strftime("%Y-%m-%d")
    except (ValueError, TypeError):
        return timestamp  # Return original if it can't be parsed

def extract_coffee_details(body_text):
    """
    Extract coffee details dynamically from HTML body text.
    Returns a dictionary with potential keys (or None if not found).
    """
    fields = {
        "Region": None,
        "Variety": None,
        "Elevation": None,
        "Processing": None,
        "Roast profile": None,
        "Flavour notes": None,
        "Acidity": None,
        "Body": None,
        "Tasting Experience": None,
        "Farm Information": None,
        "Moisture content of Green Coffee": None,
        "Packaging": None
    }
    
    patterns = {
        "Region": r"Region:\s*([\w\s.,&-]+)",
        "Variety": r"Variety:\s*([\w\s.,&-]+)",
        "Elevation": r"Elevation:\s*([\w\s.,&-]+)",
        "Processing": r"Processing:\s*([\w\s.,&-]+)",
        "Roast profile": r"Roast profile:\s*([\w\s.,&-]+)",
        "Flavour notes": r"Flavour notes:\s*([\w\s.,&-]+)",
        "Acidity": r"Acidity:\s*([\w\s.,&-]+)",
        "Body": r"Body:\s*([\w\s.,&-]+)",
        "Tasting Experience": r"Tasting experience:\s*([\w\s.,&-]+)",
        "Farm Information": r"Farm Information:\s*([\w\s.,&-]+)",
        "Moisture content of Green Coffee": r"Moisture content of Green Coffee:\s*([\w\s.,&-]+)",
        "Packaging": r"Packaging:\s*([\w\s.,&-]+)"
    }
    
    soup = BeautifulSoup(body_text, "html.parser")
    plain_text = soup.get_text()

    for field, pattern in patterns.items():
        match = re.search(pattern, plain_text, re.IGNORECASE | re.DOTALL)
        if match:
            fields[field] = match.group(1).strip()
    return fields

def fetch_products(page):
    """Fetch products from the paginated API."""
    url = f"https://cowpressocoffee.sg/collections/frontpage/products.json?page={page}"
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        return data.get("products", [])
    return []

def parse_product(product):
    """
    Parse a single product and return a list of rows (one per variant)
    with the common product details and variant-specific information.
    Only include variants with 'Whole Beans' in the title.
    """
    description_html = product.get("body_html", "")
    extracted_details = extract_coffee_details(description_html)
    
    rows = []
    # Use first image URL (if available) for all variants
    images = product.get("images", [])
    image_url = images[0].get("src", "") if images else ""
    
    for variant in product.get("variants", []):
        variant_title = variant.get("title", "").lower()
        if "whole beans" in variant_title:  # Filter for Whole Beans
            row = {
                "Product ID": product.get("id"),
                "Title": product.get("title"),
                "Vendor": product.get("vendor"),
                "Created At": clean_timestamp(product.get("created_at", "")),
                "Updated At": clean_timestamp(product.get("updated_at", "")),
                "Variant ID": variant.get("id"),
                "Variant Title": variant.get("title"),
                "Variant Price": variant.get("price"),
                "Image URL": image_url,
            }
            # Merge extracted details
            row.update(extracted_details)
            rows.append(row)
    return rows

def clean_row(row):
    """
    Clean a row by applying regex fixes for misplaced headers
    and remove unwanted fields.
    """
    misplaced_headers = [
        "Elevation", "Variety", "Processing", "Roast profile",
        "Flavour notes", "Acidity", "Body", "Tasting notes", 
        "District", "Farm", "Packaging", "Type of Soil", "Soil Type", 
        "MASL", "Varietal", "Flavour", "Average Annual Rainfall", 
        "notes", "this crop", "Roast Level", "Process"
    ]
    # Process each cell
    for key, value in row.items():
        if value and isinstance(value, str):
            for header in misplaced_headers:
                # Insert a space before header if needed, then remove the header text
                value = re.sub(rf"(?<!\s){header}", rf" {header}", value, flags=re.IGNORECASE)
                value = re.sub(rf"\b{header}\b", "", value, flags=re.IGNORECASE)
            if key == "Elevation":
                value = re.sub(r"[^\d\s,-]", "", value).strip()
            value = re.sub(r"\s{2,}", " ", value).strip()
            row[key] = value
    # Drop unwanted columns
    columns_to_drop = ["Tasting Experience", "Farm Information", 
                       "Moisture content of Green Coffee", "Packaging", "Body"]
    for col in columns_to_drop:
        row.pop(col, None)
    return row

def scrape_and_clean():
    """Scrape the products, clean the data, and write final CSV 'cowpresso.csv'."""
    page = 1
    all_products = []
    while True:
        products = fetch_products(page)
        if not products:
            break
        for product in products:
            rows = parse_product(product)
            all_products.extend(rows)
        page += 1

    # Define final output header (order matters)
    final_headers = [
        "Product ID", "Title", "Vendor", "Created At", "Updated At",
        "Variant ID", "Variant Title", "Variant Price", "Image URL",
        "Region", "Variety", "Elevation", "Processing", 
        "Roast profile", "Flavour notes", "Acidity"
    ]
    
    # Clean each row and ensure only final_headers remain
    cleaned_products = []
    for row in all_products:
        cleaned = clean_row(row)
        # Build a new dict with only the final headers (use empty string if missing)
        cleaned_products.append({col: cleaned.get(col, "") for col in final_headers})
    
    # Write the final CSV
    output_file = "cowpresso.csv"
    with open(output_file, mode="w", newline="", encoding="utf-8") as file:
        writer = csv.DictWriter(file, fieldnames=final_headers, quoting=csv.QUOTE_MINIMAL, escapechar="\\")
        writer.writeheader()
        for row in cleaned_products:
            writer.writerow(row)
    
    print(f"Cleaned data exported to '{output_file}'.")

# Run the complete scraping and cleaning process
scrape_and_clean()

Cleaned data exported to 'cowpresso.csv'.
