In [38]:
import requests
from bs4 import BeautifulSoup
import csv
import json
import re
from difflib import SequenceMatcher
from datetime import datetime
import io

# Base URL for the collection
COLLECTION_URL = "https://alchemist.global/collections/coffee-beans"

# Headers to mimic a browser visit
HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/89.0.4389.82 Safari/537.36"
    )
}

# Final CSV file name
FINAL_OUTPUT_FILE = "final_output.csv"

# ----- UTILITY FUNCTIONS -----

def extract_weight_and_brewing(title):
    """
    Extract weight (e.g., '150g', '200g') and brewing type (e.g., 'Filter', 'Espresso')
    from the end of a variant title.
    
    Expected format (at the end of the string):
        "Filter / 150g"
        "Espresso / 200g"
        
    Returns a tuple: (weight, brewing_type) or ("N/A", "N/A") if not found.
    """
    # Look for "Filter" or "Espresso" followed by a slash and weight at the end.
    pattern = r'(Filter|Espresso)\s*/\s*(\d+\s?[gG])$'
    match = re.search(pattern, title, re.IGNORECASE)
    if match:
        brewing_type = match.group(1).title()  # Normalize to title case
        weight = match.group(2).lower()
        return weight, brewing_type
    return "N/A", "N/A"

def similar(a, b):
    """Return a similarity score between two strings."""
    return SequenceMatcher(None, a, b).ratio()

# ----- SCRAPE CARD DATA -----

def scrape_card_data():
    """
    Scrape card data from the collection page.
    Returns a list of dictionaries with keys:
      Card Title, Card Product URL, Blend Origin, Description, Tasting Notes, Image URL.
    """
    response = requests.get(COLLECTION_URL, headers=HEADERS)
    soup = BeautifulSoup(response.text, "html.parser")
    product_cards = soup.find_all("div", class_="product-card-wrapper")
    
    card_data_list = []
    for card in product_cards:
        # Card Title
        title_element = card.find("a", class_="h5")
        card_title = title_element.text.strip() if title_element else "N/A"
        
        # Card Product URL
        product_url = title_element["href"] if title_element else "N/A"
        if product_url != "N/A":
            product_url = f"https://alchemist.global{product_url}"
        
        # Blend Origin
        blend_origin_elem = card.find("p", class_="p4 c-blue opacity-4 mb-10")
        blend_origin = blend_origin_elem.text.strip() if blend_origin_elem else "N/A"
        
        # Description
        description_elem = card.find("div", class_="short-description")
        description = description_elem.text.strip() if description_elem else "N/A"
        
        # Tasting Notes
        tasting_notes = []
        tasting_note_elems = card.find_all("div", class_="flex ai-center mr-10")
        for note in tasting_note_elems:
            note_span = note.find("span", class_="p3")
            if note_span:
                tasting_notes.append(note_span.text.strip())
        tasting_notes_str = ", ".join(tasting_notes)
        
        # Image URL
        primary_image = card.find("img", class_="image")
        if primary_image and primary_image.get("data-src"):
            image_url = f"https:{primary_image['data-src']}"
        else:
            image_url = "N/A"
        
        card_data_list.append({
            "Card Title": card_title,
            "Card Product URL": product_url,
            "Blend Origin": blend_origin,
            "Description": description,
            "Tasting Notes": tasting_notes_str,
            "Image URL": image_url
        })
    return card_data_list

# ----- SCRAPE VARIANT DATA -----

def scrape_variant_data():
    """
    Scrape variant data from the meta JSON embedded on the page.
    Returns a list of dictionaries with keys:
      Variant Title, Weight, Brewing Type, Variant Product URL, Variant Price.
    """
    response = requests.get(COLLECTION_URL, headers=HEADERS)
    soup = BeautifulSoup(response.text, "html.parser")
    
    variant_data_list = []
    meta_script = soup.find("script", string=lambda t: t and "var meta =" in t)
    if meta_script:
        meta_text = meta_script.string
        start_index = meta_text.find('{"products":')
        end_index = meta_text.rfind("};") + 1
        valid_json_text = meta_text[start_index:end_index]
        
        try:
            meta = json.loads(valid_json_text)
            for product in meta.get("products", []):
                product_id = product.get("id", "N/A")
                vendor = product.get("vendor", "N/A")
                for variant in product.get("variants", []):
                    variant_name = variant.get("name", "N/A")
                    variant_price_cents = variant.get("price", 0)
                    variant_price_sgd = variant_price_cents / 100.0
                    
                    # Extract weight and brewing type from the variant title
                    weight, brewing_type = extract_weight_and_brewing(variant_name)
                    
                    # Combine vendor with variant name for fuzzy matching
                    full_variant_title = f"{vendor} - {variant_name}"
                    variant_url = f"https://alchemist.global/products/{product_id}"
                    
                    variant_data_list.append({
                        "Variant Title": full_variant_title,
                        "Weight": weight,
                        "Brewing Type": brewing_type,
                        "Variant Product URL": variant_url,
                        "Variant Price": f"S${variant_price_sgd:.2f}"
                    })
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON: {e}")
    return variant_data_list

# ----- FUZZY MATCH & FINAL OUTPUT -----

def create_final_output():
    """
    Fuzzy match each card's title with the best variant's title.
    Final output includes:
      Card Title, Variant Data Weight, Variant Data Brewing Type, Variant Price,
      Card Product URL, Blend Origin, Description, Tasting Notes, Image URL.
    """
    card_data = scrape_card_data()
    variant_data = scrape_variant_data()
    
    final_rows = []
    for card in card_data:
        card_title = card["Card Title"]
        best_match = None
        highest_score = 0.0
        
        for variant in variant_data:
            score = similar(card_title.lower(), variant["Variant Title"].lower())
            if score > highest_score:
                highest_score = score
                best_match = variant
        
        if best_match and highest_score > 0.2:
            weight = best_match["Weight"]
            brewing_type = best_match["Brewing Type"]
            variant_price = best_match["Variant Price"]
        else:
            weight = "N/A"
            brewing_type = "N/A"
            variant_price = "N/A"
        
        final_rows.append({
            "Card Title": card_title,
            "Weight": weight,
            "Brewing Type": brewing_type,
            "Variant Price": variant_price,
            "Card Product URL": card["Card Product URL"],
            "Blend Origin": card["Blend Origin"],
            "Description": card["Description"],
            "Tasting Notes": card["Tasting Notes"],
            "Image URL": card["Image URL"]
        })
    return final_rows

def generate_final_csv():
    """
    Generate the final CSV content from the combined data.
    Returns a CSV string.
    """
    final_data = create_final_output()
    fieldnames = [
        "Card Title", "Weight", "Brewing Type", "Variant Price", "Card Product URL",
        "Blend Origin", "Description", "Tasting Notes", "Image URL"
    ]
    
    output = io.StringIO()
    writer = csv.DictWriter(output, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(final_data)
    csv_content = output.getvalue()
    output.close()
    return csv_content

# ----- WRITE FINAL CSV TO FILE (LOCAL TESTING) -----

if __name__ == "__main__":
    csv_output = generate_final_csv()
    with open(FINAL_OUTPUT_FILE, "w", newline="", encoding="utf-8") as f:
        f.write(csv_output)
    print(f"Final combined data written to {FINAL_OUTPUT_FILE}")

Final combined data written to final_output.csv
