In [170]:
import requests
from bs4 import BeautifulSoup
import csv
import json

# Base URLs
collection_url = "https://alchemist.global/collections/coffee-beans"

# Headers to mimic a browser visit
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36"
}

# Output CSV file names
card_data_file = "card_data.csv"
variant_data_file = "variant_data.csv"

# Request the collection page content
response = requests.get(collection_url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")

# Find all product cards
product_cards = soup.find_all("div", class_="product-card-wrapper")

# Prepare card data
card_data = []

for card in product_cards:
    # Extract product details
    title_element = card.find("a", class_="h5")
    title = title_element.text.strip() if title_element else "N/A"

    product_url = title_element["href"] if title_element else "N/A"
    product_url = f"https://alchemist.global{product_url}"

    blend_origin = card.find("p", class_="p4 c-blue opacity-4 mb-10")
    blend_origin_text = blend_origin.text.strip() if blend_origin else "N/A"

    description_element = card.find("div", class_="short-description")
    description = description_element.text.strip() if description_element else "N/A"

    # Tasting notes
    tasting_notes = []
    tasting_note_elements = card.find_all("div", class_="flex ai-center mr-10")
    for note in tasting_note_elements:
        note_text = note.find("span", class_="p3").text.strip()
        tasting_notes.append(note_text)

    price_element = card.find("span", class_="applied-price h7")
    price = price_element.text.strip() if price_element else "N/A"

    # Primary image
    primary_image = card.find("img", class_="image")
    primary_image_url = f"https:{primary_image['data-src']}" if primary_image else "N/A"

    # Add to card data
    card_data.append({
        "Title": title,
        "Product URL": product_url,
        "Blend Origin": blend_origin_text,
        "Description": description,
        "Tasting Notes": ", ".join(tasting_notes),
        "Price": price,
        "Image URL": primary_image_url,
    })

# Write card data to CSV
if card_data:
    with open(card_data_file, mode="w", newline="", encoding="utf-8") as file:
        fieldnames = ["Title", "Product URL", "Blend Origin", "Description", "Tasting Notes", "Price", "Image URL"]
        writer = csv.DictWriter(file, fieldnames=fieldnames)

        # Write header and rows
        writer.writeheader()
        writer.writerows(card_data)

    print(f"Card data successfully written to {card_data_file}")
else:
    print("No card data to write to CSV")

# Step 2: Extract and process the meta JSON for variants
variant_data = []
meta_data_script = soup.find("script", string=lambda t: t and "var meta =" in t)
if meta_data_script:
    # Extract the JSON text
    meta_data_text = meta_data_script.string

    # Isolate the JSON object by finding the first valid curly brace
    start_index = meta_data_text.find('{"products":')  # Find the start of the JSON
    end_index = meta_data_text.rfind("};") + 1          # Find the last closing brace
    valid_json_text = meta_data_text[start_index:end_index]

    try:
        meta = json.loads(valid_json_text)  # Parse the isolated JSON
        for product in meta.get("products", []):
            product_title = product.get("vendor", "N/A")
            product_type = product.get("type", "N/A")
            product_id = product.get("id", "N/A")

            for variant in product.get("variants", []):
                variant_price = variant.get("price", 0) / 100
                variant_name = variant.get("name", "N/A")
                variant_url = f"https://alchemist.global/products/{product_id}"

                # Add variant data
                variant_data.append({
                    "Title": f"{product_title} - {variant_name}",
                    "Product URL": variant_url,
                    "Product Type": product_type,
                    "Price (SGD)": f"S${variant_price:.2f}"
                })
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON: {e}")

# Write variant data to CSV
if variant_data:
    with open(variant_data_file, mode="w", newline="", encoding="utf-8") as file:
        fieldnames = ["Title", "Product URL", "Product Type", "Price (SGD)"]
        writer = csv.DictWriter(file, fieldnames=fieldnames)

        # Write header and rows
        writer.writeheader()
        writer.writerows(variant_data)

    print(f"Variant data successfully written to {variant_data_file}")
else:
    print("No variant data to write to CSV")


Card data successfully written to card_data.csv
Variant data successfully written to variant_data.csv


In [112]:
import pandas as pd

# Load the CSV file
file_path = "alchemist_coffee_products.csv"
df = pd.read_csv(file_path)

# Group by 'Product URL' or another unique identifier
compressed_df = df.groupby("Product URL").agg({
    "Title": "first",  # Use the first non-null Title
    "Blend Origin": "first",  # Use the first non-null Blend Origin
    "Description": "first",  # Use the first non-null Description
    "Tasting Notes": lambda x: ", ".join(x.dropna().unique()),  # Combine unique Tasting Notes
    "Price": lambda x: ", ".join(x.dropna().unique()),  # Combine unique Prices
    "Image URL": "first",  # Use the first non-null Image URL
    "Price (SGD)": lambda x: ", ".join(x.dropna().unique()),  # Combine unique Prices in SGD
}).reset_index()

# Save the compressed data back to a CSV file
output_path = "compressed_alchemist_coffee_products.csv"
compressed_df.to_csv(output_path, index=False)

print(f"Compressed data saved to {output_path}")

Compressed data saved to compressed_alchemist_coffee_products.csv
