In [14]:
import requests
from bs4 import BeautifulSoup
import csv
import json
import re
from difflib import SequenceMatcher
from datetime import datetime
import io

# Base URLs
collection_url = "https://tionghoe.com/collections/roasted-beans"
base_product_url = "https://tionghoe.com"

# Headers to mimic a browser visit
HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/89.0.4389.82 Safari/537.36"
    )
}

# Output CSV file
output_file = "tionghoe.csv"

# ---- PART 1: Scrape Variants (Title, Price, Weight) ----

def extract_weight(title):
    """
    Extract weight (e.g., '250g', '1kg') from the variant title.
    """
    match = re.search(r'(\d+\s?[gG]|1\s?[kK][gG])', title)
    return match.group(0) if match else 'N/A'

def clean_variant_title(title):
    """
    Remove weight from the variant title.
    """
    return re.sub(r'(\d+\s?[gG]|1\s?[kK][gG])', '', title).strip()

def scrape_variants():
    variant_data = []
    response = requests.get(collection_url, headers=HEADERS)
    soup = BeautifulSoup(response.text, "html.parser")

    meta_data_script = soup.find("script", string=lambda t: t and "var meta =" in t)
    if meta_data_script:
        meta_data_text = meta_data_script.string
        start_index = meta_data_text.find('{"products":')
        end_index = meta_data_text.rfind("};") + 1
        valid_json_text = meta_data_text[start_index:end_index]

        try:
            meta = json.loads(valid_json_text)
            for product in meta.get("products", []):
                product_id = product.get("id", "N/A")

                for variant in product.get("variants", []):
                    variant_price = variant.get("price", 0) / 100
                    variant_name = variant.get("name", "N/A")

                    # Only keep variants with "Whole Beans" in the title
                    if "Whole Beans" in variant_name:
                        variant_url = f"https://tionghoe.com/products/{product_id}"

                        # Extract weight and clean title
                        weight = extract_weight(variant_name)
                        cleaned_title = clean_variant_title(variant_name)

                        variant_data.append({
                            "Variant Title": cleaned_title,
                            "Weight": weight,
                            "Product URL": variant_url,
                            "Price (SGD)": f"S${variant_price:.2f}"
                        })
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON: {e}")

    return variant_data

# ---- PART 2: Scrape Product Details (Flavour, Body, Region, etc.) ----

def scrape_product_details():
    response = requests.get(collection_url, headers=HEADERS)
    soup = BeautifulSoup(response.text, "html.parser")
    product_cards = soup.select("span.product-card__title")

    product_data = []
    for card in product_cards:
        title_element = card.find("a", class_="bold")
        title = title_element.text.strip() if title_element else "N/A"
        product_url = title_element["href"] if title_element else "N/A"
        full_product_url = f"{base_product_url}{product_url}"

        # Initialize relevant product fields
        product_details = {
            "Product Title": title,
            "Product URL": full_product_url,
            "Flavor": "N/A",
            "Body": "N/A",
            "Aftertaste": "N/A",
            "Region": "N/A",
            "Varietal": "N/A",
            "Process": "N/A",
            "Product Image URL": "N/A"  # New field for image URL
        }

        print(f"Scraping: {full_product_url}")
        product_response = requests.get(full_product_url, headers=HEADERS)
        product_soup = BeautifulSoup(product_response.text, "html.parser")

        # Extract flavor, body, and aftertaste from feature chart
        feature_chart = product_soup.find("div", class_="feature-chart__table")
        if feature_chart:
            for feature in ["Flavor", "Body", "Aftertaste"]:
                try:
                    product_details[feature] = feature_chart.find("div", string=feature).find_next_sibling("div").text.strip()
                except AttributeError:
                    pass

        # Extract description details
        description_div = product_soup.find("div", class_="section-stack__intro")
        if description_div:
            description_text = description_div.find("p").get_text(separator=" ")

            for field in ["Region", "Varietal", "Process"]:
                try:
                    start = description_text.index(f"{field}:") + len(f"{field}:")
                    end = description_text.find(":", start)
                    product_details[field] = description_text[start:end].strip()
                except ValueError:
                    pass

        # --- NEW: Extract Product Image URL from meta tags ---
        image_meta = product_soup.find("meta", property="og:image:secure_url")
        if not image_meta:
            image_meta = product_soup.find("meta", property="og:image")
        if image_meta:
            product_details["Product Image URL"] = image_meta.get("content", "N/A")
        else:
            product_details["Product Image URL"] = "N/A"

        product_data.append(product_details)
        print(f"Scraped details for: {title}")

    return product_data

# ---- PART 3: Fuzzy Matching and Combining Data ----

def similar(a, b):
    """
    Compute the similarity between two strings.
    """
    return SequenceMatcher(None, a, b).ratio()

def combine_and_export():
    variants = scrape_variants()
    products = scrape_product_details()

    combined_data = []

    for variant in variants:
        variant_title = variant["Variant Title"]

        # Find the best matching product based on title similarity
        best_match = None
        highest_similarity = 0

        for product in products:
            product_title = product["Product Title"]
            similarity_score = similar(variant_title.lower(), product_title.lower())

            if similarity_score > highest_similarity:
                highest_similarity = similarity_score
                best_match = product

        # Apply a similarity threshold (e.g., >0.6 for a match)
        if highest_similarity > 0.6 and best_match:
            combined_entry = {
                "Variant Title": variant_title,
                "Weight": variant["Weight"],
                "Product Title": best_match["Product Title"],
                "Product URL": variant["Product URL"],
                "Price (SGD)": variant["Price (SGD)"],
                "Flavor": best_match.get("Flavor", "N/A"),
                "Body": best_match.get("Body", "N/A"),
                "Aftertaste": best_match.get("Aftertaste", "N/A"),
                "Region": best_match.get("Region", "N/A"),
                "Varietal": best_match.get("Varietal", "N/A"),
                "Process": best_match.get("Process", "N/A"),
                "Product Image URL": best_match.get("Product Image URL", "N/A")
            }
            combined_data.append(combined_entry)

    # Write combined data to CSV
    with open(output_file, mode="w", newline="", encoding="utf-8") as file:
        fieldnames = [
            "Variant Title", "Weight", "Product Title", "Product URL", "Price (SGD)", "Flavor", "Body",
            "Aftertaste", "Region", "Varietal", "Process", "Product Image URL"
        ]
        writer = csv.DictWriter(file, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(combined_data)

    print(f"Fuzzy matched data with weight successfully written to {output_file}")

# ---- RUN THE SCRAPER ----

if __name__ == "__main__":
    combine_and_export()