In [42]:
import requests
from bs4 import BeautifulSoup
import csv
import json

# Base URLs
collection_url = "https://tionghoe.com/collections/roasted-beans"

# Headers to mimic a browser visit
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36"
}

# Output CSV file names
card_data_file = "tionghoe_card_data.csv"
variant_data_file = "tionghoe_variant_data.csv"

# Request the collection page content
response = requests.get(collection_url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")

# Step 2: Extract and process the meta JSON for variants
variant_data = []
meta_data_script = soup.find("script", string=lambda t: t and "var meta =" in t)
if meta_data_script:
    # Extract the JSON text
    meta_data_text = meta_data_script.string

    # Isolate the JSON object by finding the first valid curly brace
    start_index = meta_data_text.find('{"products":')  # Find the start of the JSON
    end_index = meta_data_text.rfind("};") + 1          # Find the last closing brace
    valid_json_text = meta_data_text[start_index:end_index]

    try:
        meta = json.loads(valid_json_text)  # Parse the isolated JSON
        for product in meta.get("products", []):
            product_title = product.get("vendor", "N/A")
            product_type = product.get("type", "N/A")
            product_id = product.get("id", "N/A")

            for variant in product.get("variants", []):
                variant_price = variant.get("price", 0) / 100
                variant_name = variant.get("name", "N/A")

                # Only keep variants with "Whole Beans" in the title
                if "Whole Beans" in variant_name:
                    variant_url = f"https://tionghoe.com/products/{product_id}"

                    # Add variant data
                    variant_data.append({
                        "Title": f"{product_title} - {variant_name}",
                        "Product URL": variant_url,
                        "Product Type": product_type,
                        "Price (SGD)": f"S${variant_price:.2f}"
                    })
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON: {e}")

# Write variant data to CSV
if variant_data:
    with open(variant_data_file, mode="w", newline="", encoding="utf-8") as file:
        fieldnames = ["Title", "Product URL", "Product Type", "Price (SGD)"]
        writer = csv.DictWriter(file, fieldnames=fieldnames)

        # Write header and rows
        writer.writeheader()
        writer.writerows(variant_data)

    print(f"Variant data successfully written to {variant_data_file}")
else:
    print("No variant data to write to CSV")

Variant data successfully written to tionghoe_variant_data.csv


In [50]:
import requests
from bs4 import BeautifulSoup

# URL of the product page
url = "https://tionghoe.com/products/smoky-quartz-seasonal-espresso-blend"

# Headers to mimic a browser visit
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36"
}

# Fetch the page content
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")

# Extract blend, acidity level, and roast level from the description
description_div = soup.find("div", class_="section-stack__intro")

if description_div:
    description_text = description_div.find("p").get_text(separator=" ")

    # Safely extract values with error handling
    try:
        blend = description_text.split("Blend of:")[1].split("Acidity Level:")[0].strip()
    except IndexError:
        blend = "N/A"

    try:
        acidity_level = description_text.split("Acidity Level:")[1].split("Roast Level:")[0].strip()
    except IndexError:
        acidity_level = "N/A"

    try:
        roast_level = description_text.split("Roast Level:")[1].split("Following our gemstone series")[0].strip()
    except IndexError:
        roast_level = "N/A"
else:
    print("Description not found!")
    blend = "N/A"
    acidity_level = "N/A"
    roast_level = "N/A"

# Extract flavor, body, and aftertaste from the feature chart
feature_chart = soup.find("div", class_="feature-chart__table")
if feature_chart:
    try:
        flavor = feature_chart.find("div", string="Flavor").find_next_sibling("div").text.strip()
    except AttributeError:
        flavor = "N/A"

    try:
        body = feature_chart.find("div", string="Body").find_next_sibling("div").text.strip()
    except AttributeError:
        body = "N/A"

    try:
        aftertaste = feature_chart.find("div", string="Aftertaste").find_next_sibling("div").text.strip()
    except AttributeError:
        aftertaste = "N/A"
else:
    print("Feature chart not found!")
    flavor = "N/A"
    body = "N/A"
    aftertaste = "N/A"

# Print results
print(f"Blend: {blend}")
print(f"Acidity Level: {acidity_level}")
print(f"Roast Level: {roast_level}")
print(f"Flavor: {flavor}")
print(f"Body: {body}")
print(f"Aftertaste: {aftertaste}")

Blend: 50% India and 50% Brazil
Acidity Level: Low
Roast Level: Medium
Flavor: Hazelnuts, Caramel, Brown Sugar
Body: Heavy-bodied, Creamy
Aftertaste: Dark Chocolates, Roasted Peanuts


In [68]:
import requests
from bs4 import BeautifulSoup
import csv

# Base URLs
collection_url = "https://tionghoe.com/collections/roasted-beans"
base_product_url = "https://tionghoe.com"

# Headers to mimic a browser visit
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36"
}

# Output CSV file
output_file = "tionghoe_product_details.csv"

# Step 1: Scrape product details from the collection page
response = requests.get(collection_url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")

# Find all product cards
product_cards = soup.select("span.product-card__title")

# Extract product data
product_data = []
for card in product_cards:
    # Extract product title and URL
    title_element = card.find("a", class_="bold")
    title = title_element.text.strip() if title_element else "N/A"
    product_url = title_element["href"] if title_element else "N/A"
    full_product_url = f"{base_product_url}{product_url}"

    # Extract price
    price_element = card.find_next("sale-price")
    price = price_element.text.strip() if price_element else "N/A"

    # Append base data
    product_data.append({
        "Title": title,
        "Product URL": full_product_url,
        "Price": price
    })

# Step 2: Scrape additional details from each product page
detailed_product_data = []
for product in product_data:
    print(f"Scraping: {product['Product URL']}")

    # Fetch the product page
    product_response = requests.get(product["Product URL"], headers=headers)
    product_soup = BeautifulSoup(product_response.text, "html.parser")

    # Initialize all columns with default "N/A"
    product_details = {
        "Title": product["Title"],
        "Product URL": product["Product URL"],
        "Price": product["Price"],
        "Blend": "N/A",
        "Acidity Level": "N/A",
        "Roast Level": "N/A",
        "Flavor": "N/A",
        "Body": "N/A",
        "Aftertaste": "N/A",
        "Region": "N/A",
        "Varietal": "N/A",
        "Process": "N/A"
    }

    # Extract description details
    description_div = product_soup.find("div", class_="section-stack__intro")
    if description_div:
        description_text = description_div.find("p").get_text(separator=" ")

        try:
            product_details["Blend"] = description_text.split("Blend of:")[1].split("Acidity Level:")[0].strip()
        except IndexError:
            pass

        try:
            product_details["Acidity Level"] = description_text.split("Acidity Level:")[1].split("Roast Level:")[0].strip()
        except IndexError:
            pass

        try:
            product_details["Roast Level"] = description_text.split("Roast Level:")[1].split("<br><br>")[0].strip()
        except IndexError:
            pass

        try:
            product_details["Region"] = description_text.split("Region:")[1].split("Varietal:")[0].strip()
        except IndexError:
            pass

        try:
            product_details["Varietal"] = description_text.split("Varietal:")[1].split("Process:")[0].strip()
        except IndexError:
            pass

        try:
            product_details["Process"] = description_text.split("Process:")[1].split("<br><br>")[0].strip()
        except IndexError:
            pass

    # Extract flavor, body, and aftertaste from the feature chart
    feature_chart = product_soup.find("div", class_="feature-chart__table")
    if feature_chart:
        try:
            product_details["Flavor"] = feature_chart.find("div", string="Flavor").find_next_sibling("div").text.strip()
        except AttributeError:
            pass

        try:
            product_details["Body"] = feature_chart.find("div", string="Body").find_next_sibling("div").text.strip()
        except AttributeError:
            pass

        try:
            product_details["Aftertaste"] = feature_chart.find("div", string="Aftertaste").find_next_sibling("div").text.strip()
        except AttributeError:
            pass

    # Append to detailed product data
    detailed_product_data.append(product_details)
    print(f"Scraped: {product['Title']}")

# Step 3: Write all details to CSV
with open(output_file, mode="w", newline="", encoding="utf-8") as file:
    fieldnames = [
        "Title", "Product URL", "Price", "Blend", "Acidity Level",
        "Roast Level", "Flavor", "Body", "Aftertaste", "Region",
        "Varietal", "Process"
    ]
    writer = csv.DictWriter(file, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(detailed_product_data)

print(f"Product details successfully written to {output_file}")


Scraping: https://tionghoe.com/products/smoky-quartz-seasonal-espresso-blend
Scraped: Smoky Quartz (Seasonal Espresso Blend)
Scraping: https://tionghoe.com/products/jet-candy-seasonal-espresso-blend
Scraped: Jet Candy (Seasonal Espresso Blend)
Scraping: https://tionghoe.com/products/gachala-house-espresso-blend
Scraped: Gachala (House Espresso Blend)
Scraping: https://tionghoe.com/products/brazil-mogiana
Scraped: Brazil Mogiana
Scraping: https://tionghoe.com/products/ethiopia-alo-coffee-natural-g1
Scraped: Ethiopia Alo Coffee Natural G1
Scraping: https://tionghoe.com/products/indonesia-sumatra-mandheling
Scraped: Indonesia Sumatra Mandheling
Scraping: https://tionghoe.com/products/purple-onyx-seasonal-espresso-blend
Scraped: Purple Onyx (Seasonal Espresso Blend)
Scraping: https://tionghoe.com/products/colombia-popayan-decaf
Scraped: Colombia Popayan Decaf
Scraping: https://tionghoe.com/products/india-monsooned-malabar
Scraped: India Monsooned Malabar
Scraping: https://tionghoe.com/prod