In [10]:
import requests
import re
import csv
import time
from bs4 import BeautifulSoup

# -------------------- PART 1: Extract Product Links --------------------
def get_product_links(shop_url):
    """
    Downloads the shop page and extracts full product links.
    Assumes product links have hrefs starting with "/shop/p/".
    """
    response = requests.get(shop_url)
    if response.status_code != 200:
        print("Error fetching shop page:", response.status_code)
        return []
    
    soup = BeautifulSoup(response.text, "html.parser")
    product_links = set()
    for a in soup.find_all("a", href=True):
        href = a["href"]
        if href.startswith("/shop/p/"):
            full_url = "https://www.percolate.sg" + href
            product_links.add(full_url)
    return list(product_links)

# -------------------- PART 2: Parse Description Text --------------------
def parse_description(desc_text):
    """
    Parses the meta description content to extract products.
    Returns a list of dictionaries containing:
    - Category (e.g., ESPRESSO, FILTER, SINGLE ORIGINS, etc.)
    - Title
    - Country (if available)
    - Weight
    - Price
    - Flavour Notes
    """
    # Remove unnecessary line breaks and trim spaces
    lines = [line.strip() for line in desc_text.splitlines() if line.strip()]
    
    parsed_products = []
    category = None
    weight_pattern = re.compile(r'\((\d+(?:g)?)\)')  # Match weight like (250g) or (200g)
    entry_pattern = re.compile(r'^(.*?)[,-]?\s*(.*?)?\s*\((\$\d+(?:\.\d+)?)\)\s*-\s*(.+)$')

    for line in lines:
        # Detect category headers (like ESPRESSO, FILTER, SINGLE ORIGINS, etc.)
        if line.isupper() and " " not in line and not entry_pattern.match(line):
            category = line  # Set current category
            continue

        # Match product entry lines
        match = entry_pattern.match(line)
        if match:
            title = match.group(1).strip()
            country = match.group(2).strip() if match.group(2) else "Unknown"
            price = match.group(3).strip()
            flavour_notes = match.group(4).strip()

            # Try to infer weight from category or product name
            weight_match = weight_pattern.search(title)
            weight = weight_match.group(1) if weight_match else "250g"  # Defaulting to 250g if unspecified

            parsed_products.append({
                "Category": category if category else "Unknown",
                "Title": title,
                "Country": country,
                "Weight": weight,
                "Price": price,
                "Flavour Notes": flavour_notes
            })
    
    return parsed_products

# -------------------- PART 3: Scrape Each Product Page --------------------
def scrape_product_page(url):
    """
    Extracts details from a product page, including:
      - Product Name (from og:title)
      - Price (from meta tags)
      - Image URL (from og:image, itemprop="image", or twitter:image)
      - Description (from meta description)
      - Individual product entries parsed from description
    
    Returns a list of rows with:
        URL, Product Name, Category, Title, Country, Weight, Price, Flavour Notes, Image URL
    """
    response = requests.get(url)
    if response.status_code != 200:
        print("Error fetching product page:", url, response.status_code)
        return []
    
    soup = BeautifulSoup(response.text, "html.parser")
    
    # Extract product name from <meta property="og:title">
    meta_title = soup.find("meta", property="og:title")
    product_name = meta_title["content"].strip() if meta_title else "Unknown Product"
    
    # Extract price from meta tags
    meta_price = soup.find("meta", property="product:price:amount")
    price = meta_price["content"].strip() if meta_price else "Unknown Price"
    
    # Extract currency
    meta_currency = soup.find("meta", property="product:price:currency")
    currency = meta_currency["content"].strip() if meta_currency else "Unknown Currency"
    
    # Extract product image URL from multiple possible meta tags
    img_src = None
    for tag in ["og:image", "itemprop:image", "twitter:image"]:
        meta_img = soup.find("meta", property=tag) or soup.find("meta", itemprop="image") or soup.find("meta", name="twitter:image")
        if meta_img and meta_img.get("content"):
            img_src = meta_img["content"].strip()
            break
    if not img_src:
        img_src = "No Image Available"

    # Extract full description from <meta name="description">
    meta_desc = soup.find("meta", attrs={"name": "description"})
    desc_text = meta_desc["content"].strip() if meta_desc else ""

    # Parse the description text into product entries.
    parsed_entries = parse_description(desc_text)
    
    # Build rows by adding the parent URL and product name to each entry.
    rows = []
    for entry in parsed_entries:
        row = {
            "URL": url,
            "Product Name": product_name,
            "Category": entry.get("Category", ""),
            "Title": entry.get("Title", ""),
            "Country": entry.get("Country", ""),
            "Weight": entry.get("Weight", ""),
            "Price": f"{currency} {entry.get('Price', '')}",
            "Flavour Notes": entry.get("Flavour Notes", ""),
            "Image URL": img_src  # Now captures image URL correctly
        }
        rows.append(row)
    
    return rows

# -------------------- PART 4: Main Function --------------------
def main():
    shop_url = "https://www.percolate.sg/shop/coffee"
    print("Scraping shop page:", shop_url)
    product_links = get_product_links(shop_url)
    print("Found {} product links.".format(len(product_links)))
    
    all_rows = []
    for link in product_links:
        print("Processing product:", link)
        rows = scrape_product_page(link)
        if rows:
            all_rows.extend(rows)
        time.sleep(1)  # Be polite to the server
    
    # Define CSV columns.
    fieldnames = ["URL", "Product Name", "Category", "Title", "Country", "Weight", "Price", "Flavour Notes", "Image URL"]
    csv_filename = "percolate_products_parsed.csv"
    with open(csv_filename, "w", newline="", encoding="utf-8") as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for row in all_rows:
            writer.writerow(row)
    
    print("Scraping complete. Data saved to", csv_filename)

if __name__ == "__main__":
    main()

Scraping shop page: https://www.percolate.sg/shop/coffee
Found 5 product links.
Processing product: https://www.percolate.sg/shop/p/youmeca-drip-bags
Processing product: https://www.percolate.sg/shop/p/friedhats-coffee-beans
Processing product: https://www.percolate.sg/shop/p/onibus-coffee
Processing product: https://www.percolate.sg/shop/p/single-origin-coffee-espresso-250g
Processing product: https://www.percolate.sg/shop/p/dak-coffee-beans
Scraping complete. Data saved to percolate_products_parsed.csv
