In [48]:
import requests
import re
import csv
import time
from bs4 import BeautifulSoup

# --- Step 1: Extract product links from the shop page ---
def get_product_links(shop_url):
    """
    Downloads the shop page and extracts full product links.
    We assume that product links have hrefs starting with "/shop/p/".
    """
    response = requests.get(shop_url)
    if response.status_code != 200:
        print("Error fetching shop page:", response.status_code)
        return []
    
    soup = BeautifulSoup(response.text, "html.parser")
    product_links = set()
    for a in soup.find_all("a", href=True):
        href = a["href"]
        if href.startswith("/shop/p/"):
            full_url = "https://www.percolate.sg" + href
            product_links.add(full_url)
    return list(product_links)

# --- Step 2: Parsing the description text ---
def parse_description(desc_text):
    """
    Given a description text (as from a meta description tag) that follows a pattern
    similar to your example, parse out the weight from the heading and then extract each
    product entry.
    
    Expected format example:
    
    ... (narrative text)
    
    SINGLE ORIGINS (250g)
    
    Chelbesa, Ethiopia ($36) - Watermelon, Peach, Citrus
    
    Maritza Dota Mejorado, Ecuador ($45) - Orange Blossom, Ripe Peach, 
    Raspberry
    
    ...
    
    Returns a list of dictionaries with keys: "Title", "Country", "Weight", "Price", "Flavour Notes".
    """
    # Split the text into nonempty lines.
    lines = [line.strip() for line in desc_text.splitlines() if line.strip()]
    
    # Find the first line that contains the weight pattern, e.g. "(250g)"
    weight = None
    heading_index = None
    for i, line in enumerate(lines):
        match = re.search(r'\((\d+g)\)', line)
        if match:
            weight = match.group(1)  # e.g. "250g"
            heading_index = i
            break

    if heading_index is None:
        print("No weight heading found in description.")
        return []
    
    # Assume product entries start after the weight heading.
    product_entries = lines[heading_index+1:]
    
    # Regex to parse a product entry line:
    # Format: "Chelbesa, Ethiopia ($36) - Watermelon, Peach, Citrus"
    entry_pattern = re.compile(
        r'^(.*?),\s*(.*?)\s*\(\$(\d+(?:\.\d+)?)\)\s*-\s*(.+)$'
    )
    
    parsed_products = []
    for entry in product_entries:
        match = entry_pattern.match(entry)
        if match:
            prod_title = match.group(1).strip()
            country = match.group(2).strip()
            price = match.group(3).strip()
            flavour_notes = match.group(4).strip()
            parsed_products.append({
                "Title": prod_title,
                "Country": country,
                "Weight": weight,
                "Price": price,
                "Flavour Notes": flavour_notes
            })
        else:
            # If the entry doesn't match, skip it.
            print("Skipping unrecognized entry:", entry)
            continue

    return parsed_products

# --- Step 3: Scrape each product page ---
def scrape_product_page(url):
    """
    Downloads a product page and extracts:
      - Product Name (from og:title or <title>)
      - Description (from meta name="description")
    Then, it parses the description to get product entries.
    
    Returns a list of rows. Each row is a dictionary with:
        URL, Product Name, Title, Country, Weight, Price, Flavour Notes
    """
    response = requests.get(url)
    if response.status_code != 200:
        print("Error fetching product page:", url, response.status_code)
        return []
    
    soup = BeautifulSoup(response.text, "html.parser")
    
    # Get product name
    meta_title = soup.find("meta", property="og:title")
    if meta_title and meta_title.get("content"):
        product_name = meta_title.get("content").strip()
    elif soup.title:
        product_name = soup.title.string.strip()
    else:
        product_name = ""
    
    # Get full description from <meta name="description">
    meta_desc = soup.find("meta", attrs={"name": "description"})
    desc_text = meta_desc.get("content", "").strip() if meta_desc else ""
    
    # Parse the description into product entries
    parsed_entries = parse_description(desc_text)
    
    # For each parsed entry, add the parent product URL and name.
    rows = []
    for entry in parsed_entries:
        row = {
            "URL": url,
            "Product Name": product_name,
            "Title": entry.get("Title", ""),
            "Country": entry.get("Country", ""),
            "Weight": entry.get("Weight", ""),
            "Price": entry.get("Price", ""),
            "Flavour Notes": entry.get("Flavour Notes", "")
        }
        rows.append(row)
    
    return rows

# --- Step 4: Main function to combine all steps and write to CSV ---
def main():
    shop_url = "https://www.percolate.sg/shop/coffee"
    print("Scraping shop page:", shop_url)
    product_links = get_product_links(shop_url)
    print("Found {} product links.".format(len(product_links)))
    
    all_rows = []
    for link in product_links:
        print("Processing product:", link)
        rows = scrape_product_page(link)
        if rows:
            all_rows.extend(rows)
        time.sleep(1)  # be polite

    # Define CSV columns.
    fieldnames = ["URL", "Product Name", "Title", "Country", "Weight", "Price", "Flavour Notes"]
    csv_filename = "percolate_products_parsed.csv"
    with open(csv_filename, "w", newline="", encoding="utf-8") as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for row in all_rows:
            writer.writerow(row)
    
    print("Scraping complete. Data saved to", csv_filename)

if __name__ == "__main__":
    main()

Scraping shop page: https://www.percolate.sg/shop/coffee
Found 6 product links.
Processing product: https://www.percolate.sg/shop/p/sey-coffee
Skipping unrecognized entry: Raspberry
Skipping unrecognized entry: Hibiscus
Skipping unrecognized entry: Rose Hips
Processing product: https://www.percolate.sg/shop/p/fuglen-coffee-tokyo
No weight heading found in description.
Processing product: https://www.percolate.sg/shop/p/onibus-coffee
No weight heading found in description.
Processing product: https://www.percolate.sg/shop/p/youmeca-drip-bags
No weight heading found in description.
Processing product: https://www.percolate.sg/shop/p/single-origin-coffee-espresso-250g
No weight heading found in description.
Processing product: https://www.percolate.sg/shop/p/the-barn-coffee
Skipping unrecognized entry: FILTER ROAST (250g)
Skipping unrecognized entry: Elegant
Scraping complete. Data saved to percolate_products_parsed.csv
