In [None]:
"""
01_scrape_fashionphile.ipynb

Goal:
- Collect product URLs from a Fashionphile listing page (Selenium, because listing is JS-heavy)
- For each product URL, scrape structured fields (Requests + BeautifulSoup)
- Save results to CSV

Notes:
- Be polite: add delays and timeouts
- Keep functions small + testable
"""

import time
import re
import random
from dataclasses import dataclass
from typing import Optional, Dict, List

import requests
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager


In [None]:
HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/123.0 Safari/537.36"
    )
}

REQUEST_TIMEOUT = 20
MAX_RETRIES = 3

DEFAULT_DELAY_SECONDS = 1.0   # be polite
JITTER_SECONDS = 0.25         # add small randomness to delays

# Listing page CSS selector for product cards (verify if it changes)
LISTING_CARD_SELECTOR = "a.full-unstyled-link.fp-card__link"


In [None]:
def get_soup(url: str, headers: dict = HEADERS, timeout: int = REQUEST_TIMEOUT) -> BeautifulSoup:
    """
    Fetch a URL with Requests and return BeautifulSoup HTML parser.
    Includes retries for transient errors.
    """
    last_err = None
    for attempt in range(1, MAX_RETRIES + 1):
        try:
            resp = requests.get(url, headers=headers, timeout=timeout)
            resp.raise_for_status()
            return BeautifulSoup(resp.text, "html.parser")
        except Exception as e:
            last_err = e
            time.sleep(0.5 * attempt)  # backoff
    raise last_err


def get_text(soup: BeautifulSoup, selector: Optional[str]) -> Optional[str]:
    """Return stripped text for the first matching CSS selector, or None."""
    if not selector:
        return None
    el = soup.select_one(selector)
    return el.get_text(strip=True) if el else None


def clean_price(text: Optional[str]) -> Optional[float]:
    """Convert '$1,234.00' -> 1234.0 safely."""
    if not text:
        return None
    digits = "".join(ch for ch in text if ch.isdigit() or ch == ".")
    try:
        return float(digits)
    except ValueError:
        return None


In [None]:
SELECTORS = {
    "title": "h1",
    "model": "p.fp-product-title__details",
    "sell_price": "span.price-item--regular",
    "condition": "span.h6.fp-font-weight--regular",
    # sku/status are extracted from accordion text blocks
    "sku": None,
    "status": None,
    "brand": "a.fp-product-vendor__link",
}


def infer_leather(model: Optional[str], desc: Optional[str]) -> Optional[str]:
    text = " ".join([model or "", desc or ""]).lower()
    for name in ["togo", "epsom", "clemence", "swift", "box", "chevre"]:
        if name in text:
            return name.capitalize()
    return None


def infer_hardware(desc: Optional[str]) -> Optional[str]:
    if not desc:
        return None
    t = desc.lower()
    if "palladium hardware" in t:
        return "Palladium"
    if "gold hardware" in t:
        return "Gold"
    if "silver hardware" in t:
        return "Silver"
    return None


def infer_color_from_model(model: Optional[str]) -> Optional[str]:
    """
    Very naive: uses last token in the model string.
    Example: "HERMES TOGO BIRKIN 30 TRENCH" -> "TRENCH"
    """
    if not model:
        return None
    parts = model.split()
    return parts[-1] if parts else None


def infer_size_from_model(model: Optional[str]) -> Optional[int]:
    """Extract the first integer token from model string (e.g., 'Birkin 30' -> 30)."""
    if not model:
        return None
    for token in model.split():
        if token.isdigit():
            return int(token)
    return None


def extract_description_from_soup(soup: BeautifulSoup) -> Optional[str]:
    """
    Pull the long product description from accordion content blocks.
    This is heuristic and depends on Fashionphile HTML structure.
    """
    blocks = soup.select("div.accordion__content.rte.body-md")
    for div in blocks:
        text = div.get_text(" ", strip=True)
        lower = text.lower()
        if "this is an authentic" in lower or lower.startswith("this is"):
            idx = lower.find("this is")
            return text[idx:] if idx != -1 else text
    return None


def extract_sku_from_soup(soup: BeautifulSoup) -> Optional[str]:
    """Extract SKU from accordion blocks via regex like 'Item #: 1747041'."""
    blocks = soup.select("div.accordion__content.rte.body-md")
    for div in blocks:
        text = div.get_text(" ", strip=True)
        cleaned = " ".join(text.split())
        match = re.search(r"Item\s*#:\s*(\d+)", cleaned, flags=re.IGNORECASE)
        if match:
            return match.group(1)
    return None


def extract_brand(soup: BeautifulSoup) -> Optional[str]:
    el = soup.select_one("a.fp-product-vendor__link")
    return el.get_text(strip=True) if el else None


In [None]:
def parse_product_page(url: str) -> Dict:
    """
    Scrape one product page and return a normalized dictionary row.
    """
    soup = get_soup(url)

    # Basic fields via CSS selectors
    data = {field: get_text(soup, sel) for field, sel in SELECTORS.items()}

    # Normalize / enrich
    data["sell_price"] = clean_price(data.get("sell_price"))

    desc = extract_description_from_soup(soup)
    data["description"] = desc

    data["sku"] = extract_sku_from_soup(soup)

    model = data.get("model")
    data["leather"] = infer_leather(model, desc)
    data["hardware"] = infer_hardware(desc)
    data["color"] = infer_color_from_model(model)
    data["size_cm"] = infer_size_from_model(model)

    # Brand extraction (override selector field to be safe)
    data["brand"] = extract_brand(soup)

    data["url"] = url
    return data


In [None]:
def make_driver(headless: bool = True) -> webdriver.Chrome:
    """Create a Chrome webdriver (headless by default)."""
    options = Options()
    if headless:
        options.add_argument("--headless=new")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")

    driver = webdriver.Chrome(
        service=Service(ChromeDriverManager().install()),
        options=options
    )
    return driver


def make_page_url(base_url: str, page: int) -> str:
    """
    Return URL for a given page number.
    Assumes the listing uses '&page=N'. If the base URL changes, update here.
    """
    if page == 1:
        return base_url
    return f"{base_url}&page={page}"


def get_product_links_from_listing_selenium(
    listing_url: str,
    driver: webdriver.Chrome,
    max_products: Optional[int] = None,
    wait_seconds: int = 30
) -> List[str]:
    """
    Load a listing page and return product links found.
    """
    driver.get(listing_url)

    try:
        WebDriverWait(driver, wait_seconds).until(
            EC.presence_of_all_elements_located((By.CSS_SELECTOR, LISTING_CARD_SELECTOR))
        )
    except Exception as e:
        print("Timed out waiting for product cards:", e)

    elements = driver.find_elements(By.CSS_SELECTOR, LISTING_CARD_SELECTOR)
    links = []

    for el in elements:
        href = el.get_attribute("href")
        if not href:
            continue
        if "/products/" not in href:
            continue
        links.append(href)

    # de-dupe while preserving order
    links = list(dict.fromkeys(links))

    if max_products is not None:
        links = links[:max_products]

    print(f"Found {len(links)} product links on this page.")
    return links


def get_all_product_links_across_pages(
    base_url: str,
    driver: webdriver.Chrome,
    max_pages: int = 10,
    max_total: Optional[int] = None
) -> List[str]:
    """
    Paginate the listing and collect product links.
    Stops if a page returns zero links.
    """
    all_links = []
    seen = set()

    for page in range(1, max_pages + 1):
        page_url = make_page_url(base_url, page)
        print(f"\n=== Page {page} â†’ {page_url} ===")

        links = get_product_links_from_listing_selenium(page_url, driver)

        if not links:
            print("No products found on this page. Stopping pagination.")
            break

        for u in links:
            if u not in seen:
                seen.add(u)
                all_links.append(u)

        print(f"Total collected so far: {len(all_links)}")

        if max_total is not None and len(all_links) >= max_total:
            all_links = all_links[:max_total]
            print(f"Reached max_total={max_total}. Stopping.")
            break

    return all_links


In [None]:
def scrape_products_to_df(
    product_links: List[str],
    delay: float = DEFAULT_DELAY_SECONDS
) -> pd.DataFrame:
    """
    Scrape product pages into a dataframe.
    Includes delay + jitter to reduce risk of rate-limits.
    """
    rows = []
    total = len(product_links)

    for i, url in enumerate(product_links, start=1):
        print(f"{i}/{total} {url}")
        try:
            row = parse_product_page(url)
            rows.append(row)
        except Exception as e:
            print("   Error parsing:", repr(e))

        # polite delay
        time.sleep(delay + random.random() * JITTER_SECONDS)

    return pd.DataFrame(rows)


In [None]:
BASE_LISTING_URL = (
    "https://www.fashionphile.com/collections/all-bags?"
    "refinementList%5Bvendor%5D%5B0%5D=Hermes&sortBy=shopify_products_published_at_desc"
)

driver = make_driver(headless=True)

try:
    all_links = get_all_product_links_across_pages(
        BASE_LISTING_URL,
        driver,
        max_pages=10,
        max_total=1000
    )
finally:
    driver.quit()

print("Total links:", len(all_links))
all_links[:5]


In [None]:
df = scrape_products_to_df(all_links, delay=1.0)
df.shape, df.head()


In [None]:
# Save outputs
df.to_csv("fashionphile_hermes_bags.csv", index=False)
pd.Series(all_links).to_csv("fashionphile_hermes_links.csv", index=False)

df.isna().mean().sort_values(ascending=False).head(15)
