In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager

from bs4 import BeautifulSoup
import time
import requests
import pandas as pd
import re


In [None]:
product_url = "https://www.fashionphile.com/products/hermes-togo-birkin-30-trench-1747041"

In [None]:
headers = {
    "User-Agent": (
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/123.0 Safari/537.36"
    )
}

def get_soup(url):
    resp = requests.get(url, headers=headers)
    resp.raise_for_status()
    return BeautifulSoup(resp.text, "html.parser")

soup = get_soup(product_url)
soup.title


In [None]:
def get_text(soup, selector):
    if not selector:
        return None
    el = soup.select_one(selector)
    return el.get_text(strip=True) if el else None


def clean_price(text):
    if not text:
        return None
    digits = "".join(ch for ch in text if ch.isdigit() or ch == ".")
    try:
        return float(digits)
    except ValueError:
        return None


SELECTORS = {
    "title": "h1",
    "model": "p.fp-product-title__details",
    "sell_price": "span.price-item--regular",  
    "condition": "span.h6.fp-font-weight--regular",
    "sku": None,
    "status": None,
    "brand": "a.fp-product-vendor__link",
}


In [None]:
def infer_leather(model, desc):
    text = " ".join([model or "", desc or ""]).lower()
    for name in ["togo", "epsom", "clemence", "swift", "box", "chevre"]:
        if name in text:
            return name.capitalize()
    return None


def infer_hardware(desc):
    if not desc:
        return None
    t = desc.lower()
    if "palladium hardware" in t:
        return "Palladium"
    if "gold hardware" in t:
        return "Gold"
    if "silver hardware" in t:
        return "Silver"
    return None

def infer_color_from_model(model):
    if not model:
        return None
    parts = model.split()
    return parts[-1]

def infer_size_from_model(model):
    if not model:
        return None
    for token in model.split():
        if token.isdigit():
            return int(token)  
    return None

def extract_description_from_soup(soup):
    blocks = soup.select("div.accordion__content.rte.body-md")
    for div in blocks:
        text = div.get_text(" ", strip=True)
        lower = text.lower()
        if "this is an authentic" in lower or "this is a" in lower:
            idx = lower.find("this is")
            return text[idx:] if idx != -1 else text
    return None

def extract_sku_from_soup(soup):
    blocks = soup.select("div.accordion__content.rte.body-md")
    for div in blocks:
        text = div.get_text(" ", strip=True)
        cleaned = " ".join(text.split())
        match = re.search(r"Item\s*#:\s*(\d+)", cleaned, flags=re.IGNORECASE)
        if match:
            return match.group(1)
    return None

def extract_brand(soup):
    el = soup.select_one("a.fp-product-vendor__link")
    return el.get_text(strip=True) if el else None
    
    return None


In [None]:
def parse_product_page(url):
    soup = get_soup(url)

    
    data = {field: get_text(soup, sel) for field, sel in SELECTORS.items()}

    data["sell_price"] = clean_price(data.get("sell_price"))
    desc = extract_description_from_soup(soup)
    data["description"] = desc
    data["sku"] = extract_sku_from_soup(soup)
    model = data.get("model")
    data["leather"] = infer_leather(model, desc)
    data["hardware"] = infer_hardware(desc)
    data["color"] = infer_color_from_model(model)
    data["size_cm"] = infer_size_from_model(model)
    data["brand"] = extract_brand(soup)
    data["url"] = url

    return data


In [None]:
data = parse_product_page(product_url)
data


In [None]:
def make_driver(headless=True):
    options = Options()
    if headless:
        options.add_argument("--headless=new")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    driver = webdriver.Chrome(
        service=Service(ChromeDriverManager().install()),
        options=options
    )
    return driver

driver = make_driver()


In [None]:
BASE_URL = "https://www.fashionphile.com"

def get_product_links_from_listing_selenium(listing_url, driver, max_products=None):
    driver.get(listing_url)
    
    try:
        WebDriverWait(driver, 15).until(
            EC.presence_of_all_elements_located(
                (By.CSS_SELECTOR, "a.full-unstyled-link.fp-card__link")
            )
        )
    except Exception as e:
        print("Timed out waiting for product cards:", e)

    elements = driver.find_elements(By.CSS_SELECTOR, "a.full-unstyled-link.fp-card__link")
    links = []

    for el in elements:
        href = el.get_attribute("href")
        if not href:
            continue
        if "/products/" not in href:
            continue
        links.append(href)

    links = list(dict.fromkeys(links))

    if max_products:
        links = links[:max_products]

    print(f"Found {len(links)} product links on this page.")
    return links


In [None]:
def make_page_url(base_url, page):
    """Return the correct URL for a given page number."""
    if page == 1:
        return base_url  
    return f"{base_url}&page={page}"

def get_all_product_links_across_pages(base_url, driver, max_pages=10, max_total=None):
    all_links = []
    seen = set()

    for page in range(1, max_pages + 1):
        page_url = make_page_url(base_url, page)
        print(f"\n=== Page {page} â†’ {page_url} ===")
        
        links = get_product_links_from_listing_selenium(page_url, driver)

        if not links:
            print("No products found on this page. Stopping pagination.")
            break

        for u in links:
            if u not in seen:
                seen.add(u)
                all_links.append(u)

        print(f"Total collected so far: {len(all_links)}")

        if max_total and len(all_links) >= max_total:
            all_links = all_links[:max_total]
            print(f"Reached max_total={max_total}. Stopping.")
            break

    return all_links

In [None]:
base_url = "https://www.fashionphile.com/collections/all-bags?refinementList%5Bvendor%5D%5B0%5D=Hermes&sortBy=shopify_products_published_at_desc"

all_links = get_all_product_links_across_pages(
    base_url,
    driver,
    max_pages=10,
    max_total=1000
)

len(all_links), all_links[:5]


In [None]:
def scrape_listing_to_df(listing_url, max_products=1000, delay=1.0):
    links = listing_url
    rows = []

    for i, url in enumerate(links, start=1):
        print(f"{i}/{len(links)}")
        try:
            row = parse_product_page(url)
            rows.append(row)
        except Exception as e:
            print("   Error parsing:", e)
        time.sleep(delay)

    return pd.DataFrame(rows)

df = scrape_listing_to_df(all_links, max_products=500)
df.head(), df.shape


In [None]:
driver.quit()

In [None]:
df.to_csv("fashionphile_sample.csv", index=False)

In [None]:
df.head()
df.info()
df.describe()
df["sell_price"].describe()
df.isna().mean().sort_values(ascending=False)