In [1]:
import re
import pandas as pd
from bs4 import BeautifulSoup
from urllib.request import urlopen, Request
import requests, time, random

In [2]:
headers = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/120.0.0.0 Safari/537.36"
    ),
    "Accept-Language": "en-US,en;q=0.9",
}
session = requests.Session()

In [3]:
def parse_headphone_title(title: str):
    features = {}
    features["Product_Name"] = title

    # Brand
    brand_match = re.match(r"([A-Za-z]+)", title)
    features["Brand"] = brand_match.group(1) if brand_match else None

    # Model
    model_match = re.search(rf"{features['Brand']}\s+([A-Za-z0-9\s/-]+?)(?:,|\()", title)
    features["Model"] = model_match.group(1).strip() if model_match else None

    # Battery Life
    battery_match = re.search(r"(\d+\s*(?:Hrs|Hours|Hr|Hour|Playback))", title, re.IGNORECASE)
    features["Battery_Life"] = battery_match.group(1) if battery_match else None

    # Driver Size
    driver_match = re.search(r"(\d+\s*mm)\s*Drivers?", title, re.IGNORECASE)
    features["Driver_Size"] = driver_match.group(1) if driver_match else None

    # Bluetooth Version
    bt_match = re.search(r"(Bluetooth\s*v?\d+(\.\d+)?)", title, re.IGNORECASE)
    features["Bluetooth_Version"] = bt_match.group(1) if bt_match else None

    # Connectivity
    if "Wireless" in title:
        features["Connectivity"] = "Wireless"
    elif "Wired" in title or "USB" in title:
        features["Connectivity"] = "Wired"
    else:
        features["Connectivity"] = None

    # Type
    if "Over Ear" in title:
        features["Type"] = "Over Ear"
    elif "On Ear" in title:
        features["Type"] = "On Ear"
    else:
        features["Type"] = None

    # Mic
    features["Mic"] = "Yes" if "Mic" in title else "No"

    # Color
    color_match = re.search(r"\(([^)]+)\)$", title.strip())
    features["Color"] = color_match.group(1) if color_match else None

    return features


# --- Backup: scrape missing values from product page ---
def fill_from_url(url, features):
    try:
        r = requests.get(url, headers=headers, timeout=10)
        if r.status_code != 200:
            return features
        soup = BeautifulSoup(r.text, "html.parser")

        specs = soup.find_all("tr")
        for tr in specs:
            th = tr.find("th")
            td = tr.find("td")
            if not th or not td:
                continue
            label = th.get_text(strip=True)
            value = td.get_text(strip=True)

            # check and fill missing ones
            if not features.get("Battery_Life") and "Battery" in label:
                features["Battery_Life"] = value
            if not features.get("Driver_Size") and "Driver" in label:
                features["Driver_Size"] = value
            if not features.get("Bluetooth_Version") and "Bluetooth" in label:
                features["Bluetooth_Version"] = value
            if not features.get("Item_Weight") and "Weight" in label:
                features["Item_Weight"] = value
            if not features.get("Water_Resistance") and "Water" in label:
                features["Water_Resistance"] = value

    except Exception as e:
        print("⚠️ URL scrape failed:", e)

    return features


In [4]:
titles, links = [], []
url = "https://www.amazon.in/s?k=headphones&page=1"
req = Request(url, headers=headers)
html = urlopen(req).read()
soup = BeautifulSoup(html, "html.parser")

containers = soup.find_all("a", class_="a-link-normal s-no-outline")
for c in containers:
    title = c.find("span", class_="a-size-medium a-color-base a-text-normal")
    if title:
        titles.append(title.get_text(strip=True))
        links.append("https://www.amazon.in" + c["href"])

HTTPError: HTTP Error 503: Service Unavailable

In [None]:
parsed = []
for title, link in zip(titles, links):
    features = parse_headphone_title(title)
    # Fill missing values from product URL
    if any(v is None for v in features.values()):
        features = fill_from_url(link, features)
    parsed.append(features)
    time.sleep(random.uniform(2, 4))  # avoid rate limiting

df = pd.DataFrame(parsed)
df.to_csv("headphones_final.csv", index=False, encoding="utf-8-sig")
print("✅ Done! Rows:", len(df))