In [1]:
import pandas as pd
import numpy as np
import time, random, requests
from bs4 import BeautifulSoup
from urllib.request import urlopen, Request
import warnings
warnings.filterwarnings("ignore")

In [4]:
data_cols = [
    "Product_Name",
    "Selling_Price",
    "MRP",
    "Discount",
    "Brand",
    "Connectivity",
    "FormFactor",
    "Item_Weight",
    "Water_Resistance",
    "Battery_Life",
    "Battery_Included",
    "Frequency_Range",
    "Detachable_Cable",
    "Manufacturer",
    "Model",
]


In [5]:
data = {col: [] for col in data_cols}

In [6]:
feature_map = {
    # Connectivity
    "Connectivity Technology": "Connectivity",
    "Wireless Communication Technology": "Connectivity",
    "Headphones Jack": "Connectivity",
    "Cable Feature": "Detachable_Cable",

    # Form Factor
    "Earpiece Shape": "FormFactor",
    "Headphone Type": "FormFactor",
    "Style": "FormFactor",

    # Brand / Manufacturer
    "Brand": "Brand",
    "Manufacturer": "Manufacturer",

    # Weight
    "Item Weight": "Item_Weight",

    # Water resistance
    "Water Resistance Level": "Water_Resistance",
    "IP Rating": "Water_Resistance",

    # Battery
    "Battery Life": "Battery_Life",
    "Batteries Included": "Battery_Included",
    "Batteries Required": "Battery_Included",

    # Frequency
    "Frequency Response": "Frequency_Range",
    "Frequency Range": "Frequency_Range",

    # Cable
    "Detachable Cable": "Detachable_Cable"
}


In [7]:
def normalize_feature(key, value):
    if not value:
        return None

    val = value.strip().lower()

    if key == "Connectivity":
        if "bluetooth" in val or "wireless" in val:
            return "Wireless"
        if "3.5" in val or "wired" in val:
            return "Wired"
    
    if key == "FormFactor":
        if "over" in val:
            return "OverEar"
        if "on" in val:
            return "OnEar"
        if "in" in val:
            return "InEar"
    
    if key == "Water_Resistance":
        if "not" in val:
            return "No"
        return "Yes"
    
    if key in ["Battery_Included", "Detachable_Cable"]:
        if "yes" in val or "included" in val:
            return "Yes"
        return "No"
    
    if key == "Item_Weight":
        return val.replace("grams", "g").replace("gram", "g")

    return value.strip()


In [8]:
def safe_append(key, value):
    if key not in data:
        data[key] = []
    data[key].append(value)

# ✅ Extract feature from product page
def get_feature(soup, label):
    row = soup.find("th", string=lambda t: t and label in t)
    if row:
        val = row.find_next("td")
        return val.get_text(strip=True) if val else None
    return None

headers = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/120.0.0.0 Safari/537.36"
    ),
    "Accept-Language": "en-US,en;q=0.9",
}

# ✅ Scraper function
def scrap(c, i):
    a_tag = c.find("a", class_="a-link-normal")
    if not a_tag:
        return

    link = "https://www.amazon.in" + a_tag["href"]
    title = a_tag.get_text(strip=True)

    try:
        r = requests.get(link, headers=headers, timeout=10)
        if r.status_code == 200:
            soup = BeautifulSoup(r.text, "html.parser")
            print(f"✅ Page {i} scraped: {title[:50]}...")
        else:
            print(f"⚠️ Failed page {i}, status:", r.status_code)
            return
    except requests.exceptions.RequestException as e:
        print(f"❌ Error fetching page {i}:", e)
        return

    time.sleep(random.uniform(2, 6))

    # main values
    price = soup.find("span", class_="a-price-whole")
    mrp = soup.find("span", class_="a-price a-text-price")
    discount = soup.find("span", class_="savingsPercentage")

    safe_append("Product_Name", title)
    safe_append("Selling_Price", price.get_text(strip=True) if price else None)
    safe_append("MRP", mrp.get_text(strip=True) if mrp else None)
    safe_append("Discount", discount.get_text(strip=True) if discount else None)

    # features loop
    for site_label, schema_key in feature_map.items():
        val = get_feature(soup, site_label)
        val = normalize_feature(schema_key, val)
        safe_append(schema_key, val)


In [9]:
for i in range(1, 3):  # reduce to 2 pages for testing
    url = f"https://www.amazon.in/s?k=headphones&i=computers&page={i}"
    uClient = urlopen(url)
    page_html = uClient.read()
    uClient.close()
    soup = BeautifulSoup(page_html, features="html.parser")
    containers = soup.find_all(
        "div",
        {
            "class": "a-section a-spacing-none puis-padding-right-small s-title-instructions-style puis-desktop-list-title-instructions-style"
        },
    )

    for c in containers:
        scrap(c, i)
        time.sleep(random.uniform(2, 6))

✅ Page 1 scraped: boAt Rockerz 411 (2025 Launch), 40Ms Low Latency, ...
✅ Page 1 scraped: boAt Rockerz 550/Rockerz 558 Over Ear Bluetooth He...
✅ Page 1 scraped: JBL Tune 510BT, On Ear Wireless Headphones with Mi...
✅ Page 1 scraped: boAt Rockerz 450, 15 HRS Battery, 40mm Drivers, Pa...
✅ Page 1 scraped: ZEBRONICS THUNDER Bluetooth 5.3 Wireless Headphone...
✅ Page 1 scraped: boAt Rockerz 421 (2025 Launch), 40Hrs Battery, Low...
✅ Page 1 scraped: boAt BassHeads 225 in-Ear Super Extra Bass Wired H...
✅ Page 1 scraped: Portronics Muffs M2 Bluetooth Headphones Over Ear ...
✅ Page 1 scraped: JBL Tune 770NC Wireless Over Ear ANC Headphones wi...
✅ Page 1 scraped: boAt Rockerz 412 (2025 Launch), 60Hrs Battery, 40M...
✅ Page 1 scraped: boAt Bassheads 900 Pro Wired Headphones with 40Mm ...
✅ Page 1 scraped: Sony WH-1000XM4 Industry Leading Wireless Noise Ca...
✅ Page 1 scraped: Sony WH-CH720N Active Noise Cancellation Wireless ...
✅ Page 1 scraped: JBL C100SI Wired In Ear Headphones With Mic, P

HTTPError: HTTP Error 503: Service Unavailable

In [10]:
max_len = max(len(v) for v in data.values())
for k, v in data.items():
    if len(v) < max_len:
        v.extend([None] * (max_len - len(v)))

# ✅ Convert to dataframe
df = pd.DataFrame(data)

# ✅ Save to CSV
df.to_csv("headphones_clean.csv", index=False, encoding="utf-8-sig")
print("✅ CSV saved with", len(df), "rows and", len(df.columns), "columns")

✅ CSV saved with 72 rows and 15 columns
