In [1]:
import re
import time, random
import requests
import pandas as pd
from bs4 import BeautifulSoup

In [2]:
headers = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/120.0.0.0 Safari/537.36"
    ),
    "Accept-Language": "en-US,en;q=0.9",
}

In [3]:
data_cols = [
    "Product_Name","Brand","Model","Connectivity","Type",
    "Battery_Life","Driver_Size","Mic","Colour","Features",
    "Selling_Price","MRP","Discount","Rating","Reviews","URL"
]

In [4]:
features = {col: [] for col in data_cols}

In [5]:
def safe_append(key, value):
    features[key].append(value if value else None)

def parse_headphone_title(title: str):
    safe_append("Product_Name", title)

    # Brand = first word
    brand_match = re.match(r"([A-Za-z]+)", title)
    brand = brand_match.group(1) if brand_match else None
    safe_append("Brand", brand)

    # Model
    model_match = re.search(rf"{brand}\s+([A-Za-z0-9\s/-]+?)(?:,|-|\|)", title) if brand else None
    model = model_match.group(1).strip() if model_match else None
    safe_append("Model", model)

    # Connectivity
    if re.search(r"Wireless|Bluetooth|BT\d+(\.\d+)?", title, re.IGNORECASE):
        safe_append("Connectivity", "Wireless")
    elif re.search(r"Wired|3\.5mm|USB", title, re.IGNORECASE):
        safe_append("Connectivity", "Wired")
    else:
        safe_append("Connectivity", None)

    # Type
    if re.search(r"Over[- ]Ear", title, re.IGNORECASE):
        safe_append("Type", "Over-Ear")
    elif re.search(r"In[- ]Ear", title, re.IGNORECASE):
        safe_append("Type", "In-Ear")
    elif re.search(r"On[- ]Ear", title, re.IGNORECASE):
        safe_append("Type", "On-Ear")
    else:
        safe_append("Type", None)

    # Battery Life
    battery_match = re.search(r"(\d+\s*Hrs?|\d+\s*Hours?)", title, re.IGNORECASE)
    safe_append("Battery_Life", battery_match.group(1) if battery_match else None)

    # Driver size
    driver_match = re.search(r"(\d+\s*mm)", title, re.IGNORECASE)
    safe_append("Driver_Size", driver_match.group(1) if driver_match else None)

    # Mic
    if re.search(r"Mic|Microphone", title, re.IGNORECASE):
        safe_append("Mic", "Yes")
    else:
        safe_append("Mic", "No")

    # Colour
    color_match = re.search(r"(Black|White|Grey|Gray|Blue|Red|Green)", title, re.IGNORECASE)
    safe_append("Colour", color_match.group(1) if color_match else None)

    # Features
    features_text = []
    for word in ["Noise Cancelling", "ANC", "Low Latency", "Gaming", "Hi-Res", "Fast Charging"]:
        if word.lower() in title.lower():
            features_text.append(word)
    safe_append("Features", ", ".join(features_text) if features_text else None)


def scrap(c):
    # Title & Link
    a_tag = c.find("a", class_="a-link-normal")
    if not a_tag:
        return

    link = "https://www.amazon.in" + a_tag["href"]
    title = a_tag.get_text(strip=True)

    parse_headphone_title(title)

    # Selling price
    price_whole = c.find("span", class_="a-price-whole")
    price_fraction = c.find("span", class_="a-price-fraction")
    if price_whole:
        selling_price = price_whole.get_text(strip=True)
        if price_fraction:
            selling_price += "." + price_fraction.get_text(strip=True)
    else:
        selling_price = None

    # MRP
    mrp_tag = c.find("span", class_="a-price a-text-price")
    mrp = mrp_tag.get_text(strip=True) if mrp_tag else None

    # Discount
    discount_tag = c.find("span", class_="savingsPercentage")
    discount = discount_tag.get_text(strip=True) if discount_tag else None

    # Rating
    rating_tag = c.find("span", class_="a-icon-alt")
    rating = rating_tag.get_text(strip=True) if rating_tag else None

    # Reviews
    reviews_tag = c.find("a", attrs={"aria-label": re.compile(r"\d+\s+ratings")})
    reviews = reviews_tag.get_text(strip=True) if reviews_tag else None

    safe_append("Selling_Price", selling_price)
    safe_append("MRP", mrp)
    safe_append("Discount", discount)
    safe_append("Rating", rating)
    safe_append("Reviews", reviews)
    safe_append("URL", link)


# 🔹 main scraping loop with requests
product_class_name = (
    "puisg-col puisg-col-4-of-4 puisg-col-4-of-8 "
    "puisg-col-8-of-12 puisg-col-8-of-16 "
    "puisg-col-12-of-20 puisg-col-12-of-24 puis-list-col-right"
)

In [6]:
for i in range(1, 200):  # scrape first 4 pages (can increase to 30)
    url = f"https://www.amazon.in/s?k=headphones&page={i}"
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        print(f"Failed to fetch page {i}, status {response.status_code}")
        continue

    soup = BeautifulSoup(response.text, "html.parser")
    containers = soup.find_all("div", {"class": product_class_name})

    for c in containers:
        scrap(c)

    time.sleep(random.uniform(2, 5))  # polite delay

# Convert to DataFrame
df = pd.DataFrame(features)
print(df.head())

                                        Product_Name Brand       Model  \
0                                        Let us know   Let        None   
1                                        Let us know   Let        None   
2  boAt Rockerz 411 (2025 Launch), 40Ms Low Laten...  boAt        None   
3  JBL Tune 510BT, On Ear Wireless Headphones wit...   JBL  Tune 510BT   
4  boAt Rockerz 421 (2025 Launch), 40Hrs Battery,...  boAt        None   

  Connectivity      Type Battery_Life Driver_Size  Mic Colour     Features  \
0         None      None         None        None   No   None         None   
1         None      None         None        None   No   None         None   
2     Wireless  Over-Ear        40Hrs        40Mm  Yes   Blue  Low Latency   
3     Wireless    On-Ear     40 Hours        None  Yes   Blue         None   
4     Wireless  Over-Ear        40Hrs        40Mm  Yes   Blue  Low Latency   

  Selling_Price             MRP Discount              Rating Reviews  \
0        24,98

In [8]:
# Save to CSV
df.to_csv("amazon_headphones1.csv", index=False, encoding="utf-8")
print("✅ Data saved to amazon_keyboards.csv")

✅ Data saved to amazon_keyboards.csv
