In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time,random

In [2]:
import warnings
warnings.filterwarnings("ignore");

In [3]:
from bs4 import BeautifulSoup
from urllib.request import urlopen, Request
import requests

In [4]:
data_cols = [
    "Product_Name",
    "Selling_Price",
    "MRP",
    "Discount",

    # Core Info
    "Brand",
    "Model",
    "Model_Name",
    "Item_model_number",
    "Series_number",
    "UPC",

    # Design / Build
    "Colour",
    "Form_Factor",               # Over-ear / In-ear / On-ear
    "Style",
    "Material",
    "Earpiece_Shape",
    "Control_Type",
    "Control_Method",
    "Controller_Type",
    "Cable_Feature",
    "Cable_Length",
    "Carrying_Case_Material",
    "Carrying_Case_Color",

    # Connectivity
    "Connectivity_Technology",   # Wired / Wireless
    "Wireless_Communication_Technology",
    "Headphones_Jack",
    "Bluetooth_Version",
    "Bluetooth_Range",

    # Audio Specs
    "Audio_Driver_Type",
    "Audio_Driver_Size",
    "Frequency_Range",
    "Sensitivity",
    "Impedance",
    "Special_Feature",
    "Special_Features",

    # Battery / Power
    "Battery_Life",
    "Charging_Time",
    "Battery_Type",
    "Battery_cell_composition",
    "Are_Batteries_Included",
    "Batteries_Required",
    "Includes_Rechargeable_Battery",
    "Lithium_Battery_Energy_Content",
    "Number_of_Lithium_Ion_Cells",

    # Usage
    "Specific_Uses_For_Product",
    "Recommended_Uses_For_Product",
    "Compatible_Devices",
    "Age_Range_Description",
    "Water_Resistance_Level",

    # Misc
    "Included_Components",
    "Net_Quantity",
    "Number_of_Items",
    "Manufacturer",
    "Country_of_Origin",
    "Product_Dimensions",
    "Package_Dimensions",
    "Item_Weight",
    "Does_it_contain_liquid"
]


In [5]:
data = {col: [] for col in data_cols}

In [6]:
def get_feature(soup, name):
    row = soup.find("th", string=lambda t: t and name in t)
    if row:
        return row.find_next("td").get_text(strip=True)
    return None


In [9]:
headers = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/120.0.0.0 Safari/537.36"
    ),
    "Accept-Language": "en-US,en;q=0.9",
}
def scrap(c, i):
    a_tag = c.find("a", class_="a-link-normal")
    if not a_tag:
        return

    link = "https://www.amazon.in" + a_tag["href"]
    title = a_tag.get_text(strip=True)

    # ✅ requests with headers
    try:
        r = requests.get(link, headers=headers, timeout=10)
        if r.status_code == 200:
            soup = BeautifulSoup(r.text, "html.parser")
            print(f"✅ Page {i} scraped")
        else:
            print(f"⚠️ Failed page {i}, status:", r.status_code)
            return
    except requests.exceptions.RequestException as e:
        print(f"❌ Error fetching page {i}:", e)
        return
    
    time.sleep(random.uniform(2, 6))

    # main values
    price = soup.find("span", class_="a-price-whole")
    mrp = soup.find("span", class_="a-price a-text-price")
    discount = soup.find("span", class_="savingsPercentage")

    # push into dictionary
    data["Product_Name"].append(title)
    data["Selling_Price"].append(price.get_text(strip=True) if price else None)
    data["MRP"].append(mrp.get_text(strip=True) if mrp else None)
    data["Discount"].append(discount.get_text(strip=True) if discount else None)

    # feature extraction (headphone-specific + generic)
    data["Brand"].append(get_feature(soup, "Brand"))
    data["Manufacturer"].append(get_feature(soup, "Manufacturer"))
    data["Model_Name"].append(get_feature(soup, "Model Name"))
    data["Model"].append(get_feature(soup, "Model"))
    data["Model_Number"].append(get_feature(soup, "Item model number"))
    data["Series_Number"].append(get_feature(soup, "Series number"))
    data["Colour"].append(get_feature(soup, "Colour"))
    data["Form_Factor"].append(get_feature(soup, "Form Factor"))
    data["Style"].append(get_feature(soup, "Style"))
    data["Material"].append(get_feature(soup, "Material"))
    data["Earpiece_Shape"].append(get_feature(soup, "Earpiece Shape"))
    data["Special_Feature"].append(get_feature(soup, "Special Feature"))
    data["Special_Features"].append(get_feature(soup, "Special Features"))
    data["Connectivity_Technology"].append(get_feature(soup, "Connectivity Technology"))
    data["Wireless_Communication_Technology"].append(get_feature(soup, "Wireless Communication Technology"))
    data["Compatible_Devices"].append(get_feature(soup, "Compatible Devices"))
    data["Specific_Uses_For_Product"].append(get_feature(soup, "Specific Uses For Product"))
    data["Recommended_Uses_For_Product"].append(get_feature(soup, "Recommended Uses For Product"))
    data["Age_Range"].append(get_feature(soup, "Age Range (Description)"))
    data["Control_Type"].append(get_feature(soup, "Control Type"))
    data["Control_Method"].append(get_feature(soup, "Control Method"))
    data["Controller_Type"].append(get_feature(soup, "Controller Type"))
    data["Included_Components"].append(get_feature(soup, "Included Components"))
    data["Cable_Feature"].append(get_feature(soup, "Cable Feature"))
    data["Cable_Length"].append(get_feature(soup, "Cable Length"))
    data["Headphones_Jack"].append(get_feature(soup, "Headphones Jack"))
    data["Noise_Control"].append(get_feature(soup, "Noise Control"))
    data["Water_Resistance_Level"].append(get_feature(soup, "Water Resistance Level"))
    data["Frequency_Range"].append(get_feature(soup, "Frequency Range"))
    data["Audio_Driver_Type"].append(get_feature(soup, "Audio Driver Type"))
    data["Audio_Driver_Size"].append(get_feature(soup, "Audio Driver Size"))
    data["Bluetooth_Range"].append(get_feature(soup, "Bluetooth Range"))
    data["Bluetooth_Version"].append(get_feature(soup, "Bluetooth Version"))
    data["Battery_Life"].append(get_feature(soup, "Battery Life"))
    data["Charging_Time"].append(get_feature(soup, "Charging Time"))
    data["Batteries"].append(get_feature(soup, "Batteries"))
    data["Batteries_Included"].append(get_feature(soup, "Batteries Included"))
    data["Batteries_Required"].append(get_feature(soup, "Batteries Required"))
    data["Battery_Cell_Composition"].append(get_feature(soup, "Battery cell composition"))
    data["Includes_Rechargeable_Battery"].append(get_feature(soup, "Includes Rechargeable Battery"))
    data["Lithium_Battery_Energy_Content"].append(get_feature(soup, "Lithium Battery Energy Content"))
    data["Number_of_Lithium_Ion_Cells"].append(get_feature(soup, "Number of Lithium Ion Cells"))
    data["Does_it_contain_liquid"].append(get_feature(soup, "Does it contain liquid?"))
    data["Carrying_Case_Material"].append(get_feature(soup, "Carrying Case Material"))
    data["Carrying_Case_Color"].append(get_feature(soup, "Carrying Case Color"))
    data["UPC"].append(get_feature(soup, "UPC"))
    data["Net_Quantity"].append(get_feature(soup, "Net Quantity"))
    data["Number_of_Items"].append(get_feature(soup, "Number of Items"))
    data["Product_Dimensions"].append(get_feature(soup, "Product Dimensions"))
    data["Package_Dimensions"].append(get_feature(soup, "Package Dimensions"))
    data["Country_of_Origin"].append(get_feature(soup, "Country of Origin"))
    data["Item_Weight"].append(get_feature(soup, "Item Weight"))

In [10]:
for i in range(1,30):
    url=f"https://www.amazon.in/s?k=laptop&i=computers&page={i}&crid=1YDNORBH8ERL2&qid=1758091753&sprefix=laptop%2Ccomputers%2C372&xpid=SjrP3069I27qU&ref=sr_pg_{i}"
    uClient = urlopen(url)
    page_html = uClient.read()
    uClient.close()
    soup = BeautifulSoup(page_html, features='html')
    containers = soup.find_all(
        "div",
        {"class": "a-section a-spacing-none puis-padding-right-small s-title-instructions-style puis-desktop-list-title-instructions-style"}
    )

    for c in containers:
        scrap(c)
        time.sleep(random.uniform(2, 6))

✅ Page 1 scraped
✅ Page 1 scraped
✅ Page 1 scraped
✅ Page 1 scraped
✅ Page 1 scraped
✅ Page 1 scraped
✅ Page 1 scraped
✅ Page 1 scraped
✅ Page 1 scraped
✅ Page 1 scraped
✅ Page 1 scraped
✅ Page 1 scraped
✅ Page 1 scraped
✅ Page 1 scraped
✅ Page 1 scraped
✅ Page 1 scraped
✅ Page 1 scraped
✅ Page 1 scraped
✅ Page 1 scraped
✅ Page 1 scraped
✅ Page 1 scraped
✅ Page 1 scraped
✅ Page 1 scraped
✅ Page 1 scraped
✅ Page 2 scraped
✅ Page 2 scraped
✅ Page 2 scraped
✅ Page 2 scraped
✅ Page 2 scraped
✅ Page 2 scraped
✅ Page 2 scraped
✅ Page 2 scraped
✅ Page 2 scraped
✅ Page 2 scraped
✅ Page 2 scraped
✅ Page 2 scraped
✅ Page 2 scraped
✅ Page 2 scraped
✅ Page 2 scraped
✅ Page 2 scraped
✅ Page 2 scraped
✅ Page 2 scraped
✅ Page 2 scraped
✅ Page 2 scraped
✅ Page 2 scraped
✅ Page 2 scraped
✅ Page 2 scraped
✅ Page 2 scraped
✅ Page 3 scraped
✅ Page 3 scraped
✅ Page 3 scraped
✅ Page 3 scraped
✅ Page 3 scraped
✅ Page 3 scraped
✅ Page 3 scraped
✅ Page 3 scraped
✅ Page 3 scraped
✅ Page 3 scraped
✅ Page 3 scrap

UnboundLocalError: cannot access local variable 'soup' where it is not associated with a value

In [12]:
df = pd.DataFrame(data)
df.to_csv("laptops.csv", index=False, encoding="utf-8-sig")

✅ CSV saved with 222 rows
