In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time,random

In [None]:
import warnings
warnings.filterwarnings("ignore");

In [None]:
from bs4 import BeautifulSoup
from urllib.request import urlopen, Request
import requests

In [None]:
data_cols = [
    "Product_Name",
    "Selling_Price",
    "MRP",
    "Discount",
    "Noise_Control",
    "Sensitivity",
    "Headphones_Jack",
    "Model_Name",
    "Connectivity_Technology",
    "Wireless_Communication_Technology",
    "Included_Components",
    "Material",
    "Specific_Uses_For_Product",
    "Charging_Time",
    "Compatible_Devices",
    "Item_Weight",
    "Water_Resistance_Level",
    "Net_Quantity",
    "Style",
    "Control_Method",
    "Number_of_Items",
    "Battery_Life",
    "Audio_Driver_Type",
    "Bluetooth_Version",
    "Audio_Driver_Size",
    "Earpiece_Shape",
    "Special_Features",
    "Manufacturer",
    "Model",
    "Product_Dimensions",
    "Item_Model_Number",
    "Batteries",
    "Batteries_Required",
    "Country_of_Origin"
]


In [None]:
data = {col: [] for col in data_cols}

In [None]:
def get_feature(soup, name):
    row = soup.find("th", string=lambda t: t and name in t)
    if row:
        return row.find_next("td").get_text(strip=True)
    return None


In [None]:
headers = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/120.0.0.0 Safari/537.36"
    ),
    "Accept-Language": "en-US,en;q=0.9",
}

def safe_append(key, value):
    if key not in data:
        data[key] = []
    data[key].append(value)

# ✅ Helper to extract features
def get_feature(soup, label):
    row = soup.find("span", string=lambda x: x and label in x)
    if row:
        val = row.find_next("span")
        return val.get_text(strip=True) if val else None
    return None

# ✅ Scraper function
def scrap(c, i):
    a_tag = c.find("a", class_="a-link-normal")
    if not a_tag:
        return

    link = "https://www.amazon.in" + a_tag["href"]
    title = a_tag.get_text(strip=True)

    # request with headers
    try:
        r = requests.get(link, headers=headers, timeout=10)
        if r.status_code == 200:
            soup = BeautifulSoup(r.text, "html.parser")
            print(f"✅ Page {i} scraped")
        else:
            print(f"⚠️ Failed page {i}, status:", r.status_code)
            return
    except requests.exceptions.RequestException as e:
        print(f"❌ Error fetching page {i}:", e)
        return

    time.sleep(random.uniform(2, 6))

    # main values
    price = soup.find("span", class_="a-price-whole")
    mrp = soup.find("span", class_="a-price a-text-price")
    discount = soup.find("span", class_="savingsPercentage")

    safe_append("Product_Name", title)
    safe_append("Selling_Price", price.get_text(strip=True) if price else None)
    safe_append("MRP", mrp.get_text(strip=True) if mrp else None)
    safe_append("Discount", discount.get_text(strip=True) if discount else None)

    # loop through features
    feature_map = {
        "Noise Control": "Noise_Control",
        "Sensitivity": "Sensitivity",
        "Headphones Jack": "Headphones_Jack",
        "Model Name": "Model_Name",
        "Connectivity Technology": "Connectivity_Technology",
        "Wireless Communication Technology": "Wireless_Communication_Technology",
        "Included Components": "Included_Components",
        "Material": "Material",
        "Specific Uses For Product": "Specific_Uses_For_Product",
        "Charging Time": "Charging_Time",
        "Compatible Devices": "Compatible_Devices",
        "Item Weight": "Item_Weight",
        "Water Resistance Level": "Water_Resistance_Level",
        "Net Quantity": "Net_Quantity",
        "Style": "Style",
        "Control Method": "Control_Method",
        "Number of Items": "Number_of_Items",
        "Battery Life": "Battery_Life",
        "Audio Driver Type": "Audio_Driver_Type",
        "Bluetooth Version": "Bluetooth_Version",
        "Audio Driver Size": "Audio_Driver_Size",
        "Earpiece Shape": "Earpiece_Shape",
        "Special Feature": "Special_Features",
        "Special Features": "Special_Features",
        "Manufacturer": "Manufacturer",
        "Model": "Model",
        "Product Dimensions": "Product_Dimensions",
        "Item model number": "Item_Model_Number",
        "Batteries": "Batteries",
        "Batteries Required": "Batteries_Required",
        "Country of Origin": "Country_of_Origin"
    }

    for key, label in feature_map.items():
        safe_append(key, get_feature(soup, label))

In [None]:
for i in range(1,30):
    url=f"https://www.amazon.in/s?k=headphones&i=computers&page={i}&xpid=TMfoc7lZx3xLb&crid=1FMYGN6S9G4QP&qid=1758171013&sprefix=headphones%2Ccomputers%2C417&ref=sr_pg_{i}"
    uClient = urlopen(url)
    page_html = uClient.read()
    uClient.close()
    soup = BeautifulSoup(page_html, features='html')
    containers = soup.find_all(
        "div",
        {"class": "a-section a-spacing-none puis-padding-right-small s-title-instructions-style puis-desktop-list-title-instructions-style"}
    )

    for c in containers:
        scrap(c,i)
        time.sleep(random.uniform(2, 6))

In [None]:
!pip install pymongo
from pymongo import MongoClient

In [None]:
max_len = max(len(v) for v in data.values())
for k, v in data.items():
    if len(v) < max_len:
        v.extend([None] * (max_len - len(v)))  # pad missing with None

# Now safe to convert
df = pd.DataFrame(data)

# Save to CSV
df.to_csv("headphones_scrape.csv", index=False, encoding="utf-8-sig")
print("✅ CSV saved with", len(df), "rows")

In [None]:
client=MongoClient("mongodb://localhost:27017/")

In [None]:
db = client["flipkart_database"]
collection = db["headphones"]
data_dict = final_df.to_dict("records")

In [None]:
collection.insert_many(data_dict)
print("Dataframe inserted")