In [1]:
import re
import time, random
import requests
import pandas as pd
from bs4 import BeautifulSoup

In [2]:
headers = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/120.0.0.0 Safari/537.36"
    ),
    "Accept-Language": "en-US,en;q=0.9",
}

In [3]:
data_cols = [
    "Product_Name","Brand","Model","Connectivity","Type",
    "Keys_Count","Special_Keys","Colour","Features",
    "Selling_Price","MRP","Discount","Rating","Reviews","URL"
]
features = {col: [] for col in data_cols}

In [4]:
def safe_append(key, value):
    features[key].append(value if value else None)

def parse_keyboard_title(title: str):
    safe_append("Product_Name", title)

    # Brand = first word
    brand_match = re.match(r"([A-Za-z]+)", title)
    brand = brand_match.group(1) if brand_match else None
    safe_append("Brand", brand)

    # Model
    model_match = re.search(rf"{brand}\s+([A-Za-z0-9\s/-]+?)(?:,|-|\|)", title) if brand else None
    model = model_match.group(1).strip() if model_match else None
    safe_append("Model", model)

    # Connectivity
    if re.search(r"Wireless|2\.4\s*Ghz|Bluetooth", title, re.IGNORECASE):
        safe_append("Connectivity", "Wireless")
    elif re.search(r"Wired|USB", title, re.IGNORECASE):
        safe_append("Connectivity", "Wired")
    else:
        safe_append("Connectivity", None)

    # Type
    if re.search(r"Mechanical", title, re.IGNORECASE):
        safe_append("Type", "Mechanical")
    elif re.search(r"Gaming", title, re.IGNORECASE):
        safe_append("Type", "Gaming")
    elif re.search(r"Multimedia", title, re.IGNORECASE):
        safe_append("Type", "Multimedia")
    else:
        safe_append("Type", None)

    # Keys count
    keys_match = re.search(r"(\d+\s*Keys?)", title, re.IGNORECASE)
    safe_append("Keys_Count", keys_match.group(1) if keys_match else None)

    # Special Keys
    if re.search(r"Hot Keys|Multimedia Keys|Rupee Key", title, re.IGNORECASE):
        safe_append("Special_Keys", "Yes")
    else:
        safe_append("Special_Keys", "No")

    # Colour
    color_match = re.search(r"(Black|White|Grey|Gray|Blue|Red)", title, re.IGNORECASE)
    safe_append("Colour", color_match.group(1) if color_match else None)

    # Features
    features_text = []
    for word in ["Spill", "Ergonomic", "Chiclet", "Programmable", "Slim", "Wrist Rest", "Foam"]:
        if word.lower() in title.lower():
            features_text.append(word)
    safe_append("Features", ", ".join(features_text) if features_text else None)


def scrap(c):
    # Title & Link
    a_tag = c.find("a", class_="a-link-normal")
    if not a_tag:
        return

    link = "https://www.amazon.in" + a_tag["href"]
    title = a_tag.get_text(strip=True)

    parse_keyboard_title(title)

    # Selling price
    price_whole = c.find("span", class_="a-price-whole")
    price_fraction = c.find("span", class_="a-price-fraction")
    if price_whole:
        selling_price = price_whole.get_text(strip=True)
        if price_fraction:
            selling_price += "." + price_fraction.get_text(strip=True)
    else:
        selling_price = None

    # MRP
    mrp_tag = c.find("span", class_="a-price a-text-price")
    mrp = mrp_tag.get_text(strip=True) if mrp_tag else None

    # Discount
    discount_tag = c.find("span", class_="savingsPercentage")
    discount = discount_tag.get_text(strip=True) if discount_tag else None

    # Rating
    rating_tag = c.find("span", class_="a-icon-alt")
    rating = rating_tag.get_text(strip=True) if rating_tag else None

    # Reviews
    reviews_tag = c.find("a", attrs={"aria-label": re.compile(r"\d+\s+ratings")})
    reviews = reviews_tag.get_text(strip=True) if reviews_tag else None

    safe_append("Selling_Price", selling_price)
    safe_append("MRP", mrp)
    safe_append("Discount", discount)
    safe_append("Rating", rating)
    safe_append("Reviews", reviews)
    safe_append("URL", link)

In [7]:
product_class_name = (
    "puisg-col puisg-col-4-of-4 puisg-col-4-of-8 "
    "puisg-col-8-of-12 puisg-col-8-of-16 "
    "puisg-col-12-of-20 puisg-col-12-of-24 puis-list-col-right"
)

for i in range(150, 300):  # scrape 4 pages (change to 30 if needed)
    url = f"https://www.amazon.in/s?k=keyboards&page={i}"
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        print(f"Failed to fetch page {i}, status {response.status_code}")
        continue

    soup = BeautifulSoup(response.text, "html.parser")
    containers = soup.find_all("div", {"class": product_class_name})

    for c in containers:
        scrap(c)

    time.sleep(random.uniform(2, 5))

In [8]:
df = pd.DataFrame(features)
print(df.head())

# Save to CSV
df.to_csv("amazon_keyboards2.csv", index=False, encoding="utf-8")
print("✅ Data saved to amazon_keyboards.csv")

                                        Product_Name      Brand  \
0                                        Let us know        Let   
1                                        Let us know        Let   
2  ZEBRONICS ZEB-KM2100 Multimedia USB Keyboard C...  ZEBRONICS   
3  Dell KB216 Wired Multimedia Keyboard - Full-Si...       Dell   
4  ZEBRONICS K36 Wired USB Keyboard with 106 Keys...  ZEBRONICS   

                                  Model Connectivity        Type Keys_Count  \
0                                  None         None        None       None   
1                                  None         None        None       None   
2                                   ZEB        Wired  Multimedia   114 Keys   
3       KB216 Wired Multimedia Keyboard        Wired  Multimedia       None   
4  K36 Wired USB Keyboard with 106 Keys        Wired        None   106 Keys   

  Special_Keys Colour        Features Selling_Price           MRP Discount  \
0           No   None            None       