In [12]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random

base_url = "https://www.snapdeal.com/search?keyword={}&sort=rlvncy&start={}"

keywords = [
    "first copy watches",
    "replica shoes",
    "imported headphones",
    "analog watch men",
    "running shoes"
]

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
    "Accept-Language": "en-US,en;q=0.5"
}

all_data = []

print("--- üïµÔ∏è STARTING MASS SCRAPE (Target: 100 rows per product) ---")

for query in keywords:
    print(f"\nProcessing: {query}...")
    
    items_collected = 0
    start_offset = 0
    
    while items_collected < 100:
        url = base_url.format(query.replace(" ", "%20"), start_offset)
        
        try:
            time.sleep(random.uniform(2, 4))
            
            response = requests.get(url, headers=headers, timeout=15)
            
            if response.status_code != 200:
                print(f"  - Blocked or Error at offset {start_offset}. Stopping this keyword.")
                break
                
            soup = BeautifulSoup(response.text, 'html.parser')
            
            products = soup.find_all('div', class_='product-tuple-listing')
            
            if not products:
                print("  - No more items found. Moving to next keyword.")
                break
                
            batch_count = 0
            for item in products:
                if items_collected >= 100:
                    break
                    
                try:
                    title_tag = item.find('p', class_='product-title')
                    title = title_tag.get_text(strip=True) if title_tag else "N/A"
                    
                    price_tag = item.find('span', class_='lfloat product-price')
                    price_str = price_tag.get_text(strip=True).replace('Rs.', '').replace(',', '').strip() if price_tag else "0"
                    price = float(price_str) if price_str.replace('.','').isdigit() else 0
                    
                    mrp_tag = item.find('span', class_='lfloat product-desc-price strike')
                    mrp_str = mrp_tag.get_text(strip=True).replace('Rs.', '').replace(',', '').strip() if mrp_tag else str(price)
                    mrp = float(mrp_str) if mrp_str.replace('.','').isdigit() else price
                    
                    rating_tag = item.find('p', class_='product-rating-count')
                    reviews = rating_tag.get_text(strip=True).replace('(', '').replace(')', '') if rating_tag else "0"
                    
                    link_tag = item.find('a', class_='dp-widget-link')
                    link = link_tag['href'] if link_tag else ""

                    if not any(d['Product_Link'] == link for d in all_data):
                        all_data.append({
                            "Search_Term": query,
                            "Product_Title": title,
                            "Selling_Price": price,
                            "MRP": mrp,
                            "Discount_Pct": round((1 - (price/mrp))*100, 1) if mrp > 0 else 0,
                            "Review_Count": reviews,
                            "Product_Link": link
                        })
                        items_collected += 1
                        batch_count += 1
                        
                except Exception as e:
                    continue
            
            print(f"  - Offset {start_offset}: Grabbed {batch_count} items. Total: {items_collected}/100")
            
            start_offset += 20
            
        except Exception as e:
            print(f"  - Critical Error: {e}")
            break

df = pd.DataFrame(all_data)

if not df.empty:
    print(f"\n‚úÖ SCRAPING COMPLETE! Total Rows: {len(df)}")
    
    def check_suspicious(row):
        score = 0
        text = row['Product_Title'].lower()
        
        if any(x in text for x in ['copy', 'replica', 'compatible with', '7a', 'import']):
            score += 50
            
        if "watch" in text and row['Selling_Price'] < 400:
            score += 20
            
        if row['Discount_Pct'] > 80:
            score += 20
            
        return 1 if score >= 30 else 0

    df['Is_Grey_Market'] = df.apply(check_suspicious, axis=1)
    
    filename = "snapdeal_mass_dataset.csv"
    df.to_csv(filename, index=False)
    print(f"üìÅ Data saved to: {filename}")
    print("\nClass Distribution:")
    print(df['Is_Grey_Market'].value_counts())
else:
    print("‚ùå Failed to collect data. Check your connection.")

--- üïµÔ∏è STARTING MASS SCRAPE (Target: 100 rows per product) ---

Processing: first copy watches...
  - Offset 0: Grabbed 20 items. Total: 20/100
  - Offset 20: Grabbed 16 items. Total: 36/100
  - Offset 40: Grabbed 19 items. Total: 55/100
  - Offset 60: Grabbed 17 items. Total: 72/100
  - Offset 80: Grabbed 15 items. Total: 87/100
  - Offset 100: Grabbed 13 items. Total: 100/100

Processing: replica shoes...
  - Offset 0: Grabbed 20 items. Total: 20/100
  - Offset 20: Grabbed 17 items. Total: 37/100
  - Offset 40: Grabbed 11 items. Total: 48/100
  - Offset 60: Grabbed 17 items. Total: 65/100
  - Offset 80: Grabbed 19 items. Total: 84/100
  - Offset 100: Grabbed 16 items. Total: 100/100

Processing: imported headphones...
  - Offset 0: Grabbed 20 items. Total: 20/100
  - Offset 20: Grabbed 18 items. Total: 38/100
  - Offset 40: Grabbed 13 items. Total: 51/100
  - Offset 60: Grabbed 17 items. Total: 68/100
  - Offset 80: Grabbed 15 items. Total: 83/100
  - Offset 100: Grabbed 10 item