In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
import os

# 1. SETUP & LOAD EXISTING DATA
filename = "snapdeal_mass_dataset.csv"

if os.path.exists(filename):
    df_existing = pd.read_csv(filename)
    print(f"--- üìÇ Loaded existing dataset with {len(df_existing)} rows ---")
    existing_links = set(df_existing['Product_Link'].tolist())
else:
    print("--- ‚ö†Ô∏è No existing dataset found. Starting fresh. ---")
    df_existing = pd.DataFrame()
    existing_links = set()

# 2. DEFINE "HIGH RISK" KEYWORDS
# These keywords are specifically chosen to find Grey Market/Fake items
grey_market_keywords = [
    "first copy watches men",
    "7a quality watch",
    "replica shoes for men",
    "master copy watch",
    "imported wireless headphones",
    "clone iphone",
    "first copy airpods",
    "replica sunglasses",
    "first copy shoes",
    "imported smart watch"
]

base_url = "https://www.snapdeal.com/search?keyword={}&sort=rlvncy&start={}"

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
    "Accept-Language": "en-US,en;q=0.5"
}

new_data = []
target_grey_count = 300
collected_grey_count = 0

print(f"--- üïµÔ∏è STARTING GREY MARKET BOOST (Target: {target_grey_count} new rows) ---")

for query in grey_market_keywords:
    if collected_grey_count >= target_grey_count:
        break

    print(f"\nProcessing High-Risk Query: {query}...")
    
    start_offset = 0
    # Search deeper (up to 10 pages) for these specific fake items
    while start_offset < 200: 
        url = base_url.format(query.replace(" ", "%20"), start_offset)
        
        try:
            time.sleep(random.uniform(2, 4))
            response = requests.get(url, headers=headers, timeout=15)
            
            if response.status_code != 200:
                break
                
            soup = BeautifulSoup(response.text, 'html.parser')
            products = soup.find_all('div', class_='product-tuple-listing')
            
            if not products:
                break
            
            batch_count = 0
            for item in products:
                if collected_grey_count >= target_grey_count:
                    break

                try:
                    # A. Scrape Details
                    title_tag = item.find('p', class_='product-title')
                    title = title_tag.get_text(strip=True) if title_tag else "N/A"
                    
                    link_tag = item.find('a', class_='dp-widget-link')
                    link = link_tag['href'] if link_tag else ""
                    
                    # SKIP IF DUPLICATE (Already in existing file or current batch)
                    if link in existing_links:
                        continue

                    price_tag = item.find('span', class_='lfloat product-price')
                    price_str = price_tag.get_text(strip=True).replace('Rs.', '').replace(',', '').strip() if price_tag else "0"
                    price = float(price_str) if price_str.replace('.','').isdigit() else 0
                    
                    mrp_tag = item.find('span', class_='lfloat product-desc-price strike')
                    mrp_str = mrp_tag.get_text(strip=True).replace('Rs.', '').replace(',', '').strip() if mrp_tag else str(price)
                    mrp = float(mrp_str) if mrp_str.replace('.','').isdigit() else price

                    rating_tag = item.find('p', class_='product-rating-count')
                    reviews = rating_tag.get_text(strip=True).replace('(', '').replace(')', '') if rating_tag else "0"

                    # B. Check "Is_Grey_Market" Logic IMMEDIATELY
                    # We only want to keep this item if it helps balance the dataset
                    score = 0
                    text = title.lower()
                    
                    # Strict Grey Market Rules
                    if any(x in text for x in ['copy', 'replica', '7a', 'clone', 'master', 'import']):
                        score += 50
                    if "watch" in text and price < 500: # Cheap "Rolex" copies
                        score += 20
                    if mrp > 0 and (1 - (price/mrp)) > 0.75: # >75% discount
                        score += 20
                    
                    is_grey = 1 if score >= 30 else 0

                    # C. Save (Append)
                    # We accept ALL items from these keywords, but we expect mostly Grey ones.
                    new_data.append({
                        "Search_Term": query,
                        "Product_Title": title,
                        "Selling_Price": price,
                        "MRP": mrp,
                        "Discount_Pct": round((1 - (price/mrp))*100, 1) if mrp > 0 else 0,
                        "Review_Count": reviews,
                        "Product_Link": link,
                        "Is_Grey_Market": is_grey
                    })
                    
                    existing_links.add(link)
                    if is_grey == 1:
                        collected_grey_count += 1
                    batch_count += 1
                        
                except Exception:
                    continue
            
            print(f"  - Offset {start_offset}: Found {batch_count} items ({collected_grey_count} confirmed Grey Market so far)")
            start_offset += 20
            
        except Exception as e:
            print(f"Error: {e}")
            break

# 3. MERGE AND SAVE
if new_data:
    df_new = pd.DataFrame(new_data)
    
    # Combine with existing
    df_final = pd.concat([df_existing, df_new], ignore_index=True)
    
    # Save back to same file
    df_final.to_csv(filename, index=False)
    
    print(f"\n‚úÖ SUCCESS! Added {len(df_new)} new rows.")
    print(f"üìä Total Dataset Size: {len(df_final)}")
    print("New Class Distribution:")
    print(df_final['Is_Grey_Market'].value_counts())
else:
    print("‚ùå No new data found. Try adding more keywords.")

--- üìÇ Loaded existing dataset with 500 rows ---
--- üïµÔ∏è STARTING GREY MARKET BOOST (Target: 300 new rows) ---

Processing High-Risk Query: first copy watches men...
  - Offset 0: Found 1 items (1 confirmed Grey Market so far)
  - Offset 20: Found 7 items (2 confirmed Grey Market so far)
  - Offset 40: Found 4 items (5 confirmed Grey Market so far)
  - Offset 60: Found 7 items (9 confirmed Grey Market so far)
  - Offset 80: Found 3 items (11 confirmed Grey Market so far)
  - Offset 100: Found 5 items (14 confirmed Grey Market so far)
  - Offset 120: Found 4 items (16 confirmed Grey Market so far)
  - Offset 140: Found 13 items (25 confirmed Grey Market so far)
  - Offset 160: Found 7 items (31 confirmed Grey Market so far)
  - Offset 180: Found 9 items (35 confirmed Grey Market so far)

Processing High-Risk Query: 7a quality watch...
  - Offset 0: Found 10 items (35 confirmed Grey Market so far)

Processing High-Risk Query: replica shoes for men...
  - Offset 0: Found 16 items (3