In [23]:
# STEP 1: Import libraries
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import pandas as pd
import time
import re
import matplotlib.pyplot as plt
import seaborn 
import os

#STEP 2: Set up Selenium with better options
options = Options()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36')
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

# STEP 3: Improved scraping function
def scrape_amazon_sponsored(keyword):
    url = f"https://www.amazon.in/s?k={keyword.replace(' ', '+')}"
    print(f"Scraping URL: {url}")
    
    driver.get(url)
    time.sleep(5)  
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(3)
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    sponsored_products = []
    for product in soup.find_all('div', {'data-component-type': 's-search-result'}):
        sponsored_tag = product.find('span', string=re.compile(r'Sponsored|Sponsored\s*$'))
        if sponsored_tag:
            sponsored_products.append(product)
    if not sponsored_products:
        print("Trying alternative sponsored product detection...")
        sponsored_products = soup.select('div.sg-col-20-of-24.s-result-item.s-asin.sg-col-0-of-12.sg-col-16-of-20')
    
    print(f"Found {len(sponsored_products)} sponsored products")
    return sponsored_products

# STEP 4: Extract product data with better error handling
def extract_product_data(products):
    data = []
    for product in products:
        try:
            title_elem = product.find('h2') or product.find('span', class_='a-size-medium')
            title = title_elem.text.strip() if title_elem else "No title"

            product_url = None
            link = product.find('a', {'class': ['a-link-normal', 's-no-outline']})
            if link and 'href' in link.attrs:
                product_url = "https://www.amazon.in" + link['href'].split('ref=')[0]
            
            brand = None
            brand_span = product.find('span', class_=re.compile(r'a-size-base-plus a-color-base|a-text-bold'))
            if brand_span:
                brand = brand_span.text.strip()
                # Clean common artifacts
                brand = re.sub(r'[^\w\s-]', '', brand).strip()
            
            rating = None
            rating_tag = product.find('span', class_=re.compile(r'a-icon-alt|a-icon-star'))
            if rating_tag:
                rating_text = rating_tag.text
                match = re.search(r'(\d\.\d)', rating_text)
                if match:
                    rating = float(match.group(1))
            
            reviews = None
            reviews_tag = product.find('span', class_=re.compile(r'a-size-base s-underline-text|a-size-base'))
            if reviews_tag:
                reviews_text = re.sub(r'[^\d]', '', reviews_tag.text)
                if reviews_text.isdigit():
                    reviews = int(reviews_text)
            
            price = None
            price_span = product.find('span', class_='a-price-whole')
            if not price_span:
                price_span = product.find('span', class_='a-offscreen')
            if price_span:
                price_text = re.sub(r'[^\d]', '', price_span.text)
                if price_text.isdigit():
                    price = int(price_text)
            
            image_url = None
            img_tag = product.find('img', class_=re.compile(r's-image|product-image'))
            if img_tag and 'src' in img_tag.attrs:
                image_url = img_tag['src']
            
            data.append({
                'Title': title,
                'Brand': brand if brand else 'Unknown',
                'Rating': rating,
                'Reviews': reviews,
                'Price': price,
                'Image URL': image_url,
                'Product URL': product_url
            })
        except Exception as e:
            print(f"Error processing product: {e}")
            continue
    
    return data

if not df.empty:
    # Less aggressive cleaning - keep rows with at least 3 valid fields
    df = df.dropna(thresh=4, subset=['Title', 'Brand', 'Price', 'Rating', 'Reviews'])
    df.drop_duplicates(subset='Product URL', inplace=True, keep='first')

# 📌 STEP 6: Enhanced Visualization with Beautiful Charts

if not df.empty:
    try:
        # Set modern style
        plt.style.use('seaborn-v0_8')
        
        # Create analysis directory
        os.makedirs('amazon_analysis/visualizations', exist_ok=True)
        
        # Custom color palette
        custom_palette = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd']
        
        # 1. Top 5 Products by Reviews
        plt.figure(figsize=(12, 6))
        top_reviews = df.nlargest(5, 'Reviews')
        
        # Fixed: Added hue parameter to address FutureWarning
        bar1 = sns.barplot(x='Reviews', y='Title', data=top_reviews, 
                          hue='Title', palette=custom_palette, 
                          edgecolor='black', linewidth=0.5, legend=False)
        
        # Add formatted review counts
        max_reviews = top_reviews['Reviews'].max()
        for p in bar1.patches:
            width = p.get_width()
            plt.text(width + max_reviews*0.05,  # Fixed: Removed 'ha' parameter
                    p.get_y() + p.get_height()/2., 
                    f'{width:,.0f}', 
                    ha='left', va='center')
        
        plt.title('Top 5 Soft Toys by Customer Reviews', fontsize=14, pad=15)
        plt.xlabel('Number of Reviews', fontsize=12)
        plt.ylabel('')
        plt.grid(axis='x', alpha=0.2)
        sns.despine(left=True, bottom=True)
        plt.tight_layout()
        plt.savefig('amazon_analysis/visualizations/top5_by_reviews.png', 
                   dpi=300, bbox_inches='tight')
        plt.close()
        
        # 2. Top 5 Products by Rating (with minimum 10 reviews)
        plt.figure(figsize=(12, 6))
        min_reviews = 10  # Minimum threshold
        top_rated = df[df['Reviews'] >= min_reviews].nlargest(5, 'Rating')
        
        if not top_rated.empty:
            # Fixed: Added hue parameter
            bar2 = sns.barplot(x='Rating', y='Title', data=top_rated, 
                              hue='Title', palette=custom_palette[::-1], 
                              edgecolor='black', linewidth=0.5, legend=False)
            
            # Add rating values
            for p in bar2.patches:
                width = p.get_width()
                plt.text(width + 0.1, 
                        p.get_y() + p.get_height()/2., 
                        f'{width:.1f}', 
                        ha='left', va='center')
            
            plt.title(f'Top 5 Soft Toys by Rating (Min {min_reviews} Reviews)', 
                     fontsize=14, pad=15)
            plt.xlabel('Average Rating (out of 5)', fontsize=12)
            plt.ylabel('')
            plt.xlim(0, 5.5)
            plt.grid(axis='x', alpha=0.2)
            sns.despine(left=True, bottom=True)
            plt.tight_layout()
            plt.savefig('amazon_analysis/visualizations/top5_by_rating.png', 
                       dpi=300, bbox_inches='tight')
            plt.close()
        else:
            print("\n⚠️ No products with minimum review threshold for rating chart")
        
        print("\n✅ Visualizations saved:")
        print("   - top5_by_reviews.png")
        print("   - top5_by_rating.png")
        
    except Exception as e:
        print(f"\n⚠️ Visualization error: {str(e)}")
else:
    print("\n⚠️ No data available for visualizations")
# Main execution
keyword = "soft toys"
sponsored_products = scrape_amazon_sponsored(keyword)

if not sponsored_products:
    print("\n⚠️ No sponsored products found! Possible reasons:")
    print("- Amazon has changed its page structure")
    print("- No sponsored products for this keyword")
    print("- Your IP might be blocked (try without headless mode)")
    print("- Try increasing the wait time")
else:
    data = extract_product_data(sponsored_products)
    df = pd.DataFrame(data)
    
    print("\nRaw Data:")
    print(df.head())
    
    if not df.empty:
        os.makedirs('amazon_analysis', exist_ok=True)
        df.to_csv("amazon_analysis/soft_toys_sponsored_raw.csv", index=False)
        df = df.dropna(subset=['Price', 'Rating', 'Reviews', 'Brand'], how='all')
        df.drop_duplicates(subset='Product URL', inplace=True, keep='first')
        
        print("\nCleaned Data:")
        print(df.head())
        
        if not df.empty:
            print("\n✅ Analysis completed! Check the 'amazon_analysis' folder.")
        else:
            print("\n⚠️ No valid data remaining after cleaning!")
    else:
        print("\n⚠️ No product data could be extracted!")

driver.quit()

  bar2 = sns.barplot(x='Rating', y='Title', data=top_rated,



✅ Visualizations saved:
   - top5_by_reviews.png
   - top5_by_rating.png
Scraping URL: https://www.amazon.in/s?k=soft+toys
Found 12 sponsored products

Raw Data:
                                               Title       Brand  Rating  \
0  Wembley Stacking Toys for Kids Baby Bath Toys ...  Tue 13 May     4.7   
1  HappyBuddy Talking Plush Toy Lenny The Lion | ...  Thu 15 May     NaN   
2  pikipo Super Saver Soft Polyester Toy for Infa...  Thu 15 May     4.5   
3  GOLDENHUB TOYS 15cm Panda Plush Soft Toy, Hugg...  Sat 17 May     NaN   
4  VGRASSP Classic Dial Receiver Simulation Telep...  Wed 14 May     4.0   

   Reviews  Price                                          Image URL  \
0      5.0    799  https://m.media-amazon.com/images/I/612SDw1eEQ...   
1      NaN    999  https://m.media-amazon.com/images/I/612RyncKUI...   
2    142.0    399  https://m.media-amazon.com/images/I/819-ZTM82t...   
3      NaN     90  https://m.media-amazon.com/images/I/31Jzz8457p...   
4    171.0    639  h