In [2]:
import requests
from bs4 import BeautifulSoup
import csv
import time
import os
from datetime import datetime
import random
import re

# Function to get current timestamp for the filename
def get_timestamp():
    return datetime.now().strftime('%Y_%m_%d_%H_%M_%S')

# User-Agent strings to rotate
user_agents = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0',
    'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36'
]

def scrape_amazon_laptops(num_pages=2):
    laptops_data = []
    base_url = "https://www.amazon.in/s?k=laptops&page="
    
    for page in range(1, num_pages + 1):
        # Add delay to prevent being blocked
        time.sleep(random.uniform(2, 5))
        
        # Rotate User-Agent
        headers = {
            'User-Agent': random.choice(user_agents),
            'Accept-Language': 'en-US,en;q=0.9',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,/;q=0.8',
            'Referer': 'https://www.amazon.in/'
        }
        
        url = base_url + str(page)
        print(f"Scraping page {page}: {url}")
        
        try:
            response = requests.get(url, headers=headers, timeout=10)
            if response.status_code != 200:
                print(f"Failed to retrieve page {page}. Status code: {response.status_code}")
                continue
            
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # Try multiple selectors for product containers
            products = soup.select('div[data-component-type="s-search-result"]')
            if not products:
                products = soup.select('.s-result-item')
            if not products:
                products = soup.select('.sg-col-inner .a-section')
                
            print(f"Found {len(products)} potential products on page {page}")
            
            for product in products:
                try:
                    # Multiple attempts to find title with different selectors
                    title_element = None
                    for selector in ['h2 a span', '.a-text-normal', '.a-link-normal .a-text-normal']:
                        title_element = product.select_one(selector)
                        if title_element:
                            break
                    
                    if not title_element:
                        continue
                    
                    title = title_element.text.strip()
                    print(f"Found product: {title[:50]}...")
                    
                    # Skip if not a laptop (simple check)
                    lower_title = title.lower()
                    laptop_keywords = ['laptop', 'notebook', 'chromebook', 'macbook', 'ultrabook', 'thinkpad']
                    if not any(keyword in lower_title for keyword in laptop_keywords):
                        print(f"Skipping non-laptop product: {title[:30]}...")
                        continue
                    
                    # Get image URL - try multiple selectors
                    img_element = None
                    for img_selector in ['img.s-image', '.s-product-image-container img', '.a-section img']:
                        img_element = product.select_one(img_selector)
                        if img_element:
                            break
                    
                    image_url = img_element['src'] if img_element and 'src' in img_element.attrs else "No image available"
                    
                    # Get rating - try multiple selectors
                    rating = "No rating"
                    for rating_selector in ['i.a-icon-star-small span', '.a-icon-alt', '.a-icon-star .a-icon-alt']:
                        rating_element = product.select_one(rating_selector)
                        if rating_element:
                            rating_text = rating_element.text.strip()
                            # Extract the rating number using regex
                            rating_match = re.search(r'(\d+(\.\d+)?)', rating_text)
                            if rating_match:
                                rating = rating_match.group(1)
                                break
                    
                    # Get price - try multiple selectors
                    price = "Price not available"
                    for price_selector in ['span.a-price-whole', '.a-price .a-offscreen', '.a-price']:
                        price_element = product.select_one(price_selector)
                        if price_element:
                            price_text = price_element.text.strip()
                            # Extract numbers from price text
                            price_match = re.search(r'(\d+,?\d*(\.\d+)?)', price_text)
                            if price_match:
                                price = price_match.group(1)
                                break
                    
                    # Check if it's an ad (sponsored) - try multiple approaches
                    is_ad = "Organic"
                    sponsored_texts = ['Sponsored', 'sponsored', 'Ad', 'advertisement']
                    for text in sponsored_texts:
                        if text in product.text:
                            is_ad = "Ad"
                            break
                    
                    laptops_data.append({
                        'Title': title,
                        'Image': image_url,
                        'Rating': rating,
                        'Price': price,
                        'Ad/Organic': is_ad
                    })
                    
                    print(f"Successfully scraped: {title[:50]}...")
                    
                except Exception as e:
                    print(f"Error processing a product: {str(e)}")
                    continue
        
        except Exception as e:
            print(f"Error scraping page {page}: {str(e)}")
    
    return laptops_data

def save_to_csv(data, filename=None):
    if not filename:
        timestamp = get_timestamp()
        filename = f"amazon_laptops_{timestamp}.csv"
    
    try:
        with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
            fieldnames = ['Title', 'Image', 'Rating', 'Price', 'Ad/Organic']
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            
            writer.writeheader()
            for item in data:
                writer.writerow(item)
                
        print(f"Data successfully saved to {filename}")
        return filename
    except Exception as e:
        print(f"Error saving to CSV: {str(e)}")
        return None

def main():
    print("Starting Amazon.in laptop scraper...")
    
    # Number of pages to scrape
    num_pages = 3
    
    # Scrape data
    laptops_data = scrape_amazon_laptops(num_pages)
    
    if laptops_data:
        print(f"Successfully scraped {len(laptops_data)} laptops")
        
        # Save data with timestamp
        timestamp = get_timestamp()
        filename = f"amazon_laptops_{timestamp}.csv"
        save_to_csv(laptops_data, filename)
    else:
        print("No data was scraped. Please check the script or try again later.")

if __name__ == "__main__":
    main()

Starting Amazon.in laptop scraper...
Scraping page 1: https://www.amazon.in/s?k=laptops&page=1
Found 22 potential products on page 1
Found product: HP 15, 13th Gen Intel Core i5-1334U, 16GB DDR4, 51...
Successfully scraped: HP 15, 13th Gen Intel Core i5-1334U, 16GB DDR4, 51...
Found product: HP 15s, 12th Gen Intel Core i5-1235U, 8GB DDR4, 51...
Successfully scraped: HP 15s, 12th Gen Intel Core i5-1235U, 8GB DDR4, 51...
Found product: Acer Aspire Lite, AMD Ryzen 5-5625U, 16GB RAM, 512...
Successfully scraped: Acer Aspire Lite, AMD Ryzen 5-5625U, 16GB RAM, 512...
Found product: Lenovo V15 G4 AMD Athlon Silver 7120U Laptop 8GB L...
Successfully scraped: Lenovo V15 G4 AMD Athlon Silver 7120U Laptop 8GB L...
Found product: (Refurbished) HP 250 G7 7th Gen Intel Core i3 Thin...
Successfully scraped: (Refurbished) HP 250 G7 7th Gen Intel Core i3 Thin...
Found product: HP 255 G9 Ryzen 3 Dual Core AMD Ryzen™ 3 Processor...
Successfully scraped: HP 255 G9 Ryzen 3 Dual Core AMD Ryzen™ 3 Processor.