In [10]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import time

# Set up the WebDriver with headless option
options = webdriver.ChromeOptions()
options.add_argument('--headless')  # Run browser in headless mode
options.add_argument('--no-sandbox')  # Recommended option for headless mode
options.add_argument('--disable-dev-shm-usage')  # Disable shared memory usage for better stability

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

# Start with the first page
base_url = 'https://www.snapdeal.com/search?keyword=sunscreen&santizedKeyword=mobiles&catId=0&categoryId=0&suggested=true&vertical=p&noOfResults=20&searchState=&clickSrc=suggested&lastKeyword=&prodCatId=&changeBackToAll=false&foundInAll=false&categoryIdSearched=&cityPageUrl=&categoryUrl=ALL&url=&utmContent=&dealDetail=&sort=rlvncy'
driver.get(base_url)

# Initialize an empty list to store product data
product_data = []
max_products = 300  # Set limit to stop after collecting a certain number of products

def extract_product_details():
    products = driver.find_elements(By.CSS_SELECTOR, 'div.product-tuple-listing')

    for product in products:
        if len(product_data) >= max_products:  # Stop if we have enough data
            return False

        product_info = {}

    
            try:
                # Open product link in a new tab
                driver.execute_script(f"window.open('{product_info['link']}', '_blank');")
                driver.switch_to.window(driver.window_handles[1])

                # Wait for the product page to load
                time.sleep(2)

                # Extract title
                try:
                    title_elements = driver.find_elements(By.CSS_SELECTOR, 'h1.pdp-e-i-head')
                    title = title_elements[0].text if title_elements else 'No title'
                    product_info['title'] = title
                except Exception:
                    product_info['title'] = 'No title'

                # Extract price
                try:
                    price_elements = driver.find_elements(By.CSS_SELECTOR, 'span.payBlkBig')
                    price = price_elements[0].text if price_elements else 'No price'
                    product_info['price'] = price
                except Exception:
                    product_info['price'] = 'No price'

                # Extract ratings
                try:
                    rating_elements = driver.find_elements(By.CSS_SELECTOR, 'span.avrg-rating')
                    rating = rating_elements[0].text if rating_elements else 'No rating'
                    product_info['rating'] = rating
                except Exception:
                    product_info['rating'] = 'No rating'

                # Extract number of reviews
                try:
                    reviews_elements = driver.find_elements(By.CSS_SELECTOR, 'span.numbr-review')
                    reviews = reviews_elements[0].text if reviews_elements else 'No reviews'
                    product_info['reviews'] = reviews
                except Exception:
                    product_info['reviews'] = 'No reviews'

                print(f"Scraped details from product page: {product_info}")

                # Close the product tab and switch back to the search results tab
                driver.close()
                driver.switch_to.window(driver.window_handles[0])

                # Wait for the page to return to the main search
                time.sleep(2)

            except Exception as e:
                print(f"Error while scraping product page: {e}")
                product_info['title'] = 'No title'
                product_info['price'] = 'No price'
                product_info['rating'] = 'No rating'
                product_info['reviews'] = 'No reviews'

        # Append the product info to the list
        product_data.append(product_info)

        # Print product information to show the scraping progress
        print(f"Scraped product {len(product_data)}: {product_info}")

    return True  # Continue scraping

# Scroll the page and extract products until we reach the max product limit
scroll_pause_time = 2  # Adjust if needed for smoother scrolling
while len(product_data) < max_products:
    extract_success = extract_product_details()

    if not extract_success:
        break

    # Scroll down to load more products
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    
    # Wait for the page to load new products
    time.sleep(scroll_pause_time)

# Close the driver after scraping
driver.quit()

# Convert the product data into a DataFrame and save it to an Excel file
df = pd.DataFrame(product_data)
df.to_excel('snapdeal_sunscreen_with_reviews.xlsx', index=False)  # Use .to_excel() for Excel format
print('Data saved to snapdeal_sunscreen_with_reviews.xlsx.')


Scraped details from product page: {'link': 'https://www.snapdeal.com/product/smartdrops-spf-50-tan-removal/637038437012#bcrumbSearch:sunscreen', 'title': 'Smartdrops SPF 50 Tan Removal Cream For All Skin Type ( Pack of 1 )', 'price': '148', 'rating': '(4.9)', 'reviews': '4 Reviews'}
Scraped product 1: {'link': 'https://www.snapdeal.com/product/smartdrops-spf-50-tan-removal/637038437012#bcrumbSearch:sunscreen', 'title': 'Smartdrops SPF 50 Tan Removal Cream For All Skin Type ( Pack of 1 )', 'price': '148', 'rating': '(4.9)', 'reviews': '4 Reviews'}
Scraped details from product page: {'link': 'https://www.snapdeal.com/product/smartdrops-spf-50-tan-removal/647467097925#bcrumbSearch:sunscreen', 'title': 'Smartdrops SPF 50 Tan Removal Cream For All Skin Type ( Pack of 3 )', 'price': '154', 'rating': '(4.6)', 'reviews': '16 Reviews'}
Scraped product 2: {'link': 'https://www.snapdeal.com/product/smartdrops-spf-50-tan-removal/647467097925#bcrumbSearch:sunscreen', 'title': 'Smartdrops SPF 50 Ta