In [1]:
import csv
import time
import random
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.keys import Keys
from webdriver_manager.chrome import ChromeDriverManager
from selenium.common.exceptions import TimeoutException

# Setup WebDriver
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.set_page_load_timeout(30)  # Set page load timeout to 30 seconds

# Flipkart search URL for iPhones
base_search_url = "https://www.flipkart.com/search?q=Apple%20Iphone&page={}"  # URL template for pagination

def scroll_page():
    """Scroll down to load all product links or reviews."""
    for _ in range(5):  # Adjust if necessary
        driver.find_element(By.TAG_NAME, 'body').send_keys(Keys.END)
        time.sleep(random.uniform(2, 4))  # Random wait to avoid detection

def get_product_links():
    """Extract product links from the first 10 pages of search results."""
    product_links = set()
    for page in range(1, 11):  # Scrape 10 pages of product listings
        try:
            driver.get(base_search_url.format(page))
            time.sleep(random.uniform(3, 6))
            scroll_page()
            product_elements = driver.find_elements(By.XPATH, "//a[contains(@class, 'CGtC98')]")
            for elem in product_elements:
                product_links.add(elem.get_attribute('href'))
            print(f"Scraped {len(product_links)} product links from Page {page}")
        except TimeoutException:
            print(f"Timeout while loading page {page}, skipping...")
            continue
    return list(product_links)

def navigate_to_reviews(product_url):
    """Navigate to the full reviews page of a product with retries."""
    retries = 3
    for attempt in range(retries):
        try:
            driver.get(product_url)
            time.sleep(random.uniform(5, 8))  # Wait for product page to load
            review_button = driver.find_element(By.XPATH, "//div[contains(@class, '_23J90q RcXBOT')]/span")
            review_button.click()
            time.sleep(random.uniform(5, 8))
            return driver.current_url  # Return the new review page URL
        except TimeoutException:
            print(f"Timeout on attempt {attempt + 1} for {product_url}, retrying...")
    print("Could not find review button, skipping...")
    return None

def extract_reviews(product_url):
    """Scrape reviews from the first 10 pages of a product's review section."""
    review_url = navigate_to_reviews(product_url)
    if not review_url:
        return 0  # Skip if review page not found
    
    print(f"Product Review Url: {review_url}")
    total_reviews = 0
    
    for page in range(1, 11):  # Scrape first 10 pages of reviews
        try:
            driver.get(review_url + f"&page={page}")
            time.sleep(random.uniform(5, 8))  # Allow reviews to load
            scroll_page()
            review_blocks = driver.find_elements(By.CLASS_NAME, "EPCmJX")
            reviews_data = []
            
            for block in review_blocks:
                try:
                    rating_element = block.find_element(By.CLASS_NAME, "XQDdHH")
                    rating = rating_element.text.strip() if rating_element.text.strip() else None
                    if not rating:
                        continue  # Skip review if rating is missing
                    
                    review_title = block.find_element(By.CLASS_NAME, "z9E0IG").text.strip()
                    review_comment = block.find_element(By.CLASS_NAME, "ZmyHeo").text.strip()
                    reviews_data.append([review_url, rating, review_title, review_comment])
                except:
                    continue
            
            total_reviews += len(reviews_data)
            with open(csv_filename, mode="a", newline="", encoding="utf-8") as file:
                writer = csv.writer(file)
                writer.writerows(reviews_data)
            
            print(f"Extracted {len(reviews_data)} reviews from {review_url} Page {page}, Total so far: {total_reviews}")
        except TimeoutException:
            print(f"Timeout while loading reviews page {page} for {review_url}, skipping...")
            continue
    return total_reviews

# Prepare CSV file (Append Mode to Keep All Data)
csv_filename = "flipkart_all_reviews.csv"
with open(csv_filename, mode="a", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    if file.tell() == 0:
        writer.writerow(["Product Link", "Rating", "Review Title", "Review Comment"])

# Get product links from first 10 search pages
product_links = get_product_links()
print(f"Total Products Collected: {len(product_links)}")

total_reviews_collected = 0

# Iterate over each product link and extract reviews
for product_link in product_links:
    total_reviews_collected += extract_reviews(product_link)

print(f"Total Reviews Scraped: {total_reviews_collected}")
driver.quit()
print(f"All reviews saved in: {csv_filename}")


Scraped 24 product links from Page 1
Scraped 48 product links from Page 2
Scraped 72 product links from Page 3
Scraped 96 product links from Page 4
Scraped 120 product links from Page 5
Scraped 144 product links from Page 6
Scraped 168 product links from Page 7
Scraped 192 product links from Page 8
Scraped 216 product links from Page 9
Scraped 240 product links from Page 10
Total Products Collected: 240
Product Review Url: https://www.flipkart.com/apple-iphone-16-pro-natural-titanium-128-gb/product-reviews/itm05ad8e674782a?pid=MOBH4DQFX4FR2HYZ&lid=LSTMOBH4DQFX4FR2HYZAOPCDX&marketplace=FLIPKART
Extracted 10 reviews from https://www.flipkart.com/apple-iphone-16-pro-natural-titanium-128-gb/product-reviews/itm05ad8e674782a?pid=MOBH4DQFX4FR2HYZ&lid=LSTMOBH4DQFX4FR2HYZAOPCDX&marketplace=FLIPKART Page 1, Total so far: 10
Extracted 10 reviews from https://www.flipkart.com/apple-iphone-16-pro-natural-titanium-128-gb/product-reviews/itm05ad8e674782a?pid=MOBH4DQFX4FR2HYZ&lid=LSTMOBH4DQFX4FR2HYZAO

NoSuchElementException: Message: no such element: Unable to locate element: {"method":"xpath","selector":"//div[contains(@class, '_23J90q RcXBOT')]/span"}
  (Session info: chrome=133.0.6943.142); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
	GetHandleVerifier [0x00B60B43+25139]
	(No symbol) [0x00AF13F4]
	(No symbol) [0x009D04E3]
	(No symbol) [0x00A183D7]
	(No symbol) [0x00A1872B]
	(No symbol) [0x00A61002]
	(No symbol) [0x00A3D014]
	(No symbol) [0x00A5E778]
	(No symbol) [0x00A3CDC6]
	(No symbol) [0x00A0BDE9]
	(No symbol) [0x00A0D124]
	GetHandleVerifier [0x00E64373+3185251]
	GetHandleVerifier [0x00E8291A+3309578]
	GetHandleVerifier [0x00E7CF42+3286578]
	GetHandleVerifier [0x00BF7AE0+643536]
	(No symbol) [0x00AFA20D]
	(No symbol) [0x00AF70B8]
	(No symbol) [0x00AF7257]
	(No symbol) [0x00AE9E00]
	BaseThreadInitThunk [0x751B5D49+25]
	RtlInitializeExceptionChain [0x7745CE3B+107]
	RtlGetAppContainerNamedObjectPath [0x7745CDC1+561]
