In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd


In [2]:
custom_headers = {
    "Accept-language": "en-GB,en;q=0.9",
    "Accept-Encoding": "gzip, deflate, br",
    "Cache-Control": "max-age=0",
    "Connection": "keep-alive",
    "User-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Safari/605.1.15",
}

In [3]:
def get_soup(url):
    response = requests.get(url, headers=custom_headers)

    if response.status_code != 200:
        print("Error in getting webpage")
        exit(-1)

    soup = BeautifulSoup(response.text, "lxml")
    return soup

In [4]:
def get_reviews(soup):
    review_elements = soup.select("div.review")

    scraped_reviews = []

    for review in review_elements:
        r_author_element = review.select_one("span.a-profile-name")
        r_author = r_author_element.text if r_author_element else None

        r_rating_element = review.select_one("i.review-rating")
        r_rating = r_rating_element.text.replace("out of 5 stars", "") if r_rating_element else None

        r_title_element = review.select_one("a.review-title")
        r_title_span_element = r_title_element.select_one("span:not([class])") if r_title_element else None
        r_title = r_title_span_element.text if r_title_span_element else None

        r_content_element = review.select_one("span.review-text")
        r_content = r_content_element.text if r_content_element else None

        r_date_element = review.select_one("span.review-date")
        r_date = r_date_element.text if r_date_element else None

        r_verified_element = review.select_one("span.a-size-mini")
        r_verified = r_verified_element.text if r_verified_element else None

        r_image_element = review.select_one("img.review-image-tile")
        r_image = r_image_element.attrs["src"] if r_image_element else None

        r = {
            "author": r_author,
            "rating": r_rating,
            "title": r_title,
            "content": r_content,
            "date": r_date,
            "verified": r_verified,
            "image_url": r_image
        }

        scraped_reviews.append(r)

    return scraped_reviews

In [5]:
def main():
    search_url = "https://www.amazon.in/Apple-iPhone-13-128GB-Blue/product-reviews/B09G9BL5CP/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews"
    soup = get_soup(search_url)
    data = get_reviews(soup)
    df = pd.DataFrame(data=data)

    df.to_csv("amz.csv")

if __name__ == '__main__':
    main()

In [6]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

custom_headers = {
    "Accept-language": "en-GB,en;q=0.9",
    "Accept-Encoding": "gzip, deflate, br",
    "Cache-Control": "max-age=0",
    "Connection": "keep-alive",
    "User-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Safari/605.1.15",
}

def get_soup(url):
    try:
        response = requests.get(url, headers=custom_headers)
        if response.status_code != 200:
            print(f"Error in getting webpage: {url} with status code: {response.status_code}")
            return None
        soup = BeautifulSoup(response.text, "lxml")
        return soup
    except Exception as e:
        print(f"Exception occurred while fetching page: {e}")
        return None

def get_reviews(soup):
    review_elements = soup.select("div[data-hook='review']")

    scraped_reviews = []

    for review in review_elements:
        r_author_element = review.select_one("span.a-profile-name")
        r_author = r_author_element.text if r_author_element else None

        r_rating_element = review.select_one("i.review-rating span.a-icon-alt")
        r_rating = r_rating_element.text.replace("out of 5 stars", "").strip() if r_rating_element else None

        r_title_element = review.select_one("a.review-title span")
        r_title = r_title_element.text if r_title_element else None

        r_content_element = review.select_one("span.review-text-content span")
        r_content = r_content_element.text.strip() if r_content_element else None

        r_date_element = review.select_one("span.review-date")
        r_date = r_date_element.text if r_date_element else None

        r_verified_element = review.select_one("span[data-hook='avp-badge']")
        r_verified = r_verified_element.text if r_verified_element else None

        r_image_element = review.select_one("img.review-image-tile")
        r_image = r_image_element.attrs["src"] if r_image_element else None

        r = {
            "author": r_author,
            "rating": r_rating,
            "title": r_title,
            "content": r_content,
            "date": r_date,
            "verified": r_verified,
            "image_url": r_image
        }

        scraped_reviews.append(r)

    return scraped_reviews

def main():
    base_url = "https://www.amazon.in/Apple-iPhone-13-128GB-Blue/product-reviews/B09G9BL5CP/ref=cm_cr_getr_d_paging_btm_next_{}?ie=UTF8&reviewerType=all_reviews&pageNumber={}"
    all_reviews = []

    for page_number in range(1, 10):  # Loop through the first 100 pages
        url = base_url.format(page_number, page_number)
        print(f"Scraping page {page_number}: {url}")
        soup = get_soup(url)
        if soup is None:
            continue
        reviews = get_reviews(soup)
        if not reviews:
            print(f"No reviews found on page {page_number}")
            break
        all_reviews.extend(reviews)
        print(f"Scraped {len(reviews)} reviews from page {page_number}")
        time.sleep(2)  # Add delay to prevent being blocked

    if all_reviews:
        df = pd.DataFrame(data=all_reviews)
        df.to_csv("amzoooooon_reviews.csv", index=False)
        print("DataFrame saved to amz_reviews.csv")
        print(df)
    else:
        print("No reviews scraped.")

if __name__ == '__main__':
    main()


Scraping page 1: https://www.amazon.in/Apple-iPhone-13-128GB-Blue/product-reviews/B09G9BL5CP/ref=cm_cr_getr_d_paging_btm_next_1?ie=UTF8&reviewerType=all_reviews&pageNumber=1
Scraped 10 reviews from page 1
Scraping page 2: https://www.amazon.in/Apple-iPhone-13-128GB-Blue/product-reviews/B09G9BL5CP/ref=cm_cr_getr_d_paging_btm_next_2?ie=UTF8&reviewerType=all_reviews&pageNumber=2
No reviews found on page 2
DataFrame saved to amz_reviews.csv
                    author rating               title  \
0             Pankaj Kumar    5.0  5.0 out of 5 stars   
1              Jnan Shetty    5.0  5.0 out of 5 stars   
2  Nagendra Singh Madnawat    4.0  4.0 out of 5 stars   
3                  vaibhav    5.0  5.0 out of 5 stars   
4             Ranjit Kumar    5.0  5.0 out of 5 stars   
5             Soumyojit S.    5.0  5.0 out of 5 stars   
6                     NEHA    5.0  5.0 out of 5 stars   
7               Sonu Kumar    5.0  5.0 out of 5 stars   
8        Keshav Maheshwari    4.0  4.0 out of 