In [5]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import time

# Function to extract Product Title
def get_title(soup):
    try:
        title = soup.find("span", attrs={"id": 'productTitle'})
        return title.text.strip()
    except AttributeError:
        return ""

# Function to extract Product Price
def get_price(soup):
    try:
        # Amazon.in often uses this class
        price = soup.find("span", class_='a-offscreen')
        return price.get_text().strip() if price else ""
    except:
        return ""

# Function to extract Product Rating
def get_rating(soup):
    try:
        rating = soup.find("span", class_='a-icon-alt')
        return rating.get_text().strip() if rating else ""
    except:
        return ""

# Function to extract Number of User Reviews
def get_review_count(soup):
    try:
        review_count = soup.find("span", attrs={'id': 'acrCustomerReviewText'})
        return review_count.text.strip()
    except AttributeError:
        return ""

# Function to extract Availability Status
def get_availability(soup):
    try:
        available = soup.find("div", attrs={'id': 'availability'})
        return available.find("span").text.strip()
    except AttributeError:
        return "Not Available"

if __name__ == '__main__':
    # Add a valid User-Agent
    HEADERS = ({
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36',
        'Accept-Language': 'en-IN,en;q=0.9'
    })

    # Amazon India Laptops Search URL
    URL = "https://www.amazon.in/s?k=python+books+for+beginners+to+advanced&crid=2TJE2J45P0V2P&sprefix=python+book%2Caps%2C430&ref=nb_sb_ss_ts-doa-p_2_11"

    # HTTP Request
    webpage = requests.get(URL, headers=HEADERS)
    soup = BeautifulSoup(webpage.content, "html.parser")

    # Extract product links from the search results
    links = soup.find_all("a", attrs={'class': 'a-link-normal s-no-outline'})

    links_list = []
    for link in links:
        product_link = link.get('href')
        if product_link:
            links_list.append("https://www.amazon.in" + product_link)

    # Dictionary to hold data
    d = {"title": [], "price": [], "rating": [], "reviews": [], "availability": []}

    # Loop through product links
    for link in links_list:
        try:
            new_webpage = requests.get(link, headers=HEADERS)
            new_soup = BeautifulSoup(new_webpage.content, "html.parser")

            # Get data
            d['title'].append(get_title(new_soup))
            d['price'].append(get_price(new_soup))
            d['rating'].append(get_rating(new_soup))
            d['reviews'].append(get_review_count(new_soup))
            d['availability'].append(get_availability(new_soup))

            # Add delay to avoid getting blocked
            time.sleep(1)

        except Exception as e:
            print(f"Error fetching data for {link}: {e}")
            continue

    # Save to CSV
    amazon_df = pd.DataFrame.from_dict(d)
    amazon_df['title'].replace('', np.nan, inplace=True)
    amazon_df.dropna(subset=['title'], inplace=True)
    amazon_df.to_csv("amazon_in_laptops.csv", header=True, index=False)
    print("Data exported to amazon_in_laptops.csv")
    print(amazon_df)


Error fetching data for https://www.amazon.inhttps://aax-eu.amazon.in/x/c/JOL9wVm5KF0Dw77l4bztFocAAAGWGyCyRwMAAAH2AQBvbm9fdHhuX2JpZDEgICBvbm9fdHhuX2ltcDEgICCJGKll/clv1_CEuOPUxokZA0iHrVBvl26R3aQ3dLa4IZqoo39SKtr9x1139LmB3pdPw18HHz9hvVjq0yzpNb3X7OfU31A3siGRNJCCuSEfrKK20Te3FxVXz5I4ZjxYgc6JJGZxqD8CswSvccpekaaQA7bb7PVfcln2sEh9OkkwEdxz17IuI8Q1TRlrnQ1hJgCBjla8avPLFsHH4HbZmFzptYR2t3PNIRZPsQh9Kumt1TSBDYL7UlzhESrMBvm7fmTUcsgxvigUBeAnGoOOGLoxR1yAlDdY1eRiSPc2OsnheLHsiEzPLCcM4TMlyty6ezwyq5PTDbH4Z5qYHSkpsaNfrXuov2LOe_IMRc5fBMBmzwwcR5s4kagX_wqB_T2kJ6jHbR15o/https://www.amazon.in/Financial-Statement-Analysis-Handbook-ZebraLearn/dp/8195895077/ref=sxbs_sbv_search_btf?content-id=amzn1.sym.3353ca56-2043-45f1-9c2e-99dccef31d9d%3Aamzn1.sym.3353ca56-2043-45f1-9c2e-99dccef31d9d&crid=2TJE2J45P0V2P&cv_ct_cx=python+books+for+beginners+to+advanced&keywords=python+books+for+beginners+to+advanced&pd_rd_i=8195895077&pd_rd_r=6ad43220-051b-4750-b3f9-3934dd6792b6&pd_rd_w=pFlu7&pd_rd_wg=t4P9b&pf_rd_p=3353ca56-2043-45f1-9

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  amazon_df['title'].replace('', np.nan, inplace=True)
