In [5]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import os

# Function to extract Product Title
def get_title(soup):
    try:
        title = soup.find("span", attrs={"id": 'productTitle'})
        title_string = title.text.strip()
    except AttributeError:
        title_string = ""
    return title_string

# Function to extract Product Price
def get_price(soup):
    try:
        price = soup.find("span", attrs={'id': 'priceblock_ourprice'}).string.strip()
    except AttributeError:
        try:
            price = soup.find("span", attrs={'id': 'priceblock_dealprice'}).string.strip()
        except AttributeError:
            price = ""
    return price

# Function to extract Product Rating
def get_rating(soup):
    try:
        rating = soup.find("i", attrs={'class': 'a-icon a-icon-star a-star-4-5'}).string.strip()
    except AttributeError:
        try:
            rating = soup.find("span", attrs={'class': 'a-icon-alt'}).string.strip()
        except AttributeError:
            rating = ""
    return rating

# Function to extract Number of User Reviews
def get_review_count(soup):
    try:
        review_count = soup.find("span", attrs={'id': 'acrCustomerReviewText'}).string.strip()
    except AttributeError:
        review_count = ""
    return review_count

# Function to extract Availability Status
def get_availability(soup):
    try:
        available = soup.find("div", attrs={'id': 'availability'})
        available = available.find("span").string.strip()
    except AttributeError:
        available = "Not Available"
    return available



In [6]:
if __name__ == '__main__':
    HEADERS = {'User-Agent': 'Mozilla/5.0 (X11; Linux i586; rv:31.0) Gecko/20100101 Firefox/31.0',
               'Accept-Language': 'en-US, en;q=0.5'}

    URL = "https://www.amazon.com/s?k=laptops&crid=9F705CH1BXKX&sprefix=laptops%2Caps%2C301&ref=nb_sb_noss_1"

    webpage = requests.get(URL, headers=HEADERS)
    webpage.encoding = 'utf-8'
    soup = BeautifulSoup(webpage.text, "html.parser")

    links = soup.find_all("a", attrs={'class': 'a-link-normal s-underline-text s-underline-link-text s-link-style a-text-normal'})
    links_list = [link.get('href') for link in links]

    print("Number of links found:", len(links_list))
    print("First few links:", links_list[:5])

    d = {"title": [], "price": [], "rating": [], "reviews": [], "availability": []}

    for link in links_list:
        new_webpage = requests.get("https://www.amazon.com" + link, headers=HEADERS)
        new_webpage.encoding = 'utf-8'
        new_soup = BeautifulSoup(new_webpage.text, "html.parser")

        title = get_title(new_soup)
        price = get_price(new_soup)
        rating = get_rating(new_soup)
        reviews = get_review_count(new_soup)
        availability = get_availability(new_soup)

        print("Title:", title)
        print("Price:", price)
        print("Rating:", rating)
        print("Reviews:", reviews)
        print("Availability:", availability)
        print("-----")

        d['title'].append(title)
        d['price'].append(price)
        d['rating'].append(rating)
        d['reviews'].append(reviews)
        d['availability'].append(availability)

    amazon_df = pd.DataFrame.from_dict(d)
    amazon_df['title'] = amazon_df['title'].replace('', np.nan)
    amazon_df = amazon_df.dropna(subset=['title'])

    project_folder = os.getcwd()
    csv_file_path = os.path.join(project_folder, "amazon_laptops_data.csv")
    amazon_df.to_csv(csv_file_path, header=True, index=False)  # No encoding specified

# Output the DataFrame
amazon_df

Number of links found: 0
First few links: []


Unnamed: 0,title,price,rating,reviews,availability
