In [101]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np

In [102]:
#Function to extract Product title
def get_title(soup):

    try:
        #Outer tag object
        title=soup.find('span',attrs={'id':'productTitle'})

        #Innter NavigatableString Object
        title_value=title.text

        #Title as a string value
        title_string = title_value.strip()

    except AttributeError:
        title_string='';

    return title_string

#Function to extract Product Price
def get_price(soup):

    try:
        price=soup.find('span',attrs={'class':'a-offscreen'}).string.strip()

    except AttributeError:
        price=soup.find('span',attrs={'class':'a-offscreen'}).string.strip()

    except:
        price=''

    return price

#Function to extract Product rating
def get_rating(soup):

    try:
        rating=soup.find('i',attrs={'class':'a-icon a-icon-star a-star-5 cm-cr-review-stars-spacing-big'}).string.strip()

    except AttributeError:
        try:
            rating=soup.find('span',attrs={'class':'a-icon-alt'}).string.strip()
        except:
            rating=''

    return rating

#Function to extract number of reviews for the product
def get_review_count(soup):

    try:
        review_count=soup.find('span',attrs={'id':'acrCustomerReviewText'}).string.strip()

    except AttributeError:
        review_count=''

    return review_count

#Function to extract Availability status
def get_availability(soup):

    try:
        available=soup.find('div',attrs={'id':'availability'}).text.strip()
        # available=soup.find('span').string.strip()

    except AttributeError:
        available='Not Available'

    except:
        available=''

    return available

In [103]:
if __name__=='__main__':

    #adding user agent
    HEADERS=({'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36', 'Accept-Language':'en-US, en;q=0.5'})

    #Webpage URL
    URL='https://www.amazon.com/s?k=playstation+5&crid=3HRCY545HJZ7&sprefix=play%2Caps%2C184&ref=nb_sb_ss_w_hit-vc-lth_playstation-5_k0_1_4'

    #HTTP Request
    webpage=requests.get(URL, headers=HEADERS)

    #Soup object containing all data
    soup = BeautifulSoup(webpage.content, 'html.parser')

    #Fetch links from all tags
    links = soup.find_all('a', attrs={'class':'a-link-normal s-underline-text s-underline-link-text s-link-style a-text-normal'})

    #Storing links in a string
    links_list=[]

    #Extracting all links from all tags and storing them in string
    for link in links:
        links_list.append(link.get('href'))
        
    del links_list[-1]
    d={'title':[], 'price':[], 'rating':[], 'reviews':[], 'availability':[]}
    
    #loop for extracting all product information from link list
    for link in links_list:
        new_webpage=requests.get('https://www.amazon.com'+link, headers=HEADERS)

        new_soup=BeautifulSoup(new_webpage.content,'html.parser')

        #Function calls to display all product information
        d['title'].append(get_title(new_soup))
        d['price'].append(get_price(new_soup))
        d['rating'].append(get_rating(new_soup))
        d['reviews'].append(get_review_count(new_soup))
        d['availability'].append(get_availability(new_soup))
        


amazon_df=pd.DataFrame.from_dict(d)
amazon_df['title'].replace('',np.nan, inplace=True)
amazon_df=amazon_df.dropna(subset=['title'])
amazon_df.to_csv('amazon_data.csv',header=True, index=False)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  amazon_df['title'].replace('',np.nan, inplace=True)


In [104]:
amazon_df

Unnamed: 0,title,price,rating,reviews,availability
0,PlayStation®5 console (slim),$494.65,4.5 out of 5 stars,787 ratings,In Stock
1,PlayStation 5 Console (PS5),$463.98,4.8 out of 5 stars,"8,186 ratings",Only 1 left in stock - order soon.
2,PlayStation 5 Console - Marvel’s Spider-Man 2 ...,$479.99,4.8 out of 5 stars,"4,034 ratings",Only 1 left in stock - order soon.
3,PlayStation 5 Digital Edition – Marvel’s Spide...,$479.99,4.7 out of 5 stars,420 ratings,Only 3 left in stock - order soon.
4,Playstation 5 Disc Version PS5 Console - Addit...,$458.99,3.5 out of 5 stars,23 ratings,
5,EA SPORTS College Football 25 - PlayStation 5,$69.00,3.5 out of 5 stars,14 ratings,In Stock
6,PlayStation 5 Digital Edition – Marvel’s Spide...,Page 1 of 1,Previous page of related Sponsored Products,,
7,PlayStation DualSense® Wireless Controller - M...,$69.00,4.5 out of 5 stars,"1,812 ratings",In Stock
8,PlayStation Portal Remote Player - PlayStation 5,$199.00,4.5 out of 5 stars,"1,130 ratings",In Stock
9,PlayStation®5 console (slim) (Renewed),$458.99,4.5 out of 5 stars,11 ratings,Only 2 left in stock - order soon.
