In [71]:
# import necessary libraries
import numpy as np
import pandas as pd
import requests

from bs4 import BeautifulSoup
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
import re

In [72]:
def get_product_name(soup) -> str:
    """
    Function to extract Product Name
    
    Args:
        soup (BeautifulSoup Object): html format text output result after perfoming html soup parser
        
    Return:
        title_string (str): String containing the Product Name
    """

    try:
        # Outer Tag Object
        title = soup.find("span", attrs={"id": 'productTitle'})
        
        # Inner NavigatableString Object
        title_value = title.text

        # Title as a string value
        title_string = title_value.strip()

    except AttributeError:
        title_string = ""

    return title_string


def get_price(soup) -> str:
    """
    Function to extract the Price of the product
    
    Args:
        soup (BeautifulSoup Object): html format text output result after perfoming html soup parser
        
    Return:
        price (str): String containing the Product price 
    """
    
    try:
        # 
        price=soup.find("span", attrs={"class":'a-price-whole' }).text
    except AttributeError:
            price = ""

    return price


def get_description(soup) -> str:
    """
    Function to extract Product Description
    
    Args:
        soup (BeautifulSoup Object): html format text output result after perfoming html soup parser
        
    Return:
        description (str): String containing the Product description
    """

    try:
        #
        description = soup.find("div", attrs={"id": 'productDescription'}).text.strip()

    except AttributeError:

            description = ""

    return description


def get_rating(soup):
    """
    Function to extract user rating
    
    Args:
        soup (BeautifulSoup Object): html format text output result after perfoming html soup parser
        
    Return:
        rating (str): String containing the user rating
    """

    try:
        #
        rating = soup.find("span", attrs={"class": 'a-size-base a-nowrap'}).text.strip()
    
    except AttributeError:
            rating = ""	

    return rating


def get_review_other(soup) -> str:
    """
    Function to extract user review
    
    Args:
        soup (BeautifulSoup Object): html format text output result after perfoming html soup parser
        
    Return:
        revother (str): String containing the user review
    """
    
    try:
        #
        revother = soup.find("span", attrs = {"class": 'a-size-base review-text'}).text.strip()

    except AttributeError:
        revother = ""	

    return revother


def get_imgurl(soup) -> str:
    """
    Function to extract Image URL
    
    Args:
        soup (BeautifulSoup Object): html format text output result after perfoming html soup parser
        
    Return:
        imgurl (str): String containing the Image URL
    """
    
    try:
        #
        imgurl = new_soup.find("img", attrs={"class": 'a-dynamic-image a-stretch-vertical'}).get("src")
        
    except AttributeError:
        
        try:
            imgurl = new_soup.find("img", attrs={"class": 'a-dynamic-image a-stretch-horizontal'}).get("src")
        except AttributeError:
            imgurl = ""

    return imgurl

In [73]:
# Driver code
if __name__ == '__main__':
    
    #Defining the HTTP Header.Need to define the user agent of the user. 
    HEADERS = ({'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36', 'Accept-Language': 'en-US, en;q=0.5'})

    # Creating a list to store urls for watches, dresses and earrings
    categories=[]
    
    # Appending urls of watches, dresses and earrings
    categories.append("https://www.amazon.in/s?k=watches&crid=3KVA3VJWH31VK&sprefix=watches%2Caps%2C209&ref=nb_sb_ss_ts-doa-p_3_7")
    categories.append("https://www.amazon.in/s?k=dresses&crid=2YECVUH0TDRVS&sprefix=dresses+%2Caps%2C219&ref=nb_sb_noss_2")
    categories.append("https://www.amazon.in/s?k=earings&crid=NK0BF2NP73I9&sprefix=earings%2Caps%2C202&ref=nb_sb_noss_2")
    
    d = {"Category":[], "Product Name":[], "Price":[], "Product Description":[], "User Ratings":[], "Top Review": [], "Product Image URL": []}
    
    # Keeping track of the category type
    count=0
    
    # Performing operations over each category
    for cat in categories:
        
        # HTTP Request Session for handling multiple GET requests
        session = requests.Session()
        retry = Retry(connect=3, backoff_factor=0.5)
        adapter = HTTPAdapter(max_retries=retry)
        session.mount('http://', adapter)
        session.mount('https://', adapter)

        webpage = session.get(cat, headers=HEADERS)

        # Status Code check for 200 or 503
        if webpage.status_code == 200:
        
            # Soup Object containing all data
            soup = BeautifulSoup(webpage.content, "html.parser")

            # Fetch links as List of Tag Objects
            links = soup.find_all("a", attrs={'class':'a-link-normal s-no-outline'})

            # Store the links
            links_list = []
            
            # Storing items from each category in the list
            for s in range(len(links)):
                if s < 400:
                    links_list.append(links[s].get('href'))

            # Loop for extracting product details from each link 
            for link in links_list:

                try:
                    new_webpage = requests.get("https://www.amazon.in" + link, headers=HEADERS)
                    
                except Exception as e:
                    print(f'Exception Caught: {e}')
                    continue

                # New Soup object to get the data of the current specific product
                new_soup = BeautifulSoup(new_webpage.content, "html.parser")

                if count==0:
                    d['Category'].append("Watches")
                elif count==1:
                    d['Category'].append("Dresses")
                elif count==2:
                    d['Category'].append("Earings")
                    
                # Fetching the necessary product information and storing in dictionary object
                d['Product Name'].append(get_product_name(new_soup))
                d['Price'].append(get_price(new_soup))
                d['Product Description'].append(get_description(new_soup))
                d['User Ratings'].append(get_rating(new_soup))
                d['Top Review'].append(get_review_other(new_soup))
                d['Product Image URL'].append(get_imgurl(new_soup))
            
            count += 1

        elif webpage.status_code == 503:
            print(f'Status Code 503 error from webpage.')
            print('Re-run the program again after some time.')
            exit
        
        # Storing the entire data in a Pandas dataframe
        amazon_df = pd.DataFrame.from_dict(d)
        amazon_df.to_csv("amazon_data.csv", header=True, index=False)


Exception Caught: HTTPSConnectionPool(host='www.amazon.inhttps', port=443): Max retries exceeded with url: //aax-eu.amazon.in/x/c/RI4mF9UqbHr9gO8XobO7bm0AAAGOK_D_UQMAAAH2AQBvbm9fdHhuX2JpZDMgICBvbm9fdHhuX2ltcDEgICCv4ZRj/https://www.amazon.in/EthnicJunction-Chanderi-Jacquard-Straight-SKD34-Sita-Grey_S/dp/B0C77QH3F7/ref=sxin_16_sbv_search_btf?content-id=amzn1.sym.4fe35c1d-da6a-41a4-967c-7a2315a8b22e%3Aamzn1.sym.4fe35c1d-da6a-41a4-967c-7a2315a8b22e&crid=2YECVUH0TDRVS&cv_ct_cx=dresses&dib=eyJ2IjoiMSJ9.dMkOu_95L3wMayyzAaXg1g.iq8lcwgVA9adCqpnsRbneeO_rRbGIGBvFCy0xoOlxGc&dib_tag=se&keywords=dresses&pd_rd_i=B0C77QH3F7&pd_rd_r=945cf4ed-e3a3-4db9-ba6d-7828d1a84c46&pd_rd_w=w4LbZ&pd_rd_wg=R5da9&pf_rd_p=4fe35c1d-da6a-41a4-967c-7a2315a8b22e&pf_rd_r=52NTNAP4P2MJ5GBY4BVK&qid=1710134198&sbo=RZvfv%2F%2FHxDF%2BO5021pAnSA%3D%3D&sprefix=dresses+%2Caps%2C219&sr=1-1-b614a050-8d70-4320-9c19-cc457ab2a351 (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x000002529F280910>: Failed to e

In [96]:
#printing the data frame
amazon_df

Unnamed: 0,Category,Product Name,Price,Product Description,User Ratings,Top Review,Product Image URL
0,Watches,Daniel Wellington Iconic Analog Black Dial Uni...,21199,,3.8 out of 5,Beautiful watch loved every bit of it. It is a...,https://m.media-amazon.com/images/I/31mYGwp5yK...
1,Watches,Daniel Wellington Men Analogue Graphite Grey R...,22599,The Iconic Chronograph Link will have you look...,,,https://m.media-amazon.com/images/I/31Rm9NGypN...
2,Watches,OLEVS Watches for Men Chronograph Business Wri...,2805,OLEVS Watches For Men Chronograph Business Wri...,4.6 out of 5,"I like this watch, I like the gold band. Easy ...",https://m.media-amazon.com/images/I/51pvCriTdO...
3,Watches,Daniel Wellington Classic 40 Sterling Black Me...,19799,Classic Sterlig Silver Watch For Men. A Classi...,5 out of 5,,https://m.media-amazon.com/images/I/31RcwxYpgI...
4,Watches,beatXP Flare Pro 1.39” HD Display Bluetooth Ca...,999.,,3.7 out of 5,I Liked the quality. colour is little bit ligh...,https://m.media-amazon.com/images/I/51fMcMB6HI...
...,...,...,...,...,...,...,...
200,Earings,Estele 24 Kt Gold Plated Wicket Dangle Earring...,249,Description,4.2 out of 5,I loved the earrings. They r lightweight. Have...,https://m.media-amazon.com/images/I/21dsafqRw7...
201,Earings,Estele 24Kt Gold Tone Plated Metal Brass Flowe...,279,Estele is not simply a brand or a jewellery li...,4.1 out of 5,Quality is really good 👍\nRead more,https://m.media-amazon.com/images/I/41ixiW-v33...
202,Earings,Zaveri Pearls Green & Pink Meenakari Lotus Des...,311,Earrings Are A Great Way To Spice Up Any Outfi...,4.1 out of 5,Very beautiful earings....\nRead more,https://m.media-amazon.com/images/I/41TwFvQSct...
203,Earings,LeCalla 925 Sterling Silver BIS Hallmarked Ant...,1399,,4.1 out of 5,These earrings are well made and pretty. I re...,https://m.media-amazon.com/images/I/41mVM38C3b...


In [97]:
#Performing data cleaning on this dataset
#Removing all NA values
from copy import deepcopy
df = deepcopy(amazon_df)
df.replace('', np.nan, inplace=True)
df.dropna(inplace=True)
df.head()

Unnamed: 0,Category,Product Name,Price,Product Description,User Ratings,Top Review,Product Image URL
2,Watches,OLEVS Watches for Men Chronograph Business Wri...,2805,OLEVS Watches For Men Chronograph Business Wri...,4.6 out of 5,"I like this watch, I like the gold band. Easy ...",https://m.media-amazon.com/images/I/51pvCriTdO...
9,Watches,Matrix Antique 2.0 Day & Date Softest Silicone...,299,Matrix Watch,4.2 out of 5,It's price is reasonable and is of good qualit...,https://m.media-amazon.com/images/I/41mzQ4Bg5h...
16,Watches,TIMEWEAR Analog Day Date Functioning Stainless...,319,Stay fashionable with this watch from TIMEWEAR...,3.8 out of 5,"It is really nice product. Looks sturdy, forma...",https://m.media-amazon.com/images/I/415wDnu8sC...
20,Watches,NIBOSI Men Watches Analog Quartz Stainless Ste...,2499,NIBOSI Stainless Steel Men's Watch Classic Bus...,4.7 out of 5,This gold color NIBOSI watch is quite attracti...,https://m.media-amazon.com/images/I/416wu+VLPJ...
21,Watches,Carlington Analog-Digital Sports Watch: Chrono...,1119,Introducing the Carlington Endurance Series An...,3.8 out of 5,"Amazing watch to wear. Best watch to buy, Its ...",https://m.media-amazon.com/images/I/51fzc9GthK...


In [98]:

#Removing /nRead more from top review
# Define a function to remove the substring "\nRead more" from each value
def remove_substring(value):
    return value.replace("\nRead more", "")

# Apply the function to all values of the Top Review column 
df['Top Review'] = df['Top Review'].apply(lambda x: remove_substring(x))
df.head()

Unnamed: 0,Category,Product Name,Price,Product Description,User Ratings,Top Review,Product Image URL
2,Watches,OLEVS Watches for Men Chronograph Business Wri...,2805,OLEVS Watches For Men Chronograph Business Wri...,4.6 out of 5,"I like this watch, I like the gold band. Easy ...",https://m.media-amazon.com/images/I/51pvCriTdO...
9,Watches,Matrix Antique 2.0 Day & Date Softest Silicone...,299,Matrix Watch,4.2 out of 5,It's price is reasonable and is of good quality,https://m.media-amazon.com/images/I/41mzQ4Bg5h...
16,Watches,TIMEWEAR Analog Day Date Functioning Stainless...,319,Stay fashionable with this watch from TIMEWEAR...,3.8 out of 5,"It is really nice product. Looks sturdy, forma...",https://m.media-amazon.com/images/I/415wDnu8sC...
20,Watches,NIBOSI Men Watches Analog Quartz Stainless Ste...,2499,NIBOSI Stainless Steel Men's Watch Classic Bus...,4.7 out of 5,This gold color NIBOSI watch is quite attracti...,https://m.media-amazon.com/images/I/416wu+VLPJ...
21,Watches,Carlington Analog-Digital Sports Watch: Chrono...,1119,Introducing the Carlington Endurance Series An...,3.8 out of 5,"Amazing watch to wear. Best watch to buy, Its ...",https://m.media-amazon.com/images/I/51fzc9GthK...


In [111]:
#Removing out of 5 from the elements in the user ratings column
df.rename(columns={'User Ratings': 'User Ratings (out of 5)'}, inplace=True)
# Define a function to remove the "out of 5" substring
def remove_substring(value):
    try:
        return value.replace(" out of 5", "")
    except:
        return value

# Apply the function to all values of the User Ratings column 
df['User Ratings (out of 5)'] = df['User Ratings (out of 5)'].apply(lambda x: remove_substring(x))

df.head()

Unnamed: 0,Category,Product Name,Price(₹),Product Description,User Ratings (out of 5),Top Review,Product Image URL
2,Watches,OLEVS Watches for Men Chronograph Business Wri...,2805,OLEVS Watches For Men Chronograph Business Wri...,4.6,"I like this watch, I like the gold band. Easy ...",https://m.media-amazon.com/images/I/51pvCriTdO...
9,Watches,Matrix Antique 2.0 Day & Date Softest Silicone...,299,Matrix Watch,4.2,It's price is reasonable and is of good quality,https://m.media-amazon.com/images/I/41mzQ4Bg5h...
16,Watches,TIMEWEAR Analog Day Date Functioning Stainless...,319,Stay fashionable with this watch from TIMEWEAR...,3.8,"It is really nice product. Looks sturdy, forma...",https://m.media-amazon.com/images/I/415wDnu8sC...
20,Watches,NIBOSI Men Watches Analog Quartz Stainless Ste...,2499,NIBOSI Stainless Steel Men's Watch Classic Bus...,4.7,This gold color NIBOSI watch is quite attracti...,https://m.media-amazon.com/images/I/416wu+VLPJ...
21,Watches,Carlington Analog-Digital Sports Watch: Chrono...,1119,Introducing the Carlington Endurance Series An...,3.8,"Amazing watch to wear. Best watch to buy, Its ...",https://m.media-amazon.com/images/I/51fzc9GthK...


In [112]:
#renaming the price column
df.rename(columns={'Price': 'Price(₹)'}, inplace=True)
df.head()

Unnamed: 0,Category,Product Name,Price(₹),Product Description,User Ratings (out of 5),Top Review,Product Image URL
2,Watches,OLEVS Watches for Men Chronograph Business Wri...,2805,OLEVS Watches For Men Chronograph Business Wri...,4.6,"I like this watch, I like the gold band. Easy ...",https://m.media-amazon.com/images/I/51pvCriTdO...
9,Watches,Matrix Antique 2.0 Day & Date Softest Silicone...,299,Matrix Watch,4.2,It's price is reasonable and is of good quality,https://m.media-amazon.com/images/I/41mzQ4Bg5h...
16,Watches,TIMEWEAR Analog Day Date Functioning Stainless...,319,Stay fashionable with this watch from TIMEWEAR...,3.8,"It is really nice product. Looks sturdy, forma...",https://m.media-amazon.com/images/I/415wDnu8sC...
20,Watches,NIBOSI Men Watches Analog Quartz Stainless Ste...,2499,NIBOSI Stainless Steel Men's Watch Classic Bus...,4.7,This gold color NIBOSI watch is quite attracti...,https://m.media-amazon.com/images/I/416wu+VLPJ...
21,Watches,Carlington Analog-Digital Sports Watch: Chrono...,1119,Introducing the Carlington Endurance Series An...,3.8,"Amazing watch to wear. Best watch to buy, Its ...",https://m.media-amazon.com/images/I/51fzc9GthK...


In [108]:
#Number of products from each product category in the final clean and consistent data frame
df['Category'].value_counts()

Category
Dresses    46
Earings    41
Watches    25
Name: count, dtype: int64

In [110]:
#Saving the csv file in my system
file_path = 'C:/Users/avani/OneDrive/Documents/WebScrapingProject/CSVofData.csv'  
df.to_csv(file_path, index=False)