In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np

In [2]:
def get_my_url_from_card(dynamicURL):
    return f"https://www.Amazon.in{dynamicURL}"

In [3]:
def get_my_url(dynamicURL):
    return f"https://www.Amazon.in/{dynamicURL}"

In [4]:
# Function to extract Product details from individual product links
    
def get_title_from_link(soup):

    try:
        # Outer Tag Object
        title = soup.find("span", attrs={"id":'productTitle'})
        
        # Inner NavigatableString Object
        title_value = title.text

        # Title as a string value
        title_string = title_value.strip()

    except AttributeError:
        title_string = ""

    return title_string

# Function to extract Product Price
def get_price_from_link(soup):

    try:
        price_span = soup.find("span", class_ = 'a-price aok-align-center')
        price = price_span.find("span", class_ = 'a-offscreen').get_text()

    except AttributeError:

        try:
            # If there is some deal price
            price = soup.find("span", attrs={'id':'priceblock_dealprice'}).string.strip()

        except:
            price = ""

    return price

# Function to extract Product Rating
def get_rating_from_link(soup):

    try:
        rating = soup.find("i", attrs={'class':'a-icon a-icon-star a-star-4-5'}).string.strip()
    
    except AttributeError:
        try:
            rating = soup.find("span", attrs={'class':'a-icon-alt'}).string.strip()
        except:
            rating = ""	

    return rating

# Function to extract Number of User Reviews
def get_review_count_from_link(soup):
    try:
        review_count = soup.find("span", attrs={'id':'acrCustomerReviewText'}).string.strip()

    except AttributeError:
        review_count = ""	

    return review_count

# Function to extract description
def get_desc_from_link(soup):
    try:
        desc=soup.find('div',attrs={'id':'productDescription'}).text.strip()
    except AttributeError:
        desc=''
    return desc

In [5]:
# function to get detatails from a single page
def get_brand(card):
    try:
        brand=card.find('span',attrs={'class':'a-size-base-plus a-color-base'}).text.strip()
    except AttributeError:
        brand=""
    return brand

def get_title(card,newSoup):
    try:
        title=card.find('span',attrs={'class':'a-size-base-plus a-color-base a-text-normal'}).text.strip()
    except AttributeError:
        try:
            title=get_title_from_link(newSoup)
        except:
            title=""
    return title

def get_rating(card,newSoup):
    try:
        rating=card.find('div',attrs={'class':'a-section a-spacing-none a-spacing-top-micro'}).find('span',attrs={'class':'a-size-base'}).text.strip()
    except AttributeError:
        try:
            rating=get_rating_from_link(newSoup)
        except:
            rating=""
    return rating

def get_price(card,newSoup):
    try:
        price=card.find('span',attrs={'class':'a-price-whole'}).text.strip()
    except AttributeError:
        try:
            price=get_price_from_link(newSoup)
        except:
            price=""
    return price
def get_review_count(card,newSoup):
    try:
        review_count=card.find('span',attrs={'class':'a-size-base s-underline-text'}).text.strip()[1:-1]
    except AttributeError:
        try:
            review_count=get_review_count_from_link(newSoup)
        except:
            review_count=""
    return review_count
    

In [6]:
d = {"title":[],"brand":[], "price":[], "rating":[], "reviewCount":[],"description":[],"categories":"Kitchen-And-Home-Appliances"}

In [7]:
HEADERS=({'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36'})
for pageNo in range(100,130):
    dynamicURL=f"s?rh=n%3A4951860031&fs=true&ref=lp_4951860031_sar&page={pageNo}"
    URL=get_my_url(dynamicURL)
    webpage=requests.get(URL,headers=HEADERS)
    soup=BeautifulSoup(webpage.content,'html.parser')
    cardList=soup.find_all('div',attrs={'class':'s-card-container s-overflow-hidden aok-relative puis-expand-height puis-include-content-margin puis s-latency-cf-section s-card-border'})
    for card in cardList:
        link=card.find('a',attrs={'class':'a-link-normal s-underline-text s-underline-link-text s-link-style a-text-normal'})
        linkHref=link.get('href')
        newUrl=get_my_url_from_card(linkHref)
        newWebpage=requests.get(newUrl,headers=HEADERS)
        newSoup=BeautifulSoup(newWebpage.content,'html.parser')
        d['title'].append(get_title(card,newSoup))
        d['brand'].append(get_brand(card))
        d['price'].append(get_price(card,newSoup))
        d['rating'].append(get_rating(card,newSoup))
        d['reviewCount'].append(get_review_count(card,newSoup))
        d['description'].append(get_desc_from_link(newSoup))

In [8]:
home_kitapp_df = pd.DataFrame.from_dict(d)
home_kitapp_df['title'].replace('', np.nan, inplace=True)
home_kitapp_df = home_kitapp_df.dropna(subset=['title'])
home_kitapp_df.to_csv("amazon_data_kitchen_and_home_appliances_100_to_130.csv", header=True, index=True)

In [None]:
from google.colab import drive
drive.mount('/content/drive/folders/1z5yR1dTcrGyGvA4R5SllA6t-SPf_Sj-S')