In [1]:
# imports
from bs4 import BeautifulSoup as bs
import requests as req
import pandas as pd

In [119]:
# for Testing
product_page_url ='https://www.amazon.in/ASUS-i3-1005G1-Integrated-Transparent-K413JA-EK289T/dp/B08S4TY7WL/ref=sr_1_3?crid=1H2ECW5THL6OK&dchild=1&keywords=asus+laptops&qid=1612252842&sprefix=asus+la%2Caps%2C407&sr=8-3'

In [65]:
page_content = req.get(product_page_url,headers={"User-Agent": "Requests"}).content
bsoup_object = bs(page_content)
useful_content = bsoup_object.find('div',{'id':'centerCol'})

ASUS VivoBook Ultra K14 (2020), Intel Core i3-1005G1 10th Gen, 14-Inch FHD Thin and Light Laptop (8GB RAM/512GB SSD/Windows 10/Integrated Graphics/Transparent Silver/1.4 kg), K413JA-EK289T
[{'M.R.P.:': ' ₹\xa049,990.00'}, {'Price:': '₹\xa042,295.00'}]


In [2]:
# this fuction return product name from given product page
def get_product_name(content):
    block = content.find('div',{'id':'title_feature_div'})
    if block.h1 != None :
        h1 =block.h1
        if h1.find('span') :
            return h1.find('span').text.strip()

In [3]:
# this fuction return product MRP and price from given product page
def get_product_prices(content):
    block = content.find('div',{'id':'desktop_unifiedPrice'})
    try:
        if block.find('table'):
            table = block.find('table')
            table_rows = table.find_all('tr')

            prices = []
            for row in table_rows:
                 if row.find('td'):
                    row_data = row.find_all('td')
                    if len(row_data) ==2:
                        price_key =""
                        for data in row_data:
                            if data.find('span') and price_key !='':
                                 prices.append({price_key:data.span.text})   
                            else :
                                price_key = data.text
            return prices
    except : 
        return

In [4]:
# this function takes product page url and return product name and MRP and price
def get_product_details(product_page_url):
    page_content = req.get(product_page_url,headers={"User-Agent": "Requests"}).content
    bsoup_object = bs(page_content)
    useful_content = bsoup_object.find('div',{'id':'centerCol'})
    return { get_product_name(useful_content):get_product_prices(useful_content)}

## All functions are ready for scrape specific product information 
### Let use this functions for search specific result on amazon

In [5]:
base_url = 'https://www.amazon.in'

In [6]:
# this function takes a string we want to search and return a query for url 
def get_search_query(search):
    template = '/s?k={}'
    query = search.replace(' ','+')
    return template.format(query)

In [7]:
# this function return all possible product urls that you want to search
## Helper Function
def filter_product_url(card):
    product = card.find('h2')
    if product != None and product.find('a'):
        url = product.find('a').get('href')
        return url

In [8]:
# this function return all possible product urls that you want to search using url that contains search query
def get_all_products_url(url):
    page_content = req.get(url,headers={"User-Agent": "Requests"}).content
    # convert into beautiful soup object
    bsoup = bs(page_content)
    product_cards = bsoup.find_all('div' , {'class': 'sg-col-inner'})
    products = []
    for card in product_cards:
        product_name = filter_product_url(card)
        if product_name is not None:
            products.append(product_name)
    return products

In [9]:
# lets test our function , search for asus laptops on amazon
products_pages = get_all_products_url(base_url+get_search_query("asus laptops"))

In [22]:
# how many products pages are found
len(products_pages)

55

In [23]:
# lets see top 5 
products_pages[:5]

['/ASUS-VivoBook-i3-1005G1-Integrated-X409JA-EK011T/dp/B08BV8CBC9/ref=sr_1_1?dchild=1&keywords=asus+laptops&qid=1612359853&sr=8-1',
 '/ASUS-VivoBook-i3-1005G1-Integrated-X409JA-EK011T/dp/B08BV8CBC9/ref=sr_1_1?dchild=1&keywords=asus+laptops&qid=1612359853&sr=8-1',
 '/ASUS-VivoBook-i3-1005G1-Integrated-X409JA-EK011T/dp/B08BV8CBC9/ref=sr_1_1?dchild=1&keywords=asus+laptops&qid=1612359853&sr=8-1',
 '/ASUS-VivoBook-i3-1005G1-Integrated-X409JA-EK011T/dp/B08BV8CBC9/ref=sr_1_1?dchild=1&keywords=asus+laptops&qid=1612359853&sr=8-1',
 '/ASUS-i3-1005G1-15-6-inch-Integrated-X515JA-EJ301T/dp/B08PQVXSVK/ref=sr_1_2?dchild=1&keywords=asus+laptops&qid=1612359853&sr=8-2']

In [11]:
# lets use the above urls for finding the product details on product details page
## First ten product is enough
products_price_details =[]
for index , product in enumerate(products_pages):
    if index ==10 :
        break
    products_price_details.append(get_product_details(base_url+product))


In [13]:
# Format our data into dictionary of lists in order to feed into pandas
def format_data(product_list):
    product_names =[]
    MRPs =[]
    prices =[]
    for product in product_list:
        name = list(product.keys())[0]
        product_names.append(name)
        details = product[name]
        mrp = details[0][list(details[0].keys())[0]]        
        price = details[1][list(details[1].keys())[0]]
        MRPs.append(mrp)
        prices.append(price)
    return {"Product Name" : product_names,"MRP" :MRPs,"Selling Price":prices}

In [14]:
formated_data = format_data(products_price_details)

In [15]:
data_frame = pd.DataFrame(formated_data)
data_frame

Unnamed: 0,Product Name,MRP,Selling Price
0,ASUS VivoBook 14 Intel Core i3-1005G1 10th Gen...,"₹ 40,990.00","₹ 33,694.00"
1,ASUS VivoBook 14 Intel Core i3-1005G1 10th Gen...,"₹ 40,990.00","₹ 33,694.00"
2,ASUS VivoBook 14 Intel Core i3-1005G1 10th Gen...,"₹ 40,990.00","₹ 33,694.00"
3,ASUS VivoBook 14 Intel Core i3-1005G1 10th Gen...,"₹ 40,990.00","₹ 33,694.00"
4,ASUS VivoBook 15 (2020) Intel Core i3-1005G1 1...,"₹ 39,990.00","₹ 36,999.00"
5,ASUS VivoBook 15 (2020) Intel Core i3-1005G1 1...,"₹ 39,990.00","₹ 36,999.00"
6,ASUS VivoBook 15 (2020) Intel Core i3-1005G1 1...,"₹ 39,990.00","₹ 36,999.00"
7,"ASUS ZenBook 14, Intel Core i5-8265U 8th Gen, ...","₹ 1,02,990.00","₹ 54,990.00"
8,"ASUS ZenBook 14, Intel Core i5-8265U 8th Gen, ...","₹ 1,02,990.00","₹ 54,990.00"
9,"ASUS ZenBook 14, Intel Core i5-8265U 8th Gen, ...","₹ 1,02,990.00","₹ 54,990.00"


# Put all logic into One function 

In [17]:
# this function contains all logic. 
# This function takes product name as parameter and written top 10 results from amazon with mrp and price
def get_products_by_name(product_name):
    products_pages = get_all_products_url(base_url+get_search_query(product_name))
    products_price_details =[]
    for index , product in enumerate(products_pages):
        if index ==10 :
            break
        products_price_details.append(get_product_details(base_url+product))
    return format_data(products_price_details)

In [18]:
# calling function in order to search for samsung mobiles
product_list = get_products_by_name("samsung mobiles")
data_frame = pd.DataFrame(product_list)

In [19]:
data_frame

Unnamed: 0,Product Name,MRP,Selling Price
0,"Samsung Galaxy M31 Prime Edition (Space Black,...","₹ 19,999.00","₹ 16,499.00"
1,"Samsung Galaxy M31 Prime Edition (Space Black,...","₹ 19,999.00","₹ 16,499.00"
2,"Samsung Galaxy M31 Prime Edition (Space Black,...","₹ 19,999.00","₹ 16,499.00"
3,"Samsung Galaxy M31 Prime Edition (Space Black,...","₹ 19,999.00","₹ 16,499.00"
4,"Samsung Galaxy M51 (Electric Blue, 6GB RAM, 12...","₹ 28,999.00","₹ 22,999.00"
5,"Samsung Galaxy M51 (Electric Blue, 6GB RAM, 12...","₹ 28,999.00","₹ 22,999.00"
6,"Samsung Galaxy M51 (Electric Blue, 6GB RAM, 12...","₹ 28,999.00","₹ 22,999.00"
7,"Samsung Galaxy M21 (Midnight Blue, 4GB RAM, 64...","₹ 15,999.00","₹ 13,999.00"
8,"Samsung Galaxy M21 (Midnight Blue, 4GB RAM, 64...","₹ 15,999.00","₹ 13,999.00"
9,"Samsung Galaxy M21 (Midnight Blue, 4GB RAM, 64...","₹ 15,999.00","₹ 13,999.00"


In [20]:
# calling function in order to search for external harddisk
data = get_products_by_name("external harddisk")
data_frame_2 = pd.DataFrame(data)

In [21]:
data_frame_2

Unnamed: 0,Product Name,MRP,Selling Price
0,Seagate Backup Plus Portable 4 TB External HDD...,"₹ 11,999.00","₹ 7,999.00"
1,Seagate Backup Plus Portable 4 TB External HDD...,"₹ 11,999.00","₹ 7,999.00"
2,Seagate Backup Plus Portable 4 TB External HDD...,"₹ 11,999.00","₹ 7,999.00"
3,Seagate Backup Plus Portable 4 TB External HDD...,"₹ 11,999.00","₹ 7,999.00"
4,ADATA HV320 1TB Sleek Light Portable USB 3.1 E...,"₹ 4,999.00","₹ 3,979.00"
5,ADATA HV320 1TB Sleek Light Portable USB 3.1 E...,"₹ 4,999.00","₹ 3,979.00"
6,ADATA HV320 1TB Sleek Light Portable USB 3.1 E...,"₹ 4,999.00","₹ 3,979.00"
7,Seagate Expansion 1.5 TB External HDD - USB 3....,"₹ 5,899.00","₹ 4,099.00"
8,Seagate Expansion 1.5 TB External HDD - USB 3....,"₹ 5,899.00","₹ 4,099.00"
9,Seagate Expansion 1.5 TB External HDD - USB 3....,"₹ 5,899.00","₹ 4,099.00"
