# Automated web scraping using Selenium and BeautifulSoup

In [1]:
##importing required libraries
from selenium import webdriver
from bs4 import BeautifulSoup
from selenium.webdriver.chrome.options import Options
import requests
import pandas as pd

In [2]:
##Saving the location of Chrome driver app
driver_path = 'C:\Program Files (x86)\chromedriver.exe'

In [3]:
##Creating a class to save all info of an item as an object
class info_saver:
    def __init__(self, product_name, price, brand, rating, no_of_ratings):
        self.name = product_name
        self.price = price
        self.brand = brand
        self.rating = rating
        self.no_of_ratings = no_of_ratings

In [4]:
## Creating a function that gives IDs of all the items for a given page (url)
def ID_generator(url):

    header={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36'}

    response = requests.get(url, headers = header)  
    
    IDS = []

    if response.status_code==200:
        print('Request Granted')
    else:
        print('Error with link: ', url)

    mysoup = BeautifulSoup(response.text, 'html.parser')  

    item_ids = mysoup.find_all('div', {'class' : "sg-col-4-of-12 s-result-item s-asin sg-col-4-of-16 sg-col sg-col-4-of-20"})
    for i in item_ids:
        IDS.append(i['data-asin'])

    return IDS

In [5]:
## Creating a funtion that returns an object (that contains all info of the item) for a given ID of an item
def info_collector(ID):
    ID_url = "https://www.amazon.in/dp/" + str(ID)
    
    options = Options()
    options.headless = False    
    driver = webdriver.Chrome(driver_path, options=options)
    driver.maximize_window()
    driver.get(ID_url)
    driver.set_page_load_timeout(10)
    
    
    ##getting product name                    
    xpath_title = '//*[@id="productTitle"]'
    titleText = driver.find_element_by_xpath(xpath_title).text
    #print(titleText)
    
    
    ##getting product price
    try:
        xpath_price = '//*[@id="priceblock_ourprice"]' 
        priceText = float( driver.find_element_by_xpath(xpath_price).text.replace('₹ ', '') )
        #print(priceText)
    except:
        try:
            xpath_price = '//*[@id="priceblock_saleprice"]' 
            priceText = float( driver.find_element_by_xpath(xpath_price).text.replace('₹ ', '') )
            #print(priceText)
        except:
            priceText = float('NAN')
            #print(priceText)
            
    
    ##getting product brand
    xpath_brand = '//*[@id="bylineInfo"]'
    brandText = driver.find_element_by_xpath(xpath_brand).text.replace('Brand: ', '').replace('Visit the ', '').replace(' Store', '')
    #print(brandText)
    
    
    try:
        ##getting product rating
        xpath_rating = '//*[@id="acrPopover"]/span[1]/a/i[1]'
        rating = driver.find_element_by_xpath(xpath_rating)
        ratingText = rating.get_attribute("innerHTML")
        ratingsoup = BeautifulSoup(ratingText, 'html.parser')
        ratingText = float( ratingsoup.find('span').text.replace('out of 5 stars', '') )
        #print(ratingText)

        ##getting number of ratings
        xpath_no_of_ratings = '//*[@id="acrCustomerReviewText"]'
        no_of_ratings = driver.find_element_by_xpath(xpath_no_of_ratings)
        no_of_ratingsText = int (no_of_ratings.get_attribute("innerHTML").replace('ratings', ''))
        #print(no_of_ratingsText)
        
    except:
        ratingText = float('NAN')
        #print(ratingText)
        no_of_ratingsText = 0
        #print(no_of_ratingsText)
        

    ##saving all info 
    info_product = info_saver(titleText, priceText, brandText, ratingText, no_of_ratingsText)
    
    ##closing tab
    driver.close()
    
    return info_product

In [6]:
#Creating a function that returns a list of objects (that contains all info of each item)
#the function takes two arguments: 1. Name of the item we want to search on amazon (string)
#                                  2. The number of pages we want to collect information (integer)
def all_info_collector(search_name, no_of_pages):
    info_array = []
    for i in range(1, no_of_pages + 1):
        page_url = 'https://www.amazon.in/s?k=' + search_name + "&page=" + str(i)
        all_ids = ID_generator(page_url)
        for j in all_ids:
            item_info = info_collector(j)
            info_array.append(item_info)
    return info_array

In [7]:
def df_maker(info_obj_list):
    
    columns = ['Product Name', 'Brand', 'Price', 'Rating', 'No. of ratings']
    df = pd.DataFrame([], columns = columns)
    
    for info_obj in info_obj_list:
        
        info = [info_obj.name, info_obj.brand, 
                info_obj.price, info_obj.rating, info_obj.no_of_ratings]
        
        df.loc[info_obj_list.index(info_obj)] = info
        
    return df
        

In [14]:
## Saving the dataframe as a csv file for later use
def csv_maker(dataframe, file_name):
    dataframe.to_csv(str(file_name) + '.csv')

### Demo: Bamboo Toothbrush

In [9]:
Bamboo_Toothbrush_info  = all_info_collector('bamboo toothbrush', 1)

Request Granted


In [10]:
Bamboo_Toothbrush_df = df_maker(Bamboo_Toothbrush_info)

In [11]:
Bamboo_Toothbrush_df.head(30)

Unnamed: 0,Product Name,Brand,Price,Rating,No. of ratings
0,Rusabl Bamboo Toothbrush with Charcoal Activat...,Rusabl,251.0,4.2,297
1,Terrabrush - Happy Mouth Happy Earth Slim Bamb...,Terrabrush - Happy Mouth Happy Earth,245.0,4.0,516
2,Mirakia Bamboo Charcoal Toothbrush Natural Woo...,Generic,235.0,4.1,14
3,Zeco Premium Bamboo Toothbrush - Adult 2-Piece...,ZECO,199.0,4.5,68
4,Zeco Bamboo Toothbrush - Adult 4-Piece BPA Fre...,ZECO,299.99,4.7,59
5,Rusabl Bamboo biodegradable Toothbrush with Ba...,Rusabl,275.0,4.1,136
6,ECO365 Bamboo Toothbrush With Charcoal Infused...,ECO365,239.0,4.0,204
7,Rusabl Bamboo Toothbrush with Charcoal Activat...,Rusabl,180.0,4.2,37
8,Rusabl Bamboo Toothbrush with Charcoal Activat...,Rusabl,274.0,3.8,53
9,Bigbluemarble Toothbrush - Bamboo Toothbrush i...,Generic,499.0,4.4,23


In [15]:
csv_maker(Bamboo_Toothbrush_df, 'Bamboo_toothbrush_dataset')