<center style = "color: purple; font-weight: bold"><h1>Scrapping - Analystt.ai Assignment</h1></center>

In [1]:
# required modules loading
import os
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

## Data Scraper 

In [2]:
# class Data Crawling
class DataCrawler:
    def __init__(
        self,  
        header = "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.5938.152 Safari/537.36"
    ):
        self.header = header                          # system specification and browser version name etc.
        self.options = webdriver.ChromeOptions()      # here we uses chrome as browser 
        self.options.add_argument(self.header)             # adding specification 
        self.driver = webdriver.Chrome(options = self.options)     # initializing chrome driver
        self.driver.maximize_window()              # increases the size of window or tab with maximum size

        
    # Part 1 Crawling
    def part1_crawler(self, BASE_URL):
        '''
        Part-1 Crawling 
        • Product URL
        • Product Name
        • Product Price
        • Rating
        • Number of reviews
        '''

        # Creating a DataFrame to store crawled data
        crawled_data = pd.DataFrame(columns = ['Product_URL', 'Product_Name', 'Product_Price', 'Product_Rating', 'Product_Reviews'])
        # Opening the Base URL page
        self.driver.get(BASE_URL)

        # performing crawling over 20 product listing pages 
        for i in range(1, 21):
            print(f"Crawling page {i}: ", end = "")
            
            '''
            check: if page iteration == 20! Because 20 is last page for crawling and this not be linked with next page. 
            So when i becomes 20, there can be error occured of Time limit error
            '''
            if i == 20:
                pass
            else:
                WebDriverWait(self.driver, 50).until(EC.element_to_be_clickable(
                    (By.XPATH, "//a[@class='s-pagination-item s-pagination-next s-pagination-button s-pagination-separator']")
                ))

            # finding all product which is listed on webpage
            products = self.driver.find_elements(By.XPATH, '//div[@data-component-type="s-search-result"]')
            # iterate over all products
            for product in products:
                # product-page link [XPATH ---> a[@class = 'a-link-normal s-underline-text s-underline-link-text s-link-style a-text-normal'] ]
                links = product.find_elements(
                    By.XPATH, ".//a[@class = 'a-link-normal s-underline-text s-underline-link-text s-link-style a-text-normal']"
                )
                # product name [XPATH ---> //span[@class = 'a-size-medium a-color-base a-text-normal'] ]
                names = product.find_elements(By.XPATH, ".//span[@class = 'a-size-medium a-color-base a-text-normal']")
                # product price [XPATH ---> //span[@class='a-price-whole'] ]
                prices = product.find_elements(By.XPATH, ".//span[@class='a-price-whole']")
                # product rating and reviews [XPATH ---> //div[@class='a-row a-size-small']/span]
                ratings_box = product.find_elements(By.XPATH, ".//div[@class='a-row a-size-small']/span")

                # extracting the details of product
                for link, name, price in zip(links, names, prices):
                    product_url = link.get_attribute('href')
                    product_name = name.text
                    product_price = price.text
                    '''
                    check: if any product is no rated by customers or not reviews by customers
                    if yes, then make their ratings and reviews 0
                    '''
                    if ratings_box == []:
                        product_rating = product_review = 0
                    else:
                        product_rating = ratings_box[0].get_attribute('aria-label')
                        product_review = ratings_box[1].get_attribute('aria-label')
                    
                    new_row = {
                        "Product_URL": product_url, 
                        "Product_Name": product_name, 
                        "Product_Price": product_price, 
                        "Product_Rating": product_rating, 
                        "Product_Reviews": product_review
                    }
                    # inserting into dataframe
                    crawled_data.loc[len(crawled_data)] = new_row
            
            print(f"Completed")

            # check: if this is a last page, if this is a last page we break the loop
            if i == 20:   
                break
            else:
                self.driver.find_element(By.XPATH, "//a[@class='s-pagination-item s-pagination-next s-pagination-button s-pagination-separator']").click()
        
        self.driver.back()           # closing the driver or window after all pages crawling
        crawled_data.to_csv("part1_crawled_data.csv", index=False)    # saving the crawled data into hard disk
        print(f"crawled data saved at location {os.getcwd()}/part1_crawled_data.csv")

        return list(crawled_data['Product_URL'])
        
    
    # Part 2 Crawling
    def part2_crawler(self, product_URLs):
        # dataframe for storing crawled data 
        crawled_data = pd.DataFrame(columns = ['Description', 'ASIN', 'Product_Description', 'Manufacturer'])
        
        # crawling each page
        i = 1
        for URL in product_URLs:
            print(f"Crawling {i} product: ", end = "")
            # opening given URL or webpage
            self.driver.get(URL)
            try:
                # crawling product descrition 1
                description1 = self.driver.find_elements(By.XPATH, "//div[@id = 'feature-bullets']/ul/li/span")
                # crawling product description 2
                description2 = self.driver.find_elements(By.XPATH, "//div[@id = 'productDescription']/p")
                # crawling details like, manufacturer, ASIN no etc.
                product_details = self.driver.find_elements(By.XPATH, "//div[@id = 'detailBullets_feature_div']/ul/li/span")
                # extracting description1 text
                description1_text = ''
                for i_description in description1:
                    description1_text += i_description.text
                # extracting description2 text
                description2_text = ''
                for i_description in description2:
                    description2_text += i_description.text
                # ASIN number
                ASIN_num = product_details[3].text
                # manufacturer 
                manufacturer = product_details[2].text
                
                new_row = {
                    'Description': description1_text, 
                    'ASIN': ASIN_num, 
                    'Product_Description': description2_text, 
                    'Manufacturer': manufacturer
                }
                # inserting new row into dataframe
                crawled_data.loc[len(crawled_data)] = new_row

                # back to previous window
                driver.back()
            except:
                pass
            print(f"Completed")
            i += 1

        # saving crawled data into hard disk
        crawled_data.to_csv('part2_crawled_data.csv', index=False)
        print(f"crawled data saved at location {os.getcwd()}/part2_crawled_data.csv")
        self.driver.close()       # closing the driver window
        return "Data crawled successfully"

    
    def main(self, BASE_URL):
        print("Part 1 crawling started......")
        URLs = self.part1_crawler(BASE_URL)
        print("Part 1 crawling completed successfully......")

        print("Part 2 crawling started.....")
        _ = self.part2_crawler(URLs)
        print("Part 2 crawling completed successfully.....")

### Base URL 

In [3]:
BASE_URL = 'https://www.amazon.in/s?k=bags&crid=2M096C61O4MLT&qid=1653308124&sprefix=ba%2Caps%2C283&ref=sr_pg_1'

### starting crawling 

In [4]:
crawler = DataCrawler()
crawler.main(BASE_URL)

Part 1 crawling started......
Crawling page 1: Completed
Crawling page 2: Completed
Crawling page 3: Completed
Crawling page 4: Completed
Crawling page 5: Completed
Crawling page 6: Completed
Crawling page 7: Completed
Crawling page 8: Completed
Crawling page 9: Completed
Crawling page 10: Completed
Crawling page 11: Completed
Crawling page 12: Completed
Crawling page 13: Completed
Crawling page 14: Completed
Crawling page 15: Completed
Crawling page 16: Completed
Crawling page 17: Completed
Crawling page 18: Completed
Crawling page 19: Completed
Crawling page 20: Completed
crawled data saved at location W:\analystt.ai/part1_crawled_data.csv
Part 1 crawling completed successfully......
Part 2 crawling started.....
Crawling 1 product: Completed
Crawling 2 product: Completed
Crawling 3 product: Completed
Crawling 4 product: Completed
Crawling 5 product: Completed
Crawling 6 product: Completed
Crawling 7 product: Completed
Crawling 8 product: Completed
Crawling 9 product: Completed
Crawli

<center style = "color: purple; font-weight: bold"><i><h2>This is from my side...</h2></i><center>
<center style = "color: purple; font-weight: bold"><i><h3>Hope this notebook will be informative</h3></i><center>