In [163]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import requests
import re
from matplotlib import pyplot as plt
import time

In [164]:
def construct_url_params(url, kwargs):
    """
        Construct new url by adding query params to the url.
        
        Returns: new url
    """
    
    #if no query params add ? else add & at the end of url
    url+='?' if not '?' in url else '&'
    for i,j in kwargs.items():
        url+=f'{i}={j}&'
        
    #strip extra & at the end
    return url.rstrip('&')

In [165]:
construct_url_params("https://www.amazon.com/s?k=smart+plug+wifi", {'page': '2'})

'https://www.amazon.com/s?k=smart+plug+wifi&page=2'

In [166]:
class AmazonScraper:
    base_url = "https://www.amazon.com/s?k="

    def __init__(self, product_name, chrome_path=r"C:\Users\moink\Downloads\chromedriver-win64\chromedriver.exe"):
        
        service = webdriver.chrome.service.Service(executable_path=chrome_path)
        options = webdriver.ChromeOptions()
        options.add_argument("--start-maximized")

        self.driver = webdriver.Chrome(service=service, options=options)

        self.search_url = self.base_url+ re.sub("\s+", "+", product_name)
        self.driver.get(self.search_url)
        
    
    def getProductURLList(self):
        WebDriverWait(self.driver, 20).until(EC.presence_of_element_located((By.XPATH, "//div[contains(@class, 'a-section a-spacing-none a-spacing-top-small s-title-instructions-style')]")))
        
        qid = self.driver.find_element(By.XPATH, "//input[@name='qid']").get_attribute("value")
        total_page = int(self.driver.find_element(By.XPATH, "//div[@role='navigation']").find_element(By.XPATH, "//span[contains(@class, 's-pagination-item s-pagination-disabled')]").text)

        product_urls = []
        
        curr_page = 1
        
        while True: 
            for product in self.driver.find_elements(By.XPATH, "//a[contains(@class, 'a-link-normal s-underline-text s-underline-link-text s-link-style a-text-normal')]"):
                product_urls.append(product.get_attribute("href"))


            curr_page +=1
            
            if curr_page > total_page:
                break

            next_page_url = construct_url_params(self.search_url, {'page': curr_page, 'qid': qid, 'ref': f'sr_pg_{curr_page}'})
            self.driver.get(next_page_url)
        
        return product_urls
    
    def getLeftImage(self):
        return [img.get_attribute('src') for img in scraper.driver.find_elements(By.XPATH, "//div[@id='imageBlock']//div[@class='imgTagWrapper']//img")]

    def parseCenterDiv(self):
        #parse centerDiv

        product_detail = {}
    #     required_div = ["featurebullets_feature_div", "bylineInfo_feature_div", "title_feature_div", "productOverview_feature_div"]
        centerDiv = self.driver.find_element(By.XPATH, "//div[@id='ppd' and div[@id='centerCol']]")

        #get product title
        product_detail['product_title'] = centerDiv.find_element(By.ID, 'productTitle').text

        #get product brand
        product_detail['product_brand'] = centerDiv.find_element(By.ID, 'bylineInfo').text
        product_detail['product_brand_url'] = centerDiv.find_element(By.ID, 'bylineInfo').get_attribute('href')

        #get prdouct overview
        product_detail['product_overview'] = {}
        for row in centerDiv.find_element(By.XPATH, "//div[@id='productOverview_feature_div']").find_elements(By.XPATH, "*//table//tr"):
            td = row.find_elements(By.TAG_NAME, 'td')
            product_detail['product_overview'][td[0].text] = td[1].text

        #parse about section
        #replace non ascii characters and continous spaces
        product_detail['product_about'] = re.sub("\s+", " ", re.sub(r'[^\x00-\x7F]+', "", centerDiv.find_element(By.XPATH, ".//div[@id='featurebullets_feature_div']//ul").text))


        return product_detail
    

    def parseBottomDivs(self):

        # parse div below ppd

        not_required_div = ["sims-themis-sponsored-products-2_feature_div","ask-btf_feature_div", "discovery-and-inspiration_feature_div","dp-ads-center-promo_feature_div", "ad-endcap-1_feature_div","btfSubNavDesktop_feature_div", "HLCXComparisonWidgetTechnical_feature_div", "legal-compliance-card_feature_div", "similarities_feature_div", "customer-reviews_feature_div", "va-related-videos-widget_feature_div"]

    #     essential_divs = ["detailBullets_feature_div", "aplusBrandStory_feature_div", "aplus_feature_div"]
        # Find the reference element
        reference_element = self.driver.find_element(By.XPATH, "//div[@id='bottomRow']")

        # Find all elements that come after the reference element
        following_elements = reference_element.find_elements(By.XPATH, "following-sibling::div[@data-feature-name and div and normalize-space()]")

        product_details = {}
        images = []
        for i, div in enumerate(following_elements): #.find_elements(By.XPATH, "//div[@data-feature-name and div]"):
            if div.get_attribute('id') not in not_required_div:
                print(i, div.get_attribute('id'))
                div_id = div.get_attribute('id')
                #parse table

#                 data = {}
#                 if div.find_elements(By.TAG_NAME, "table"):
#                     table = div.find_element(By.TAG_NAME, "table")
#                     for row in table.find_elements(By.TAG_NAME, 'tr'):
#                         tds = row.find_elements(By.TAG_NAME, 'td')

#                         data[tds[0]] = tds[1].strip()

#                 else:
#                     data = div.text

#                 #parse images
#                 images = [img.get_attribute('src') for img in div.find_elements(By.TAG_NAME, "img")]

#                 #either present, then include the div
#                 if data or images:
#                     product_details[div_id] = {'data': data, 'images': images}


                data = None
                if div.find_elements(By.TAG_NAME, "table"):
                    data = {}
                    table = div.find_element(By.TAG_NAME, "table")
                    for row in table.find_elements(By.TAG_NAME, 'tr'):
#                         tds = row.find_elements(By.TAG_NAME, 'td')
                        tds = row.find_elements(By.XPATH, ".//td | .//th")

                        data[tds[0].text] = tds[1].text.strip()

                else:
                    data = div.text


                #parse images
                images.extend([img.get_attribute('src') for img in div.find_elements(By.TAG_NAME, "img")])

                #either present, then include the div
                if data:
                    product_details[div_id] = data


        return product_details, images
        
                

    
    def quit(self):
        self.driver.quit()

In [167]:
scraper = AmazonScraper("smart plug wifi")

In [168]:
product_urls = scraper.getProductURLList()

In [169]:
len(product_urls)

438

In [170]:
product_details= {}
for product_url in product_urls[:10]:
    print(product_url)
    scraper.driver.get(product_url)
    images = scraper.getLeftImage()
    product = scraper.parseCenterDiv()
    
    prod, img = scraper.parseBottomDivs()
    
    product.update(prod)
    images.extend(img)
    
    product_details[product['product_title']] = {'data': product, 'images': images}

https://www.amazon.com/sspa/click?ie=UTF8&spc=MTo2MDA4MTMyMTQ2MzQ2Njg4OjE3MDA0OTY3NzE6c3BfYXRmOjIwMDA2Njg0OTU1NTg2MTo6MDo6&url=%2FAmazon-smart-plug-works-with-Alexa%2Fdp%2FB089DR29T6%2Fref%3Dsr_1_1_ffob_sspa%3Fkeywords%3Dsmart%2Bwifi%2Bplugs%26qid%3D1700496771%26sr%3D8-1-spons%26sp_csd%3Dd2lkZ2V0TmFtZT1zcF9hdGY%26psc%3D1
0 sac-btf-start_feature_div
1 device-dp-recommendations_feature_div
2 btfContent1_feature_div
3 btfContent5_feature_div
4 btfContent7_feature_div
5 btfContent8_feature_div
6 climatePledgeFriendlyBTF_feature_div
https://www.amazon.com/sspa/click?ie=UTF8&spc=MTo2MDA4MTMyMTQ2MzQ2Njg4OjE3MDA0OTY3NzE6c3BfYXRmOjIwMDAxNjA0MTQ0MjY0MTo6MDo6&url=%2FBN-LINK-Monitoring-Function-Compatible-Assistant%2Fdp%2FB07CVPKD8Z%2Fref%3Dsr_1_2_sspa%3Fkeywords%3Dsmart%2Bwifi%2Bplugs%26qid%3D1700496771%26sr%3D8-2-spons%26sp_csd%3Dd2lkZ2V0TmFtZT1zcF9hdGY%26psc%3D1
3 detailBullets_feature_div
4 buffetServiceCard_feature_div
5 productDescription_feature_div
6 importantInformation_feature_div
7 aplu

In [171]:
product_details

{'Amazon Smart Plug | Works with Alexa | control lights with voice | easy to set up and use': {'data': {'product_title': 'Amazon Smart Plug | Works with Alexa | control lights with voice | easy to set up and use',
   'product_brand': 'Brand: Amazon',
   'product_brand_url': 'https://www.amazon.com/Amazon/b/ref=bl_dp_s_web_20784502011?ie=UTF8&node=20784502011&field-lbr_brands_browse-bin=Amazon',
   'product_overview': {},
   'product_about': "Amazon Smart Plug works with Alexa to add voice control to any outlet. Certified for Humans - Struggle-free, tinker-free, stress-free. No patience neededit's actually simple. Schedule lights, fans, and appliances to turn on and off automatically, or control them remotely when youre away. Simple to set up and useplug in, open the Alexa app, and get started in minutes. Compact design keeps your second outlet free. No smart home hub requiredset up routines and schedules through the Alexa app.",
   'sac-btf-start_feature_div': 'Smart home devices for y

In [162]:
product_details['Kasa Smart Plug HS103P4, Smart Home Wi-Fi Outlet Works with Alexa, Echo, Google Home & IFTTT, No Hub Required, Remote Control, 15 Amp, UL Certified, 4-Pack, White']

{'data': {'product_title': 'Kasa Smart Plug HS103P4, Smart Home Wi-Fi Outlet Works with Alexa, Echo, Google Home & IFTTT, No Hub Required, Remote Control, 15 Amp, UL Certified, 4-Pack, White',
  'product_brand': 'Visit the Kasa Smart Store',
  'product_brand_url': 'https://www.amazon.com/stores/KasaSmartbyTP-Link/page/9A672382-E380-4360-A698-DFBEAE7E7049?ref_=ast_bln',
  'product_overview': {'Brand': 'Kasa Smart',
   'Color': 'Indoor Plug 4-Pack',
   'Connector Type': 'Plug in',
   'No. of wires': '3',
   'Plug Format': 'Type B'},
  'product_about': 'Voice control: Kasa smart plugs that work with Alexa and Google Home Assistant. Enjoy the hands free convenience of controlling any home electronic appliances with your voice via Amazon Alexa or Google Assistant. Compatible with Android 5.0 or higher and iOS 10.0 or higher Smart Outlet Control from anywhere: Turn electronics on and off your smart home devices from anywhere with your smartphone using the Kasa app, whether you are at home, i