In [2]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import requests
import re
from matplotlib import pyplot as plt
import time
import urllib

In [3]:
def construct_url_params(url, kwargs):
    """
        Construct new url by adding query params to the url.
        
        Returns: new url
    """
    
    #if no query params add ? else add & at the end of url
    url+='?' if not '?' in url else '&'
    for i,j in kwargs.items():
        url+=f'{i}={j}&'
        
    #strip extra & at the end
    return url.rstrip('&')

In [4]:
construct_url_params("https://www.amazon.com/s?k=smart+plug+wifi", {'page': '2'})

'https://www.amazon.com/s?k=smart+plug+wifi&page=2'

In [167]:
class AmazonScraper:
    base_url = "https://www.amazon.com/s?k="

    def __init__(self, product_name, chrome_path=r"C:\Users\moink\Downloads\chromedriver-win64\chromedriver.exe"):
        
        service = webdriver.chrome.service.Service(executable_path=chrome_path)
        options = webdriver.ChromeOptions()
        options.add_argument("--start-maximized")
        options.add_experimental_option("detach", True)
        self.driver = webdriver.Chrome(service=service, options=options)

        self.search_url = self.base_url+ re.sub("\s+", "+", product_name)
        self.driver.get(self.search_url)
        
        
        
    def isNullElement(self, element):
        img = element.find_elements(By.XPATH, ".//img[@src]")
        if element.text.strip() or img:
            return False
        return True
    
    def find_element(self, element, locator, expression, list=True):
        result = element.find_elements(locator, expression)
        if list:
            return result
        
        return result[0] if result else None

    def getProductURLList(self):
        WebDriverWait(self.driver, 20).until(EC.presence_of_element_located((By.XPATH, "//div[contains(@class, 'a-section a-spacing-none a-spacing-top-small s-title-instructions-style')]")))
        
        qid = self.driver.find_element(By.XPATH, "//input[@name='qid']").get_attribute("value")
        total_page = int(self.driver.find_element(By.XPATH, "//div[@role='navigation']").find_element(By.XPATH, "//span[contains(@class, 's-pagination-item s-pagination-disabled')]").text)

        product_urls = []
        
        curr_page = 1
        
        while True: 
            for product in self.driver.find_elements(By.XPATH, "//a[contains(@class, 'a-link-normal s-underline-text s-underline-link-text s-link-style a-text-normal')]"):
                product_urls.append(product.get_attribute("href"))


            curr_page +=1
            
            if curr_page > total_page:
                break

            next_page_url = construct_url_params(self.search_url, {'page': curr_page, 'qid': qid, 'ref': f'sr_pg_{curr_page}'})
            self.driver.get(next_page_url)
        
        return product_urls
    
    def getLeftImage(self):
        return [img.get_attribute('src') for img in scraper.driver.find_elements(By.XPATH, "//div[@id='imageBlock']//div[@class='imgTagWrapper']//img")]

    def getProductNameAndIDFromURL(self):
        product_path = urllib.parse.urlparse(self.driver.current_url).path.strip('/').split('/')

        product_name = product_path[0].replace("-", " ")
        product_id = product_path[2]
        
        return product_name, product_id

    def parseCenterDiv(self):
        #parse centerDiv

        product_detail = {}
    #     required_div = ["featurebullets_feature_div", "bylineInfo_feature_div", "title_feature_div", "productOverview_feature_div"]
        
        centerDiv = self.driver.find_elements(By.XPATH, "//div[@id='ppd']")[0].find_element(By.XPATH, ".//div[@id='centerCol']")

        #get product title
        product_detail['product_title'] = centerDiv.find_element(By.ID, 'productTitle').text

        #get product brand
        product_detail['product_brand'] = re.sub("^Visit the|^Brand:|store$", "", centerDiv.find_element(By.ID, 'bylineInfo').text, flags=re.IGNORECASE).strip()
        product_detail['product_brand_url'] = centerDiv.find_element(By.ID, 'bylineInfo').get_attribute('href')

        #get customer reviews
        customer_reviews = centerDiv.find_elements(By.XPATH, "//div[@id='averageCustomerReviews']")
        if customer_reviews:
            product_detail['customer_reviews'] = customer_reviews[0].text.split("\n")[0]

        #get prdouct overview
#         product_detail['product_overview'] = {}
        
        product_overview_feature_div = centerDiv.find_elements(By.XPATH, "//div[@id='productOverview_feature_div']")
        if product_overview_feature_div:        
            soup = BeautifulSoup(product_overview_feature_div[0].get_attribute('innerHTML'), 'html.parser')
            for i in soup.findAll('tr'):
                td = i.findChildren('td')

                #below if elif are just for glance icons
                if td[0].find('table'):
                    td = td[0].findAll('td')[-1].findAll('span')
                elif td[0].find('img'):
                    td = td[1].findAll('span')

                product_detail[td[0].text.strip()] = td[1].text.strip()
                
        #parse about section
        #replace non ascii characters and continous spaces
        product_detail['product_about'] = ""
        product_about = centerDiv.find_elements(By.XPATH, ".//div[@id='featurebullets_feature_div']//ul")
        
        if product_about:
            product_detail['product_about'] = re.sub("\s+", " ", re.sub(r'[^\x00-\x7F]+', "", product_about[0].text))


        return product_detail

    def parseBottomDivs(self):

        productDescription = self.driver.find_elements(By.XPATH, "//div[@id='productDescription']")
        if productDescription:
            productDescription = productDescription[0].text.strip()
        product_config = {}
        misc = {}
        long_description = ""
        brand_story = ""
        detailBullets = self.driver.find_elements(By.XPATH, "//div[@id='detailBullets_feature_div' and not(@data-feature-name)]")

        if detailBullets:
            for li in detailBullets[0].find_elements(By.TAG_NAME, "li"):
                spans = li.find_elements(By.XPATH, ".//span/span")
                product_config[spans[0].text.replace(":","").strip()] = spans[1].text.strip()
        else:
            productDetails = self.driver.find_elements(By.XPATH, "//div[@id='productDetailsNonPets_feature_div']")
            if productDetails:
                tables = productDetails[0].find_elements(By.TAG_NAME, "table")
                for table in tables:
                    if scraper.isNullElement(table):
                        continue

                    if "productDetails_techSpec" in table.get_attribute('id'):
                        for tr in table.find_elements(By.TAG_NAME, "tr"):
                            th = tr.find_element(By.TAG_NAME, "th")
                            td = tr.find_element(By.TAG_NAME, "td")
                            product_config[th.text.strip()] = td.text.strip()

                    else:
                        for tr in table.find_elements(By.TAG_NAME, "tr"):
                            th = tr.find_element(By.TAG_NAME, "th")
                            td = tr.find_element(By.TAG_NAME, "td")
                            misc[th.text.strip()] = td.text.strip()


        aplus_feature_div = self.driver.find_elements(By.XPATH, "//div[@id='aplus_feature_div' and div and normalize-space()]")

        images = []
        if aplus_feature_div:

            long_description = aplus_feature_div[0].find_element(By.XPATH, ".//div[@id='aplus']/div").text
            images = [i.get_attribute("src") for i in aplus_feature_div[0].find_elements(By.TAG_NAME, "img")]

        aplus_BS_feature_div = self.driver.find_elements(By.XPATH, "//div[@id='aplusBrandStory_feature_div' and div and normalize-space()]")

        if aplus_BS_feature_div:
            brand_story = aplus_BS_feature_div[0].find_element(By.XPATH, ".//div[@id='aplus']/div").text

            images.extend([i.get_attribute("src") for i in aplus_BS_feature_div[0].find_elements(By.TAG_NAME, "img")])


        btf_contents = self.driver.find_elements(By.XPATH, "//div[contains(@id, 'btfContent') and div and normalize-space()]") 
        btf_description = ""
        for btf_content in btf_contents:
            tables =  btf_content.find_elements(By.TAG_NAME, "table")
            if tables:
                #first table is config and second is misc

                for i, table in enumerate(tables):
                    for tr in table.find_elements(By.TAG_NAME, "tr"):
                        td = tr.find_elements(By.TAG_NAME, "td")
                        product_config[td[0].text.strip()] = td[1].text.strip()

            else:
                btf_description += btf_content.text.strip()
            images.extend([i.get_attribute("src") for i in btf_content.find_elements(By.TAG_NAME, "img")])

        if not long_description:
            long_description = btf_description
        elif not brand_story:
            brand_story = btf_description
        else:
            misc['extra_information'] = btf_description

        product_details = {
            'product_short_description': productDescription if productDescription else None, 
            'product_long_desciption':long_description,
            'brand_story': brand_story,

        }        

        product_details.update(product_config)
        return product_details, images

    def quit(self):
        self.driver.quit()

In [168]:
scraper = AmazonScraper("smart plug wifi")

In [169]:
product_urls = scraper.getProductURLList()

In [170]:
len(product_urls)

402

In [172]:
product_details= {}
for product_url in product_urls:

    scraper.driver.get(product_url)
    product = {}
    product_name, product_id = scraper.getProductNameAndIDFromURL()

    if product_id not in product_details:
        product['product_id'] = product_id
        product['product_name'] = product_name

        images = scraper.getLeftImage()
        product.update(scraper.parseCenterDiv())

        prod, img = scraper.parseBottomDivs()
        product.update(prod)
        images.extend(img)
        product_details[product_id] = {'data': product, 'images': images}

In [173]:
len(product_details)

383

In [118]:
product_details['B079KYHLDZ']

{'data': {'product_name': 'Meross Reliable Connection Mediatek Occupies',
  'product_id': 'B079KYHLDZ',
  'product_title': 'Meross Wi-Fi Smart Plug Mini, 15 Amp & Reliable Wi-Fi Connection, Support Alexa, Google Assistant, Remote Control, Timer, Occupies Only One Socket, 2.4G WiFi Only, 4 Pack',
  'product_brand': 'meross',
  'product_brand_url': 'https://www.amazon.com/stores/meross/page/6BD28492-A54E-43B5-9D93-223D2E453680?ref_=ast_bln',
  'customer_reviews': '4.4',
  'Brand': 'meross',
  'Color': 'white',
  'Voltage': '120 Volts',
  'Material': 'Acrylonitrile Butadiene Styrene (ABS)',
  'Connector Type': 'Plug In',
  'product_about': 'Off-Line Control: When the internet is down, you can still use Meross app to control the devices under the same wifi. Routine Offline Control allows schedule and timer to be running even when internet is down. No hub Required, works with 2.4GHz network. App Remote Control & Scene: The smart plug turn electronics on/off from anywhere anytime with Meross

In [174]:
processed_pd = [product_details[product_detail]['data'] for product_detail in product_details]
df = pd.DataFrame(processed_pd)
df.to_csv(r"C:\Users\moink\Downloads\network-security-white-labelling\amazon_products_dataset.csv", index=False, encoding='utf-8')

In [176]:
df.head()

Unnamed: 0,product_id,product_name,product_title,product_brand,product_brand_url,customer_reviews,product_about,product_short_description,product_long_desciption,brand_story,...,AC Adapter Current,Total USB Ports,Finish Type,Manufacturer recommended age,Number of Ports,Average Battery Life,Shade Material,Fixture Type,Installation Type,Max Number of Supported Devices
0,B089DR29T6,Amazon smart plug works with Alexa,Amazon Smart Plug | Works with Alexa | control...,Amazon,https://www.amazon.com/Amazon/b/ref=bl_dp_s_we...,4.7,Amazon Smart Plug works with Alexa to add voic...,,Make your home smart\nAmazon Smart Plug lets y...,,...,,,,,,,,,,
1,B07RCNB2L3,TP Link Kasa Smart Wifi Plug,"Kasa Smart Plug HS103P4, Smart Home Wi-Fi Outl...",Kasa Smart,https://www.amazon.com/stores/KasaSmartbyTP-Li...,4.6,Voice control: Kasa smart plugs that work with...,The HS103 Kasa smart plug is the easiest way y...,Voice Control\nFree up your hands by using sim...,"Leading Smart Home Brand\nLaunched in 2015, Ka...",...,,,,,,,,,,
2,B0B62LPR5Z,Outlet Compatible Smartthings Control Function,"EIGHTREE Smart Plug, Smart Home WiFi Outlet Co...",EIGHTREE,https://www.amazon.com/stores/EIGHTREE-SmartLi...,4.4,Simplified Setup: Our upgraded smart plug make...,,Voice Control Timer & Schedule Device Sharing\...,Swipe left to learn more ⊳⊳⊳\nAbout Eightree\n...,...,,,,,,,,,,
3,B091FXLMS8,Kasa Smart Required Certified EP10P4,"Kasa Smart Plug Mini 15A, Smart Home Wi-Fi Out...",Kasa Smart,https://www.amazon.com/stores/KasaSmartbyTP-Li...,4.6,Voice control: Kasa smart plugs that work with...,The EP10 is Kasa latest mini smart plug that s...,Play Video\nCompact Design\nThe smallest Kasa ...,"Leading Smart Home Brand\nLaunched in 2015, Ka...",...,,,,,,,,,,
4,B08HQ2N235,meross Compatible Assistant SmartThings Waterp...,meross Outdoor Smart Plug Compatible with Appl...,meross,https://www.amazon.com/stores/meross/page/6BD2...,4.4,Ideal for Outdoor Use: IP44 weatherproof housi...,,2 Independent Outlets\n2 Outlets WORK INDEPEND...,"At Meross, we create products that empower peo...",...,,,,,,,,,,
