In [2]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import requests
import re
from matplotlib import pyplot as plt
import time
import urllib

In [3]:
def construct_url_params(url, kwargs):
    """
        Construct new url by adding query params to the url.
        
        Returns: new url
    """
    
    #if no query params add ? else add & at the end of url
    url+='?' if not '?' in url else '&'
    for i,j in kwargs.items():
        url+=f'{i}={j}&'
        
    #strip extra & at the end
    return url.rstrip('&')

In [4]:
construct_url_params("https://www.amazon.com/s?k=smart+plug+wifi", {'page': '2'})

'https://www.amazon.com/s?k=smart+plug+wifi&page=2'

In [142]:
class AmazonScraper:
    base_url = "https://www.amazon.com/s?k="

    def __init__(self, product_name, chrome_path=r"C:\Users\moink\Downloads\chromedriver-win64\chromedriver.exe"):
        
        service = webdriver.chrome.service.Service(executable_path=chrome_path)
        options = webdriver.ChromeOptions()
        options.add_argument("--start-maximized")
        options.add_experimental_option("detach", True)
        self.driver = webdriver.Chrome(service=service, options=options)

        self.search_url = self.base_url+ re.sub("\s+", "+", product_name)
        self.driver.get(self.search_url)
        
        
        
    def isNullElement(self, element):
        img = element.find_elements(By.XPATH, ".//img[@src]")
        if element.text.strip() or img:
            return False
        return True
    
    def find_element(self, element, locator, expression, list=True):
        result = element.find_elements(locator, expression)
        if list:
            return result
        
        return result[0] if result else None

    def getProductURLList(self):
        WebDriverWait(self.driver, 20).until(EC.presence_of_element_located((By.XPATH, "//div[contains(@class, 'a-section a-spacing-none a-spacing-top-small s-title-instructions-style')]")))
        
        qid = self.driver.find_element(By.XPATH, "//input[@name='qid']").get_attribute("value")
        total_page = int(self.driver.find_element(By.XPATH, "//div[@role='navigation']").find_element(By.XPATH, "//span[contains(@class, 's-pagination-item s-pagination-disabled')]").text)

        product_urls = []
        
        curr_page = 1
        
        while True: 
            for product in self.driver.find_elements(By.XPATH, "//a[contains(@class, 'a-link-normal s-underline-text s-underline-link-text s-link-style a-text-normal')]"):
                product_urls.append(product.get_attribute("href"))


            curr_page +=1
            
            if curr_page > total_page:
                break

            next_page_url = construct_url_params(self.search_url, {'page': curr_page, 'qid': qid, 'ref': f'sr_pg_{curr_page}'})
            self.driver.get(next_page_url)
        
        return product_urls
    
    def getLeftImage(self):
        return [img.get_attribute('src') for img in scraper.driver.find_elements(By.XPATH, "//div[@id='imageBlock']//div[@class='imgTagWrapper']//img")]

    def getProductNameAndIDFromURL(self):
        product_path = urllib.parse.urlparse(self.driver.current_url).path.strip('/').split('/')

        product_name = product_path[0].replace("-", " ")
        product_id = product_path[2]
        
        return product_name, product_id

    def parseCenterDiv(self):
        #parse centerDiv

        product_detail = {}
    #     required_div = ["featurebullets_feature_div", "bylineInfo_feature_div", "title_feature_div", "productOverview_feature_div"]
        centerDiv = self.driver.find_element(By.XPATH, "//div[@id='ppd' and div[@id='centerCol']]")

        #get product title
        product_detail['product_title'] = centerDiv.find_element(By.ID, 'productTitle').text

        #get product brand
        product_detail['product_brand'] = re.sub("^Visit the|^Brand:|store$", "", centerDiv.find_element(By.ID, 'bylineInfo').text, flags=re.IGNORECASE).strip()
        product_detail['product_brand_url'] = centerDiv.find_element(By.ID, 'bylineInfo').get_attribute('href')

        #get customer reviews
        customer_reviews = centerDiv.find_elements(By.XPATH, "//div[@id='averageCustomerReviews']")
        if customer_reviews:
            product_detail['customer_reviews'] = customer_reviews[0].text.split("\n")[0]

        #get prdouct overview
#         product_detail['product_overview'] = {}
        
        product_overview_feature_div = centerDiv.find_element(By.XPATH, "//div[@id='productOverview_feature_div']")
        
        soup = BeautifulSoup(product_overview_feature_div.get_attribute('innerHTML'), 'html.parser')
        for i in soup.findAll('tr'):
            td = i.findChildren('td')

            #below if elif are just for glance icons
            if td[0].find('table'):
                td = td[0].findAll('td')[-1].findAll('span')
            elif td[0].find('img'):
                td = td[1].findAll('span')

            product_detail[td[0].text.strip()] = td[1].text.strip()
                
#             product_detail['product_overview'][td[0].text.strip()] = td[1].text.strip()

#         product_detail['product_overview'] = product_overview
        #parse about section
        #replace non ascii characters and continous spaces
        product_detail['product_about'] = re.sub("\s+", " ", re.sub(r'[^\x00-\x7F]+', "", centerDiv.find_element(By.XPATH, ".//div[@id='featurebullets_feature_div']//ul").text))


        return product_detail

    def parseBottomDivs(self):

        productDescription = self.driver.find_elements(By.XPATH, "//div[@id='productDescription']")
        if productDescription:
            productDescription = productDescription[0].text.strip()
        product_config = {}
        misc = {}
        long_description = ""
        brand_story = ""
        detailBullets = self.driver.find_elements(By.XPATH, "//div[@id='detailBullets_feature_div' and not(@data-feature-name)]")

        if detailBullets:
            for li in detailBullets[0].find_elements(By.TAG_NAME, "li"):
                spans = li.find_elements(By.XPATH, ".//span/span")
                product_config[spans[0].text.replace(":","").strip()] = spans[1].text.strip()
        else:
            productDetails = self.driver.find_elements(By.XPATH, "//div[@id='productDetailsNonPets_feature_div']")
            if productDetails:
                tables = productDetails[0].find_elements(By.TAG_NAME, "table")
                for table in tables:
                    if scraper.isNullElement(table):
                        continue

                    if "productDetails_techSpec" in table.get_attribute('id'):
                        for tr in table.find_elements(By.TAG_NAME, "tr"):
                            th = tr.find_element(By.TAG_NAME, "th")
                            td = tr.find_element(By.TAG_NAME, "td")
                            product_config[th.text.strip()] = td.text.strip()

                    else:
                        for tr in table.find_elements(By.TAG_NAME, "tr"):
                            th = tr.find_element(By.TAG_NAME, "th")
                            td = tr.find_element(By.TAG_NAME, "td")
                            misc[th.text.strip()] = td.text.strip()


        aplus_feature_div = self.driver.find_elements(By.XPATH, "//div[@id='aplus_feature_div' and div and normalize-space()]")

        images = []
        if aplus_feature_div:

            long_description = aplus_feature_div[0].find_element(By.XPATH, ".//div[@id='aplus']/div").text
            images = [i.get_attribute("src") for i in aplus_feature_div[0].find_elements(By.TAG_NAME, "img")]

        aplus_BS_feature_div = self.driver.find_elements(By.XPATH, "//div[@id='aplusBrandStory_feature_div' and div and normalize-space()]")

        if aplus_BS_feature_div:
            brand_story = aplus_BS_feature_div[0].find_element(By.XPATH, ".//div[@id='aplus']/div").text

            images.extend([i.get_attribute("src") for i in aplus_BS_feature_div[0].find_elements(By.TAG_NAME, "img")])


        btf_contents = self.driver.find_elements(By.XPATH, "//div[contains(@id, 'btfContent') and div and normalize-space()]") 
        btf_description = ""
        for btf_content in btf_contents:
            tables =  btf_content.find_elements(By.TAG_NAME, "table")
            if tables:
                #first table is config and second is misc

                for i, table in enumerate(tables):
                    for tr in table.find_elements(By.TAG_NAME, "tr"):
                        td = tr.find_elements(By.TAG_NAME, "td")
                        product_config[td[0].text.strip()] = td[1].text.strip()

            else:
                btf_description += btf_content.text.strip()
            images.extend([i.get_attribute("src") for i in btf_content.find_elements(By.TAG_NAME, "img")])

        if not long_description:
            long_description = btf_description
        elif not brand_story:
            brand_story = btf_description
        else:
            misc['extra_information'] = btf_description

        product_details = {
            'product_short_description': productDescription if productDescription else None, 
            'product_long_desciption':long_description,
            'brand_story': brand_story,

        }        

        product_details.update(product_config)
        return product_details, images

    def quit(self):
        self.driver.quit()

In [143]:
scraper = AmazonScraper("smart plug wifi")

In [144]:
product_urls = scraper.getProductURLList()

In [145]:
len(product_urls)

432

In [146]:
product_details= {}
for product_url in product_urls:

    scraper.driver.get(product_url)
    product = {}
    product_name, product_id = scraper.getProductNameAndIDFromURL()

    product['product_id'] = product_id
    product['product_name'] = product_name

    images = scraper.getLeftImage()
    product.update(scraper.parseCenterDiv())
    
    prod, img = scraper.parseBottomDivs()
    product.update(prod)
    images.extend(img)
    product_details[product_id] = {'data': product, 'images': images}

NoSuchElementException: Message: no such element: Unable to locate element: {"method":"xpath","selector":".//div[@id='featurebullets_feature_div']//ul"}
  (Session info: chrome=119.0.6045.160); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
	GetHandleVerifier [0x00007FF7A6C582B2+55298]
	(No symbol) [0x00007FF7A6BC5E02]
	(No symbol) [0x00007FF7A6A805AB]
	(No symbol) [0x00007FF7A6AC175C]
	(No symbol) [0x00007FF7A6AC18DC]
	(No symbol) [0x00007FF7A6AB819C]
	(No symbol) [0x00007FF7A6AE20EF]
	(No symbol) [0x00007FF7A6AB80CF]
	(No symbol) [0x00007FF7A6AE22C0]
	(No symbol) [0x00007FF7A6AFAAA4]
	(No symbol) [0x00007FF7A6AE1E83]
	(No symbol) [0x00007FF7A6AB670A]
	(No symbol) [0x00007FF7A6AB7964]
	GetHandleVerifier [0x00007FF7A6FD0AAB+3694587]
	GetHandleVerifier [0x00007FF7A702728E+4048862]
	GetHandleVerifier [0x00007FF7A701F173+4015811]
	GetHandleVerifier [0x00007FF7A6CF47D6+695590]
	(No symbol) [0x00007FF7A6BD0CE8]
	(No symbol) [0x00007FF7A6BCCF34]
	(No symbol) [0x00007FF7A6BCD062]
	(No symbol) [0x00007FF7A6BBD3A3]
	BaseThreadInitThunk [0x00007FFA2397257D+29]
	RtlUserThreadStart [0x00007FFA241AAA58+40]


In [147]:
len(product_details)

143

In [118]:
product_details['B079KYHLDZ']

{'data': {'product_name': 'Meross Reliable Connection Mediatek Occupies',
  'product_id': 'B079KYHLDZ',
  'product_title': 'Meross Wi-Fi Smart Plug Mini, 15 Amp & Reliable Wi-Fi Connection, Support Alexa, Google Assistant, Remote Control, Timer, Occupies Only One Socket, 2.4G WiFi Only, 4 Pack',
  'product_brand': 'meross',
  'product_brand_url': 'https://www.amazon.com/stores/meross/page/6BD28492-A54E-43B5-9D93-223D2E453680?ref_=ast_bln',
  'customer_reviews': '4.4',
  'Brand': 'meross',
  'Color': 'white',
  'Voltage': '120 Volts',
  'Material': 'Acrylonitrile Butadiene Styrene (ABS)',
  'Connector Type': 'Plug In',
  'product_about': 'Off-Line Control: When the internet is down, you can still use Meross app to control the devices under the same wifi. Routine Offline Control allows schedule and timer to be running even when internet is down. No hub Required, works with 2.4GHz network. App Remote Control & Scene: The smart plug turn electronics on/off from anywhere anytime with Meross

In [148]:
processed_pd = [product_details[product_detail]['data'] for product_detail in product_details]
df = pd.DataFrame(processed_pd)
df.to_csv(r"C:\Users\moink\Downloads\network-security-white-labelling\amazon_products_dataset.csv", index=False, encoding='utf-8')

In [63]:
df.to_csv(r"C:\Users\moink\Downloads\network-security-white-labelling\amazon_products_dataset.csv", index=False, encoding='utf-8')

In [44]:
import pandas as pd

product_description = {
    "product_A": {
        'brand_story': '',
        'Size': '3.2” x 1.5” x 2.2” (80 mm x 38 mm x 57 mm)',
        'Weight': '3.1 oz. (87 grams) Actual size and weight may vary by manufacturing process',
        'Electrical Ratings': 'Input: 120VAC, 60Hz 15A\nMax Output: 15A Max',
        'Network Connectivity': '2.4 GHz only, 802.11 b/g/n. Does not support 5GHz networks or connecting to ad-hoc (or peer-to-peer) Wi-Fi networks.',
        'Use': 'For indoor use only.',
    },
    "product_B": {
        'brand_story': 'Swipe left to learn more ⊳⊳⊳\nAbout Eightree\nEightree is an up-and-coming brand established in 2021, dedicated to product development and manufacturing in the field of smart home.\nOur mission is to provide more convenient, high quality but affordable smart home solutions for people around the world.\nSmart home never needs to be expensive, just enjoy the convenience that smart home brings to your life. Eightree is always at your service.\nSmart Plugs Alexa Compatible',
        'Product Dimensions': '4.06 x 4.06 x 2.28 inches; 8.15 Ounces',
        'Item model number': 'ET01B-4',
        'Date First Available': 'July 8, 2022',
        'Manufacturer': 'EIGHTREE',
        'ASIN': 'B0B62LPR5Z',
    },
}

# Create DataFrames for each product
df_product_A = pd.DataFrame(product_description.get("product_A", {}), index=[0])
df_product_B = pd.DataFrame(product_description.get("product_B", {}), index=[0])

# Merge DataFrames
result_df = pd.merge(df_product_A, df_product_B, how='outer', left_index=True, right_index=True, suffixes=('_product_A', '_product_B'))

# Display the merged DataFrame
print(result_df.head())


  brand_story_product_A                                        Size   
0                        3.2” x 1.5” x 2.2” (80 mm x 38 mm x 57 mm)  \

                                              Weight   
0  3.1 oz. (87 grams) Actual size and weight may ...  \

                             Electrical Ratings   
0  Input: 120VAC, 60Hz 15A\nMax Output: 15A Max  \

                                Network Connectivity                   Use   
0  2.4 GHz only, 802.11 b/g/n. Does not support 5...  For indoor use only.  \

                               brand_story_product_B   
0  Swipe left to learn more ⊳⊳⊳\nAbout Eightree\n...  \

                       Product Dimensions Item model number   
0  4.06 x 4.06 x 2.28 inches; 8.15 Ounces           ET01B-4  \

  Date First Available Manufacturer        ASIN  
0         July 8, 2022     EIGHTREE  B0B62LPR5Z  


In [293]:
not_required_div = ["sims-themis-sponsored-products-2_feature_div","ask-btf_feature_div", "discovery-and-inspiration_feature_div", \
                    "dp-ads-center-promo_feature_div", "ad-endcap-1_feature_div","btfSubNavDesktop_feature_div", \
                    "HLCXComparisonWidgetTechnical_feature_div", "legal-compliance-card_feature_div", "similarities_feature_div", \
                    "customer-reviews_feature_div", "va-related-videos-widget_feature_div", \
                    "ive-videos-for-this-product-widget_feature_div", "importantInformation_feature_div", \
                    "sac-btf-start_feature_div", "bundleV2_feature_div", "device-dp-recommendations_feature_div",]

reference_element = scraper.driver.find_element(By.XPATH, "//div[@id='bottomRow']")

# Find all elements that come after the reference element
following_elements = reference_element.find_elements(By.XPATH, "following::div[contains(@id, 'feature_div') and @data-feature-name and div and normalize-space()]")

product_details = {}
images = []
for i, div in enumerate(following_elements): #.find_elements(By.XPATH, "//div[@data-feature-name and div]"):
    if div.get_attribute('id') not in not_required_div:
        print(div.get_attribute('id'))

HLCXComparisonWidget_feature_div
productDetails_feature_div
aplusBrandStory_feature_div
aplus_feature_div
productDocuments_feature_div
productDescription_feature_div
postsSameBrandCard_feature_div


In [300]:
scraper.driver.find_element(By.ID, "postsSameBrandCard_feature_div")

NoSuchElementException: Message: no such element: Unable to locate element: {"method":"css selector","selector":"[id="postsSameBrandCard_feature_div"]"}
  (Session info: chrome=119.0.6045.160); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
	GetHandleVerifier [0x00007FF7A6C582B2+55298]
	(No symbol) [0x00007FF7A6BC5E02]
	(No symbol) [0x00007FF7A6A805AB]
	(No symbol) [0x00007FF7A6AC175C]
	(No symbol) [0x00007FF7A6AC18DC]
	(No symbol) [0x00007FF7A6AFCBC7]
	(No symbol) [0x00007FF7A6AE20EF]
	(No symbol) [0x00007FF7A6AFAAA4]
	(No symbol) [0x00007FF7A6AE1E83]
	(No symbol) [0x00007FF7A6AB670A]
	(No symbol) [0x00007FF7A6AB7964]
	GetHandleVerifier [0x00007FF7A6FD0AAB+3694587]
	GetHandleVerifier [0x00007FF7A702728E+4048862]
	GetHandleVerifier [0x00007FF7A701F173+4015811]
	GetHandleVerifier [0x00007FF7A6CF47D6+695590]
	(No symbol) [0x00007FF7A6BD0CE8]
	(No symbol) [0x00007FF7A6BCCF34]
	(No symbol) [0x00007FF7A6BCD062]
	(No symbol) [0x00007FF7A6BBD3A3]
	BaseThreadInitThunk [0x00007FFA2397257D+29]
	RtlUserThreadStart [0x00007FFA241AAA58+40]


In [424]:
scraper.driver.get("https://www.amazon.com/Command-Control-Schedulete-Connection-Bluetooth/dp/B0BKPX6BPZ/ref=sxin_17_pa_sp_search_thematic_sspa?content-id=amzn1.sym.9e5188ef-9cc8-48bb-b834-24761033aedf%3Aamzn1.sym.9e5188ef-9cc8-48bb-b834-24761033aedf&cv_ct_cx=smart%2Bplug%2Bwifi&keywords=smart%2Bplug%2Bwifi&pd_rd_i=B0BKPX6BPZ&pd_rd_r=295b778e-d22d-434b-b996-b805c0bdbe62&pd_rd_w=8H7Ov&pd_rd_wg=QhhSt&pf_rd_p=9e5188ef-9cc8-48bb-b834-24761033aedf&pf_rd_r=67AKD62GR4KDBVVAMKJ0&qid=1700982782&sbo=RZvfv%2F%2FHxDF%2BO5021pAnSA%3D%3D&sr=1-2-364cf978-ce2a-480a-9bb0-bdb96faa0f61-spons&sp_csd=d2lkZ2V0TmFtZT1zcF9zZWFyY2hfdGhlbWF0aWM&th=1")

In [27]:
#parse bottom divs

#get product Description
# reference_element = scraper.driver.find_element(By.XPATH, "//div[@id='bottomRow']")

productDescription = scraper.driver.find_elements(By.XPATH, "//div[@id='productDescription']")
if productDescription:
    productDescription = productDescription[0].text.strip()
product_config = {}
misc = {}
long_description = ""
brand_story = ""
customer_reviews = None
detailBullets = scraper.driver.find_elements(By.XPATH, "//div[@id='detailBullets_feature_div' and not(@data-feature-name)]")

if detailBullets:
    for li in detailBullets[0].find_elements(By.TAG_NAME, "li"):
        spans = li.find_elements(By.XPATH, ".//span/span")
        print(spans[0].text, spans[1].text)
        product_config[spans[0].text.strip()] = spans[1].text.strip()
        customer_reviews = scraper.driver.find_element(By.XPATH, "//div[@id='detailBullets_averageCustomerReviews']").text.split("\n")[0]
else:
    productDetails = scraper.driver.find_elements(By.XPATH, "//div[@id='productDetailsNonPets_feature_div']")
    if productDetails:
        customer_reviews = scraper.driver.find_element(By.XPATH, "//div[@id='averageCustomerReviews']").text.split("\n")[0]
        tables = productDetails[0].find_elements(By.TAG_NAME, "table")
        for table in tables:
            if scraper.isNullElement(table):
                continue
        
            if "productDetails_techSpec" in table.get_attribute('id'):
                for tr in table.find_elements(By.TAG_NAME, "tr"):
                    th = tr.find_element(By.TAG_NAME, "th")
                    td = tr.find_element(By.TAG_NAME, "td")
                    product_config[th.text.strip()] = td.text.strip()

            else:
                for tr in table.find_elements(By.TAG_NAME, "tr"):
                    th = tr.find_element(By.TAG_NAME, "th")
                    td = tr.find_element(By.TAG_NAME, "td")
                    misc[th.text.strip()] = td.text.strip()

    
aplus_feature_div = scraper.driver.find_elements(By.XPATH, "//div[@id='aplus_feature_div and div']")

images = []
if aplus_feature_div:

    long_description = aplus_feature_div[0].find_element(By.XPATH, ".//div[@id='aplus']/div").text
    images = [i.get_attribute("src") for i in aplus_feature_div[0].find_elements(By.TAG_NAME, "img")]
    
aplus_BS_feature_div = scraper.driver.find_elements(By.XPATH, "//div[@id='aplusBrandStory_feature_div and div']")

if aplus_BS_feature_div:
    brand_story = aplus_BS_feature_div[0].find_element(By.XPATH, ".//div[@id='aplus']/div").text
    
    images.extend([i.get_attribute("src") for i in aplus_feature_div[0].find_elements(By.TAG_NAME, "img")])


btf_contents = scraper.driver.find_elements(By.XPATH, "//div[contains(@id, 'btfContent') and div and normalize-space()]") 
btf_description = ""
for btf_content in btf_contents:
    tables =  btf_content.find_elements(By.TAG_NAME, "table")
    if tables:
        #first table is config and second is misc
        tables_dict = {}

        for i, table in enumerate(tables):
            tables_dict[f'table_{i+1}'] = {}
            for tr in table.find_elements(By.TAG_NAME, "tr"):
                td = tr.find_elements(By.TAG_NAME, "td")
                tables_dict[f'table_{i+1}'][td[0].text.strip()] = td[1].text.strip()
                
        if not product_config:
            product_config = tables_dict['table_1']
            del tables_dict['table_1']
            
        for t in tables_dict:
            misc.update(tables_dict[t])
    else:
        btf_description += btf_content.text.strip()
    images.extend([i.get_attribute("src") for i in btf_content.find_elements(By.TAG_NAME, "img")])
                    
if not long_description:
    long_description = btf_description
elif not brand_story:
    brand_story = btf_description
else:
    misc['extra_information'] = btf_description

product_details = {
    'product_short_description': productDescription, 
    'product_configuration': product_config,
    'product_long_desciption':long_description,
    'brand_story': brand_story,
    'customer_reviews': customer_reviews,
    'misc': misc,
    'images': images
    
}


In [29]:
aplus_BS_feature_div = scraper.driver.find_elements(By.XPATH, "//div[@id='aplusBrandStory_feature_div'  and div]")


IndexError: list index out of range

In [26]:
btf_content = scraper.driver.find_elements(By.XPATH, "//div[contains(@id, 'btfContent') and div and normalize-space()]")    

In [30]:
for i in btf_content:
    print(i.get_attribute("id"))
    print(len(i.find_elements(By.TAG_NAME, "table")))

btfContent1_feature_div
0
btfContent5_feature_div
0
btfContent7_feature_div
0
btfContent8_feature_div
2


In [385]:
detailBullets[0].get_attribute('innerHTML')

'\n             <ul class="a-unordered-list a-nostyle a-vertical a-spacing-none detail-bullet-list">        <li><span class="a-list-item"> <span class="a-text-bold">Product Dimensions\n                                    \u200f\n                                        :\n                                    \u200e\n                                </span> <span>4 x 1.3 x 1.5 inches; 1.6 Ounces</span> </span></li>          <li><span class="a-list-item"> <span class="a-text-bold">Item model number\n                                    \u200f\n                                        :\n                                    \u200e\n                                </span> <span>H5082</span> </span></li>          <li><span class="a-list-item"> <span class="a-text-bold">Date First Available\n                                    \u200f\n                                        :\n                                    \u200e\n                                </span> <span>July 22, 2021</span> </span></li

In [456]:
"From the manufacturer\nVoice Control\nFree up your hands by using simple voice commands with Alexa and Google Assistant.\nVoice Control Control from Anywhere Grouping\nSmart Actions\nCreate interactions with your plug and other Kasa products. Connect a lamp to your plug so that when your camera detects motion, your light turns on.\nApp Features\nScheduling\nSchedule your connected devices to turn on and off automatically at specified times.\nCount Down Timer\nSet a timer for your connected appliance to automatically turn off when the time runs out.\nAway Mode\nAway Mode will automatically turn on and off connected appliances, like lamps, to make it appear as if you're home.\nRun Time & Usage\nView run times to track usage and cut down on energy consumption.\nSimple Setup".replace("\n", "")

"From the manufacturerVoice ControlFree up your hands by using simple voice commands with Alexa and Google Assistant.Voice Control Control from Anywhere GroupingSmart ActionsCreate interactions with your plug and other Kasa products. Connect a lamp to your plug so that when your camera detects motion, your light turns on.App FeaturesSchedulingSchedule your connected devices to turn on and off automatically at specified times.Count Down TimerSet a timer for your connected appliance to automatically turn off when the time runs out.Away ModeAway Mode will automatically turn on and off connected appliances, like lamps, to make it appear as if you're home.Run Time & UsageView run times to track usage and cut down on energy consumption.Simple Setup"

In [39]:
a = {'a':1, 'b':2}

In [40]:
a.update({'e':2, 'b':3})

In [41]:
a

{'a': 1, 'b': 3, 'e': 2}

In [55]:
class AmazonScraper:
    base_url = "https://www.amazon.com/s?k="

    def __init__(self, product_name, chrome_path=r"C:\Users\moink\Downloads\chromedriver-win64\chromedriver.exe"):
        
        service = webdriver.chrome.service.Service(executable_path=chrome_path)
        options = webdriver.ChromeOptions()
        options.add_argument("--start-maximized")

        self.driver = webdriver.Chrome(service=service, options=options)

        self.search_url = self.base_url+ re.sub("\s+", "+", product_name)
        self.driver.get(self.search_url)
        
        
        
    def isNullElement(self, element):
        img = element.find_elements(By.XPATH, ".//img[@src]")
        if element.text.strip() or img:
            return False
        return True
    
    def find_element(self, element, locator, expression, list=True):
        result = element.find_elements(locator, expression)
        if list:
            return result
        
        return result[0] if result else None

    def getProductURLList(self):
        WebDriverWait(self.driver, 20).until(EC.presence_of_element_located((By.XPATH, "//div[contains(@class, 'a-section a-spacing-none a-spacing-top-small s-title-instructions-style')]")))
        
        qid = self.driver.find_element(By.XPATH, "//input[@name='qid']").get_attribute("value")
        total_page = int(self.driver.find_element(By.XPATH, "//div[@role='navigation']").find_element(By.XPATH, "//span[contains(@class, 's-pagination-item s-pagination-disabled')]").text)

        product_urls = []
        
        curr_page = 1
        
        while True: 
            for product in self.driver.find_elements(By.XPATH, "//a[contains(@class, 'a-link-normal s-underline-text s-underline-link-text s-link-style a-text-normal')]"):
                product_urls.append(product.get_attribute("href"))


            curr_page +=1
            
            if curr_page > total_page:
                break

            next_page_url = construct_url_params(self.search_url, {'page': curr_page, 'qid': qid, 'ref': f'sr_pg_{curr_page}'})
            self.driver.get(next_page_url)
        
        return product_urls
    
    def getLeftImage(self):
        return [img.get_attribute('src') for img in scraper.driver.find_elements(By.XPATH, "//div[@id='imageBlock']//div[@class='imgTagWrapper']//img")]

    def parseCenterDiv(self):
        #parse centerDiv

        product_detail = {}
    #     required_div = ["featurebullets_feature_div", "bylineInfo_feature_div", "title_feature_div", "productOverview_feature_div"]
        centerDiv = self.driver.find_element(By.XPATH, "//div[@id='ppd' and div[@id='centerCol']]")

        #get product title
        product_detail['product_title'] = centerDiv.find_element(By.ID, 'productTitle').text

        #get product brand
        product_detail['product_brand'] = centerDiv.find_element(By.ID, 'bylineInfo').text
        product_detail['product_brand_url'] = centerDiv.find_element(By.ID, 'bylineInfo').get_attribute('href')

        #get customer reviews
        product_detail['customer_reviews'] = centerDiv.find_element(By.XPATH, "//div[@id='averageCustomerReviews']").text.split("\n")[0]

        #get prdouct overview
#         product_detail['product_overview'] = {}
        
        product_overview_feature_div = centerDiv.find_element(By.XPATH, "//div[@id='productOverview_feature_div']")
        
        soup = BeautifulSoup(product_overview_feature_div.get_attribute('innerHTML'), 'html.parser')
        for i in soup.findAll('tr'):
            td = i.findChildren('td')

            #below if elif are just for glance icons
            if td[0].find('table'):
                td = td[0].findAll('td')[-1].findAll('span')
            elif td[0].find('img'):
                td = td[1].findAll('span')

            product_detail[td[0].text.strip()] = td[1].text.strip()
                
#             product_detail['product_overview'][td[0].text.strip()] = td[1].text.strip()

#         product_detail['product_overview'] = product_overview
        #parse about section
        #replace non ascii characters and continous spaces
        product_detail['product_about'] = re.sub("\s+", " ", re.sub(r'[^\x00-\x7F]+', "", centerDiv.find_element(By.XPATH, ".//div[@id='featurebullets_feature_div']//ul").text))


        return product_detail

    def parseBottomDivs(self):
        #parse bottom divs
        #get product Description
#         reference_element = self.driver.find_element(By.XPATH, "//div[@id='bottomRow']")

        productDescription = self.driver.find_elements(By.XPATH, "//div[@id='productDescription']")
        if productDescription:
            productDescription = productDescription[0].text.strip()
        product_config = {}
        misc = {}
        long_description = ""
        brand_story = ""
        detailBullets = self.driver.find_elements(By.XPATH, "//div[@id='detailBullets_feature_div' and not(@data-feature-name)]")

        if detailBullets:
            for li in detailBullets[0].find_elements(By.TAG_NAME, "li"):
                spans = li.find_elements(By.XPATH, ".//span/span")
                print(spans[0].text, spans[1].text)
                product_config[spans[0].text.replace(":","").strip()] = spans[1].text.strip()
        else:
            productDetails = self.driver.find_elements(By.XPATH, "//div[@id='productDetailsNonPets_feature_div']")
            if productDetails:
                tables = productDetails[0].find_elements(By.TAG_NAME, "table")
                for table in tables:
                    if scraper.isNullElement(table):
                        continue

                    if "productDetails_techSpec" in table.get_attribute('id'):
                        for tr in table.find_elements(By.TAG_NAME, "tr"):
                            th = tr.find_element(By.TAG_NAME, "th")
                            td = tr.find_element(By.TAG_NAME, "td")
                            product_config[th.text.strip()] = td.text.strip()

                    else:
                        for tr in table.find_elements(By.TAG_NAME, "tr"):
                            th = tr.find_element(By.TAG_NAME, "th")
                            td = tr.find_element(By.TAG_NAME, "td")
                            misc[th.text.strip()] = td.text.strip()


        aplus_feature_div = self.driver.find_elements(By.XPATH, "//div[@id='aplus_feature_div' and div and normalize-space()]")

        images = []
        if aplus_feature_div:

            long_description = aplus_feature_div[0].find_element(By.XPATH, ".//div[@id='aplus']/div").text
            images = [i.get_attribute("src") for i in aplus_feature_div[0].find_elements(By.TAG_NAME, "img")]

        aplus_BS_feature_div = self.driver.find_elements(By.XPATH, "//div[@id='aplusBrandStory_feature_div' and div and normalize-space()]")

        if aplus_BS_feature_div:
            brand_story = aplus_BS_feature_div[0].find_element(By.XPATH, ".//div[@id='aplus']/div").text

            images.extend([i.get_attribute("src") for i in aplus_feature_div[0].find_elements(By.TAG_NAME, "img")])


        btf_contents = self.driver.find_elements(By.XPATH, "//div[contains(@id, 'btfContent') and div and normalize-space()]") 
        btf_description = ""
        for btf_content in btf_contents:
            tables =  btf_content.find_elements(By.TAG_NAME, "table")
            if tables:
                #first table is config and second is misc
                tables_dict = {}

                for i, table in enumerate(tables):
                    tables_dict[f'table_{i+1}'] = {}
                    for tr in table.find_elements(By.TAG_NAME, "tr"):
                        td = tr.find_elements(By.TAG_NAME, "td")
                        tables_dict[f'table_{i+1}'][td[0].text.strip()] = td[1].text.strip()

                if not product_config:
                    product_config = tables_dict['table_1']
                    del tables_dict['table_1']

                for t in tables_dict:
                    misc.update(tables_dict[t])
            else:
                btf_description += btf_content.text.strip()
            images.extend([i.get_attribute("src") for i in btf_content.find_elements(By.TAG_NAME, "img")])

        if not long_description:
            long_description = btf_description
        elif not brand_story:
            brand_story = btf_description
        else:
            misc['extra_information'] = btf_description

        product_details = {
            'product_short_description': productDescription if productDescription else None, 
            'product_long_desciption':long_description,
            'brand_story': brand_story,
            'misc': misc

        }        

        product_details.update(product_config)
        return product_details, images


#     def parseBottomDivs(self):

#         # parse div below ppd
#         not_required_div = ["sims-themis-sponsored-products-2_feature_div","ask-btf_feature_div", "discovery-and-inspiration_feature_div", \
#                             "dp-ads-center-promo_feature_div", "ad-endcap-1_feature_div","btfSubNavDesktop_feature_div", \
#                             "HLCXComparisonWidgetTechnical_feature_div", "legal-compliance-card_feature_div", "similarities_feature_div", \
#                             "customer-reviews_feature_div", "va-related-videos-widget_feature_div", \
#                             "ive-videos-for-this-product-widget_feature_div", "importantInformation_feature_div", \
#                             "sac-btf-start_feature_div", "bundleV2_feature_div", "device-dp-recommendations_feature_div",\
#                            "productDocuments_feature_div", "HLCXComparisonWidget_feature_div", "postPurchaseWhatsInTheBox_MP_feature_div"]

#     #     essential_divs = ["detailBullets_feature_div", "aplusBrandStory_feature_div", "aplus_feature_div", "productDescription_feature_div"]
#         # Find the reference element
#         reference_element = self.driver.find_element(By.XPATH, "//div[@id='bottomRow']")

#         # Find all elements that come after the reference element
#         following_elements = reference_element.find_elements(By.XPATH, "following::div[contains(@id, 'feature_div') and @data-feature-name and div and normalize-space()]")

#         product_details = {}
#         images = []
#         for i, div in enumerate(following_elements): #.find_elements(By.XPATH, "//div[@data-feature-name and div]"):
#             if div.get_attribute('id') not in not_required_div:
#                 print(i, div.get_attribute('id'))
#                 div_id = div.get_attribute('id')
#                 #parse table

#                 data = {}
# #                 if div.find_elements(By.TAG_NAME, "table"):
# #                     data = {}
# #                     table = div.find_element(By.TAG_NAME, "table")
# #                     for row in table.find_elements(By.TAG_NAME, 'tr'):
# #                         tds = row.find_elements(By.XPATH, ".//td | .//th")

# #                         data[tds[0].text] = tds[1].text.strip()

# #                 else:
# #                     data = div.text


#                 #parse images
#                 images.extend([img.get_attribute('src') for img in div.find_elements(By.TAG_NAME, "img")])

#                 #either present, then include the div
# #                 if data:
#                 product_details[div_id] = data


#         return product_details, images
        
                

    
    def quit(self):
        self.driver.quit()

In [85]:
brand = 'Visit the EIGHTREE store'
# brand = "EIGHTREE Store"
re.sub("^Visit the|^Brand:|store$", "", brand, flags=re.IGNORECASE).strip()

'EIGHTREE'

In [95]:
import urllib
# urllib.parse.parse_qs(urllib.parse.urlparse(last_page_url).query)['page'][0]


In [103]:
url = "https://www.amazon.com/meross-Smart-Plug-HomeKit-Pack/dp/B084JHJBQT/ref=sr_1_1_sspa?crid=3V4JP95AN0I92&keywords=Meross%2BSmart%2BPlug%2BMini%2C%2B15A%2B%26%2BReliable%2BWi-Fi%2C%2BSupport%2BApple%2BHomeKit%2C%2BSiri%2C%2BAlexa%2C%2BEcho%2C%2BGoogle%2BAssistant%2Band%2BNest%2BHub%2C%2BApp%2BControl%2C%2BTimer%2C%2BNo%2BHub%2BNeeded%2C%2B2.4G%2BWiFi%2BOnly%2C%2B4%2BPack&qid=1701051258&sprefix=meross%2Bsmart%2Bplug%2Bmini%2C%2B15a%2B%26%2Breliable%2Bwi-fi%2C%2Bsupport%2Bapple%2Bhomekit%2C%2Bsiri%2C%2Balexa%2C%2Becho%2C%2Bgoogle%2Bassistant%2Band%2Bnest%2Bhub%2C%2Bapp%2Bcontrol%2C%2Btimer%2C%2Bno%2Bhub%2Bneeded%2C%2B2.4g%2Bwifi%2Bonly%2C%2B4%2Bpack%2Caps%2C98&sr=8-1-spons&sp_csd=d2lkZ2V0TmFtZT1zcF9hdGY&th=1"
product_path = urllib.parse.urlparse(url).path.strip('/').split('/')

product_name = product_path[0].replace("-", " ")
product_id = product_path[2]

In [107]:
product_name.title(), product_id

('Meross Smart Plug Homekit Pack', 'B084JHJBQT')