In [1]:
import requests
from requests_html import HTML
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
import re
import time

In [2]:
options = Options()
options.add_argument("--headless")

driver = webdriver.Chrome(options=options)

In [3]:
categories = ['https://www.amazon.sa/-/en/gp/bestsellers/sports-goods/17007473031/', 
             'https://www.amazon.sa/-/en/gp/bestsellers/electronics/',
             'https://www.amazon.sa/-/en/gp/bestsellers/pet-supplies/'
             ]

In [4]:
# categories

In [5]:
first_url = categories[0]

In [6]:
driver.get(first_url)
body_el = driver.find_element(By.CSS_SELECTOR, 'body')
html_str = body_el.get_attribute("innerHTML")
html_obj = HTML(html=html_str)

In [7]:
page_links = [f"https://www.amazon.sa{x}" for x in html_obj.links if x.startswith("/")]
# new_links = [x for x in new_links if "product-reviews/" not in x]

In [8]:
# page_links

In [9]:
def scrape_product_page(url, title_lookup='#productTitle', 
price_lookup='.a-offscreen'):
    driver.get(url)
    time.sleep(0.5)
    body_el = driver.find_element(By.CSS_SELECTOR, 'body')
    html_str = body_el.get_attribute("innerHTML")
    html_obj = HTML(html=html_str)
    product_title = html_obj.find(title_lookup, first=True).text
    product_price = html_obj.find(price_lookup, first=True).text
    return product_title, product_price

In [10]:
# https://www.amazon.sa/-/en/GEVICONT-Performance-Salt-Water-Resistant-Superline/dp/B07GW3LX9Z/
# https://www.amazon.sa/-/en/Rovyfota-Upgraded-Beginner-Practice-Accessories/dp/B0B2JB2FTR/

# Below is how most urls are designed
# <base-url>/<slug>/<product-id>/

In [11]:
# my_regex_pattern = r'https://www.amazon.sa/(?P<slug>[\w-]+)/dp/(?P<product_id>[\w-]+)/'
# my_url = 'https://www.amazon.sa/Rovyfota-Upgraded-Beginner-Practice-Accessories/dp/B0B2JB2FTR/'

In [12]:
# regex = re.compile(my_regex_pattern)

In [13]:
# my_match = regex.match(my_url)
# print(my_match)
# print(my_match['slug'])
# print(my_match['product_id'])

In [14]:
regex_options = [
    r'https://www.amazon.sa/-/en/gp/product/(?P<product_id>[\w-]+)/',
    r'https://www.amazon.sa/-/en/dp/(?P<product_id>[\w-]+)/',
    r'https://www.amazon.sa/-/en/(?P<slug>[\w-]+)/dp/(?P<product_id>[\w-]+)/',
]

def extract_product_id_from_url(url):
    product_id = None
    for regex_str in regex_options:
        regex = re.compile(regex_str)
        match = regex.match(url)
        if match != None:
            try:
                product_id = match['product_id']
            except:
                pass
    return product_id

In [15]:

# page_links = [x for x in page_links if extract_product_id_from_url(x) !=None]
def clean_page_links(page_links=[]):
    final_page_links = []
    for url in page_links:
        product_id = extract_product_id_from_url(url)
        if product_id !=None:
            final_page_links.append({"url": url, "product_id":product_id})
    return final_page_links

cleaned_links = clean_page_links(page_links)

In [16]:
len(page_links) # == len(cleaned_links)


118

In [17]:
len(cleaned_links)

30

In [18]:
def perform_scrape(cleaned_items=[]): 
    data_extracted = []
    for obj in cleaned_items:
        link = obj['url']
        product_id = obj['product_id']
        title, price = (None, None)
        try:
            title, price = scrape_product_page(link)
        except:
            pass
        if title != None and price !=None:
            print(link, title, price)
        product_data = {
            "url": link,
            "title": title,
            "price": price
        }
        data_extracted.append(product_data)
    return data_extracted

In [19]:
extracted_data = perform_scrape(cleaned_items=cleaned_links)

https://www.amazon.sa/-/en/SCIENISH-Glow-Fishing-Lure-Bait/dp/B09C53FCTZ/ref=zg_bs_17007473031_sccl_4/260-8022584-3068325?psc=1 SCIENISH Glow Fishing Lure Bait Kit - 5pcs Luminous Fishing Bait VIB Popper Crank Minnow Pencil Artificial Lures with Tackle Box SAR25.00
https://www.amazon.sa/-/en/Fishing-Saltwater-Sabiki-Trolling-Herring/dp/B09V5S6QHB/ref=zg_bs_17007473031_sccl_8/260-8022584-3068325?psc=1 Sea Fishing Saltwater Sabiki Hook Rigs Blue Tube Tuna Lures Trolling Fishing Herring Bait Jig soft lure Pesca Sabiki lure SAR21.99
https://www.amazon.sa/-/en/MUSTAD-fishing-circle-carbon-Jigging/dp/B075N5TNB6/ref=zg_bs_17007473031_sccl_14/260-8022584-3068325?psc=1 MUSTAD 39951 fishing hooks demon circle high carbon steel Jigging Hook SAR23.99
https://www.amazon.sa/-/en/10-Holes-Fishing-Trap-Automatic/dp/B09V2T67TN/ref=zg_bs_17007473031_sccl_18/260-8022584-3068325?psc=1 10 Holes Fishing Trap - Eacam Portable 4/6/10 Holes Automatic Fishing Trap Net Shrimp Crab Mesh Trap Cage SAR37.99
https:/

In [20]:
print(extracted_data)

[{'url': 'https://www.amazon.sa/-/en/Glasses-Tactical-Military-Shooting-Protective/dp/B07M83Y496/ref=zg_bs_17007473031_sccl_13/260-8022584-3068325?psc=1', 'title': None, 'price': None}, {'url': 'https://www.amazon.sa/-/en/Travel-Spinning-Fishing-Rod-Lightweight/dp/B09SHP8WKN/ref=zg_bs_17007473031_sccl_6/260-8022584-3068325?psc=1', 'title': None, 'price': None}, {'url': 'https://www.amazon.sa/-/en/Labymos-Equipped-Resistant-Air-conditioning-Electric/dp/B0BLHCBTY2/ref=zg_bs_17007473031_sccl_16/260-8022584-3068325?psc=1', 'title': None, 'price': None}, {'url': 'https://www.amazon.sa/-/en/Guoqunshop-Fishing-Multi-Pocket-Cotton-Waistcoat/dp/B089SGPGSY/ref=zg_bs_17007473031_sccl_17/260-8022584-3068325?psc=1', 'title': None, 'price': None}, {'url': 'https://www.amazon.sa/-/en/Fishing-Spinning-Powerful-Saltwater-Freshwater/dp/B077GMG73W/ref=zg_bs_17007473031_sccl_26/260-8022584-3068325?psc=1', 'title': None, 'price': None}, {'url': 'https://www.amazon.sa/-/en/Military-Tactical-Backpack-Createy