In [3]:
import requests
from requests_html import HTML
from selenium import webdriver 
from selenium.webdriver.chrome.options import Options
import time 
import re 
import pprint

In [4]:
options = Options()
options.add_argument("--headless")
driver = webdriver.Chrome(options = options)

In [5]:
# list of all parent url
categories = [
    {"name":"baby","url":"https://www.amazon.in/gp/bestsellers/baby"},
    {"name":"electronics","url":"https://www.amazon.in/gp/bestsellers/electronics/"},
    {"name":"fashion","url":"https://www.amazon.in/gp/bestsellers/books/"} 
]


In [6]:
# list of all possible valid regex pattern
regex_options = [
    r'https://www.amazon.in/gp/product/(?P<product_id>[\w-]+)/',
    r'https://www.amazon.in/dp/(?P<product_id>[\w-]+)/',
    r'https://www.amazon.in/(?P<slug>[\w-]+)/dp/(?P<product_id>[\w-]+)/'
]

# extracts the product id from an url 
def extract_product_id_from_url(url):
    product_id = None
    for regex_str in regex_options:
        regex = re.compile(regex_str)
        match = regex.match(url)
        if match != None:
            try:
                product_id = match['product_id']
            except:
                pass
    return product_id

In [8]:


# we grab the valid product page links along with the product ids
def clean_page_links(page_links=[]):   
    # list of possible valid links 
#     page_links = [f" for x in new_links]
    final_page_links = []
    for url in page_links:
        product_id = extract_product_id_from_url(url)
        if product_id != None:
            final_page_links.append({'url':url,'product_id':product_id})
    return final_page_links

In [16]:
# scarpes the required info form the product page 
def scrape_product_page(link,title_lookup = "#productTitle",price_lookup = "#priceblock_ourprice",rating_lookup="#acrCustomerReviewText"):
    driver.get(link)
    time.sleep(1.2)
    body_el = driver.find_element_by_css_selector("body")
    html_str = body_el.get_attribute("innerHTML")
    html_obj = HTML(html=html_str)
    product_title = html_obj.find(title_lookup, first=True).text
    product_price = html_obj.find(price_lookup, first=True).text
    product_noOfRating = html_obj.find(rating_lookup,first=True).text
    return link,product_title,product_price,product_noOfRating


In [21]:
# driver loop 
def perform_scrape(cleaned_items=[]):  
    data_extracted = []
    for obj in cleaned_items:
        link = obj['url']
        product_id = obj['product_id']
        title,price,noOfRating = (None,None,None)
        try:
            product_link,title,price,noOfRating = scrape_product_page(link)
        except Exception as e:
            pass
        if title != None and price != None:
            print(product_link,title,price)
        product_data = {
            'url':link,
            'product_id':product_id,
            'title':title,
            'price':price,
            'rating':noOfRating
        }
        data_extracted.append(product_data)
    return data_extracted
            
# extracted_data = perform_scrape(cleaned_items=cleaned_links)

In [22]:
def scarpe_category_product_links(categories=[]):
    all_product_links = []
    for category in categories:
        time.sleep(1.5)
        url = category['url']
        driver.get(url)
        body_el = driver.find_element_by_css_selector("body")
        html_str = body_el.get_attribute("innerHTML")
        html_obj = HTML(html = html_str)
        new_links = [f"https://www.amazon.in{x}" for x in html_obj.links if x.startswith("/") and "product-reviews/" not in x]
        cleaned_links = clean_page_links(new_links) 
        all_product_links += cleaned_links
    return all_product_links

all_product_links = scarpe_category_product_links(categories=categories)
extracted_data = perform_scrape(all_product_links)

https://www.amazon.in/Himalaya-4003F-Baby-Lotion-400ml/dp/B008YD5500/ref=zg_bs_baby_25?_encoding=UTF8&psc=1&refRID=YEH0WHVBHC90WKY9Q4R9 Himalaya Baby Body Lotion, For All Skin Types (400 ml) ₹227.00
https://www.amazon.in/Himalaya-Baby-Massage-Oil-500ml/dp/B00NOKRPD8/ref=zg_bs_baby_39?_encoding=UTF8&psc=1&refRID=YEH0WHVBHC90WKY9Q4R9 Himalaya Face Body Oil Baby Massage Oil For All Skin Types (500 ML) ₹322.00
https://www.amazon.in/SebaMed-Sebamed-Baby-Lotion-400ml/dp/B00VEEHIEM/ref=zg_bs_baby_33?_encoding=UTF8&psc=1&refRID=YEH0WHVBHC90WKY9Q4R9 SebaMed Baby Body Lotion, For All Skin Types, 400 ml ₹868.00
https://www.amazon.in/Pampers-Active-Diapers-Small-Count/dp/B081QTNF7V/ref=zg_bs_baby_45?_encoding=UTF8&psc=1&refRID=YEH0WHVBHC90WKY9Q4R9 Pampers Active Baby Taped Diapers, Small size diapers, (SM) 92 count, taped style custom fit ₹1,349.00
https://www.amazon.in/Pampers-Diapers-Pants-X-Small-Count/dp/B07DP27JKB/ref=zg_bs_baby_19?_encoding=UTF8&psc=1&refRID=YEH0WHVBHC90WKY9Q4R9 Pampers All 

https://www.amazon.in/Pampers-Premium-Care-Diapers-Monthly/dp/B07F2HMCQ2/ref=zg_bs_baby_18?_encoding=UTF8&psc=1&refRID=YEH0WHVBHC90WKY9Q4R9 Pampers Premium Care Pants, Extra Large size baby diapers (XL), 72 Count, Softest ever Pampers pants ₹1,655.00
https://www.amazon.in/Pampers-Active-Baby-Diapers-Count/dp/B0781Z3BW6/ref=zg_bs_baby_21?_encoding=UTF8&psc=1&refRID=YEH0WHVBHC90WKY9Q4R9 Pampers Active Baby Diapers, New Born, Extra Small, (NB, XS) size, 72 Count, Taped style diaper ₹989.00
https://www.amazon.in/Himalaya-Baby-Shampoo-400-ml/dp/B00H5NMNXC/ref=zg_bs_baby_23?_encoding=UTF8&psc=1&refRID=YEH0WHVBHC90WKY9Q4R9 Himalaya Baby Shampoo (400 ml) ₹232.00
https://www.amazon.in/Supples-Pants-Diapers-Medium-Count/dp/B07Q2F37JN/ref=zg_bs_baby_3?_encoding=UTF8&psc=1&refRID=YEH0WHVBHC90WKY9Q4R9 Supples Baby Pants Diapers, Medium (7-12 kg), 72 Count ₹622.00
https://www.amazon.in/TEDIBAR-TDB5-Tedibar-2s-Pack/dp/B07ZJVXRCW/ref=zg_bs_baby_35?_encoding=UTF8&psc=1&refRID=YEH0WHVBHC90WKY9Q4R9 Tedib

https://www.amazon.in/Nokia-105-2019-Single-Black/dp/B07YYNX5X6/ref=zg_bs_electronics_29?_encoding=UTF8&psc=1&refRID=KCB727PQZQKBGQ1YGHMY Nokia 105 Single SIM (Black) ₹1,249.00
https://www.amazon.in/Boat-Bassheads-242-Earphones-Resistance/dp/B07S9S86BF/ref=zg_bs_electronics_11?_encoding=UTF8&psc=1&refRID=KCB727PQZQKBGQ1YGHMY boAt Bassheads 242 in Ear Wired Earphones with Mic(Active Black) ₹499.00
https://www.amazon.in/boAt-Rockerz-255-Pro-Earphones/dp/B08TSSCZR8/ref=zg_bs_electronics_33?_encoding=UTF8&psc=1&refRID=KCB727PQZQKBGQ1YGHMY boAt Rockerz 255 Pro+ Wireless Bluetooth in Ear Neckband Earphone with Mic (Teal Green) ₹1,399.00
https://www.amazon.in/Test-Exclusive_2020_1113-Multi-3GB-Storage/dp/B089MS8XQ3/ref=zg_bs_electronics_37?_encoding=UTF8&psc=1&refRID=KCB727PQZQKBGQ1YGHMY Redmi 9 Power (Blazing Blue, 4GB RAM, 64GB Storage) - 6000mAh Battery |FHD+ Screen| 48MP Quad Camera | Alexa Hands-Free Capable ₹11,499.00
https://www.amazon.in/255-Bluetooth-Wireless-Earphone-Immersive/dp/B0

In [15]:
len(all_product_links)

149

In [24]:
pprint.pprint(extracted_data)

[{'price': '₹227.00',
  'product_id': 'B008YD5500',
  'rating': '32,903 ratings',
  'title': 'Himalaya Baby Body Lotion, For All Skin Types (400 ml)',
  'url': 'https://www.amazon.in/Himalaya-4003F-Baby-Lotion-400ml/dp/B008YD5500/ref=zg_bs_baby_25?_encoding=UTF8&psc=1&refRID=YEH0WHVBHC90WKY9Q4R9'},
 {'price': '₹322.00',
  'product_id': 'B00NOKRPD8',
  'rating': '21,516 ratings',
  'title': 'Himalaya Face Body Oil Baby Massage Oil For All Skin Types (500 '
           'ML)',
  'url': 'https://www.amazon.in/Himalaya-Baby-Massage-Oil-500ml/dp/B00NOKRPD8/ref=zg_bs_baby_39?_encoding=UTF8&psc=1&refRID=YEH0WHVBHC90WKY9Q4R9'},
 {'price': '₹868.00',
  'product_id': 'B00VEEHIEM',
  'rating': '10,045 ratings',
  'title': 'SebaMed Baby Body Lotion, For All Skin Types, 400 ml',
  'url': 'https://www.amazon.in/SebaMed-Sebamed-Baby-Lotion-400ml/dp/B00VEEHIEM/ref=zg_bs_baby_33?_encoding=UTF8&psc=1&refRID=YEH0WHVBHC90WKY9Q4R9'},
 {'price': '₹1,349.00',
  'product_id': 'B081QTNF7V',
  'rating': '9,605 ra

In [16]:
# len(All_page_links)