In [6]:
import requests
from requests_html import HTML
from selenium import webdriver 
from selenium.webdriver.chrome.options import Options
import time 
import re 
import pprint

In [7]:
options = Options()
options.add_argument("--headless")
driver = webdriver.Chrome(options = options)

In [21]:
# list of all parent url
categories = [
    "https://www.amazon.in/gp/bestsellers/baby",
    "https://www.amazon.in/gp/bestsellers/electronics/",
    "https://www.amazon.in/gp/bestsellers/books/"    
]

print(categories[0].rindex("/"))

41


In [9]:
url = categories[0]
url

'https://www.amazon.in/gp/bestsellers/baby'

In [10]:
driver.get(url)

In [11]:
body_el = driver.find_element_by_css_selector("body")
html_str = body_el.get_attribute("innerHTML")


In [12]:
html_obj = HTML(html = html_str)

In [13]:
new_links = [x for x in html_obj.links if x.startswith("/") and "product-reviews/" not in x]

In [9]:
# just for knowledge
# valid URL: https://www.amazon.in/Himalaya-Baby-Massage-Oil-500ml/dp/B00NOKRPD8/

# valid URL skeleton: <base-url>/<slug>/dp/<product_id>/


In [10]:
# A quick demo how regex pattern matching occurs 
#optional to run
my_regex_pattern = r'https://www.amazon.in/(?P<slug>[\w-]+)/dp/(?P<product_id>[\w-]+)/'
my_url= 'https://www.amazon.in/Himalaya-Baby-Massage-Oil-500ml/dp/B00NOKRPD8/'

regex = re.compile(my_regex_pattern)

my_match = regex.match(my_url)
print(my_match)
my_match['product_id']

<re.Match object; span=(0, 68), match='https://www.amazon.in/Himalaya-Baby-Massage-Oil-5>


'B00NOKRPD8'

In [15]:
# list of all possible valid regex pattern
regex_options = [
    r'https://www.amazon.in/gp/product/(?P<product_id>[\w-]+)/',
    r'https://www.amazon.in/dp/(?P<product_id>[\w-]+)/',
    r'https://www.amazon.in/(?P<slug>[\w-]+)/dp/(?P<product_id>[\w-]+)/'
]

# extracts the product id from an url 
def extract_product_id_from_url(url):
    product_id = None
    for regex_str in regex_options:
        regex = re.compile(regex_str)
        match = regex.match(url)
        if match != None:
            try:
                product_id = match['product_id']
            except:
                pass
    return product_id

# print(extract_product_id_from_url('https://www.amazon.in/Himalaya-Baby-Massage-Oil-500ml/dp/B00NOKRPD8/'))

In [16]:
# list of possible valid links 
page_links = [f"https://www.amazon.in{x}" for x in new_links]

# only valid product_page links 
# page_links = [x for x in page_links if extract_product_id_from_url(x) != None]

# we grab the valid product page links along with the product ids
def clean_page_links(page_links=[]):   
    final_page_links = []
    for url in page_links:
        product_id = extract_product_id_from_url(url)
        if product_id != None:
            final_page_links.append({'url':url,'product_id':product_id})
    return final_page_links

cleaned_links = clean_page_links(page_links)


In [17]:
# scarpes the required info form the product page 
def scrape_product_page(link,title_lookup = "#productTitle",price_lookup = "#priceblock_ourprice",rating_lookup="#acrCustomerReviewText"):
    driver.get(link)
    time.sleep(1.2)
    body_el = driver.find_element_by_css_selector("body")
    html_str = body_el.get_attribute("innerHTML")
    html_obj = HTML(html=html_str)
    product_title = html_obj.find(title_lookup, first=True).text
    product_price = html_obj.find(price_lookup, first=True).text
    product_noOfRating = html_obj.find(rating_lookup,first=True).text
    return link,product_title,product_price,product_noOfRating


In [18]:
# driver loop 
def perform_scarpe(cleaned_items=[]):  
    data_extracted = []
    for obj in cleaned_items:
        link = obj['url']
        product_id = obj['product_id']
        title,price,noOfRating = (None,None,None)
        try:
            product_link,title,price,noOfRating = scrape_product_page(link)
        except Exception as e:
            pass
        if title != None and price != None:
            print(product_link,title,price)
        product_data = {
            'url':link,
            'product_id':product_id,
            'title':title,
            'price':price,
            'rating':noOfRating
        }
        data_extracted.append(product_data)
    return data_extracted
            
extracted_data = perform_scarpe(cleaned_items=cleaned_links)

https://www.amazon.in/Littles-Soft-Cleansing-Baby-Wipes/dp/B084S5JSZ1/ref=zg_bs_baby_6/261-3054441-6138763?_encoding=UTF8&psc=1&refRID=NYPNWJ6Q236KTM1BS81Q Little's Soft Cleansing Baby Wipes Lid Pack (80 Wipes) ₹99.00
https://www.amazon.in/Pampers-Diaper-Pants-Count-Diapers/dp/B08QSHJ7CR/ref=zg_bs_baby_19/261-3054441-6138763?_encoding=UTF8&psc=1&refRID=NYPNWJ6Q236KTM1BS81Q Pampers Diaper Pants, Baby, 86 Count & Pampers Diapers Pants, Small, 56 Count ₹1,241.00


KeyboardInterrupt: 

In [None]:
pprint.pprint(extracted_data)

In [16]:
# len(All_page_links)