In [13]:
import requests
from requests_html import HTML
from selenium import webdriver 
from selenium.webdriver.chrome.options import Options
import time 
import re 
import pprint
import datetime
import pandas as pd 
from pathlib import Path

In [2]:
BASE_DIR = Path.cwd()
DATA_DIR = BASE_DIR / "data"

if not DATA_DIR.exists():
    DATA_DIR.mkdir(exist_ok=True)
    
product_output = DATA_DIR/ "product.csv"

In [3]:
options = Options()
options.add_argument("--headless")
driver = webdriver.Chrome(options = options)

In [4]:
# list of all parent url
categories = [
    {"name":"baby","url":"https://www.amazon.in/gp/bestsellers/baby"},
    {"name":"electronics","url":"https://www.amazon.in/gp/bestsellers/electronics/"},
    {"name":"fashion","url":"https://www.amazon.in/gp/bestsellers/books/"} 
]


In [5]:
# list of all possible valid regex pattern
regex_options = [
    r'https://www.amazon.in/gp/product/(?P<product_id>[\w-]+)/',
    r'https://www.amazon.in/dp/(?P<product_id>[\w-]+)/',
    r'https://www.amazon.in/(?P<slug>[\w-]+)/dp/(?P<product_id>[\w-]+)/'
]

# extracts the product id from an url 
def extract_product_id_from_url(url):
    product_id = None
    for regex_str in regex_options:
        regex = re.compile(regex_str)
        match = regex.match(url)
        if match != None:
            try:
                product_id = match['product_id']
            except:
                pass
    return product_id

In [6]:


# we grab the valid product page links along with the product ids
def clean_page_links(page_links=[]):   
    # list of possible valid links 
#     page_links = [f" for x in new_links]
    final_page_links = []
    for url in page_links:
        product_id = extract_product_id_from_url(url)
        if product_id != None:
            final_page_links.append({'url':url,'product_id':product_id})
    return final_page_links

In [7]:
# scarpes the required info form the product page 
def scrape_product_page(link,title_lookup = "#productTitle",price_lookup = "#priceblock_ourprice",rating_lookup="#acrCustomerReviewText"):
    driver.get(link)
    time.sleep(1.2)
    body_el = driver.find_element_by_css_selector("body")
    html_str = body_el.get_attribute("innerHTML")
    html_obj = HTML(html=html_str)
    product_title = html_obj.find(title_lookup, first=True).text
    product_price = html_obj.find(price_lookup, first=True).text
    product_noOfRating = html_obj.find(rating_lookup,first=True).text
    return link,product_title,product_price,product_noOfRating


In [9]:
# driver loop 
def perform_scrape(cleaned_items=[]):  
    data_extracted = []
    for obj in cleaned_items:
        link = obj['url']
        product_id = obj['product_id']
        title,price,noOfRating = (None,None,None)
        try:
            product_link,title,price,noOfRating = scrape_product_page(link)
        except Exception as e:
            pass
        if title != None and price != None:
            print(product_link,title,price)
        product_data = {
            'url':link,
            'product_id':product_id,
            'title':title,
            'price':price,
            'rating':noOfRating,
            'timestamp':datetime.datetime.now()
        }
        data_extracted.append(product_data)
    return data_extracted
            
# extracted_data = perform_scrape(cleaned_items=cleaned_links)

In [10]:
def scarpe_category_product_links(categories=[]):
    all_product_links = []
    for category in categories:
        time.sleep(1.5)
        url = category['url']
        driver.get(url)
        body_el = driver.find_element_by_css_selector("body")
        html_str = body_el.get_attribute("innerHTML")
        html_obj = HTML(html = html_str)
        new_links = [f"https://www.amazon.in{x}" for x in html_obj.links if x.startswith("/") and "product-reviews/" not in x]
        cleaned_links = clean_page_links(new_links) 
        all_product_links += cleaned_links
    return all_product_links



In [11]:
def abc(categories =[]):
    all_product_links = scarpe_category_product_links(categories=categories)
    extracted_data = perform_scrape(all_product_links)
    return extracted_data

In [None]:
extracted_data = abc(categories)
all_product_df = pd.DataFrame(extracted_data)
all_product_df.to_csv(product_output,index = False)

https://www.amazon.in/Pampers-Large-Diapers-Pants-Count/dp/B07CXFJT8Q/ref=zg_bs_baby_5?_encoding=UTF8&psc=1&refRID=4QNS0W32JA4R97J0K74J Pampers All round Protection Pants, Large size baby diapers (LG) 64 Count, Lotion with Aloe Vera ₹785.00
https://www.amazon.in/Huggies-Nature-Pants-Small-Diaper/dp/B08JVK3ZRD/ref=zg_bs_baby_27?_encoding=UTF8&psc=1&refRID=4QNS0W32JA4R97J0K74J Huggies Nature Care Pants, Small (S) Size Baby Diaper Pants, 28 Count, Nature’s gentle protection with organic cotton ₹299.00
https://www.amazon.in/Himalaya-4004G-Baby-Powder-400g/dp/B008YD57OO/ref=zg_bs_baby_49?_encoding=UTF8&psc=1&refRID=4QNS0W32JA4R97J0K74J Himalaya Baby Powder (400g) ₹171.00
https://www.amazon.in/Himalaya-Gift-Pack/dp/B075B7WDJY/ref=zg_bs_baby_44?_encoding=UTF8&psc=1&refRID=4QNS0W32JA4R97J0K74J Himalaya Gift Pack, 1 Count ₹708.00
https://www.amazon.in/Sebamed-Baby-Cleansing-Bar-150g/dp/B01KE8VPPG/ref=zg_bs_baby_34?_encoding=UTF8&psc=1&refRID=4QNS0W32JA4R97J0K74J Sebamed Baby Cleansing Bar (150g

https://www.amazon.in/Pampers-Premium-Care-Diapers-Monthly/dp/B07FHD5WNB/ref=zg_bs_baby_17?_encoding=UTF8&psc=1&refRID=4QNS0W32JA4R97J0K74J Pampers Premium Care Pants, Large size baby diapers (LG), 88 Count, Softest ever Pampers pants ₹1,655.00
https://www.amazon.in/SebaMed-Sebamed-Baby-Lotion-400ml/dp/B00VEEHIEM/ref=zg_bs_baby_33?_encoding=UTF8&psc=1&refRID=4QNS0W32JA4R97J0K74J SebaMed Baby Body Lotion, For All Skin Types, 400 ml ₹868.00
https://www.amazon.in/Pampers-Premium-Care-Pants-Diapers/dp/B082LF4L7X/ref=zg_bs_baby_30?_encoding=UTF8&psc=1&refRID=4QNS0W32JA4R97J0K74J Pampers Premium Care Pants, Double Extra Large size baby diapers (XXL), 60 Count, Softest ever Pampers pants ₹1,678.00
https://www.amazon.in/Himalaya-Total-Pants-Diapers-Large/dp/B07MBYCL5Z/ref=zg_bs_baby_9?_encoding=UTF8&psc=1&refRID=4QNS0W32JA4R97J0K74J Himalaya Total Care Baby Pants Diapers, Extra Large (12 - 17 kg), 74 Count ₹831.00
https://www.amazon.in/Himalaya-gentle-Wipes-72Napkins-packs/dp/B017VOMJ6A/ref=zg

In [None]:
pprint.pprint(extracted_data)

In [None]:
# len(All_page_links)