In [25]:
import requests
from requests_html import HTML
from selenium import webdriver 
from selenium.webdriver.chrome.options import Options
import time 
import re 
import pprint
import pandas as pd 
from pathlib import Path

In [26]:
BASE_DIR = Path.cwd()
DATA_DIR = BASE_DIR / "data"

if not DATA_DIR.exists():
    DATA_DIR.mkdir(exist_ok=True)
    
product_output = DATA_DIR/ "product.csv"

In [4]:
options = Options()
options.add_argument("--headless")
driver = webdriver.Chrome(options = options)

In [5]:
# list of all parent url
categories = [
    {"name":"baby","url":"https://www.amazon.in/gp/bestsellers/baby"},
    {"name":"electronics","url":"https://www.amazon.in/gp/bestsellers/electronics/"},
    {"name":"fashion","url":"https://www.amazon.in/gp/bestsellers/books/"} 
]


In [6]:
# list of all possible valid regex pattern
regex_options = [
    r'https://www.amazon.in/gp/product/(?P<product_id>[\w-]+)/',
    r'https://www.amazon.in/dp/(?P<product_id>[\w-]+)/',
    r'https://www.amazon.in/(?P<slug>[\w-]+)/dp/(?P<product_id>[\w-]+)/'
]

# extracts the product id from an url 
def extract_product_id_from_url(url):
    product_id = None
    for regex_str in regex_options:
        regex = re.compile(regex_str)
        match = regex.match(url)
        if match != None:
            try:
                product_id = match['product_id']
            except:
                pass
    return product_id

In [8]:


# we grab the valid product page links along with the product ids
def clean_page_links(page_links=[]):   
    # list of possible valid links 
#     page_links = [f" for x in new_links]
    final_page_links = []
    for url in page_links:
        product_id = extract_product_id_from_url(url)
        if product_id != None:
            final_page_links.append({'url':url,'product_id':product_id})
    return final_page_links

In [16]:
# scarpes the required info form the product page 
def scrape_product_page(link,title_lookup = "#productTitle",price_lookup = "#priceblock_ourprice",rating_lookup="#acrCustomerReviewText"):
    driver.get(link)
    time.sleep(1.2)
    body_el = driver.find_element_by_css_selector("body")
    html_str = body_el.get_attribute("innerHTML")
    html_obj = HTML(html=html_str)
    product_title = html_obj.find(title_lookup, first=True).text
    product_price = html_obj.find(price_lookup, first=True).text
    product_noOfRating = html_obj.find(rating_lookup,first=True).text
    return link,product_title,product_price,product_noOfRating


In [21]:
# driver loop 
def perform_scrape(cleaned_items=[]):  
    data_extracted = []
    for obj in cleaned_items:
        link = obj['url']
        product_id = obj['product_id']
        title,price,noOfRating = (None,None,None)
        try:
            product_link,title,price,noOfRating = scrape_product_page(link)
        except Exception as e:
            pass
        if title != None and price != None:
            print(product_link,title,price)
        product_data = {
            'url':link,
            'product_id':product_id,
            'title':title,
            'price':price,
            'rating':noOfRating
            'timestamp':datetime.datetime.now()
        }
        data_extracted.append(product_data)
    return data_extracted
            
# extracted_data = perform_scrape(cleaned_items=cleaned_links)

In [22]:
def scarpe_category_product_links(categories=[]):
    all_product_links = []
    for category in categories:
        time.sleep(1.5)
        url = category['url']
        driver.get(url)
        body_el = driver.find_element_by_css_selector("body")
        html_str = body_el.get_attribute("innerHTML")
        html_obj = HTML(html = html_str)
        new_links = [f"https://www.amazon.in{x}" for x in html_obj.links if x.startswith("/") and "product-reviews/" not in x]
        cleaned_links = clean_page_links(new_links) 
        all_product_links += cleaned_links
    return all_product_links



https://www.amazon.in/Himalaya-4003F-Baby-Lotion-400ml/dp/B008YD5500/ref=zg_bs_baby_25?_encoding=UTF8&psc=1&refRID=YEH0WHVBHC90WKY9Q4R9 Himalaya Baby Body Lotion, For All Skin Types (400 ml) ₹227.00
https://www.amazon.in/Himalaya-Baby-Massage-Oil-500ml/dp/B00NOKRPD8/ref=zg_bs_baby_39?_encoding=UTF8&psc=1&refRID=YEH0WHVBHC90WKY9Q4R9 Himalaya Face Body Oil Baby Massage Oil For All Skin Types (500 ML) ₹322.00
https://www.amazon.in/SebaMed-Sebamed-Baby-Lotion-400ml/dp/B00VEEHIEM/ref=zg_bs_baby_33?_encoding=UTF8&psc=1&refRID=YEH0WHVBHC90WKY9Q4R9 SebaMed Baby Body Lotion, For All Skin Types, 400 ml ₹868.00
https://www.amazon.in/Pampers-Active-Diapers-Small-Count/dp/B081QTNF7V/ref=zg_bs_baby_45?_encoding=UTF8&psc=1&refRID=YEH0WHVBHC90WKY9Q4R9 Pampers Active Baby Taped Diapers, Small size diapers, (SM) 92 count, taped style custom fit ₹1,349.00
https://www.amazon.in/Pampers-Diapers-Pants-X-Small-Count/dp/B07DP27JKB/ref=zg_bs_baby_19?_encoding=UTF8&psc=1&refRID=YEH0WHVBHC90WKY9Q4R9 Pampers All 

https://www.amazon.in/Pampers-Premium-Care-Diapers-Monthly/dp/B07F2HMCQ2/ref=zg_bs_baby_18?_encoding=UTF8&psc=1&refRID=YEH0WHVBHC90WKY9Q4R9 Pampers Premium Care Pants, Extra Large size baby diapers (XL), 72 Count, Softest ever Pampers pants ₹1,655.00
https://www.amazon.in/Pampers-Active-Baby-Diapers-Count/dp/B0781Z3BW6/ref=zg_bs_baby_21?_encoding=UTF8&psc=1&refRID=YEH0WHVBHC90WKY9Q4R9 Pampers Active Baby Diapers, New Born, Extra Small, (NB, XS) size, 72 Count, Taped style diaper ₹989.00
https://www.amazon.in/Himalaya-Baby-Shampoo-400-ml/dp/B00H5NMNXC/ref=zg_bs_baby_23?_encoding=UTF8&psc=1&refRID=YEH0WHVBHC90WKY9Q4R9 Himalaya Baby Shampoo (400 ml) ₹232.00
https://www.amazon.in/Supples-Pants-Diapers-Medium-Count/dp/B07Q2F37JN/ref=zg_bs_baby_3?_encoding=UTF8&psc=1&refRID=YEH0WHVBHC90WKY9Q4R9 Supples Baby Pants Diapers, Medium (7-12 kg), 72 Count ₹622.00
https://www.amazon.in/TEDIBAR-TDB5-Tedibar-2s-Pack/dp/B07ZJVXRCW/ref=zg_bs_baby_35?_encoding=UTF8&psc=1&refRID=YEH0WHVBHC90WKY9Q4R9 Tedib

https://www.amazon.in/Nokia-105-2019-Single-Black/dp/B07YYNX5X6/ref=zg_bs_electronics_29?_encoding=UTF8&psc=1&refRID=KCB727PQZQKBGQ1YGHMY Nokia 105 Single SIM (Black) ₹1,249.00
https://www.amazon.in/Boat-Bassheads-242-Earphones-Resistance/dp/B07S9S86BF/ref=zg_bs_electronics_11?_encoding=UTF8&psc=1&refRID=KCB727PQZQKBGQ1YGHMY boAt Bassheads 242 in Ear Wired Earphones with Mic(Active Black) ₹499.00
https://www.amazon.in/boAt-Rockerz-255-Pro-Earphones/dp/B08TSSCZR8/ref=zg_bs_electronics_33?_encoding=UTF8&psc=1&refRID=KCB727PQZQKBGQ1YGHMY boAt Rockerz 255 Pro+ Wireless Bluetooth in Ear Neckband Earphone with Mic (Teal Green) ₹1,399.00
https://www.amazon.in/Test-Exclusive_2020_1113-Multi-3GB-Storage/dp/B089MS8XQ3/ref=zg_bs_electronics_37?_encoding=UTF8&psc=1&refRID=KCB727PQZQKBGQ1YGHMY Redmi 9 Power (Blazing Blue, 4GB RAM, 64GB Storage) - 6000mAh Battery |FHD+ Screen| 48MP Quad Camera | Alexa Hands-Free Capable ₹11,499.00
https://www.amazon.in/255-Bluetooth-Wireless-Earphone-Immersive/dp/B0

In [27]:
def abc(categories =[]):
    all_product_links = scarpe_category_product_links(categories=categories)
    extracted_data = perform_scrape(all_product_links)
    return extracted_data

In [29]:
extracted_data = abc(categories)
all_product_df = pd.DataFrame(extracted_data)
all_product_df.to_csv(product_output,index = False)

https://www.amazon.in/Pampers-Active-Medium-Diapers-Count/dp/B077HRCSXR/ref=zg_bs_baby_46?_encoding=UTF8&psc=1&refRID=E75Q8HJQKXNC6NJBN651 Pampers Active Baby Taped Diapers, Medium size diapers, (MD) 90 count, taped style custom fit ₹1,303.00
https://www.amazon.in/Pampers-Active-Large-Diapers-Count/dp/B077HV19LS/ref=zg_bs_baby_43?_encoding=UTF8&psc=1&refRID=E75Q8HJQKXNC6NJBN651 Pampers Active Baby Taped Diapers, Large size diapers, (LG) 78 count, taped style custom fit ₹1,394.00
https://www.amazon.in/Mamaearth-Percent-Natural-Berry-Toothpaste/dp/B07BGT4J55/ref=zg_bs_baby_42?_encoding=UTF8&psc=1&refRID=E75Q8HJQKXNC6NJBN651 Mamaearth 100% Natural Berry Blast Kids Toothpaste 50 Gm, Fluoride Free, SLS Free, No Artificial Flavours, Best for baby ₹124.00
https://www.amazon.in/Bey-Bee-Waterproof-Protector-Sheet/dp/B00VZQP6PM/ref=zg_bs_baby_22?_encoding=UTF8&psc=1&refRID=E75Q8HJQKXNC6NJBN651 BeyBee Bed Protector Sheet (Small (50cm X 70cm), Salmon Rose) ₹139.00
https://www.amazon.in/Littles-Sof

https://www.amazon.in/Pampers-Diapers-Pants-XX-Large-Count/dp/B07P226KH4/ref=zg_bs_baby_14?_encoding=UTF8&psc=1&refRID=E75Q8HJQKXNC6NJBN651 Pampers All round Protection Pants, Double Extra Large size baby diapers (XXL) 28 Count, Lotion with Aloe Vera ₹587.00
https://www.amazon.in/Himalaya-Total-Pants-Diapers-Large/dp/B07MBYCL5Z/ref=zg_bs_baby_9?_encoding=UTF8&psc=1&refRID=E75Q8HJQKXNC6NJBN651 Himalaya Total Care Baby Pants Diapers, Extra Large (12 - 17 kg), 74 Count ₹831.00
https://www.amazon.in/Pampers-New-Diapers-Pants-Count/dp/B07CXGJKXL/ref=zg_bs_baby_4?_encoding=UTF8&psc=1&refRID=E75Q8HJQKXNC6NJBN651 Pampers All round Protection Pants, Extra Large size baby diapers (XL) 56 Count, Lotion with Aloe Vera ₹851.00
https://www.amazon.in/SebaMed-Sebamed-Baby-Lotion-400ml/dp/B00VEEHIEM/ref=zg_bs_baby_33?_encoding=UTF8&psc=1&refRID=E75Q8HJQKXNC6NJBN651 SebaMed Baby Body Lotion, For All Skin Types, 400 ml ₹868.00
https://www.amazon.in/Himalaya-4004G-Baby-Powder-400g/dp/B008YD57OO/ref=zg_bs_

https://www.amazon.in/OnePlus-Bullets-Wireless-Bass-Black/dp/B092ZJVB6Z/ref=zg_bs_electronics_2?_encoding=UTF8&psc=1&refRID=KSPTZ55VVC590PKH9CRJ OnePlus Bullets Wireless Z Bass Edition (Bold Black) ₹1,999.00
https://www.amazon.in/Logitech-B170-Wireless-Mouse-Black/dp/B01J0XWYKQ/ref=zg_bs_electronics_47?_encoding=UTF8&psc=1&refRID=KSPTZ55VVC590PKH9CRJ Logitech B170 Wireless Mouse, 2.4 GHz with USB Nano Receiver, Optical Tracking, 12-Months Battery Life, Ambidextrous, PC/Mac/Laptop - Black ₹645.00
https://www.amazon.in/HP-Original-Advantage-Cartridge-Black/dp/B014AKZMNK/ref=zg_bs_electronics_36?_encoding=UTF8&psc=1&refRID=KSPTZ55VVC590PKH9CRJ HP 680 Original Ink Advantage Cartridge (Black) ₹851.00
https://www.amazon.in/OFIXO-Multi-Purpose-Foldable-Portable-Writing/dp/B08HQL67D6/ref=zg_bs_electronics_17?_encoding=UTF8&psc=1&refRID=KSPTZ55VVC590PKH9CRJ OFIXO Multi-Purpose Laptop Table/Study Table/Bed Table/Foldable and Portable Wooden/Writing Desk (Wooden) ₹500.00
https://www.amazon.in/Gen

In [30]:
pprint.pprint(extracted_data)

None


In [16]:
# len(All_page_links)