In [1]:
from pathlib import Path
import re
import requests
import time
import datetime

import pandas as pd
from requests_html import HTML
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

In [2]:
# grab the current dir and make a output directory
BASE_DIR = Path.cwd()
DATA_DIR = BASE_DIR / "data" # os.path.join(BASE_DIR, 'data')
if not DATA_DIR.exists(): # os.path.exists(DATA_DIR)
    DATA_DIR.mkdir(exist_ok=True) # os.makedirs(DATA_DIR, exist_ok=True)
    
product_category_links_output = DATA_DIR / "category-products.csv"
product_output = DATA_DIR / "products.csv"

In [3]:
options = Options()
options.add_argument("--headless")

driver = webdriver.Chrome(options=options)

In [4]:
# list of all category urls
categories = [
    {"name":"Video Games","url":"https://www.amazon.in/gp/bestsellers/videogames/"},
    {"name":"electronics","url":"https://www.amazon.in/gp/bestsellers/electronics/"},
    {"name":"Watches","url":"https://www.amazon.in/gp/bestsellers/watches/2563504031/"} 
]


In [5]:
# list of all possible valid regex pattern
regex_options = [
    r'https://www.amazon.in/gp/product/(?P<product_id>[\w-]+)/',
    r'https://www.amazon.in/dp/(?P<product_id>[\w-]+)/',
    r'https://www.amazon.in/(?P<slug>[\w-]+)/dp/(?P<product_id>[\w-]+)/'
]

# extracts the product id from an url 
def extract_product_id_from_url(url):
    product_id = None
    for regex_str in regex_options:
        regex = re.compile(regex_str)
        match = regex.match(url)
        if match != None:
            try:
                product_id = match['product_id']
            except:
                pass
    return product_id

In [6]:
# we grab the valid product page links along with the product ids
def clean_page_links(page_links=[]):   
    final_page_links = []
    for url in page_links:
        product_id = extract_product_id_from_url(url)
        if product_id != None:
            final_page_links.append({'url':url,'product_id':product_id})
    return final_page_links

In [7]:
# using the category link we try to grab all the related products
def scrape_category_product_links(categories=[]):
    all_product_links = []
    for category in categories:
        time.sleep(1.5)
        url = category['url']
        driver.get(url)
        body_el = driver.find_element_by_css_selector("body")
        html_str = body_el.get_attribute("innerHTML")
        html_obj = HTML(html=html_str)
        page_links = [f"https://www.amazon.in{x}" for x in html_obj.links if x.startswith("/") and "product-reviews/" not in x]
        cleaned_links = clean_page_links(page_links=page_links)
        all_product_links += cleaned_links
    return all_product_links

In [8]:
# extract all the valid urls of the categories and save it on a csv file 
def extract_categories_and_save(categories=[]):
    all_product_links = scrape_category_product_links(categories)
    category_df = pd.DataFrame(all_product_links)
    category_df.to_csv(product_category_links_output, index=False)

In [9]:
extract_categories_and_save(categories)

In [10]:
# scarpes the required info from the individual product page 
def scrape_product_page(link,title_lookup = "#productTitle",price_lookup = "#priceblock_ourprice",rating_lookup="#acrCustomerReviewText"):
    driver.get(link)
    time.sleep(1.2)
    body_el = driver.find_element_by_css_selector("body")
    html_str = body_el.get_attribute("innerHTML")
    html_obj = HTML(html=html_str)
    product_title,product_price,product_noOfRating=(None,None,None)
    try:
        product_title = html_obj.find(title_lookup, first=True).text
    except:
        pass
    try:
        product_price = html_obj.find(price_lookup, first=True).text
    except:
        pass
    try:
        product_noOfRating = html_obj.find(rating_lookup,first=True).text
    except:
        pass 
    return product_title,product_price,product_noOfRating


In [11]:
# build the skeleton of the DataFrame
def row_scrape_event(row, *args, **kwargs):
    link = row['url']
    scraped = 0
    try:
        scraped = row['scraped']
    except:
        pass
    if scraped == 1 or scraped == "1":
        print("skipped")
        return row
    
    product_id = row['product_id']
    title, price,noOfRating = (None, None,None)
    try:
        title,price,noOfRating = scrape_product_page(link)
    except Exception as e:
        print(e)
        
    row['title'] = title
    row['price'] = price
    row['scraped'] = 1
    row['rating'] = noOfRating
    row['timestamp'] = datetime.datetime.now()
    print(link, title, price)
    return row

In [12]:
df = pd.read_csv(product_category_links_output)
df.head()

Unnamed: 0,url,product_id
0,https://www.amazon.in/Uncharted-Thiefs-Playsta...,B07GLGSHWN
1,https://www.amazon.in/JBL-Quantum-Over-Ear-Hea...,B083X24VG9
2,https://www.amazon.in/PS4-Minecraft-Starter-Co...,B098KP6TZQ
3,https://www.amazon.in/Nitho-Racing-MLT-DP16-K-...,B08LFLHXGC
4,https://www.amazon.in/Assassins-Creed-Valhalla...,B0894KKNVL


In [13]:
df.shape

(148, 2)

In [14]:
df_sub = df.copy()

In [15]:
df_sub = df_sub.apply(row_scrape_event, axis=1)

https://www.amazon.in/Uncharted-Thiefs-Playstation-Newer-Version/dp/B07GLGSHWN/ref=zg_bs_videogames_28/261-4460124-2798536?_encoding=UTF8&psc=1&refRID=K4VG4XW94BFZK8FYFZF8 Uncharted 4 TE Hit (PS4) ₹984.00
https://www.amazon.in/JBL-Quantum-Over-Ear-Headset-Detachable/dp/B083X24VG9/ref=zg_bs_videogames_9/261-4460124-2798536?_encoding=UTF8&psc=1&refRID=K4VG4XW94BFZK8FYFZF8 JBL Quantum 100 by Harman Wired Over-Ear Gaming Headset with Detachable Mic for PC, Mobile, Laptop, PS4, Xbox, Nintendo Switch, VR (Blue) ₹2,599.00
https://www.amazon.in/PS4-Minecraft-Starter-Collection/dp/B098KP6TZQ/ref=zg_bs_videogames_49/261-4460124-2798536?_encoding=UTF8&psc=1&refRID=K4VG4XW94BFZK8FYFZF8 PS4 Minecraft Starter Collection (PS4) None
https://www.amazon.in/Nitho-Racing-MLT-DP16-K-Compatible-Switch/dp/B08LFLHXGC/ref=zg_bs_videogames_26/261-4460124-2798536?_encoding=UTF8&psc=1&refRID=K4VG4XW94BFZK8FYFZF8 Nitho Drive Pro V16 Racing Wheel & Pedal Set MLT-DP16-K, Compatible with PC/PS3/PS4/Xbox1/Switch ₹7,09

https://www.amazon.in/Thumb-Grips-4-Pack-PS4/dp/B08DPDTP5P/ref=zg_bs_videogames_50/261-4460124-2798536?_encoding=UTF8&psc=1&refRID=K4VG4XW94BFZK8FYFZF8 Thumb Grips 4 Pack for PS4 None
https://www.amazon.in/Cosmic-Byte-H1-Headphone-Laptops/dp/B07GB9SMS6/ref=zg_bs_videogames_36/261-4460124-2798536?_encoding=UTF8&psc=1&refRID=K4VG4XW94BFZK8FYFZF8 Cosmic Byte H1 Gaming Headphone with Mic for PS5, PC, Laptops, Mobile, PS4, Xbox One (Red) ₹849.00
https://www.amazon.in/Sony-Uncharted-Collection-Hits-PS4/dp/B07M5V17R9/ref=zg_bs_videogames_41/261-4460124-2798536?_encoding=UTF8&psc=1&refRID=K4VG4XW94BFZK8FYFZF8 Uncharted Collection Hits (PS4) ₹999.00
https://www.amazon.in/Horizon-Zero-Dawn-Complete-PS4/dp/B07D2NY7RW/ref=zg_bs_videogames_43/261-4460124-2798536?_encoding=UTF8&psc=1&refRID=K4VG4XW94BFZK8FYFZF8 Sony Horizon: Zero Dawn - Complete Edition (Playstation 4) ₹649.00
https://www.amazon.in/UBI-Soft-Assassins-Creed-Odyssey/dp/B07DS1FFHK/ref=zg_bs_videogames_39/261-4460124-2798536?_encoding=U

https://www.amazon.in/OFIXO-Multi-Purpose-Foldable-Portable-Writing/dp/B08HQL67D6/ref=zg_bs_electronics_18?_encoding=UTF8&psc=1&refRID=01BRWPEF77PW5H9TBZPV OFIXO Multi-Purpose Laptop Table/Study Table/Bed Table/Foldable and Portable Wooden/Writing Desk (Wooden) ₹500.00
https://www.amazon.in/Tangent-Lite-Magnetic-Bluetooth-Headphones/dp/B085W8CFLH/ref=zg_bs_electronics_37?_encoding=UTF8&psc=1&refRID=01BRWPEF77PW5H9TBZPV PTron Tangent Lite Wireless Bluetooth In-Ear Neckband Headphone With Mic (Black) None
https://www.amazon.in/Samsung-Galaxy-Ocean-128GB-Storage/dp/B07HG8S7KP/ref=zg_bs_electronics_20?_encoding=UTF8&psc=1&refRID=01BRWPEF77PW5H9TBZPV Samsung Galaxy M31 (Ocean Blue, 8GB RAM, 128GB Storage) 6 Months Free Screen Replacement for Prime ₹16,999.00
https://www.amazon.in/Realme-Buds-Android-Smartphones-Black/dp/B07XMFDHSG/ref=zg_bs_electronics_46?_encoding=UTF8&psc=1&refRID=01BRWPEF77PW5H9TBZPV realme Wired in Ear Buds 2 with Mic for Android Smartphones (Black) ₹599.00
https://www.

https://www.amazon.in/Watch-Company-Digital-Unisex-Birthday/dp/B0972HYQS3/ref=zg_bs_2563504031_17?_encoding=UTF8&psc=1&refRID=4PR8YHZC2A2NTN9S5S7V The Watch Company Digital Black Dial Led Watch for Kids Unisex Birthday Gift Digital Watch - for Boys & Girls ₹79.00
https://www.amazon.in/Fastrack-Casual-Analog-White-Watch-NJ3114PP01C/dp/B00NLASVBQ/ref=zg_bs_2563504031_45?_encoding=UTF8&psc=1&refRID=4PR8YHZC2A2NTN9S5S7V Fastrack Casual Analog White Dial Men's Watch NM3114PP01/NN3114PP01 ₹1,050.00
https://www.amazon.in/SELLORIA-Shock-Digital-Watch-Years/dp/B08X6LNL9Z/ref=zg_bs_2563504031_48?_encoding=UTF8&psc=1&refRID=4PR8YHZC2A2NTN9S5S7V SELLORIA S Shock Digital Kids Watch for Boys [7-30 Years] None
https://www.amazon.in/Sonata-Super-Fibre-Analog-Watch-NH7930PP01C/dp/B00ECWG1NC/ref=zg_bs_2563504031_11?_encoding=UTF8&psc=1&refRID=4PR8YHZC2A2NTN9S5S7V SF Analog Black Round Dial Men's Casual Watch-NL7930PP01 ₹599.00
https://www.amazon.in/Black-Digital-led-Watch-Girls/dp/B098TGS6ZF/ref=zg_bs_2

https://www.amazon.in/V2A-Military-Digital-Multi-Function-Chronograph/dp/B07TVTVC8Z/ref=zg_bs_2563504031_39?_encoding=UTF8&psc=1&refRID=4PR8YHZC2A2NTN9S5S7V V2A Digital Men's & Boy's Watch (Black Dial Black Colored Strap) None
https://www.amazon.in/IIK-Collection-Stainless-Analoge-IIK-2052W-DND/dp/B07YC685FW/ref=zg_bs_2563504031_49?_encoding=UTF8&psc=1&refRID=4PR8YHZC2A2NTN9S5S7V IIK Collection Round Dial Stainless Steel Bracelet Chain Analogue Day & Date Functioning Watch for Women and Girls ₹399.00
https://www.amazon.in/Mens-Watch-Black-Colored-Strap/dp/B08DFYSGDL/ref=zg_bs_2563504031_38?_encoding=UTF8&psc=1&refRID=4PR8YHZC2A2NTN9S5S7V TIMEWEAR Analog Number Dial Leather Strap Watch for Men None
https://www.amazon.in/Maxima-Smartwatch-Monitoring-Customized-Assistance/dp/B091TTTF8W/ref=zg_bs_2563504031_27?_encoding=UTF8&psc=1&refRID=4PR8YHZC2A2NTN9S5S7V Maxima Max Pro Smartwatch with SpO2, Upto 15 Day Battery life, Full-touch Ultra Bright Display of Upto 380 Nits, 10+ Sports Mode, Con

In [16]:
# finally write the details into a csv
final_df = df_sub
final_df.to_csv(product_output, index=False)
final_df.head(50)

Unnamed: 0,url,product_id,title,price,scraped,rating,timestamp
0,https://www.amazon.in/Uncharted-Thiefs-Playsta...,B07GLGSHWN,Uncharted 4 TE Hit (PS4),₹984.00,1,"2,150 ratings",2021-09-25 19:02:51.229128
1,https://www.amazon.in/JBL-Quantum-Over-Ear-Hea...,B083X24VG9,JBL Quantum 100 by Harman Wired Over-Ear Gamin...,"₹2,599.00",1,"1,424 ratings",2021-09-25 19:03:05.054446
2,https://www.amazon.in/PS4-Minecraft-Starter-Co...,B098KP6TZQ,PS4 Minecraft Starter Collection (PS4),,1,1 rating,2021-09-25 19:03:19.659052
3,https://www.amazon.in/Nitho-Racing-MLT-DP16-K-...,B08LFLHXGC,Nitho Drive Pro V16 Racing Wheel & Pedal Set M...,"₹7,099.00",1,199 ratings,2021-09-25 19:03:26.747579
4,https://www.amazon.in/Assassins-Creed-Valhalla...,B0894KKNVL,Assassin's Creed Valhalla Drakkar Edition (Fre...,"₹2,799.00",1,331 ratings,2021-09-25 19:03:32.758208
5,https://www.amazon.in/Steam-Wallet-Digital-Del...,B0874Y4SPX,INR 250 Steam Wallet Code (Digital Code- Email...,₹250.00,1,755 ratings,2021-09-25 19:03:38.258147
6,https://www.amazon.in/God-of-War-Remastered-PS...,B00YORZL4A,God of War Remastered (PS4),,1,713 ratings,2021-09-25 19:03:45.521700
7,https://www.amazon.in/Cricket-19-International...,B07QNQN449,Cricket 19 International Edition (PS4),"₹2,116.00",1,708 ratings,2021-09-25 19:03:50.915913
8,https://www.amazon.in/Marvels-Spider-Man-PS4-G...,B07X23YRCS,Marvel's Spider Man (PS4) - Game of the Year E...,"₹1,509.00",1,"1,877 ratings",2021-09-25 19:03:56.052319
9,https://www.amazon.in/Grand-Theft-Auto-V-PS4/d...,B00L8XUDIC,Grand Theft Auto V – Premium,"₹1,469.00",1,"2,783 ratings",2021-09-25 19:04:03.886423
