In [24]:
from pathlib import Path
import datetime
import pandas as pd
import requests
import re
import time
from requests_html import HTML
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

In [2]:
BASE_DIR = Path.cwd()
DATA_DIR = BASE_DIR / 'data'
if not DATA_DIR.exists():
   DATA_DIR.mkdir(exist_ok=True)
product_category_links_output = DATA_DIR / 'category-products.csv'
product_output = DATA_DIR / 'products.csv'

In [3]:
options = Options()
options.add_argument('--headless')

driver = webdriver.Chrome(options=options)

In [4]:

categories = [
    {"name": "toys-and-games", "url": "https://www.amazon.com/Best-Sellers-Toys-Games/zgbs/toys-and-games/"},
    {"name": "electronics", "url": "https://www.amazon.com/Best-Sellers-Electronics/zgbs/electronics/"},
    {"name": "fashion", "url": "https://www.amazon.com/Best-Sellers/zgbs/fashion/"}
]

In [5]:
regex_options = [
    r"https://www.amazon.com/gp/product/(?P<product_id>[\w-]+)/",
    r"https://www.amazon.com/dp/(?P<product_id>[\w-]+)/",
    r"https://www.amazon.com/(?P<slug>[\w-]+)/dp/(?P<product_id>[\w-]+)/",
]

def extract_product_id_from_url(url):
    product_id = None
    for regex_str in regex_options:
        regex = re.compile(regex_str)
        match = regex.match(url)
        if match != None:
            try:
                product_id = match['product_id']
            except:
                pass
    return product_id

In [6]:
def clean_page_links(page_links,category=None):
    final_page_links = []
    for url in page_links:
        product_id = extract_product_id_from_url(url)
        if product_id != None:
           final_page_links.append({'url':url,'product_id':product_id,"category":category})
    return final_page_links

In [13]:
def scrape_category_product_link(categories=[]):
    all_product_links = []
    for category in categories:
        time.sleep(1.5)
        url = category.get("url")
        driver.get(url)
        body_el = driver.find_element_by_css_selector("body")
        html_str = body_el.get_attribute("innerHTML")
        html_obj = HTML(html=html_str)
        page_links = [f"https://www.amazon.com{x}" for x in html_obj.links if x.startswith("/")]
        cleaned_links = clean_page_links(page_links=page_links, category=category)
        all_product_links += cleaned_links
    return all_product_links



In [14]:
def extract_categories_and_save(categories=[]):
    all_product_links = scrape_category_product_links(categories)
    category_df = pd.DataFrame(all_product_links)
    category_df.to_csv(product_category_links_output, index=False)

In [15]:

extract_categories_and_save(categories=categories)

In [16]:

def scrape_product_page(url, title_lookup = "#productTitle", price_lookup = "#priceblock_ourprice"):
    driver.get(url)
    time.sleep(0.5)
    body_el = driver.find_element_by_css_selector("body")
    html_str = body_el.get_attribute("innerHTML")
    html_obj = HTML(html=html_str)
    product_title = html_obj.find(title_lookup, first=True).text
    product_price = html_obj.find(price_lookup, first=True).text
    return product_title, product_price  

all_product_links = scrape_category_product_link(categories)

In [None]:
#page_links

In [17]:
def perform_scrape(cleaned_items=[]):
    data_extractred = []
    for obj in cleaned_items:
        link = obj['url']
        product_id = obj['product_id']
        title, price = (None, None)
        try:
            title, price = scrape_product_page(link)
        except:
            pass
        if title != None and price != None:
            print(link, title, price)
        product_data = {
        'url':link,
        'product_id':product_id,
        'title':title,
        'price':price
    }
    data_extracted.append(product_data)
    return data_extracted

In [22]:
def row_scrape_event(row, *args, **kwargs):
    link = row['url']
    scraped = 0
    try:
        scraped = row['scraped']
    except:
        pass
    # print(link)
    if scraped == 1 or scraped == "1":
        print("skipped")
        return row
    product_id = row['product_id']
    title, price = (None, None)
    try:
        title, price = scrape_product_page(link)
    except:
        pass
    row['title'] = title
    row['price'] = price
    row['scraped'] = 1
    row['timestamp'] = datetime.datetime.now().timestamp()
    print(link, title, price)
    return row

In [18]:

df = pd.read_csv(product_category_links_output)
df.head()

Unnamed: 0,url,product_id,category
0,https://www.amazon.com/Crayola-Wonder-Drawing-...,B07PMLL5L7,"{'name': 'toys-and-games', 'url': 'https://www..."
1,https://www.amazon.com/Play-Doh-Pack-Case-Blue...,B07BC152DC,"{'name': 'toys-and-games', 'url': 'https://www..."
2,https://www.amazon.com/BalanceFrom-Puzzle-Exer...,B01D7SS1XE,"{'name': 'toys-and-games', 'url': 'https://www..."
3,https://www.amazon.com/SunWorks-Construction-P...,B002LARR7Q,"{'name': 'toys-and-games', 'url': 'https://www..."
4,https://www.amazon.com/Intex-River-Lounge-Infl...,B000PEOMC8,"{'name': 'toys-and-games', 'url': 'https://www..."


In [None]:
#extracted_data = perform_scrape(cleaned_links)

In [19]:
df.shape

(149, 3)

In [20]:

df_sub = df.copy() 

In [25]:
df_sub = df_sub.apply(row_scrape_event, axis=1)

https://www.amazon.com/Crayola-Wonder-Drawing-Cantidad-paquete/dp/B07PMLL5L7/ref=zg_bs_toys-and-games_11?_encoding=UTF8&psc=1&refRID=DSW81JP145KQM8GA3NG2 Crayola Baby Shark Wonder Pages Mess Free Coloring Gift, Kids Indoor Activities at Home $6.97
https://www.amazon.com/Crayola-Wonder-Drawing-Cantidad-paquete/dp/B07PMLL5L7/ref=zg_bs_toys-and-games_11?_encoding=UTF8&psc=1&refRID=DSW81JP145KQM8GA3NG2 None None
https://www.amazon.com/Play-Doh-Pack-Case-Blue/dp/B07BC152DC/ref=zg_bs_toys-and-games_18?_encoding=UTF8&psc=1&refRID=DSW81JP145KQM8GA3NG2 Play-Doh Bulk 12-Pack of Blue Non-Toxic Modeling Compound, 4-Ounce Cans $11.97
https://www.amazon.com/BalanceFrom-Puzzle-Exercise-Interlocking-Tiles/dp/B01D7SS1XE/ref=zg_bs_toys-and-games_48?_encoding=UTF8&psc=1&refRID=DSW81JP145KQM8GA3NG2 BalanceFrom Kid's Puzzle Exercise Play Mat with EVA Foam Interlocking Tiles $15.99
https://www.amazon.com/SunWorks-Construction-Paper-Black-Sheets/dp/B002LARR7Q/ref=zg_bs_toys-and-games_44?_encoding=UTF8&psc=1&

https://www.amazon.com/Inflatable-Splash-Sprinkler-Toddlers-Outdoor/dp/B082WXCM5W/ref=zg_bs_toys-and-games_6?_encoding=UTF8&psc=1&refRID=DSW81JP145KQM8GA3NG2 Inflatable Splash Pad Sprinkler for Kids Toddlers, Kiddie Baby Pool, Outdoor Games Water Mat Toys - Baby Infant Wadin Swimming Pool - Fun Backyard Fountain Play Mat for 1 -12 Year Old Girls Boys (68") $34.95
https://www.amazon.com/Tara-Toys-Barbie-Necklace-Activity/dp/B07WWZBF5G/ref=zg_bs_toys-and-games_5?_encoding=UTF8&psc=1&refRID=DSW81JP145KQM8GA3NG2 Tara Toys Barbie Necklace Activity Set $12.99
https://www.amazon.com/First-Princess-Make-Kit-Everything/dp/B07FPWYY79/ref=zg_bs_toys-and-games_10?_encoding=UTF8&psc=1&refRID=DSW81JP145KQM8GA3NG2 None None
https://www.amazon.com/Aqua-Campania-Ultimate-Adjustable-Inflatable/dp/B07WDM32ZC/ref=zg_bs_toys-and-games_24?_encoding=UTF8&psc=1&refRID=DSW81JP145KQM8GA3NG2 AQUA Campania Ultimate 2 in 1 Recliner & Tanner Pool Lounger with Adjustable Backrest and Caddy, Inflatable Pool Float, Te

https://www.amazon.com/Blink-Smart-Security-Camera/dp/B07MMZ2LTB/ref=zg_bs_electronics_4?_encoding=UTF8&psc=1&refRID=Y799VVD4EK7EQ4G6XSV2 None None
https://www.amazon.com/Sabrent-4-Port-Individual-Switches-HB-UMLS/dp/B00BWF5U0M/ref=zg_bs_electronics_31?_encoding=UTF8&psc=1&refRID=Y799VVD4EK7EQ4G6XSV2 None None
https://www.amazon.com/Bototek-Protector-Charging-Extension-Smartphone/dp/B07MVZZV3G/ref=zg_bs_electronics_25?_encoding=UTF8&psc=1&refRID=Y799VVD4EK7EQ4G6XSV2 Power Strip, Bototek Surge Protector with 10 AC Outlets and 4 USB Charging Ports,1625W/13A, 2100 Joules, 6 Feet Long Extension Cord for Smartphone Tablets Home,Office, Hotel- Black $26.99
https://www.amazon.com/Roku-Premiere-HDR-Streaming-Player-Premium/dp/B07HDBZN7Q/ref=zg_bs_electronics_10?_encoding=UTF8&psc=1&refRID=Y799VVD4EK7EQ4G6XSV2 Roku Premiere | HD/4K/HDR Streaming Media Player, Simple Remote and Premium HDMI Cable $29.00
https://www.amazon.com/All-New-Fire-HD-8-Tablet/dp/B07TMJ1R3X/ref=zg_bs_electronics_18?_encod

https://www.amazon.com/Champion-Jersey-Short-Pockets-Black/dp/B00AKSCXBQ/ref=zg_bs_fashion_2?_encoding=UTF8&psc=1&refRID=NXNHK2QRRRSR2NM1P5RG Champion Men's Jersey Short With Pockets $14.99
https://www.amazon.com/Dickies-Multi-Pack-Dri-Tech-Moisture-Control/dp/B01N6YAVNM/ref=zg_bs_fashion_20?_encoding=UTF8&psc=1&refRID=NXNHK2QRRRSR2NM1P5RG Dickies Men's Dri-tech Moisture Control Crew Socks Multipack $25.98
https://www.amazon.com/Levis-Bandanas-100-Cotton-Headband/dp/B009YQX1SO/ref=zg_bs_fashion_45?_encoding=UTF8&psc=1&refRID=NXNHK2QRRRSR2NM1P5RG Levi's Men's 100% Cotton Multi-Purpose Bandana Gift Sets – Headband, Wrap, Protective Coverage $12.00
https://www.amazon.com/Gildan-Shirts-Multipack-Black-Small/dp/B07JCJMPVN/ref=zg_bs_fashion_28?_encoding=UTF8&psc=1&refRID=NXNHK2QRRRSR2NM1P5RG Gildan Men's A-Shirts Tanks Multipack $12.71
https://www.amazon.com/Next-Level-Womens-Festival-Tank/dp/B079SPMGNN/ref=zg_bs_fashion_40?_encoding=UTF8&psc=1&refRID=NXNHK2QRRRSR2NM1P5RG None None
https://w

In [None]:
df.to_csv(product_output, index=False)

In [None]:
final_df = pd.concat([products_df, df_sub])
final_df.to_csv(product_output, index=False)


In [None]:
final_df.head()