In [1]:
import requests
from requests_html import HTML
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
import re
import time
from pathlib import Path
import pandas as pd
import datetime

In [2]:
BASE_DIR = Path.cwd()
DATA_DIR = BASE_DIR / 'data' # Same as -> os.path.join(BASE_DIR, 'data)
if not DATA_DIR.exists(): # Same as -> os.path.exists(DATA_DIR)
    DATA_DIR.mkdir(exist_ok=True) # Same as -> os.makedirs(DATA_DIR, exist_ok=True)

product_category_links_output = DATA_DIR / 'category-products.csv'
product_output = DATA_DIR / 'products.csv'

In [3]:
options = Options()
options.add_argument("--headless")

driver = webdriver.Chrome(options=options)

In [4]:
categories = [
    {'name': 'sports-goods', 'url': 'https://www.amazon.sa/-/en/gp/bestsellers/sports-goods/17007473031/'},
    {'name': 'electronics', 'url': 'https://www.amazon.sa/-/en/gp/bestsellers/electronics/'},
    {'name': 'pet-supplies', 'url': 'https://www.amazon.sa/-/en/gp/bestsellers/pet-supplies/'}
]

In [5]:
regex_options = [
    r'https://www.amazon.sa/-/en/gp/product/(?P<product_id>[\w-]+)/',
    r'https://www.amazon.sa/-/en/dp/(?P<product_id>[\w-]+)/',
    r'https://www.amazon.sa/-/en/(?P<slug>[\w-]+)/dp/(?P<product_id>[\w-]+)/',
]

def extract_product_id_from_url(url):
    product_id = None
    for regex_str in regex_options:
        regex = re.compile(regex_str)
        match = regex.match(url)
        if match != None:
            try:
                product_id = match['product_id']
            except:
                pass
    return product_id

In [6]:
def clean_page_links(page_links=[], category=None):
    final_page_links = []
    for url in page_links:
        product_id = extract_product_id_from_url(url)
        if product_id !=None:
            final_page_links.append({"url": url, "product_id":product_id, "category": category})
    return final_page_links

In [7]:
def scrape_category_product_links(categories=[]):
    all_product_links = []
    for category in categories:
        time.sleep(1.5)
        url = category.get('url')
        driver.get(url)
        body_el = driver.find_element(By.CSS_SELECTOR, 'body')
        html_str = body_el.get_attribute("innerHTML")
        html_obj = HTML(html=html_str)
        page_links = [f"https://www.amazon.sa{x}" for x in html_obj.links if x.startswith("/")]
        cleaned_links = clean_page_links(page_links=page_links, category=category)
        all_product_links += cleaned_links
    return all_product_links

In [8]:
def extract_categories_and_save(categories=categories):
    all_product_items = scrape_category_product_links(categories)
    category_df = pd.DataFrame(all_product_items)
    # category_df.head()
    category_df.to_csv(product_category_links_output, index=False)

In [9]:
extract_categories_and_save(categories=categories)

In [10]:
def scrape_product_page(url, title_lookup='#productTitle', 
price_lookup='.a-offscreen'):
    driver.get(url)
    time.sleep(1.5)
    body_el = driver.find_element(By.CSS_SELECTOR, 'body')
    html_str = body_el.get_attribute("innerHTML")
    html_obj = HTML(html=html_str)
    product_title = html_obj.find(title_lookup, first=True).text
    product_price = html_obj.find(price_lookup, first=True).text
    return product_title, product_price

In [11]:
def perform_scrape(cleaned_items=[]): 
    data_extracted = []
    for obj in cleaned_items:
        link = obj['url']
        product_id = obj['product_id']
        title, price = (None, None)
        try:
            title, price = scrape_product_page(link)
        except:
            pass
        if title != None and price !=None:
            print(link, title, price)
        product_data = {
            "url": link,
            "title": title,
            "price": price
        }
        data_extracted.append(product_data)
    return data_extracted

In [12]:
# extracted_data = perform_scrape(cleaned_items=cleaned_links)
# print(extracted_data)

In [13]:
def row_scrape_event(row, *args, **kwargs): 
    link = row['url']
    scraped = 0
    try:
        scraped = row['scraped']
    except:
        pass
    # print(link)
    if scraped == 1 or scraped == '1':
        return row
    product_id = row['product_id']
    title, price = (None, None)
    try:
        title, price = scrape_product_page(link)
    except:
        pass
    row['title'] = title
    row['price'] = price
    row['scraped'] = 1
    row['timestamp'] = datetime.datetime.now().timestamp()
    return row

In [14]:
df = pd.read_csv(product_category_links_output)
df.head()

Unnamed: 0,url,product_id,category
0,https://www.amazon.sa/-/en/Fishing-Game-Fishes...,B091NYZTCH,"{'name': 'sports-goods', 'url': 'https://www.a..."
1,https://www.amazon.sa/-/en/Rovyfota-Upgraded-B...,B0B2JB2FTR,"{'name': 'sports-goods', 'url': 'https://www.a..."
2,https://www.amazon.sa/-/en/Hook-Loop-Suspender...,B07D2D19NP,"{'name': 'sports-goods', 'url': 'https://www.a..."
3,https://www.amazon.sa/-/en/Electric-Fishing-Li...,B099Z9FSZT,"{'name': 'sports-goods', 'url': 'https://www.a..."
4,https://www.amazon.sa/-/en/Strong-Durable-Brea...,B07MX7ZKDJ,"{'name': 'sports-goods', 'url': 'https://www.a..."


In [15]:
df.shape

(88, 3)

In [16]:
df_sub = df.copy() # df.head(n=10)
df_sub = df_sub.apply(row_scrape_event, axis=1)

In [17]:
df.to_csv(product_output, index=False)

In [18]:
products_df = pd.read_csv(product_output)
# products_df.head()

In [19]:
final_df = pd.concat([products_df, df_sub])
final_df.to_csv(product_output, index=False)

In [22]:
final_df.tail()

Unnamed: 0,url,product_id,category,title,price,scraped,timestamp
83,https://www.amazon.sa/-/en/Fancy-Feast-Purina-...,B07KX45BNN,"{'name': 'pet-supplies', 'url': 'https://www.a...",Fancy Feast Purina Puree Kiss Tuna Puree with ...,SAR10.14,1.0,1671402000.0
84,https://www.amazon.sa/-/en/Thomas-Cat-Litter-N...,B0037XNXIS,"{'name': 'pet-supplies', 'url': 'https://www.a...",Thomas Cat Litter Non Clumping - 16 L,SAR61.25,1.0,1671402000.0
85,https://www.amazon.sa/-/en/Felix-Naturally-Del...,B08YNJWQXB,"{'name': 'pet-supplies', 'url': 'https://www.a...",Felix Purina Naturally Delicious Countryside S...,SAR38.00,1.0,1671402000.0
86,https://www.amazon.sa/-/en/Brit-Premium-Nature...,B099F27PRZ,"{'name': 'pet-supplies', 'url': 'https://www.a...",Brit Premium by Nature Cat Sticks with Salmon ...,SAR6.00,1.0,1671402000.0
87,https://www.amazon.sa/-/en/Kitekat-Mackerel-Dr...,B076GMVVXM,"{'name': 'pet-supplies', 'url': 'https://www.a...","Kitekat Mackerel, Dry Cat Food, 7Kg",SAR128.95,1.0,1671402000.0
