# Amazon Web Scraping: Bestsellers by Category 🛒


In [1]:
import pandas as pd
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
import time
import warnings
warnings.filterwarnings('ignore')
import asyncio
from tqdm.notebook import tqdm

Some options for the webdriver

In [2]:
#driver configuration
opciones=Options()

opciones.add_experimental_option('excludeSwitches', ['enable-automation'])
opciones.add_experimental_option('useAutomationExtension', False)
opciones.headless=False    # si True, no aperece la ventana (headless=no visible)
opciones.add_argument('--start-maximized')         # comienza maximizado
#opciones.add_argument('user-data-dir=selenium')    # mantiene las cookies
#opciones.add_extension('driver_folder/adblock.crx')       # adblocker
opciones.add_argument('--incognito')

In [3]:
driver = webdriver.Chrome(opciones)
url = 'https://www.amazon.es/gp/bestsellers'

driver.get(url)

time.sleep(2)

# Cookies
aceptar = driver.find_element(By.XPATH, '//*[@id="sp-cc-rejectall-link"]')
aceptar.click()

# Locate the div element by its class and ID
div_element = driver.find_element(By.CLASS_NAME, "_p13n-zg-nav-tree-all_style_zg-browse-root__-jwNv")
link_elements = div_element.find_elements(By.TAG_NAME, 'a')
links = [link.get_attribute('href') for link in link_elements]


driver.quit()

In [4]:
links[:]

['https://www.amazon.es/gp/bestsellers/grocery/ref=zg_bs_nav_grocery_0',
 'https://www.amazon.es/gp/bestsellers/boost/ref=zg_bs_nav_boost_0',
 'https://www.amazon.es/gp/bestsellers/amazon-renewed/ref=zg_bs_nav_amazon-renewed_0',
 'https://www.amazon.es/gp/bestsellers/mobile-apps/ref=zg_bs_nav_mobile-apps_0',
 'https://www.amazon.es/gp/bestsellers/baby/ref=zg_bs_nav_baby_0',
 'https://www.amazon.es/gp/bestsellers/beauty/ref=zg_bs_nav_beauty_0',
 'https://www.amazon.es/gp/bestsellers/tools/ref=zg_bs_nav_tools_0',
 'https://www.amazon.es/gp/bestsellers/music/ref=zg_bs_nav_music_0',
 'https://www.amazon.es/gp/bestsellers/gift-cards/ref=zg_bs_nav_gift-cards_0',
 'https://www.amazon.es/gp/bestsellers/climate-pledge/ref=zg_bs_nav_climate-pledge_0',
 'https://www.amazon.es/gp/bestsellers/automotive/ref=zg_bs_nav_automotive_0',
 'https://www.amazon.es/gp/bestsellers/sports/ref=zg_bs_nav_sports_0',
 'https://www.amazon.es/gp/bestsellers/amazon-devices/ref=zg_bs_nav_amazon-devices_0',
 'https://w

In [5]:
len(links)

34

Scraping groceries

In [6]:
url = 'https://www.amazon.es/gp/bestsellers/grocery/ref=zg_bs_nav_grocery_0'

DATOS=[]
CABECERAS=[]

In [7]:
# Step 1: Start a driver instance
driver = webdriver.Chrome()
driver.get(url)

#Step 2: Close cookies popup - if needed

try: 
    driver.find_element(By.XPATH, '//*[@id="sp-cc-accept"]').click()
    time.sleep(1)
except:
    print(f"Cookies not needed")

# Step 3: Scroll to the end of the website
# driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

# Step 4: Find product container and save it
caja_productos = driver.find_elements(By.CLASS_NAME, 'a-cardui._cDEzb_grid-cell_1uMOS.expandableGrid.p13n-grid-content')

# Step 5: Data Extraction:
rank = [int(i.text.split('\n')[0].split('#')[1]) for i in caja_productos]
titulos = [i.text.split('\n')[1] for i in caja_productos]
precio = []
num_reviews = [int(i.text.split('\n')[2].replace('.', '')) for i in caja_productos]

for e in caja_productos:
    try:
        precio_text = e.text.split('\n')[3]
        precio_value = float(precio_text.split()[0].replace(',', '.'))
        precio.append(precio_value)
    except:
        precio.append(0)

rating_elements = driver.find_elements(By.CSS_SELECTOR, "i.a-icon-star-small span.a-icon-alt")
ratings = [rating.get_attribute("textContent").split(" de ")[0] for rating in rating_elements]


Scraping Images

In [5]:
image_elements = driver.find_elements(By.CSS_SELECTOR, "div[data-asin] a.a-link-normal img.a-dynamic-image")
image_links = [image.get_attribute("src") for image in image_elements]


In [8]:
from selenium import webdriver
from selenium.webdriver.common.by import By
import time

# Step 1: Start a driver instance
driver = webdriver.Chrome()
url = "https://www.amazon.es/gp/bestsellers/grocery/6347831031/ref=zg_bs_nav_grocery_1"
driver.get(url)

# Step 2: Close cookies popup - if needed
try:
    driver.find_element(By.XPATH, '//*[@id="sp-cc-accept"]').click()
    time.sleep(1)
except:
    print("Cookies not needed")

# Step 3: Scroll to the end of the website to load all elements
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(2)
    new_height = driver.execute_script("return document.body.scrollHeight")
    if new_height == last_height:
        break
    last_height = new_height

# Step 4: Find product container and save it
caja_productos = driver.find_elements(By.CLASS_NAME, 'a-cardui._cDEzb_grid-cell_1uMOS.expandableGrid.p13n-grid-content')

# Step 5: Data Extraction
rank = [int(i.text.split('\n')[0].split('#')[1]) for i in caja_productos]
titulos = [i.text.split('\n')[1] for i in caja_productos]
precio = []
num_reviews = [int(i.text.split('\n')[2].replace('.', '')) for i in caja_productos]

for e in caja_productos:
    try:
        precio_text = e.text.split('\n')[3]
        precio_value = float(precio_text.split()[0].replace(',', '.'))
        precio.append(precio_value)
    except:
        precio.append(0)

rating_elements = driver.find_elements(By.CSS_SELECTOR, "i.a-icon-star-small span.a-icon-alt")
ratings = [rating.get_attribute("textContent").split(" de ")[0] for rating in rating_elements]

image_elements = driver.find_elements(By.CSS_SELECTOR, "div[data-asin] a.a-link-normal img.a-dynamic-image")
image_links = [image.get_attribute("src") for image in image_elements]


time.sleep(2)

//*[@id="CardInstanceivKun2BDSxhVFrrzzESDbQ"]/div[2]/div[2]/ul/li[4]/a

# Close the webdriver when you're done
driver.quit()

# Create a dictionary from your lists
data = {
    'rank': rank,
    'title': titulos,
    'price': precio,
    'rating': ratings,
    'num_reviews': num_reviews,
    'img_link': image_links
}

# Create a DataFrame
df = pd.DataFrame(data, columns=['rank', 'title', 'price', 'rating', 'num_reviews', 'img_link'])
df


In [9]:
df

Unnamed: 0,rank,title,price,rating,num_reviews,img_link
0,1,Blemil 2 Optimum ProTech - Leche de continuaci...,2.0,47,170,https://images-eu.ssl-images-amazon.com/images...
1,2,Blevit Plus Bibe 8 Cereales - Papilla de Cerea...,8.0,47,583,https://images-eu.ssl-images-amazon.com/images...
2,3,Blemil Confort ProTech - Fórmula de Inicio en ...,2.0,46,704,https://images-eu.ssl-images-amazon.com/images...
3,4,Blemil 1 Optimum ProTech 3PACK | Leche de Inic...,1.0,40,46,https://images-eu.ssl-images-amazon.com/images...
4,5,Nutribén Innova 2 - Leche de Fórmula en Polvo ...,10.0,47,629,https://images-eu.ssl-images-amazon.com/images...
5,6,Blemil 2 Optimum ProTech - Leche de continuaci...,3.0,48,480,https://images-eu.ssl-images-amazon.com/images...
6,7,Blevit Plus Duplo 8 Cereales y Galletas María ...,2.0,47,475,https://images-eu.ssl-images-amazon.com/images...
7,8,"Almirón Profutura 1 Leche de Inicio en Polvo, ...",17.0,48,511,https://images-eu.ssl-images-amazon.com/images...
8,9,Blevit Plus Superfibra 8 Cereales - Papilla de...,5.0,46,1064,https://images-eu.ssl-images-amazon.com/images...
9,10,Blevit Barriguitas Felices - Infusión digestiv...,11.0,46,1265,https://images-eu.ssl-images-amazon.com/images...
