---

Notebook ini digunakan untuk proses scraping data pada website Amazon

Proses scraping menggunakan bantuan ScraperAPI untuk melakukan scraping website dan Beautiful Soup untuk parsing data

---

# Import Library

Import Library yang dibutuhkan

In [1]:
import pandas as pd
import json
import time
from bs4 import BeautifulSoup
import requests

# API

In [2]:
api_key = "d62541de90843bd5ad38c1bc99e40639"

# URL

In [3]:
def nike_url(page=1):
    url = f"https://www.amazon.com/s?k=nike+running+shoes&page={page}&xpid=c19ZYZKsdabEP&crid=3UJSBHJW2MC0G&qid=1759744785&sprefix=nike+running+shoe%2Caps%2C304&ref=sr_pg_{page}"
    return url

def adidas_url(page=1):
    url = f"https://www.amazon.com/s?k=adidas+running+shoes&page={page}&xpid=WdPRXIQzs_yZr&crid=24Y534EW495K2&qid=1759753219&sprefix=adida+running+shoes%2Caps%2C306&ref=sr_pg_{page}"
    return url

def nb_url(page=1):
    url = f"https://www.amazon.com/s?k=new+balance+running+shoes&page={page}&xpid=cJV8-hCtiPB2m&crid=316I8Y3T8QK73&qid=1759753290&sprefix=new+balancerunning+shoes%2Caps%2C347&ref=sr_pg_{page}"
    return url

def puma_url(page=1):
    url = f"https://www.amazon.com/s?k=puma+running+shoes&page={page}&xpid=h1o6i4Me3b_CG&crid=7MXYXUX5LFHZ&qid=1759820901&sprefix=pumrunning+shoes%2Caps%2C297&ref=sr_pg_3{page}"
    return url

def reebok_url(page=1):
    url = f"https://www.amazon.com/s?k=reebok+running+shoes&page={page}&xpid=oTc7T-rOWodFB&crid=2PW6C859EW6TL&qid=1759758846&sprefix=reebokrunning+shoes%2Caps%2C354&ref=sr_pg_{page}"
    return url

# Scraping Nike

## Scraping URL Product

Scraping Pertama dilakukan untuk mencari url produk dari pencarian di website Amazon

In [None]:

response=[]

for page in range(1,6):
    """
    Perulangan untuk melakukan scraping sebanyak 5 halaman
    """
    url = nike_url(page=page)

    # Token API
    payload = {
    'api_key': api_key,
    'url': url,
    'country_code': 'us',  
    'render': 'false'      
    }

    # Error Handling agar tidak berhenti saat tidak ada respon
    try:
        res = requests.get("https://api.scraperapi.com", params=payload, timeout=15)
        res.raise_for_status()  
        response.append(res)
        print(f"Sukses mengambil halaman {page}")
    except requests.exceptions.RequestException as e:
        print(f"Gagal mengambil halaman {page}: {e}")
        continue  
    
    # Time delay
    time.sleep(5)  


Sukses mengambil halaman 1
Sukses mengambil halaman 2
Sukses mengambil halaman 3
Sukses mengambil halaman 4
Sukses mengambil halaman 5


## Extract link

In [None]:
# Ekstrak link produk
product_url = []
base_seen = set()  # track unique product bases

for res in response:
    """
    Perulangan untuk membuka data hasil scraping
    """
    extractor = BeautifulSoup(res.text, 'html.parser')

    # Ambil semua elemen hyperlink
    links = extractor.find_all('a', href=True)
    for link in links:
        href = link['href']

        if (
            "/dp/" in href 
            and not href.startswith("/gp/") 
            and "aax-us-iad" not in href 
            and "slredirect" not in href
            and "dp/product" not in href 
        ):
            # Check that the URL belongs to the Nike brand
            if "nike" not in href.lower():
                continue  # skip non-Nike links
            
            # Membuat link URL
            base_url = "https://www.amazon.com"
            product_link = href.split('?')[0]  
            full_url = base_url + product_link

            # dedupe by product family (everything before /dp/)
            base_product = full_url.split("/dp/")[0]

            # Hindari duplikat
            if base_product not in base_seen:
                base_seen.add(base_product)
                product_url.append(full_url)

# Menampilkan hasil link produk
for i, url in enumerate(product_url, start=1):
    print(f"{i}. {url}")

## Scraping Data Product

Setelah link untuk setiap produk didapatkan dilakukan scraping kedua untuk mendapatkan data produk

In [None]:
scraping_review = []
print(f"url number: {len(product_url)}")
for i,page in enumerate(product_url):
    """
    Perulangan untuk melakukan scraping terhadap semua link yang ada di list url
    """
    # Token API
    payload = {
    'api_key': api_key,
    'url': page,
    'country_code': 'us',  
    'render': 'true'      
    }

    # Error Handling agar tidak berhenti saat tidak ada respon
    try:
        response = requests.get("https://api.scraperapi.com", params=payload, timeout=15)
        response.raise_for_status()  
        scraping_review.append(response)
        print(f"url ke-{i+1} Sukses: {page}")
    except requests.exceptions.RequestException as e:
        print(f"url ke-{i+1} Gagal: {page}: {e}")
        continue  
    
    # Time Delay
    time.sleep(5)  


In [None]:
# List untuk menyimpan data produk
review_data =[]

for product in scraping_review:
    """
    Perulangan untuk melakukan parsing data hasil scraping
    """
    # Memanggil model BS4
    parser = BeautifulSoup(product.text, 'html.parser')

    # Parsing Nama Produk
    title = parser.find(id='productTitle')
    product_title=title.get_text(strip=True) if title else 'N/A'

    # Parsing Gambar produk
    img_tag = parser.select_one('#imgTagWrapperId img')
    image_url = img_tag['src'] if img_tag else 'N/A'

    # Parsing Harga
    # Try all possible price containers
    price = (
        parser.select_one('.a-price-whole') or
        parser.select_one('.a-offscreen') or
        parser.select_one('.olpWrapper.a-size-small')
    )
    # Extract text safely
    product_price = price.get_text(strip=True) if price else 'N/A'

    # Parsing Rating Produk
    rating = parser.find('span', {'class': 'a-icon-alt'})
    product_rating=rating.get_text(strip=True) if rating else 'N/A'

    # Parsing Customers say
    p_tag = parser.find("p", class_="a-spacing-small")
    customer_say = p_tag.get_text(strip=True) if p_tag else None
    
    # Mengambil data reviews
    reviews = []
    review_blocks = parser.find_all('li', {'data-hook': 'review'})
    for block in review_blocks:

        # Mengambil data rating reviews
        rating_tag = block.find('i', {'data-hook': 'review-star-rating'})
        rating_val = 'N/A'
        if rating_tag:
            rating_span = rating_tag.find('span', class_='a-icon-alt')
            rating_val = rating_span.get_text(strip=True) if rating_span else 'N/A'

        # Mengambil teks reviews
        text_tag = block.find('span', {'data-hook': 'review-body'})
        text_val = 'N/A'
        if text_tag:
            inner_span = text_tag.find('span')
            text_val = inner_span.get_text(strip=True) if inner_span else text_tag.get_text(strip=True)

        # Menyimpan data reviews jika teks dan rating not null
        if text_val != 'N/A' and rating_val != 'N/A':
            reviews.append({
                'review': text_val,
                'rating': rating_val
            })

    # Skip produk tanpa review
    if not reviews:
        continue  

    # Membuat Dictionary review
    dict_review = {
        "title" : product_title,
        "rating" : product_rating,
        "price" : product_price,
        "img_url": image_url,
        "Brand": "Nike",
        "Customer_say": customer_say,
        "Review": reviews
    }

    # Menyimpan data review produk ke dalam list
    review_data.append(dict_review)


In [None]:
# Menghapus data tanpa title
review_data = [item for item in review_data if item["title"] != "N/A"]
review_data


In [46]:
len(review_data)

156

## Saving Data

Setelah data produk didapatkan, data disimpan dalam format JSON karena terdapat list dictionary yang berisi data reviews

In [47]:
# Menyimpan data dalam format JSON
with open("nike.json", "w", encoding="utf-8") as review_json:
    json.dump(review_data, review_json, indent=2, ensure_ascii=True)