# Data Downloader
The following notebook is a tentative notebook designed to debug the pipeline for data downloading.

In [25]:
from dotenv import load_dotenv
import os
import time
from typing import List, Dict
import random
import requests
from typing import Any

load_dotenv()

True

In [None]:
CLIENT_ID = os.environ.get('CLIENT_ID')
CLIENT_SECRET = os.environ.get('CLIENT_SECRET')
REFRESH_TOKEN = os.environ.get('REFRESH_TOKEN')

BASE_URL = "https://sellingpartnerapi-na.amazon.com" #for sp-api not access token

MARKETPLACE_ID = "ATVPDKIKX0DER"
AWS_ROLE_ARN = os.environ["AWS_ROLE_ARN"]


In [11]:
import requests

LWA_TOKEN_URL = "https://api.amazon.com/auth/o2/token"
def get_lwa_access_token(client_id, client_secret, refresh_token):
    headers = {
        "Content-Type": "application/x-www-form-urlencoded;charset=UTF-8",
        "Accept": "application/json",
    }
    data = {
        "grant_type": "refresh_token",
        "refresh_token": refresh_token,
        "client_id": client_id,
        "client_secret": client_secret,
    }
    r = requests.post(LWA_TOKEN_URL, headers=headers, data=data, timeout=30)
    if r.status_code != 200:
        raise RuntimeError(f"LWA {r.status_code}: {r.text[:400]}")
    return r.json()["access_token"]


In [18]:
LWA_ACCESS_TOKEN = get_lwa_access_token(CLIENT_ID, CLIENT_SECRET, REFRESH_TOKEN)

In [19]:
class SPAPIError(Exception):
    pass
def sp_get(path: str, params: dict, *, access_token: str | None = None,
           timeout: int = 30, max_retries: int = 5) -> dict:
    """
    Minimal SP-API GET using only the LWA access token (no IAM/SigV4).
    Retries on 429/5xx with exponential backoff and Retry-After support.
    """
    if access_token is None:
        access_token = LWA_ACCESS_TOKEN
    if not access_token:
        raise ValueError("Missing LWA access token. Set LWA_ACCESS_TOKEN or pass access_token=...")

    url = f"{BASE_URL}{path}"
    headers = {
        "Authorization": f"Bearer {access_token}",   # works equivalently to x-amz-access-token
        "x-amz-access-token": access_token,          # many examples use this header explicitly
        "Accept": "application/json",
        "Content-Type": "application/json",
        "User-Agent": "spapi-lite/1.0 (+https://example.com)",
    }

    attempt = 0
    while True:
        resp = requests.get(url, headers=headers, params=params, timeout=timeout)

        # success
        if 200 <= resp.status_code < 300:
            try:
                return resp.json()
            except Exception as e:
                raise SPAPIError(f"Bad JSON from SP-API {resp.status_code}: {resp.text[:400]}") from e

        # retryable: throttle or transient
        if resp.status_code in (429, 500, 502, 503, 504) and attempt < max_retries:
            # respect Retry-After header if present (seconds)
            ra = resp.headers.get("Retry-After")
            if ra:
                try:
                    delay = max(0.5, float(ra))
                except Exception:
                    delay = 1.0
            else:
                # exponential backoff with jitter
                delay = min(2 ** attempt, 8) + random.random()
            time.sleep(delay)
            attempt += 1
            continue

        # hard failure: surface Amazon's error payload if available
        try:
            j = resp.json()
            msg = j.get("errors") or j
        except Exception:
            msg = resp.text[:500]
        raise SPAPIError(f"SP-API GET {path} failed: {resp.status_code} {msg}")


In [None]:
def search_catalog_items(keywords: List[str], page_size: int = 20, max_pages: int = 3) -> List[str]:
    """
    Uses GET /catalog/2022-04-01/items (searchCatalogItems) with keywords
    to find candidate ASINs.
    Be mindful of rate limits. page_size max is 20.
    """
    asins: List[str] = []
    for kw in keywords:
        next_token = None
        pages = 0
        while pages < max_pages:
            params = {
                "marketplaceIds": MARKETPLACE_ID,
                "keywords": kw,
                "pageSize": page_size,
                "includedData": "summaries",
                # you can optionally add:
                # "locale": "en_US",
                # "keywordsLocale": "en_US"
            }
            if next_token:
                params["pageToken"] = next_token
            data = sp_get("/catalog/2022-04-01/items", params)
            items = data.get("items", [])
            for it in items:
                asin = it.get("asin")
                if asin:
                    asins.append(asin)
            pagination = data.get("pagination", {})
            next_token = pagination.get("nextToken")
            pages += 1
            if not next_token:  # no more pages
                break
            time.sleep(0.1)  # small delay to avoid rate limit
    seen = set()
    unique_asins: List[str] = []
    for a in asins:
        if a not in seen and a:
            seen.add(a)
            unique_asins.append(a)
    return unique_asins


In [21]:
kws = ["mechanical keyboard", "wireless mouse"]
results = search_catalog_items(kws, page_size=20, max_pages=2)
print("Found ASINs:", results)

Found ASINs: ['B0CQ2MSP2B', 'B0CDWP1D58', 'B0CF3VGQFL', 'B08Z6X4NK3', 'B09LK1P1RD', 'B0D14N2QZF', 'B089YFHYYS', 'B09NCMHTSB', 'B09JG7KRC7', 'B0DBZGH5XM', 'B0CH3MRGK7', 'B0CNT61VMZ', 'B09TR4Y91J', 'B098LG3N6R', 'B0CQ4TBV2N', 'B0FF9WN7R1', 'B0F58SM5BT', 'B0DWMX5TXV', 'B07G11G2X8', 'B0DQPWRDGW', 'B0DKNLMD9D', 'B0C8QYB8W6', 'B0CJ2Q799Z', 'B0CDX5XGLK', 'B0CT2HDJMS', 'B0C9ZJHQHM', 'B09NMM4FY6', 'B0DT43NNNF', 'B01NAI2TXC', 'B0D14L7VZ6', 'B0CQ53BKWT', 'B07KCRTN9Q', 'B07XVCP7F5', 'B0F1ZW5BGT', 'B07QGHK6Q8', 'B08HR74WV4', 'B0DKXH5WCD', 'B0DKNDPKV3', 'B07QQB9VCV', 'B098785CL3', 'B004YAVF8I', 'B087Z5WDJ2', 'B015NBTAOW', 'B005EJH6Z4', 'B09Y8K2HJN', 'B098S48QWM', 'B07CMS5Q6N', 'B0F5HPCLGB', 'B0C4XXH2FV', 'B00ADBY98A', 'B07H8TJMX7', 'B087Z733CM', 'B01JPOLKDW', 'B0D9N62T62', 'B0DKFGJDCN', 'B07Y1KKFBH', 'B0CP3HTHLZ', 'B0BJ6NBYJG', 'B001DHECXA', 'B09KX66ZCD', 'B0787D6SGQ', 'B0BXBC26X8', 'B09JSH81HZ', 'B0CH44BLWM', 'B099T9J1YN', 'B0BYD5H31D', 'B0836GXKKB', 'B0CB7W6W7B', 'B09JG4KJN5', 'B071YZJ1G1', 'B003N

In [30]:

def get_catalog_item(asin: str,
                     included_data: str = "summaries,images,attributes,classifications,salesRanks") -> Dict[str, Any]:
    """
    GET /catalog/2022-04-01/items/{asin}
    included_data controls payload size. If you request too many datasets at once,
    the API may return InvalidInput (seen in the wild) — trim the list if so.
    """
    params = {
        "marketplaceIds": MARKETPLACE_ID,
        "includedData": included_data
    }
    return sp_get(f"/catalog/2022-04-01/items/{asin}", params)


In [31]:
result = get_catalog_item(results[0], included_data="summaries,images,attributes,classifications,salesRanks")

In [32]:
result

{'asin': 'B0CQ2MSP2B',
 'attributes': {'item_weight': [{'unit': 'pounds',
    'value': 3.0,
    'marketplace_id': 'ATVPDKIKX0DER'}],
  'bullet_point': [{'language_tag': 'en_US',
    'value': "Big Features on a Small Screen - Is there anything it can't display? Custom gif image, date. connection mode, WIN/MAC layout, battery status, etc.",
    'marketplace_id': 'ATVPDKIKX0DER'},
   {'language_tag': 'en_US',
    'value': 'Knob Design- Adjust volume, connection mode, backlit brightness/speed, RGB mode/color, all it takes is just a twist or a click.',
    'marketplace_id': 'ATVPDKIKX0DER'},
   {'language_tag': 'en_US',
    'value': 'BT5.0/2.4G/USB-C - Wireless keyboard with stable BT 5.0, hassle-free 2.4Ghz dongle plus USB-C wired mode set no limits about your keyboard connection.',
    'marketplace_id': 'ATVPDKIKX0DER'},
   {'language_tag': 'en_US',
    'value': 'Gaming Friendly Top-Mount Design - Offers a superior tactile consistency, firm feeling, and better noice reducing creamy keyboa

In [33]:
print("Name:", result.get("summaries", [{}])[0].get("itemName"))
print("Brand:", result.get("summaries", [{}])[0].get("brand"))
print("Images:", [img.get("link") for img in result.get("images", [])])
print("Sales rank:", result.get("salesRanks", {}))
print("Classifications:", result.get("classifications", {}))

Name: RK ROYAL KLUDGE S98 Mechanical Keyboard w/Smart Display & Knob, Top Mount 96% Wireless Mechanical Keyboard BT/2.4G/USB-C, Hot Swappable, Software Support, Creamy Sounding, 98 Keys
Brand: RK ROYAL KLUDGE
Images: [None]
Sales rank: [{'marketplaceId': 'ATVPDKIKX0DER', 'classificationRanks': [{'classificationId': '402051011', 'title': 'PC Gaming Keyboards', 'link': 'https://www.amazon.com/gp/bestsellers/videogames/402051011', 'rank': 20}], 'displayGroupRanks': [{'websiteDisplayGroup': 'video_games_display_on_website', 'title': 'Video Games', 'link': 'https://www.amazon.com/gp/bestsellers/videogames', 'rank': 384}]}]
Classifications: [{'marketplaceId': 'ATVPDKIKX0DER', 'classifications': [{'displayName': 'Gaming Keyboards', 'classificationId': '402051011', 'parent': {'displayName': 'Accessories', 'classificationId': '318813011', 'parent': {'displayName': 'PC', 'classificationId': '229575', 'parent': {'displayName': 'Categories', 'classificationId': '11846801', 'parent': {'displayName'

In [None]:
import pandas as pd

def extract_row(payload: Dict[str, Any], asin: str) -> Dict[str, Any]:
    # Summaries
    s = (payload.get("summaries") or [{}])[0]
    title = s.get("itemName")
    brand = s.get("brand")
    model = s.get("modelNumber")
    release_date = s.get("releaseDate")

    # Images
    main_image = None
    all_imgs = []
    for by_mkt in (payload.get("images") or []):
        for im in by_mkt.get("images", []):
            if im.get("variant") == "MAIN" and not main_image:
                main_image = im.get("link")
            all_imgs.append({
                "variant": im.get("variant"),
                "url": im.get("link"),
                "w": im.get("width"),
                "h": im.get("height")
            })

    # Attributes → common text fields (varies by product type)
    attrs = payload.get("attributes") or {}
    bullets = attrs.get("bullet_points") or attrs.get("bullet_point")
    if isinstance(bullets, str):
        bullets = [bullets]
    description = None
    for k in ("product_description","description","long_description","marketing_message"):
        v = attrs.get(k)
        if isinstance(v, dict) and "value" in v:
            v = v["value"]
        if v:
            description = v
            break

    # Classifications (browse nodes)
    nodes = []
    for cls in (payload.get("classifications") or []):
        for c in cls.get("classifications", []):
            nodes.append({"id": c.get("classificationId"), "name": c.get("displayName")})

    # Sales ranks (display group + classification)
    ranks = []
    for sr in (payload.get("salesRanks") or []):
        for g in (sr.get("displayGroupRanks") or []):
            ranks.append({"title": g.get("title"), "rank": g.get("rank"), "link": g.get("link")})
        for g in (sr.get("classificationRanks") or []):
            ranks.append({"title": g.get("title"), "rank": g.get("rank"), "link": g.get("link")})

    return {
        "asin": asin,
        "title": title,
        "brand": brand,
        "model_number": model,
        "release_date": release_date,
        "main_image": main_image,
        "image_list": all_imgs,
        "bullets": bullets,
        "description": description,
        "browse_nodes": nodes,
        "sales_ranks": ranks
    }

def hydrate_asins(asins: List[str],
                  included_data: str = "summaries,images,attributes,classifications,salesRanks",
                  sleep_s: float = 0.2) -> pd.DataFrame:
    rows = []
    for a in asins:
        try:
            p = get_catalog_item(a, included_data=included_data)
        except SPAPIError as e:
            if "InvalidInput" in str(e):
                p = get_catalog_item(a, included_data="summaries,images,attributes")
            else:
                print(f"[WARN] {a}: {e}")
                continue
        rows.append(extract_row(p, a))
        time.sleep(sleep_s)
    df = pd.DataFrame(rows)
    return df


In [36]:
df = hydrate_asins(results)

In [37]:
df.head()

Unnamed: 0,asin,title,brand,model_number,release_date,main_image,image_list,bullets,description,browse_nodes,sales_ranks
0,B0CQ2MSP2B,RK ROYAL KLUDGE S98 Mechanical Keyboard w/Smar...,RK ROYAL KLUDGE,S98,,https://m.media-amazon.com/images/I/41pNC8EVNX...,"[{'variant': 'MAIN', 'url': 'https://m.media-a...","[{'language_tag': 'en_US', 'value': 'Big Featu...","[{'language_tag': 'en_US', 'value': 'RK ROYAL ...","[{'id': '402051011', 'name': 'Gaming Keyboards'}]","[{'title': 'Video Games', 'rank': 384, 'link':..."
1,B0CDWP1D58,"Redragon K668 RGB Gaming Keyboard, 108 Keys Wi...",Redragon,K668,,https://m.media-amazon.com/images/I/31qNb+bxSw...,"[{'variant': 'MAIN', 'url': 'https://m.media-a...","[{'language_tag': 'en_US', 'value': 'Hot-Swapp...","[{'language_tag': 'en_US', 'value': 'REDRAGON ...","[{'id': '402051011', 'name': 'Gaming Keyboards'}]","[{'title': 'Video Games', 'rank': 278, 'link':..."
2,B0CF3VGQFL,"Redragon Mechanical Gaming Keyboard Wired, 11 ...",Redragon,K671,,https://m.media-amazon.com/images/I/41khzfsV4m...,"[{'variant': 'MAIN', 'url': 'https://m.media-a...","[{'language_tag': 'en_US', 'value': 'Brilliant...","[{'language_tag': 'en_US', 'value': 'RGB Progr...","[{'id': '402051011', 'name': 'Gaming Keyboards'}]","[{'title': 'Video Games', 'rank': 174, 'link':..."
3,B08Z6X4NK3,Logitech G413 SE Full-Size Mechanical Gaming K...,Logitech G,920-010433,2022-01-25,https://m.media-amazon.com/images/I/31rtIUUw7b...,"[{'variant': 'MAIN', 'url': 'https://m.media-a...","[{'language_tag': 'en_US', 'value': 'Take your...",,"[{'id': '402051011', 'name': 'Gaming Keyboards...","[{'title': 'Video Games', 'rank': 335, 'link':..."
4,B09LK1P1RD,Logitech MX Mechanical Wireless Illuminated Pe...,Logitech,920-010547,2022-05-24,https://m.media-amazon.com/images/I/41FBNsAaL4...,"[{'variant': 'MAIN', 'url': 'https://m.media-a...","[{'language_tag': 'en_US', 'value': 'Tactile Q...",,"[{'id': '12879431', 'name': 'Keyboards'}]","[{'title': 'Computer Keyboards', 'rank': 36, '..."
