# Data Downloader
The following notebook is a tentative notebook designed to debug the pipeline for data downloading.

In [42]:
from dotenv import load_dotenv
import os
import time
load_dotenv()

True

In [43]:
# Credentials from Fesal
CLIENT_ID = os.environ.get('CLIENT_ID')
CLIENT_SECRET = os.environ.get('CLIENT_SECRET')
REFRESH_TOKEN = os.environ.get('REFRESH_TOKEN')

BASE_URL = "https://sellingpartnerapi-na.amazon.com" #for sp-api not access token

MARKETPLACE_ID = "ATVPDKIKX0DER"


In [44]:
import requests

LWA_TOKEN_URL = "https://api.amazon.com/auth/o2/token"
def get_lwa_access_token(client_id, client_secret, refresh_token):
    headers = {
        "Content-Type": "application/x-www-form-urlencoded;charset=UTF-8",
        "Accept": "application/json",
    }
    data = {
        "grant_type": "refresh_token",
        "refresh_token": refresh_token,
        "client_id": client_id,
        "client_secret": client_secret,
    }
    r = requests.post(LWA_TOKEN_URL, headers=headers, data=data, timeout=30)
    if r.status_code != 200:
        raise RuntimeError(f"LWA {r.status_code}: {r.text[:400]}")
    return r.json()["access_token"]


In [36]:
get_lwa_access_token(CLIENT_ID, CLIENT_SECRET, REFRESH_TOKEN)

RuntimeError: LWA 401: {"error_description":"Client authentication failed","error":"invalid_client"}

In [None]:
def search_catalog_items(keywords: List[str], page_size: int = 20, max_pages: int = 3) -> List[str]:
    """
    Uses GET /catalog/2022-04-01/items with keywords to find candidate ASINs.
    Note: increase max_pages/PageSize carefully; respect rate limits.
    """
    asins: List[str] = []
    # SP-API uses pagination tokens; we iterate per keyword.
    for kw in keywords:
        next_token = None
        pages = 0
        while pages < max_pages:
            params = {
                "marketplaceIds": MARKETPLACE_ID,
                "keywords": kw,
                "pageSize": page_size,
                # for lightweight search we can request summaries
                "includedData": "summaries",
            }
            if next_token:
                params["paginationToken"] = next_token
            data = sp_get("/catalog/2022-04-01/items", params)
            items = data.get("items", [])
            asins.extend([it.get("asin") for it in items if it.get("asin")])
            next_token = data.get("pagination", {}).get("nextToken")
            pages += 1
            if not next_token:
                break
            time.sleep(0.2)  # be polite
    # make unique, preserve order
    seen = set()
    uniq = []
    for a in asins:
        if a and a not in seen:
            seen.add(a)
            uniq.append(a)
    return uniq

# example_asins = search_catalog_items(["mechanical keyboard", "wireless mouse"], page_size=20, max_pages=2)
# example_asins[:10]


In [None]:
def get_catalog_item(asin: str,
                     included_data: str = "summaries,images,attributes,classifications,salesRanks") -> Dict[str, Any]:
    """
    GET /catalog/2022-04-01/items/{asin}
    included_data controls payload size. If you request too many datasets at once,
    the API may return InvalidInput (seen in the wild) — trim the list if so.
    """
    params = {
        "marketplaceIds": MARKETPLACE_ID,
        "includedData": included_data
    }
    return sp_get(f"/catalog/2022-04-01/items/{asin}", params)

# Quick smoke test (replace with a known ASIN if you have one)
# test = get_catalog_item("B08N5WRWNW")
# list(test.keys()), test.get("summaries", [{}])[0].get("itemName")


In [None]:
import pandas as pd

def extract_row(payload: Dict[str, Any], asin: str) -> Dict[str, Any]:
    # Summaries
    s = (payload.get("summaries") or [{}])[0]
    title = s.get("itemName")
    brand = s.get("brand")
    model = s.get("modelNumber")
    release_date = s.get("releaseDate")

    # Images
    main_image = None
    all_imgs = []
    for by_mkt in (payload.get("images") or []):
        for im in by_mkt.get("images", []):
            if im.get("variant") == "MAIN" and not main_image:
                main_image = im.get("link")
            all_imgs.append({
                "variant": im.get("variant"),
                "url": im.get("link"),
                "w": im.get("width"),
                "h": im.get("height")
            })

    # Attributes → common text fields (varies by product type)
    attrs = payload.get("attributes") or {}
    bullets = attrs.get("bullet_points") or attrs.get("bullet_point")
    if isinstance(bullets, str):
        bullets = [bullets]
    description = None
    for k in ("product_description","description","long_description","marketing_message"):
        v = attrs.get(k)
        if isinstance(v, dict) and "value" in v:
            v = v["value"]
        if v:
            description = v
            break

    # Classifications (browse nodes)
    nodes = []
    for cls in (payload.get("classifications") or []):
        for c in cls.get("classifications", []):
            nodes.append({"id": c.get("classificationId"), "name": c.get("displayName")})

    # Sales ranks (display group + classification)
    ranks = []
    for sr in (payload.get("salesRanks") or []):
        for g in (sr.get("displayGroupRanks") or []):
            ranks.append({"title": g.get("title"), "rank": g.get("rank"), "link": g.get("link")})
        for g in (sr.get("classificationRanks") or []):
            ranks.append({"title": g.get("title"), "rank": g.get("rank"), "link": g.get("link")})

    return {
        "asin": asin,
        "title": title,
        "brand": brand,
        "model_number": model,
        "release_date": release_date,
        "main_image": main_image,
        "image_list": all_imgs,
        "bullets": bullets,
        "description": description,
        "browse_nodes": nodes,
        "sales_ranks": ranks
    }

def hydrate_asins(asins: List[str],
                  included_data: str = "summaries,images,attributes,classifications,salesRanks",
                  sleep_s: float = 0.2) -> pd.DataFrame:
    rows = []
    for a in asins:
        try:
            p = get_catalog_item(a, included_data=included_data)
        except SPAPIError as e:
            if "InvalidInput" in str(e):
                p = get_catalog_item(a, included_data="summaries,images,attributes")
            else:
                print(f"[WARN] {a}: {e}")
                continue
        rows.append(extract_row(p, a))
        time.sleep(sleep_s)
    df = pd.DataFrame(rows)
    return df

# sample_asins = search_catalog_items(["mechanical keyboard"], page_size=20, max_pages=1)
# df = hydrate_asins(sample_asins[:10])
# df.head(3)
