# 0. Descarga y conversión (Amazon SNAP)

Descarga dos categorías públicas de https://nijianmo.github.io/amazon/index.html
y convierte el pseudo-JSON (comillas simples) en JSONL válido.

## Categorías a usar
Edita la lista para probar otras ramas. Recomendado: limitar a 2 para el TFM.

In [2]:
from pathlib import Path
import urllib.request
import gzip
import shutil
import ast, json

# Categorías que queremos descargar
CATEGORIES = [
    "Cell_Phones_and_Accessories",
    "Sports_and_Outdoors",
    "Pet_Supplies",
]
BASE_URL = "https://snap.stanford.edu/data/amazon/productGraph/categoryFiles"

DATA_DIR = Path("data/step_0")
DATA_DIR.mkdir(parents=True, exist_ok=True)


def download_category(cat: str):
    url = f"{BASE_URL}/meta_{cat}.json.gz"
    dest_gz = DATA_DIR / f"meta_{cat}.json.gz"
    dest_txt = DATA_DIR / f"meta_{cat}.json"
    dest_jsonl = DATA_DIR / f"meta_{cat}.jsonl"
    return url, dest_gz, dest_txt, dest_jsonl

list(download_category(c) for c in CATEGORIES)


[('https://snap.stanford.edu/data/amazon/productGraph/categoryFiles/meta_Cell_Phones_and_Accessories.json.gz',
  PosixPath('data/step_0/meta_Cell_Phones_and_Accessories.json.gz'),
  PosixPath('data/step_0/meta_Cell_Phones_and_Accessories.json'),
  PosixPath('data/step_0/meta_Cell_Phones_and_Accessories.jsonl')),
 ('https://snap.stanford.edu/data/amazon/productGraph/categoryFiles/meta_Sports_and_Outdoors.json.gz',
  PosixPath('data/step_0/meta_Sports_and_Outdoors.json.gz'),
  PosixPath('data/step_0/meta_Sports_and_Outdoors.json'),
  PosixPath('data/step_0/meta_Sports_and_Outdoors.jsonl')),
 ('https://snap.stanford.edu/data/amazon/productGraph/categoryFiles/meta_Pet_Supplies.json.gz',
  PosixPath('data/step_0/meta_Pet_Supplies.json.gz'),
  PosixPath('data/step_0/meta_Pet_Supplies.json'),
  PosixPath('data/step_0/meta_Pet_Supplies.jsonl'))]

## Utilidades
- `download_snap_file`: descarga si falta.
- `gunzip`: descomprime.
- `convert_pythonish_json_to_jsonl`: parsea línea a línea con `ast.literal_eval` y escribe JSONL.

In [3]:
def download_snap_file(url: str, dest: Path):
    if dest.exists():
        print(f"✔ Ya existe: {dest}")
        return dest
    print(f"↓ Descargando {url} → {dest}")
    urllib.request.urlretrieve(url, dest)
    return dest

def gunzip(src: Path, dest: Path):
    if dest.exists() and dest.stat().st_mtime >= src.stat().st_mtime:
        print(f"✔ Ya descomprimido: {dest}")
        return dest
    with gzip.open(src, "rb") as fin, open(dest, "wb") as fout:
        shutil.copyfileobj(fin, fout)
    print(f"✔ Descomprimido: {dest}")
    return dest

def convert_pythonish_json_to_jsonl(src: Path, dest: Path):
    total = 0
    dest_tmp = dest.with_suffix(dest.suffix + ".tmp")
    with open(src, "r", encoding="utf-8") as fin, open(dest_tmp, "w", encoding="utf-8") as fout:
        for line in fin:
            line = line.strip()
            if not line:
                continue
            obj = ast.literal_eval(line)
            json.dump(obj, fout, ensure_ascii=False)
            fout.write("\n")
            total += 1
            if total % 50000 == 0:
                print(f"… {total} líneas")
    dest_tmp.replace(dest)
    print(f"✔ Convertido a JSONL: {dest} ({total} líneas)")
    return dest


## Ejecutar descarga y conversión

In [4]:

artifacts = []
for cat in CATEGORIES:
    url, gz_path, txt_path, jsonl_path = download_category(cat)
    download_snap_file(url, gz_path)
    gunzip(gz_path, txt_path)
    convert_pythonish_json_to_jsonl(txt_path, jsonl_path)
    artifacts.append(jsonl_path)
artifacts


↓ Descargando https://snap.stanford.edu/data/amazon/productGraph/categoryFiles/meta_Cell_Phones_and_Accessories.json.gz → data/step_0/meta_Cell_Phones_and_Accessories.json.gz
✔ Descomprimido: data/step_0/meta_Cell_Phones_and_Accessories.json
… 50000 líneas
… 100000 líneas
… 150000 líneas
… 200000 líneas
… 250000 líneas
… 300000 líneas
✔ Convertido a JSONL: data/step_0/meta_Cell_Phones_and_Accessories.jsonl (346793 líneas)
↓ Descargando https://snap.stanford.edu/data/amazon/productGraph/categoryFiles/meta_Sports_and_Outdoors.json.gz → data/step_0/meta_Sports_and_Outdoors.json.gz
✔ Descomprimido: data/step_0/meta_Sports_and_Outdoors.json
… 50000 líneas
… 100000 líneas
… 150000 líneas
… 200000 líneas
… 250000 líneas
… 300000 líneas
… 350000 líneas
… 400000 líneas
… 450000 líneas
… 500000 líneas
✔ Convertido a JSONL: data/step_0/meta_Sports_and_Outdoors.jsonl (532197 líneas)
↓ Descargando https://snap.stanford.edu/data/amazon/productGraph/categoryFiles/meta_Pet_Supplies.json.gz → data/step

[PosixPath('data/step_0/meta_Cell_Phones_and_Accessories.jsonl'),
 PosixPath('data/step_0/meta_Sports_and_Outdoors.jsonl'),
 PosixPath('data/step_0/meta_Pet_Supplies.jsonl')]

## Validación rápida

In [5]:

import itertools

samples = {}
for path in artifacts:
    with open(path, "r", encoding="utf-8") as f:
        rows = [json.loads(line) for line in itertools.islice(f, 3)]
    samples[path.name] = [{k: v for k, v in row.items() if k in ('asin','title','category','categories','brand')}
                           for row in rows]
samples


{'meta_Cell_Phones_and_Accessories.jsonl': [{'asin': '0110400550',
   'title': 'Pink &amp; White 3d Melt Ice-cream Skin Hard Case Cover for Apple Iphone 4 4s Protect Cell',
   'brand': '',
   'categories': [['Cell Phones & Accessories', 'Cases', 'Basic Cases']]},
  {'asin': '011040047X',
   'categories': [['Cell Phones & Accessories', 'Cases', 'Basic Cases']],
   'title': 'Purple Hard Case Cover for Iphone 4 4s 4g with 3d Sculpture Design Blossom Rose Flower'},
  {'asin': '0195866479',
   'categories': [['Cell Phones & Accessories', 'Cases', 'Basic Cases']],
   'title': 'Hello Kitty Light-weighted Chrome Case Black Color for iphone 4 4G (HOT ITEM)'}],
 'meta_Sports_and_Outdoors.jsonl': [{'asin': '0000032069',
   'title': 'Adult Ballet Tutu Cheetah Pink',
   'brand': 'BubuBibi',
   'categories': [['Sports & Outdoors',
     'Other Sports',
     'Dance',
     'Clothing',
     'Girls',
     'Skirts']]},
  {'asin': '0000031909',
   'title': 'Girls Ballet Tutu Neon Pink',
   'brand': 'Unknow

## Notas
- Si la red está bloqueada, descarga los `.json.gz` manualmente y colócalos en `notebooks/data/`.
- Mantén la lista `CATEGORIES` corta para que el TFM sea manejable.