# 1) Preparación de datos (Amazon Sports & Outdoors)
Este cuaderno:
- Lee el **fichero bruto** descargado de Amazon (Sports & Outdoors).
- Extrae campos útiles: `title`, `description`, `category_path`.
- Genera dos ficheros de trabajo en la misma carpeta del notebook:
  - `cats_sample.jsonl` → subconjunto reducido y normalizado (para trabajar cómodo).
  - `cats_labels.txt` → lista de **categorías destino** (una por línea), a partir de las categorías hoja más frecuentes.

> **Estructura esperada del repo (ejemplo)**  
```
notebooks/
  data/
    amazon/
      meta_Pet_Supplies.json        # <-- archivo bruto grande
  1_preparacion_datos.ipynb         # <-- este cuaderno
  2_taxonomia_piloto.ipynb          # <-- cuaderno de modelado
```
Si tu archivo tiene otro nombre/extensión (`.jsonl`, `.gz`), ajusta la variable `RAW_PATH` abajo.


In [6]:
from pathlib import Path
import json, gzip, re
from collections import Counter

# Rutas base
CWD = Path.cwd()
PROJECT_ROOT = CWD.parent if CWD.name == "notebooks" else CWD
RAW_DIR = PROJECT_ROOT / "notebooks" / "data" / "step_0"
PREP_DIR = PROJECT_ROOT / "notebooks" / "data" / "step_1"
FINAL_DIR = PROJECT_ROOT / "data"
RAW_DIR.mkdir(parents=True, exist_ok=True)
PREP_DIR.mkdir(parents=True, exist_ok=True)
FINAL_DIR.mkdir(parents=True, exist_ok=True)

# Selecciona la categoría descargada (ejecuta antes 0_descarga_conversion.ipynb)
CATEGORY = "Sports_and_Outdoors"
RAW_PATH = RAW_DIR / f"meta_{CATEGORY}.jsonl"
FILE_SAMPLE = PREP_DIR / f"meta_{CATEGORY}_sample.jsonl"
FILE_SAMPLE_EXPORT = FINAL_DIR / f"meta_{CATEGORY}_sample.jsonl"

# Si no existe, intenta usar el primer meta_*.jsonl disponible
if not RAW_PATH.exists():
    candidates = sorted(RAW_DIR.glob("meta_*.jsonl"))
    if candidates:
        RAW_PATH = candidates[0]
        CATEGORY = RAW_PATH.stem.replace("meta_", "")
        FILE_SAMPLE = PREP_DIR / f"meta_{CATEGORY}_sample.jsonl"
        FILE_SAMPLE_EXPORT = FINAL_DIR / f"meta_{CATEGORY}_sample.jsonl"
        print(f"⚠ RAW_PATH no encontrado, usando {RAW_PATH.name}")
    else:
        available = [p.name for p in sorted(RAW_DIR.glob('*'))]
        raise FileNotFoundError(f"No se encontró ningún meta_*.jsonl en {RAW_DIR}. Ejecuta 0_descarga_conversion.ipynb. Archivos encontrados: {available}")

print(RAW_PATH, FILE_SAMPLE, FILE_SAMPLE_EXPORT)


⚠ RAW_PATH no encontrado, usando meta_Cell_Phones_and_Accessories.jsonl
/Users/marc/Documents/Projectes/tfm-product-matching/notebooks/data/step_0/meta_Cell_Phones_and_Accessories.jsonl /Users/marc/Documents/Projectes/tfm-product-matching/notebooks/data/step_1/meta_Cell_Phones_and_Accessories_sample.jsonl /Users/marc/Documents/Projectes/tfm-product-matching/data/meta_Cell_Phones_and_Accessories_sample.jsonl


## Utilidades de lectura (json / jsonl / gz)

In [7]:
def open_maybe_gzip(path: Path):
    if str(path).endswith('.gz'):
        return (line.decode('utf-8', 'ignore') for line in gzip.open(path, 'rb'))
    return open(path, 'r', encoding='utf-8')

def looks_like_jsonl(path: Path) -> bool:
    with open_maybe_gzip(path) as f:
        for line in f:
            line = line.strip()
            return line.startswith('{')
    return False

def iter_raw_items(path: Path):
    if looks_like_jsonl(path):
        with open_maybe_gzip(path) as f:
            for line in f:
                line = line.strip()
                if not line:
                    continue
                try:
                    yield json.loads(line)
                except Exception:
                    continue
    else:
        with open_maybe_gzip(path) as f:
            data = json.load(f)
        if isinstance(data, list):
            for d in data:
                yield d

def normalize_text(x):
    if not x:
        return ''
    if isinstance(x, list):
        x = ' '.join(str(t) for t in x if t)
    x = re.sub(r"\s+", " ", str(x)).strip()
    return x

def extract_image(d):
    # Busca campo común de imagen en SNAP
    for key in ('imUrl', 'image', 'main_image', 'imageURLHighRes'):
        if key in d and d[key]:
            v = d[key]
            if isinstance(v, list):
                return v[0]
            return v
    return ''

def extract_record(d):
    title = normalize_text(d.get('title') or d.get('name'))
    categories = normalize_text(d.get('categories'))
    desc = normalize_text(d.get('description') or d.get('feature') or d.get('bullet_points'))
    image = normalize_text(extract_image(d))
    if not title and not desc:
        return None
    return {'title': title, 'description': desc, 'image': image, 'categories': categories}

## Normalización de campos

In [None]:
# Tamaño del subconjunto (ajústalo a tu máquina)
N = 5000

kept = 0
with open(FILE_SAMPLE, 'w', encoding='utf-8') as out, open(FILE_SAMPLE_EXPORT, 'w', encoding='utf-8') as out_export:
    for d in iter_raw_items(RAW_PATH):
        rec = extract_record(d)
        if not rec:
            continue
        json.dump(rec, out, ensure_ascii=False)
        out.write("\n")
        out_export.write(json.dumps(rec, ensure_ascii=False) + "")
        kept += 1
        if kept >= N:
            break

print(f"✔ Sample creado: {FILE_SAMPLE}  ({kept} filas)")
print(f"✔ Sample exportado: {FILE_SAMPLE_EXPORT}")


✔ Sample creado: /Users/marc/Documents/Projectes/tfm-product-matching/notebooks/data/step_1/meta_Cell_Phones_and_Accessories_sample.jsonl  (500 filas)
✔ Sample exportado: /Users/marc/Documents/Projectes/tfm-product-matching/data/meta_Cell_Phones_and_Accessories_sample.jsonl


## Vista previa rápida

In [9]:
from itertools import islice
print("— Muestras:")
with open(FILE_SAMPLE, 'r', encoding='utf-8') as f:
    for line in islice(f, 3):
        print(json.loads(line))


— Muestras:
{'title': 'Pink &amp; White 3d Melt Ice-cream Skin Hard Case Cover for Apple Iphone 4 4s Protect Cell', 'description': 'Pink & White 3D Melt Ice-Cream Skin Hard Case Cover For Apple iPhone 4 4S Protect Cell Description: Compatible with Apple iPhone 4 4G 4S 16/32/64 GB, AT&T;, Verizon, Sprint Protect your phone from scratches, dirt and bumps. Precise openings on the protector case to allow access to all controls and features on the phone. 100% Brand New, high quality and Easy to Remove and install Material: PVC, Hard Plastic Color: Pink & White Package included: * 1x Hard Case Cover For iPhone 4 4G 4S * 1 Belt Clip', 'image': 'http://ecx.images-amazon.com/images/I/31zn6SOL1rL._SY300_.jpg', 'categories': "['Cell Phones & Accessories', 'Cases', 'Basic Cases']"}
{'title': 'Purple Hard Case Cover for Iphone 4 4s 4g with 3d Sculpture Design Blossom Rose Flower', 'description': 'Purple Hard Case Cover for iPhone 4 4S 4G With 3D Sculpture Design Blossom Rose Flower Description: Com