# 4 · Smart Connections (preparación de dataset)

Notebook en castellano para preparar el fichero de *smart connections* a partir del JSONL crudo que vive en `notebooks/raw-data/matchings_dataset.jsonl`.

Objetivos:

1) Cargar el JSONL con pares `productA` (cliente) y `productB` (competidor).
2) Limpiar campos (texto, imágenes, categorías) y asegurar IDs estables.
3) Generar dos salidas en `/data/` (no se borran):
   - `data/matchings_products.jsonl`: productos individuales (cliente/competidor) con `source` y `pair_id`.
   - `data/matchings_pairs.jsonl`: pares client/competitor con campos principales para validación.

Así tendremos un dataset limpio que podemos consumir en la página de *Smart Connections*.

In [5]:
import json
from pathlib import Path
from typing import List, Dict, Any

PROJECT_ROOT = Path.cwd().parent if Path.cwd().name == "notebooks" else Path.cwd()
RAW_PATH = PROJECT_ROOT / "notebooks" / "raw-data" / "matchings_dataset.jsonl"
STEP4_DIR = PROJECT_ROOT / "notebooks" / "data" / "step_4"
STEP4_DIR.mkdir(parents=True, exist_ok=True)

# salidas finales (sí van a git/push si las dejas)
FINAL_PRODUCTS = PROJECT_ROOT / "data" / "matchings_products.jsonl"
FINAL_PAIRS = PROJECT_ROOT / "data" / "matchings_pairs.jsonl"

# recorte para no pasar de 100MB en GitHub; ajusta si hace falta
MAX_PAIRS = 2000

print("Lectura desde:", RAW_PATH)
print("Intermedios en:", STEP4_DIR)
print("Final products:", FINAL_PRODUCTS)
print("Final pairs:", FINAL_PAIRS)


Lectura desde: /Users/marc/Documents/Projectes/tfm-product-matching/notebooks/raw-data/matchings_dataset.jsonl
Intermedios en: /Users/marc/Documents/Projectes/tfm-product-matching/notebooks/data/step_4
Final products: /Users/marc/Documents/Projectes/tfm-product-matching/data/matchings_products.jsonl
Final pairs: /Users/marc/Documents/Projectes/tfm-product-matching/data/matchings_pairs.jsonl


In [6]:
def clean_text(x: Any) -> str:
    if x is None:
        return ""
    return str(x).strip()

def pick_image(val: Any) -> str:
    if not val:
        return ""
    # algunos vienen separados por '|'
    if isinstance(val, str):
        return val.split("|")[0].strip()
    return ""

def as_price(x: Any):
    try:
        return float(x)
    except Exception:
        return None

def load_raw() -> List[Dict[str, Any]]:
    rows = []
    with RAW_PATH.open() as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            try:
                rows.append(json.loads(line))
            except json.JSONDecodeError:
                pass
    print(f"Total filas crudas: {len(rows)}")
    return rows

raw_rows = load_raw()
raw_rows[:1]

Total filas crudas: 700999


[{'productA': {'product_id': 'IDES-142733150',
   'title': 'Garmin Quatix 7X Solar watch',
   'brand': 'Garmin',
   'price': 999.99,
   'image': 'https://storage.googleapis.com/tradeinn-images/web/products_image/14273/fotos/142733150.webp|https://storage.googleapis.com/tradeinn-images/web/products_image/14273/fotos/142733150_2.webp',
   'categories': 'electrónica > relojes > ',
   'url': 'https://www.tradeinn.com/bikeinn/es/garmin-quatix-7x-solar-watch/142733150/p?utm_source=google_products&utm_medium=merchant&country=es',
   'description': ''},
  'productB': {'product_id': '831121',
   'title': 'Garmin quatix® 7X – Solar Edition con Carga Solar NÚMERO DE Referencia 010-02541-61',
   'brand': None,
   'price': 1193.0,
   'image': 'https://www.electropolis.es/media/catalog/product/cache/319844269946207f374d008884feba39/I/N/IN-ACT-SIAGARZEG0200_41bzeIYtMaL._AC_.jpeg',
   'categories': '',
   'url': 'https://www.electropolis.es/garmin-quatixr-7x-solar-edition-con-carga-solar-numero-de-ref

In [7]:
def normalize_product(data: Dict[str, Any], pair_id: int, source: str) -> Dict[str, Any]:
    # source: "client" o "competitor"
    pid = clean_text(data.get("product_id") or data.get("id") or f"{source}-{pair_id}")
    title = clean_text(data.get("title"))
    desc = clean_text(data.get("description"))
    brand = clean_text(data.get("brand"))
    price = as_price(data.get("price"))
    image_url = pick_image(data.get("image") or data.get("image_url"))
    category_path = clean_text(data.get("categories"))

    return {
        "id": pid,
        "title": title,
        "description": desc,
        "brand": brand,
        "price": price,
        "image_url": image_url,
        "category_path": category_path,
        "source": source,
        "pair_id": pair_id,
    }

products: List[Dict[str, Any]] = []
pairs: List[Dict[str, Any]] = []

for idx, row in enumerate(raw_rows):
    pa = row.get("productA", {})
    pb = row.get("productB", {})
    client = normalize_product(pa, idx, "client")
    competitor = normalize_product(pb, idx, "competitor")
    products.extend([client, competitor])

    pairs.append({
        "pair_id": idx,
        "client_id": client["id"],
        "competitor_id": competitor["id"],
        "client_title": client["title"],
        "competitor_title": competitor["title"],
        "client_brand": client["brand"],
        "competitor_brand": competitor["brand"],
        "client_price": client["price"],
        "competitor_price": competitor["price"],
        "client_image_url": client["image_url"],
        "competitor_image_url": competitor["image_url"],
    })

len(products), len(pairs)

# recorte final de seguridad
products = products[:MAX_PAIRS*2]
pairs = pairs[:MAX_PAIRS]


(1401998, 700999)

In [8]:
# exportar intermedios en step_4 y finales en /data

# intermedios (opcional)
STEP4_PRODUCTS = STEP4_DIR / "matchings_products.jsonl"
STEP4_PAIRS = STEP4_DIR / "matchings_pairs.jsonl"

with STEP4_PRODUCTS.open("w", encoding="utf-8") as f:
    for obj in products:
        f.write(json.dumps(obj, ensure_ascii=False) + "\n")
with STEP4_PAIRS.open("w", encoding="utf-8") as f:
    for obj in pairs:
        f.write(json.dumps(obj, ensure_ascii=False) + "\n")
# finales
with FINAL_PRODUCTS.open("w", encoding="utf-8") as f:
    for obj in products:
        f.write(json.dumps(obj, ensure_ascii=False) + "\n")
with FINAL_PAIRS.open("w", encoding="utf-8") as f:
    for obj in pairs:
        f.write(json.dumps(obj, ensure_ascii=False) + "\n")

print("Guardado finales:")
print(" products ->", FINAL_PRODUCTS)
print(" pairs    ->", FINAL_PAIRS)


Guardado finales:
 products -> /Users/marc/Documents/Projectes/tfm-product-matching/data/matchings_products.jsonl
 pairs    -> /Users/marc/Documents/Projectes/tfm-product-matching/data/matchings_pairs.jsonl
