# 5_rebuild_matchings

Reconstruye el dataset de Smart Connections con etiquetas y scores coherentes.

Pasos:
1. Lee `data/matchings_products.jsonl` y `data/matchings_pairs.jsonl` (productos no se tocan).
2. Genera un negativo (ruido) por cada par positivo.
3. Calcula embeddings de texto con el mismo pipeline del recomendador (MiniLM; fallback hash BOW).
4. Score = coseno; similarity = score*100.
5. Valida medias label=1 vs label=0 y muestra el caso Garmin.

Salida: `data/matchings_pairs_scored.jsonl`.


In [None]:
"""
Reconstruye el dataset de Smart Connections con labels y scores coherentes.

Pasos implementados:
1) Lee products.jsonl y pairs.jsonl (sin tocarlos) desde data/.
2) Genera negativos (ruido) por cada positivo.
3) Calcula embeddings de texto con el MISMO pipeline del recomendador:
   text = title + brand + category_path + desc[:300]
   modelo: all-MiniLM-L6-v2 (sentence-transformers); fallback hash BOW 256D.
4) Score = coseno(emb_client, emb_comp); similarity = score*100.
5) Valida medias label=1 vs label=0 y muestra caso Garmin.

Salida:
  data/matchings_products.jsonl      (se copia tal cual, no se modifica)
  data/matchings_pairs.jsonl         (pares con label/score/similarity)
"""

from __future__ import annotations

import json
import math
import random
from pathlib import Path
from typing import Dict, List, Any

try:
    PROJECT_ROOT = Path(__file__).resolve().parents[1]
except NameError:
    PROJECT_ROOT = Path.cwd().resolve().parents[0]
    if not (PROJECT_ROOT / "data").exists() and len(PROJECT_ROOT.parents) > 0:
        PROJECT_ROOT = PROJECT_ROOT.parent
DATA_DIR = PROJECT_ROOT / "data"

PROD_IN = DATA_DIR / "matchings_products.jsonl"
PAIR_IN = DATA_DIR / "matchings_pairs.jsonl"
PAIR_OUT = DATA_DIR / "matchings_pairs.jsonl"

# ---------- Embedding utils ----------
try:
    from sentence_transformers import SentenceTransformer

    MODEL = SentenceTransformer("all-MiniLM-L6-v2")

    def embed_vec(text: str) -> List[float]:
        return MODEL.encode([text], normalize_embeddings=True)[0].tolist()

    print("Usando sentence-transformers/all-MiniLM-L6-v2")
except Exception as e:
    print("No sentence-transformers, usando hash BOW 256D. Motivo:", e)
    VECTOR_SIZE = 256

    def _hash_token(t: str) -> int:
        h = 0
        for ch in t:
            h = (h << 5) - h + ord(ch)
            h &= 0xFFFFFFFF
        return abs(h)

    def embed_vec(text: str) -> List[float]:
        vec = [0.0] * VECTOR_SIZE
        tokens = text.lower().split()
        for t in tokens:
            idx = _hash_token(t) % VECTOR_SIZE
            vec[idx] += 1.0
        norm = math.sqrt(sum(v * v for v in vec)) or 1.0
        return [v / norm for v in vec]


def cosine(a: List[float], b: List[float]) -> float:
    return float(sum(x * y for x, y in zip(a, b)))


# ---------- IO helpers ----------
def read_jsonl(path: Path) -> List[Dict[str, Any]]:
    rows: List[Dict[str, Any]] = []
    if not path.exists():
        return rows
    with path.open() as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            try:
                rows.append(json.loads(line))
            except Exception:
                continue
    return rows


def write_jsonl(path: Path, rows: List[Dict[str, Any]]) -> None:
    with path.open("w", encoding="utf-8") as f:
        for r in rows:
            f.write(json.dumps(r, ensure_ascii=False) + "\n")


# ---------- Main pipeline ----------
def build_text(p: Dict[str, Any]) -> str:
    parts = [
        p.get("title", ""),
        p.get("brand", ""),
        p.get("category_path", ""),
        p.get("category_path", ""),
        p.get("brand", ""),
    ]
    desc = p.get("description", "")
    if desc:
        parts.append(desc[:300])
    return " ".join(parts)


def main():
    products = read_jsonl(PROD_IN)
    pairs = read_jsonl(PAIR_IN)

    by_id = {p["id"]: p for p in products}
    clients = [p for p in products if p.get("source") == "client"]
    competitors = [p for p in products if p.get("source") == "competitor"]

    # Embeddings por id
    emb_cache: Dict[str, List[float]] = {}
    for p in products:
        emb_cache[p["id"]] = embed_vec(build_text(p))

    # Generar negativos
    next_pair_id = max((p.get("pair_id", 0) for p in pairs), default=0) + 1
    random.seed(42)
    new_pairs: List[Dict[str, Any]] = []

    for pair in pairs:
        pair["label"] = 1
        pair["is_distractor"] = False
        new_pairs.append(pair)

        client_id = pair["client_id"]
        used = {pair["competitor_id"]}
        client_cat = (by_id.get(client_id, {}).get("category_path") or "").split(">")[0].strip().lower()
        pool_all = [c for c in competitors if c["id"] not in used]
        pool_diff = [c for c in pool_all if (c.get("category_path") or "").split(">")[0].strip().lower() != client_cat]
        pool_use = pool_diff if pool_diff else pool_all
        if not pool_use:
            continue
        neg = random.choice(pool_use)
        new_pairs.append(
            {
                "pair_id": next_pair_id,
                "client_id": client_id,
                "competitor_id": neg["id"],
                "label": 0,
                "is_distractor": True,
            }
        )
        next_pair_id += 1

    # Calcular score y similarity
    scored_pairs: List[Dict[str, Any]] = []
    for pair in new_pairs:
        c = emb_cache.get(pair["client_id"])
        comp = emb_cache.get(pair["competitor_id"])
        if c is None or comp is None:
            score = 0.0
        else:
            score = cosine(c, comp)
        pair["score"] = score
        pair["similarity"] = score * 100
        scored_pairs.append(pair)

    # Validaci√≥n simple
    positives = [p["score"] for p in scored_pairs if p.get("label") == 1]
    negatives = [p["score"] for p in scored_pairs if p.get("label") == 0]
    mean_pos = sum(positives) / max(len(positives), 1)
    mean_neg = sum(negatives) / max(len(negatives), 1)
    print(f"Mean score (label=1): {mean_pos:.4f}")
    print(f"Mean score (label=0): {mean_neg:.4f}")

    # Caso Garmin
    garmin_client = next((p for p in clients if "garmin quatix 7x solar watch".lower() in p.get("title", "").lower()), None)
    if garmin_client:
        print("\nCaso Garmin Quatix 7X Solar watch")
        for pair in scored_pairs:
            if pair["client_id"] != garmin_client["id"]:
                continue
            print(
                f"pair_id={pair['pair_id']} label={pair['label']} score={pair['score']:.4f} comp={pair['competitor_id']}"
            )

    write_jsonl(PAIR_OUT, scored_pairs)
    print(f"Escritos {len(scored_pairs)} pares en {PAIR_OUT}")
    print("Products no se tocan; la UI puede leer score/similarity directamente de matchings_pairs.jsonl")


if __name__ == "__main__":
    main()


SyntaxError: unterminated string literal (detected at line 93) (3665607542.py, line 93)