# 6_embeddings_matchings

Genera embeddings textuales para Smart Connections y recalcula los scores.

Flujo:
1. Lee `data/matchings_products.jsonl` y `data/matchings_pairs.jsonl`.
2. Construye el texto como en el recomendador: `title + brand + category_path + category_path + brand + desc[:300]`.
3. Embeddings con `all-MiniLM-L6-v2` (Sentence-Transformers); fallback hash BOW 256D si el modelo no carga.
4. Guarda embeddings en `data/matchings_text.npy` y el índice en `data/matchings_index.json`.
5. Recalcula `score` (coseno) y `similarity` para cada par y sobrescribe `data/matchings_pairs.jsonl`.

Nota: los NPY no se versionan; siguen el mismo criterio que los pasos 2/3 del pipeline.

In [None]:
from __future__ import annotations
import json, math
from pathlib import Path
from typing import Dict, List, Any
import numpy as np

try:
    PROJECT_ROOT = Path(__file__).resolve().parents[1]
except NameError:
    PROJECT_ROOT = Path.cwd().resolve().parents[0]
    if not (PROJECT_ROOT / "data").exists() and len(PROJECT_ROOT.parents) > 0:
        PROJECT_ROOT = PROJECT_ROOT.parent

DATA_DIR = PROJECT_ROOT / "data"
PROD_PATH = DATA_DIR / "matchings_products.jsonl"
PAIR_PATH = DATA_DIR / "matchings_pairs.jsonl"
EMB_PATH = DATA_DIR / "matchings_text.npy"
INDEX_PATH = DATA_DIR / "matchings_index.json"

# --- Embedding helpers ---
try:
    from sentence_transformers import SentenceTransformer
    MODEL = SentenceTransformer("all-MiniLM-L6-v2")
    def embed_vec(text: str) -> List[float]:
        return MODEL.encode([text], normalize_embeddings=True)[0].tolist()
    print("Usando sentence-transformers/all-MiniLM-L6-v2")
except Exception as e:
    print("No sentence-transformers; fallback hash BOW 256D. Motivo:", e)
    VECTOR_SIZE = 256
    def _hash_token(t: str) -> int:
        h = 0
        for ch in t:
            h = (h << 5) - h + ord(ch)
            h &= 0xFFFFFFFF
        return abs(h)
    def embed_vec(text: str) -> List[float]:
        vec = [0.0]*VECTOR_SIZE
        tokens = text.lower().split()
        for t in tokens:
            idx = _hash_token(t) % VECTOR_SIZE
            vec[idx] += 1.0
        norm = math.sqrt(sum(v*v for v in vec)) or 1.0
        return [v/norm for v in vec]

def cosine(a: List[float], b: List[float]) -> float:
    return float(sum(x*y for x,y in zip(a,b)))

def read_jsonl(path: Path) -> List[Dict[str, Any]]:
    rows=[]
    if not path.exists():
        return rows
    with path.open() as f:
        for line in f:
            line=line.strip()
            if not line:
                continue
            try:
                rows.append(json.loads(line))
            except Exception:
                continue
    return rows

def write_jsonl(path: Path, rows: List[Dict[str, Any]]):
    with path.open("w", encoding="utf-8") as f:
        for r in rows:
            f.write(json.dumps(r, ensure_ascii=False)+"\n")

def build_text(p: Dict[str, Any]) -> str:
    parts = [
        p.get("title", ""),
        p.get("brand", ""),
        p.get("category_path", ""),
        p.get("category_path", ""),
        p.get("brand", ""),
    ]
    desc = p.get("description", "")
    if desc:
        parts.append(desc[:300])
    return " ".join(parts)

# --- Load data ---
products = read_jsonl(PROD_PATH)
pairs = read_jsonl(PAIR_PATH)
by_id = {p["id"]: p for p in products}

# --- Embeddings matrix ---
embs = []
ids = []
for p in products:
    ids.append(p["id"])
    embs.append(embed_vec(build_text(p)))
embs_np = np.array(embs, dtype=np.float32)
np.save(EMB_PATH, embs_np)
with INDEX_PATH.open("w", encoding="utf-8") as f:
    json.dump({"ids": ids}, f, ensure_ascii=False)
print(f"Embeddings guardados en {EMB_PATH}, index en {INDEX_PATH}")

# --- Score pairs ---
idx_map = {pid: i for i, pid in enumerate(ids)}
scored_pairs = []
for pair in pairs:
    ci = idx_map.get(pair["client_id"])
    cj = idx_map.get(pair["competitor_id"])
    if ci is None or cj is None:
        score = 0.0
    else:
        score = float(np.dot(embs_np[ci], embs_np[cj]))
    pair["score"] = score
    pair["similarity"] = score * 100
    if "label" not in pair:
        pair["label"] = 1  # por si faltara
    if "is_distractor" not in pair:
        pair["is_distractor"] = pair.get("label",1)==0
    scored_pairs.append(pair)

write_jsonl(PAIR_PATH, scored_pairs)
print(f"Pares recalculados y escritos en {PAIR_PATH}")

# --- Validación ---
positives = [p["score"] for p in scored_pairs if p.get("label") == 1]
negatives = [p["score"] for p in scored_pairs if p.get("label") == 0]
mean_pos = sum(positives) / max(len(positives), 1)
mean_neg = sum(negatives) / max(len(negatives), 1)
print(f"Mean score label=1: {mean_pos:.4f} | label=0: {mean_neg:.4f}")

# Caso Garmin
garmin = next((p for p in products if "garmin quatix 7x solar watch".lower() in p.get("title","" ).lower()), None)
if garmin:
    print("\nCaso Garmin:")
    for pair in scored_pairs:
        if pair["client_id"] != garmin["id"]:
            continue
        print(f"pair {pair['pair_id']} label={pair['label']} score={pair['score']:.4f} comp={pair['competitor_id']}")

print("Listo. Recarga la API con ?nocache=1 tras ejecutar este notebook.")


SyntaxError: cannot assign to expression here. Maybe you meant '==' instead of '='? (355548542.py, line 14)