# 5 · Smart Connections · Embeddings de texto
Pipeline de texto para Smart Connections:
1) Carga `data/matchings_products.jsonl` y pares gold de `data/matchings_pairs.jsonl` (creados en el paso 4).
2) Calcula embeddings de texto con MiniLM (fallback hash BOW si falta el modelo).
3) Genera 1 distractor por par gold.
4) Calcula score=coseno y similarity.
5) Guarda `data/matchings_text.npy`, `data/matchings_index.json` y sobrescribe `data/matchings_pairs.jsonl` con scores.
6) Imprime medias y el caso Garmin para depurar.


In [1]:
from __future__ import annotations
import json, math, random
from pathlib import Path
from typing import Dict, List, Any
import numpy as np

try:
    PROJECT_ROOT = Path(__file__).resolve().parents[1]
except NameError:
    PROJECT_ROOT = Path.cwd().resolve().parents[0]
    if not (PROJECT_ROOT / "data").exists() and len(PROJECT_ROOT.parents) > 0:
        PROJECT_ROOT = PROJECT_ROOT.parent

DATA_DIR = PROJECT_ROOT / "data"
PROD_PATH = DATA_DIR / "matchings_products.jsonl"
PAIR_PATH = DATA_DIR / "matchings_pairs.jsonl"
EMB_PATH = DATA_DIR / "matchings_text.npy"
INDEX_PATH = DATA_DIR / "matchings_index.json"

# Embeddings
try:
    from sentence_transformers import SentenceTransformer
    MODEL = SentenceTransformer("all-MiniLM-L6-v2")
    def embed_vec(text: str) -> List[float]:
        return MODEL.encode([text], normalize_embeddings=True)[0].tolist()
    print("Usando sentence-transformers/all-MiniLM-L6-v2")
except Exception as e:
    print("Fallback hash BOW 256D. Motivo:", e)
    VECTOR_SIZE = 256
    def _hash_token(t: str) -> int:
        h = 0
        for ch in t:
            h = (h << 5) - h + ord(ch)
            h &= 0xFFFFFFFF
        return abs(h)
    def embed_vec(text: str) -> List[float]:
        vec = [0.0]*VECTOR_SIZE
        for t in text.lower().split():
            idx = _hash_token(t) % VECTOR_SIZE
            vec[idx]+=1.0
        norm = math.sqrt(sum(v*v for v in vec)) or 1.0
        return [v/norm for v in vec]

def cosine(a: List[float], b: List[float]) -> float:
    return float(sum(x*y for x,y in zip(a,b)))

def read_jsonl(path: Path) -> List[Dict[str, Any]]:
    rows=[]
    if not path.exists():
        return rows
    with path.open() as f:
        for line in f:
            line=line.strip()
            if not line:
                continue
            try:
                rows.append(json.loads(line))
            except Exception:
                continue
    return rows

def write_jsonl(path: Path, rows: List[Dict[str, Any]]):
    with path.open("w", encoding="utf-8") as f:
        for r in rows:
            f.write(json.dumps(r, ensure_ascii=False)+"\n")

def build_text(p: Dict[str, Any]) -> str:
    parts = [
        p.get("title", ""),
        p.get("brand", ""),
        p.get("category_path", ""),
        p.get("category_path", ""),
        p.get("brand", ""),
    ]
    desc = p.get("description", "")
    if desc:
        parts.append(desc[:300])
    return " ".join(parts)

# Load base data
a = read_jsonl(PROD_PATH)
pairs_gold = read_jsonl(PAIR_PATH)
by_id = {p["id"]: p for p in a}
clients = [p for p in a if p.get("source") == "client"]
competitors = [p for p in a if p.get("source") == "competitor"]

# Embeddings matrix
ids = [p["id"] for p in a]
embs = [embed_vec(build_text(p)) for p in a]
embs_np = np.array(embs, dtype=np.float32)
np.save(EMB_PATH, embs_np)
with INDEX_PATH.open("w", encoding="utf-8") as f:
    json.dump({"ids": ids}, f, ensure_ascii=False)
print(f"Embeddings guardados: {EMB_PATH}")

idx_map = {pid:i for i,pid in enumerate(ids)}

# Generar 1 distractor por gold
random.seed(42)
new_pairs = []
next_pair_id = max((p.get("pair_id",0) for p in pairs_gold), default=0)+1

for pair in pairs_gold:
    pair["label"] = 1
    pair["is_distractor"] = False
    new_pairs.append(pair)
    client_id = pair["client_id"]
    used = {pair["competitor_id"]}
    client_cat = (by_id.get(client_id,{}).get("category_path") or "").split(">")[0].strip().lower()
    pool_all = [c for c in competitors if c["id"] not in used]
    pool_diff = [c for c in pool_all if (c.get("category_path") or "").split(">")[0].strip().lower() != client_cat]
    pool_use = pool_diff if pool_diff else pool_all
    if not pool_use:
        continue
    neg = random.choice(pool_use)
    new_pairs.append({
        "pair_id": next_pair_id,
        "client_id": client_id,
        "competitor_id": neg["id"],
        "label": 0,
        "is_distractor": True,
    })
    next_pair_id += 1

# Score coseno
scored = []
for pair in new_pairs:
    ci = idx_map.get(pair["client_id"])
    cj = idx_map.get(pair["competitor_id"])
    if ci is None or cj is None:
        score = 0.0
    else:
        score = float(np.dot(embs_np[ci], embs_np[cj]))
    pair["score"] = score
    pair["similarity"] = score*100
    scored.append(pair)

write_jsonl(PAIR_PATH, scored)
print(f"Pares escritos en {PAIR_PATH}")

# Validación
pos = [p["score"] for p in scored if p.get("label") == 1]
neg = [p["score"] for p in scored if p.get("label") == 0]
mean_pos = sum(pos)/max(len(pos),1)
mean_neg = sum(neg)/max(len(neg),1)
print(f"Mean score label=1: {mean_pos:.4f} | label=0: {mean_neg:.4f}")

# Caso Garmin
cl = next((p for p in clients if "garmin quatix 7x solar watch".lower() in p.get("title","" ).lower()), None)
if cl:
    print("Caso Garmin:")
    for pair in scored:
        if pair["client_id"] != cl["id"]:
            continue
        print(f"pair {pair['pair_id']} label={pair['label']} score={pair['score']:.4f} comp={pair['competitor_id']}")

print("Listo. Recarga la API con ?nocache=1")


Usando sentence-transformers/all-MiniLM-L6-v2
Embeddings guardados: /Users/marc/Documents/Projectes/tfm-product-matching/data/matchings_text.npy
Pares escritos en /Users/marc/Documents/Projectes/tfm-product-matching/data/matchings_pairs.jsonl
Mean score label=1: 0.6095 | label=0: 0.2098
Caso Garmin:
pair 0 label=1 score=0.7750 comp=831121
pair 2000 label=0 score=0.1524 comp=178374
pair 1 label=1 score=0.7145 comp=1143212
pair 2001 label=0 score=0.1943 comp=574333
Listo. Recarga la API con ?nocache=1
