# 4_smart_connections_embed

Calcula embeddings textuales para los productos de Smart Connections y escribe la similitud cliente↔competidor en los ficheros finales.

Flujo:
1. Lee `data/matchings_products.jsonl` y `data/matchings_pairs.jsonl` generados en el paso 4 (gold) o 4b (ruido).
2. Calcula un embedding textual (Sentence-Transformer si está disponible; si no, hash BOW 256D).
3. Para cada par cliente–competidor calcula coseno y lo guarda en `similarity` (0–100) y `score` (0–1).
4. Sobrescribe los ficheros de salida con los nuevos campos.

Ejecuta este notebook tras 4 o 4b para mejorar los scores en `/similar` sin cambiar la API.

In [1]:
from __future__ import annotations
import json, math, re
from pathlib import Path
from typing import List, Dict, Any

try:
    PROJECT_ROOT = Path(__file__).resolve().parents[1]
except NameError:
    PROJECT_ROOT = Path.cwd().resolve().parents[0]
    if not (PROJECT_ROOT / "data").exists() and len(PROJECT_ROOT.parents) > 0:
        PROJECT_ROOT = PROJECT_ROOT.parent
DATA_DIR = PROJECT_ROOT / "data"
PROD_PATH = DATA_DIR / "matchings_products.jsonl"
PAIR_PATH = DATA_DIR / "matchings_pairs.jsonl"

# Intenta usar sentence-transformers si está instalado; si no, fallback hash BOW
try:
    from sentence_transformers import SentenceTransformer
    MODEL = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
    def embed(text: str):
        return MODEL.encode([text], normalize_embeddings=True)[0]
    print("Usando sentence-transformers/all-MiniLM-L6-v2")
except Exception as e:
    print("No sentence-transformers; usando hash BOW 256D", e)
    VECTOR_SIZE = 256
    def hash_token(t: str) -> int:
        h = 0
        for ch in t:
            h = (h << 5) - h + ord(ch)
            h &= 0xFFFFFFFF
        return abs(h)
    def embed(text: str):
        vec = [0.0]*VECTOR_SIZE
        tokens = re.sub(r"[^\w\s]+", " ", text.lower()).split()
        for t in tokens:
            idx = hash_token(t)%VECTOR_SIZE
            vec[idx]+=1.0
        norm = math.sqrt(sum(v*v for v in vec)) or 1.0
        return [v/norm for v in vec]

def cosine(a, b):
    return float(sum(x*y for x,y in zip(a,b)))

def read_jsonl(path: Path):
    rows=[]
    if not path.exists():
        return rows
    with path.open() as f:
        for line in f:
            line=line.strip()
            if not line:
                continue
            try:
                rows.append(json.loads(line))
            except Exception:
                continue
    return rows

def write_jsonl(path: Path, rows):
    with path.open("w", encoding="utf-8") as f:
        for r in rows:
            f.write(json.dumps(r, ensure_ascii=False)+"\n")

products = read_jsonl(PROD_PATH)
pairs = read_jsonl(PAIR_PATH)

by_id = {p["id"]: p for p in products}

# Precalcula embeddings por id
emb_cache: Dict[str, List[float]] = {}
for p in products:
    text = f"{p.get('title','')} {p.get('description','')}"
    emb_cache[p["id"]] = embed(text)

updated_pairs = []
for pair in pairs:
    c = by_id.get(pair["client_id"])
    comp = by_id.get(pair["competitor_id"])
    if not c or not comp:
        continue
    sim = cosine(emb_cache[c["id"]], emb_cache[comp["id"]])
    comp_rec = by_id[comp["id"]]
    comp_rec["score"] = sim
    comp_rec["similarity"] = sim*100
    updated_pairs.append(pair)

# reconstruimos la lista de productos con los campos actualizados
updated_products = list(by_id.values())

write_jsonl(PROD_PATH, updated_products)
write_jsonl(PAIR_PATH, updated_pairs)
print(f"OK. Escritos {len(updated_products)} productos y {len(updated_pairs)} pares con similarity.")


Usando sentence-transformers/all-MiniLM-L6-v2
OK. Escritos 2074 productos y 8000 pares con similarity.
