# 4b_add_noise: generar variante con ruido

Crea una versión "noisy" de los ficheros de Smart Connections añadiendo N distractores por cliente. Útil para pruebas de contraste y evaluación.

- Entrada: `data/matchings_products.jsonl`, `data/matchings_pairs.jsonl`
- Salida: **sobrescribe** `data/matchings_products.jsonl` y `data/matchings_pairs.jsonl` (guarda ruido). Ejecuta de nuevo el paso 4 si quieres volver a la versión “gold”.
- Similaridad: coseno sobre embedding ligero (hash BOW 256D) de título+descripción.
- Marca `is_distractor=True` en los productos añadidos.
- Reproducible: semilla fija 42.

Parámetro principal: `NUM_DISTRACTORS` (por defecto 3).

In [5]:
from __future__ import annotations
import json, math, random
from pathlib import Path
from typing import List, Dict, Any

try:
    PROJECT_ROOT = Path(__file__).resolve().parents[1]
except NameError:
    # Ejecutado desde notebook: partimos del directorio notebooks/
    PROJECT_ROOT = Path.cwd().resolve().parents[0]
    if not (PROJECT_ROOT / "data").exists() and len(PROJECT_ROOT.parents) > 0:
        PROJECT_ROOT = PROJECT_ROOT.parent
DATA_DIR = PROJECT_ROOT / "data"

PRODUCTS_IN = DATA_DIR / "matchings_products.jsonl"
PAIRS_IN = DATA_DIR / "matchings_pairs.jsonl"
PRODUCTS_OUT = DATA_DIR / "matchings_products.jsonl"  # sobrescribe
PAIRS_OUT = DATA_DIR / "matchings_pairs.jsonl"        # sobrescribe

VECTOR_SIZE = 256
NUM_DISTRACTORS = 3  # cambia si quieres más/menos ruido

def hash_token(token: str) -> int:
    h = 0
    for ch in token:
        h = (h << 5) - h + ord(ch)
        h &= 0xFFFFFFFF
    return abs(h)

def embed_text(text: str) -> List[float]:
    vec = [0.0] * VECTOR_SIZE
    tokens = text.lower().replace("[^\\w\\s]", " ").replace("\n", " ").split()
    for t in tokens:
        idx = hash_token(t) % VECTOR_SIZE
        vec[idx] += 1.0
    norm = math.sqrt(sum(v * v for v in vec)) or 1.0
    return [v / norm for v in vec]

def cosine(a: List[float], b: List[float]) -> float:
    return sum(x*y for x, y in zip(a, b))

def read_jsonl(path: Path) -> List[Dict[str, Any]]:
    rows = []
    if not path.exists():
        return rows
    with path.open() as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            try:
                rows.append(json.loads(line))
            except Exception:
                continue
    return rows

def write_jsonl(path: Path, rows: List[Dict[str, Any]]):
    with path.open("w", encoding="utf-8") as f:
        for r in rows:
            f.write(json.dumps(r, ensure_ascii=False) + "\n")

# ---
products = read_jsonl(PRODUCTS_IN)
pairs = read_jsonl(PAIRS_IN)
by_id = {p["id"]: p for p in products}
clients = [p for p in products if p.get("source") == "client"]
competitors = [p for p in products if p.get("source") == "competitor"]

# índice de pares existentes por cliente
by_client = {}
for pair in pairs:
    cid = pair["client_id"]
    by_client.setdefault(cid, set()).add(pair["competitor_id"])

new_pairs = list(pairs)
new_products = list(products)
next_pair_id = max((p.get("pair_id", 0) for p in pairs), default=0) + 1

random.seed(42)

for client in clients:
    cid = client["id"]
    used = by_client.get(cid, set())
    client_vec = embed_text(f"{client.get('title','')} {client.get('description','')}")

    pool = [c for c in competitors if c["id"] not in used]
    random.shuffle(pool)
    picked = 0

    for comp in pool:
        if picked >= NUM_DISTRACTORS:
            break
        comp_vec = embed_text(f"{comp.get('title','')} {comp.get('description','')}")
        sim = cosine(client_vec, comp_vec)
        # guardamos el score de similitud crudo; marcamos distractor
        new_products.append({**comp, "is_distractor": True, "score": sim, "similarity": sim * 100})
        new_pairs.append({"pair_id": next_pair_id, "client_id": cid, "competitor_id": comp["id"]})
        next_pair_id += 1
        picked += 1

write_jsonl(PRODUCTS_OUT, new_products)
write_jsonl(PAIRS_OUT, new_pairs)

print(f"OK. Escritos {len(new_products)} productos en {PRODUCTS_OUT}")
print(f"OK. Escritos {len(new_pairs)} pares en {PAIRS_OUT}")
print("Nota: has sobrescrito los ficheros por defecto con la variante con ruido. Ejecuta 4_smart_connections de nuevo para recuperar la versión gold.")


OK. Escritos 10000 productos en /Users/marc/Documents/Projectes/tfm-product-matching/data/matchings_products.jsonl
OK. Escritos 8000 pares en /Users/marc/Documents/Projectes/tfm-product-matching/data/matchings_pairs.jsonl
Nota: has sobrescrito los ficheros por defecto con la variante con ruido. Ejecuta 4_smart_connections de nuevo para recuperar la versión gold.
