# 3_compute_embeddings

Genera embeddings de texto e imagen a partir de `products_clean`.
Este notebook es tolerante a dependencias ausentes (cae a embeddins simulados y salta parquet si no hay pyarrow).

In [None]:
from pathlib import Path
import json
import numpy as np
import pandas as pd
try:
    from tqdm.auto import tqdm
except ImportError:
    print("⚠️ tqdm no instalado; sin barra de progreso. `pip install tqdm` para activarla.")
    tqdm = lambda x, **k: x

PROJECT_ROOT = Path.cwd().parent if Path.cwd().name == "notebooks" else Path.cwd()
STEP2_DIR = PROJECT_ROOT / "notebooks" / "data" / "step_2"
FINAL_PARQUET = PROJECT_ROOT / "data" / "products_with_embeddings.parquet"
FINAL_JSONL = PROJECT_ROOT / "data" / "products_with_embeddings.jsonl"
IMAGES_CACHE = PROJECT_ROOT / "notebooks" / "data" / "images_cache"
IMAGES_CACHE.mkdir(parents=True, exist_ok=True)

candidates = sorted(STEP2_DIR.glob("products_clean_*.parquet")) or sorted(STEP2_DIR.glob("products_clean_*.jsonl"))
if not candidates:
    raise FileNotFoundError(f"No hay products_clean en {STEP2_DIR}. Ejecuta 2_preparacion_productos.ipynb.")
INPUT_PATH = candidates[0]
print(f"Usando {INPUT_PATH.name}")

if INPUT_PATH.suffix == ".parquet":
    df = pd.read_parquet(INPUT_PATH)
else:
    records = [json.loads(line) for line in INPUT_PATH.read_text(encoding="utf-8").splitlines() if line.strip()]
    df = pd.DataFrame.from_records(records)
df.head()

## Embeddings de texto (fallback si no hay sentence-transformers)

In [None]:
try:
    from sentence_transformers import SentenceTransformer
    model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
    texts = df["text_for_embedding"].fillna("").tolist()
    emb_text = model.encode(texts, batch_size=64, convert_to_numpy=True, show_progress_bar=True, normalize_embeddings=True)
    print("emb_text shape:", emb_text.shape)
except Exception as e:
    print("⚠️ sentence-transformers no disponible, usando emb_text simulados (ceros)")
    print(e)
    emb_text = np.zeros((len(df), 1), dtype="float32")

## Embeddings de imagen (opcional; fallback a ceros)

In [None]:
emb_img = np.zeros((len(df), 1), dtype="float32")  # fallback por defecto
try:
    import io, requests
    from PIL import Image
    from sentence_transformers import SentenceTransformer
    img_model = SentenceTransformer("clip-ViT-B-32")

    def load_image(url: str):
        try:
            resp = requests.get(url, timeout=10)
            resp.raise_for_status()
            return Image.open(io.BytesIO(resp.content)).convert("RGB")
        except Exception:
            return Image.new("RGB", (224, 224), color=(0, 0, 0))

    images = [load_image(url) for url in tqdm(df["image_url"].fillna("").tolist(), desc="img")]
    emb_img = img_model.encode(images, batch_size=32, convert_to_numpy=True, show_progress_bar=True, normalize_embeddings=True)
    print("emb_img shape:", emb_img.shape)
except Exception as e:
    print("⚠️ Embeddings de imagen omitidos (usa fallback de ceros). Instala pillow/requests/sentence-transformers si quieres usarlos.")
    print(e)

## Guardado (JSONL siempre; parquet si pyarrow está disponible)

In [None]:
# Asegurar tipos serializables
df_out = df.copy()
df_out["emb_text"] = list(emb_text)
df_out["emb_img"] = list(emb_img)

parquet_ok = False
try:
    import pyarrow  # noqa: F401
    df_out.to_parquet(FINAL_PARQUET, index=False)
    parquet_ok = True
except Exception as e:
    print("⚠️ Parquet no guardado; se continua con JSONL/NPY.")
    print(e)

# Serializar embeddings a listas para JSON
df_json = df_out.copy()
df_json["emb_text"] = df_json["emb_text"].apply(lambda x: x.tolist() if hasattr(x, "tolist") else list(x))
df_json["emb_img"] = df_json["emb_img"].apply(lambda x: x.tolist() if hasattr(x, "tolist") else list(x))

with open(FINAL_JSONL, "w", encoding="utf-8") as f:
    for rec in df_json.to_dict(orient="records"):
        f.write(json.dumps(rec, ensure_ascii=False) + "\n")

# Guardado npy
np.save(PROJECT_ROOT / "data" / "emb_text.npy", np.array(df_json["emb_text"].tolist()))
np.save(PROJECT_ROOT / "data" / "emb_img.npy", np.array(df_json["emb_img"].tolist()))

if parquet_ok:
    print("Guardado Parquet:", FINAL_PARQUET)
else:
    print("Parquet omitido")
print("Guardado JSONL:", FINAL_JSONL)
