# 3_compute_embeddings_resnet

Genera embeddings de imagen con ResNet50 (Imagenet) para comparar frente a CLIP.
Entrada: `notebooks/data/step_2/products_clean_*.jsonl|parquet`
Salida: intermedios en `notebooks/data/step_3/` y fichero final `data/products.jsonl`.

In [7]:
from pathlib import Path
import json
import numpy as np
import pandas as pd
try:
    from tqdm.auto import tqdm
except ImportError:
    tqdm = lambda x, **k: x

PROJECT_ROOT = Path.cwd().parent if Path.cwd().name == "notebooks" else Path.cwd()
STEP2_DIR = PROJECT_ROOT / "notebooks" / "data" / "step_2"
STEP3_DIR = PROJECT_ROOT / "notebooks" / "data" / "step_3"
STEP3_DIR.mkdir(parents=True, exist_ok=True)

# Copias intermedias (no van a git)
STEP3_JSONL = STEP3_DIR / "products_with_embeddings_resnet.jsonl"
STEP3_TXT_NPY = STEP3_DIR / "emb_text.npy"
STEP3_IMG_NPY = STEP3_DIR / "emb_img_resnet.npy"

# Copia final ligera para la web (solo campos necesarios, sin embeddings)
FINAL_JSONL = PROJECT_ROOT / "data" / "products.jsonl"


## Embeddings de texto
Se cargan de products.jsonl si existe; si no, se simulan a ceros para no mezclar.

In [8]:
# Prepara embeddings de texto: si no hay df cargado, lo cargamos desde step_2
try:
    df
except NameError:
    candidates = sorted(STEP2_DIR.glob("products_clean*.parquet")) or sorted(STEP2_DIR.glob("products_clean*.jsonl"))
    if not candidates:
        raise FileNotFoundError(f"No hay products_clean en {STEP2_DIR}. Ejecuta el notebook 2")
    INPUT_PATH = candidates[0]
    if INPUT_PATH.suffix == ".parquet":
        df = pd.read_parquet(INPUT_PATH)
    else:
        df = pd.DataFrame([json.loads(l) for l in INPUT_PATH.read_text(encoding="utf-8").splitlines() if l.strip()])
    print(f"df cargado desde {INPUT_PATH}")

emb_text = np.zeros((len(df), 1), dtype="float32")
clip_path = PROJECT_ROOT / "data" / "products.jsonl"
if clip_path.exists():
    try:
        recs = [json.loads(l) for l in clip_path.read_text(encoding="utf-8").splitlines() if l.strip()]
        if recs and "emb_text" in recs[0]:
            emb_text = np.array([r.get("emb_text", [0]) for r in recs], dtype="float32")
            print(f"emb_text importado desde {clip_path} {emb_text.shape}")
    except Exception as e:
        print(f"No se pudo importar emb_text desde products.jsonl: {e}")


## Embeddings de imagen (ResNet50 Imagenet)

In [9]:
try:
    import io, requests, torch
    from PIL import Image
    from torchvision import models, transforms

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    resnet = models.resnet50(weights=models.ResNet50_Weights.IMAGENET1K_V2)
    resnet = torch.nn.Sequential(*(list(resnet.children())[:-1])).to(device).eval()
    preprocess = transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])

    def load_image(url: str):
        try:
            r = requests.get(url, timeout=10)
            r.raise_for_status()
            return Image.open(io.BytesIO(r.content)).convert("RGB")
        except Exception:
            return Image.new("RGB", (224, 224), color=(0, 0, 0))

    vecs = []
    for url in tqdm(df["image_url"].fillna("").tolist(), desc="ResNet50"):
        img = load_image(url)
        with torch.no_grad():
            t = preprocess(img).unsqueeze(0).to(device)
            v = resnet(t).flatten().cpu().numpy()
        vecs.append(v)
    emb_img = np.stack(vecs)
    # normalizar
    norms = np.linalg.norm(emb_img, axis=1, keepdims=True) + 1e-9
    emb_img = emb_img / norms
    print("emb_img shape:", emb_img.shape)
except Exception as e:
    print("⚠️ ResNet no disponible; usando emb_img de ceros", e)
    emb_img = np.zeros((len(df), 1), dtype="float32")


emb_img shape: (2000, 2048)


## Guardar

In [11]:
df_out = df.copy()

# Alineamos longitudes para evitar desajustes
len_df = len(df_out)
# Fallback si emb_img no está definido (por ejecución parcial)
if 'emb_img' not in locals():
    IMG_DIM = 2048
    emb_img = [np.zeros((IMG_DIM,), dtype="float32") for _ in range(len_df)]

len_txt = len(emb_text)
len_img = len(emb_img)
min_len = min(len_df, len_txt, len_img)

# Convertir a numpy (por si vienen como tensores)
try:
    import numpy as np
    emb_text_arr = np.asarray([e.cpu().numpy() if hasattr(e, 'cpu') else e for e in emb_text[:min_len]])
    emb_img_arr  = np.asarray([e.cpu().numpy() if hasattr(e, 'cpu') else e for e in emb_img[:min_len]])
except Exception:
    emb_text_arr = emb_text[:min_len]
    emb_img_arr  = emb_img[:min_len]

if len_df != min_len:
    df_out = df_out.iloc[:min_len].copy()

df_out["emb_text"] = [e.tolist() for e in emb_text_arr]
df_out["emb_img"]  = [e.tolist() for e in emb_img_arr]

# Guardado intermedio en step_3 (con embeddings)
with open(STEP3_JSONL, "w", encoding="utf-8") as f:
    for rec in df_out.to_dict(orient="records"):
        f.write(json.dumps(rec, ensure_ascii=False) + "\n")
np.save(STEP3_TXT_NPY, emb_text_arr)
np.save(STEP3_IMG_NPY, emb_img_arr)

# Copia final ligera para la web (sin embeddings, solo campos necesarios)
FINAL_JSONL.parent.mkdir(parents=True, exist_ok=True)
df_web = df_out.drop(columns=["emb_text", "emb_img"], errors="ignore")
with open(FINAL_JSONL, "w", encoding="utf-8") as f:
    for rec in df_web.to_dict(orient="records"):
        f.write(json.dumps(rec, ensure_ascii=False) + "\n")
print("Guardado JSONL final:", FINAL_JSONL)
print("Intermedios en:", STEP3_JSONL)


Guardado JSONL final: /Users/marc/Documents/Projectes/tfm-product-matching/data/products.jsonl
Intermedios en: /Users/marc/Documents/Projectes/tfm-product-matching/notebooks/data/step_3/products_with_embeddings_resnet.jsonl
