# 3_compute_embeddings_resnet

Genera embeddings de imagen con ResNet50 (Imagenet) para comparar frente a CLIP.
Entrada: `notebooks/data/step_2/products_clean_*.jsonl|parquet`
Salida: `data/products_with_embeddings_resnet.jsonl` + npy.

In [2]:
from pathlib import Path
import json
import numpy as np
import pandas as pd
try:
    from tqdm.auto import tqdm
except ImportError:
    tqdm = lambda x, **k: x

PROJECT_ROOT = Path.cwd().parent if Path.cwd().name == "notebooks" else Path.cwd()
STEP2_DIR = PROJECT_ROOT / "notebooks" / "data" / "step_2"
FINAL_JSONL = PROJECT_ROOT / "data" / "products_with_embeddings_resnet.jsonl"
FINAL_TXT_NPY = PROJECT_ROOT / "data" / "emb_text.npy"  # texto se recicla de CLIP o zeros
FINAL_IMG_NPY = PROJECT_ROOT / "data" / "emb_img_resnet.npy"

candidates = sorted(STEP2_DIR.glob("products_clean_*.parquet")) or sorted(STEP2_DIR.glob("products_clean_*.jsonl"))
if not candidates:
    raise FileNotFoundError(f"No hay products_clean en {STEP2_DIR}")
INPUT_PATH = candidates[0]
print(f"Usando {INPUT_PATH.name}")
if INPUT_PATH.suffix == ".parquet":
    df = pd.read_parquet(INPUT_PATH)
else:
    records = [json.loads(l) for l in INPUT_PATH.read_text(encoding="utf-8").splitlines() if l.strip()]
    df = pd.DataFrame.from_records(records)
df.head()


Usando products_clean_Sports_and_Outdoors_sample.jsonl


Unnamed: 0,id,title,description,category_path,image_url,text_for_embedding
0,0,Adult Ballet Tutu Cheetah Pink,,"['Sports & Outdoors', 'Other Sports', 'Dance',...",https://ecx.images-amazon.com/images/I/51EzU6q...,Adult Ballet Tutu Cheetah Pink. Categories: ['...
1,1,Girls Ballet Tutu Neon Pink,High quality 3 layer ballet tutu. 12 inches in...,"['Sports & Outdoors', 'Other Sports', 'Dance']",https://ecx.images-amazon.com/images/I/41xBoP0...,Girls Ballet Tutu Neon Pink. High quality 3 la...
2,2,Adult Ballet Tutu Yellow,,"['Sports & Outdoors', 'Other Sports', 'Dance',...",https://ecx.images-amazon.com/images/I/21GNUNI...,Adult Ballet Tutu Yellow. Categories: ['Sports...
3,3,Girls Ballet Tutu Zebra Hot Pink,TUtu,"['Sports & Outdoors', 'Other Sports', 'Dance']",https://ecx.images-amazon.com/images/I/51fAmVk...,Girls Ballet Tutu Zebra Hot Pink. TUtu. Catego...
4,4,Adult Ballet Tutu Purple,,"['Sports & Outdoors', 'Other Sports', 'Dance',...",https://ecx.images-amazon.com/images/I/41TxNYG...,Adult Ballet Tutu Purple. Categories: ['Sports...


## Embeddings de texto
Se cargan de products.jsonl si existe; si no, se simulan a ceros para no mezclar.

In [3]:
emb_text = np.zeros((len(df), 1), dtype="float32")
clip_path = PROJECT_ROOT / "data" / "products.jsonl"
if clip_path.exists():
    try:
        recs = [json.loads(l) for l in clip_path.read_text(encoding="utf-8").splitlines() if l.strip()]
        if recs and "emb_text" in recs[0]:
            emb_text = np.array([r.get("emb_text", [0]) for r in recs], dtype="float32")
            print("emb_text importado desde products.jsonl", emb_text.shape)
    except Exception as e:
        print("⚠️ No se pudo cargar emb_text existente, se dejan ceros", e)


## Embeddings de imagen (ResNet50 Imagenet)

In [4]:
try:
    import io, requests, torch
    from PIL import Image
    from torchvision import models, transforms

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    resnet = models.resnet50(weights=models.ResNet50_Weights.IMAGENET1K_V2)
    resnet = torch.nn.Sequential(*(list(resnet.children())[:-1])).to(device).eval()
    preprocess = transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])

    def load_image(url: str):
        try:
            r = requests.get(url, timeout=10)
            r.raise_for_status()
            return Image.open(io.BytesIO(r.content)).convert("RGB")
        except Exception:
            return Image.new("RGB", (224, 224), color=(0, 0, 0))

    vecs = []
    for url in tqdm(df["image_url"].fillna("").tolist(), desc="ResNet50"):
        img = load_image(url)
        with torch.no_grad():
            t = preprocess(img).unsqueeze(0).to(device)
            v = resnet(t).flatten().cpu().numpy()
        vecs.append(v)
    emb_img = np.stack(vecs)
    # normalizar
    norms = np.linalg.norm(emb_img, axis=1, keepdims=True) + 1e-9
    emb_img = emb_img / norms
    print("emb_img shape:", emb_img.shape)
except Exception as e:
    print("⚠️ ResNet no disponible; usando emb_img de ceros", e)
    emb_img = np.zeros((len(df), 1), dtype="float32")


⚠️ ResNet no disponible; usando emb_img de ceros No module named 'torch'


## Guardar

In [5]:
df_out = df.copy()
df_out["emb_text"] = [e.tolist() for e in emb_text]
df_out["emb_img"] = [e.tolist() for e in emb_img]

with open(FINAL_JSONL, "w", encoding="utf-8") as f:
    for rec in df_out.to_dict(orient="records"):
        f.write(json.dumps(rec, ensure_ascii=False) + "\n")

np.save(FINAL_TXT_NPY, emb_text)
np.save(FINAL_IMG_NPY, emb_img)
print("Guardado JSONL:", FINAL_JSONL)
print("Guardado NPY text:", FINAL_TXT_NPY)
print("Guardado NPY img:", FINAL_IMG_NPY)


Guardado JSONL: /Users/marc/Documents/Projectes/tfm-product-matching/data/products_with_embeddings_resnet.jsonl
Guardado NPY text: /Users/marc/Documents/Projectes/tfm-product-matching/data/emb_text.npy
Guardado NPY img: /Users/marc/Documents/Projectes/tfm-product-matching/data/emb_img_resnet.npy
