In [1]:
"""
一次性生成 128 维图片 embedding 并写入 LMDB。
作者：ChatGPT 2025-06-29
"""

import os, glob, lmdb, numpy as np, torch
from PIL import Image
from tqdm import tqdm
from sklearn.decomposition import IncrementalPCA
import clip
from torchvision import transforms

# --------------- 配置 ---------------
ROOT_DIR   = r"D:\VideoRecSystem\MicroLens\DataSet"
LMDB_PATH  = r"D:\VideoRecSystem\MicroLens\cover_emb128.lmdb"
MAP_SIZE   = 256 << 20          # 256 MB upper-bound
BATCH_SIZE = 64               # 调大/调小按显存来
OUT_DIM    = 128              # 目标维度
PCA_BATCH  = 2048             # IncrementalPCA 每批样本数

# --------------- 预处理 & 模型 ---------------
device = "cuda" if torch.cuda.is_available() else "cpu"

model, preprocess = clip.load("ViT-B/32", device=device)   # 输出 512 维

model = model.to(device).eval()       # 输出 512 维


# ------- 收集并排序 jpg（保持不变） -------
def collect_images(root):
    paths = glob.glob(os.path.join(root, "*.jpg"))
    paths = [p for p in paths if "-" not in os.path.basename(p)]
    # 按文件名里的数字顺序排：1.jpg -> 2.jpg -> ...
    paths.sort(key=lambda x: int(os.path.splitext(os.path.basename(x))[0]))
    return paths

img_paths = collect_images(ROOT_DIR)
print(f"Found {len(img_paths)} valid jpg files.")



Found 19220 valid jpg files.


In [2]:
# --------------- 第 1 步：增量 PCA 拟合 ---------------
pca = IncrementalPCA(n_components=OUT_DIM)

with torch.no_grad():
    for s in tqdm(range(0, len(img_paths), PCA_BATCH), desc="Fit PCA"):
        batch = img_paths[s:s + PCA_BATCH]
        imgs  = [preprocess(Image.open(p).convert("RGB")) for p in batch]
        feats = torch.stack(imgs).to(device)

        feats = model.encode_image(feats).cpu().numpy().astype(np.float32)  # (B,512)
        pca.partial_fit(feats)

print("PCA fitting done.")


Fit PCA: 100%|██████████| 10/10 [13:01<00:00, 78.15s/it]

PCA fitting done.





In [3]:
# --------------- 第 2 步：提特征 → 降维 → 写 LMDB ---------------
env = lmdb.open(LMDB_PATH, map_size=MAP_SIZE, subdir=False)

with env.begin(write=True) as txn, torch.no_grad():
    # 把 PCA 参数也存进去，后续可复用
    txn.put(b"__pca_mean__", pca.mean_.astype(np.float32).tobytes())
    txn.put(b"__pca_components__", pca.components_.astype(np.float32).tobytes())
    txn.put(b"__dim__", np.array([OUT_DIM], dtype=np.int32).tobytes())

    for s in tqdm(range(0, len(img_paths), BATCH_SIZE), desc="Embed & Store"):
        batch = img_paths[s:s + BATCH_SIZE]
        imgs  = [preprocess(Image.open(p).convert("RGB")) for p in batch]
        feats128 = pca.transform(model.encode_image(torch.stack(imgs).to(device))
                                 .cpu().numpy().astype(np.float32))

        for path, vec in zip(batch, feats128):
            item_id = int(os.path.splitext(os.path.basename(path))[0])   # 1, 2, ...
            key = f"{item_id:08d}".encode()   # 00000001, 00000002 …
            txn.put(key, vec.tobytes())

print("✅ 128-dim embeddings saved to", LMDB_PATH)

Embed & Store: 100%|██████████| 301/301 [09:54<00:00,  1.98s/it]

✅ 128-dim embeddings saved to D:\VideoRecSystem\MicroLens\cover_emb128.lmdb



