In [2]:
import os, glob, lmdb, numpy as np, torch
from PIL import Image
from tqdm import tqdm
from sklearn.decomposition import IncrementalPCA
import clip
from torchvision import transforms

"""
一次性生成 128 维图片 embedding 并写入 LMDB。
"""

# --------------- 配置 ---------------
ROOT_DIR   = r"D:\VideoRecSystem\MicroLens\DataSet"
LMDB_PATH  = r"D:\VideoRecSystem\MicroLens\cover_emb128.lmdb"
MAP_SIZE   = 256 << 20          # 256 MB upper-bound
BATCH_SIZE = 64               # 调大/调小按显存来
OUT_DIM    = 128              # 目标维度
PCA_BATCH  = 2048             # IncrementalPCA 每批样本数

In [3]:
def load_lmdb_to_dict(lmdb_path, vector_dim=None, dtype=np.float32):
    env = lmdb.open(lmdb_path, readonly=True, subdir=False, lock=False, readahead=False)

    # 如果没有指定维度，尝试从 LMDB 中读取
    if vector_dim is None:
        with env.begin() as txn:
            dim_bytes = txn.get(b"__dim__")
            if dim_bytes:
                vector_dim = np.frombuffer(dim_bytes, dtype=np.int32)[0]
                print(f"Found stored dimension: {vector_dim}")
            else:
                print("No dimension info found in LMDB, will auto-detect from first item")

    raw_data = {}

    with env.begin() as txn:
        cursor = txn.cursor()
        for key_bytes, val_bytes in cursor:
            try:
                key_str = key_bytes.decode()
                if not key_str.isdigit():
                    continue
                key_int = int(key_str)
            except:
                continue

            raw_data[key_int] = bytes(val_bytes)  # 拷贝 buffer

    env.close()

    vectors = {}
    for k, val in raw_data.items():
        vec = np.frombuffer(val, dtype=dtype)

        # 自动检测维度
        if vector_dim is None:
            vector_dim = vec.size
            print(f"Auto-detected vector dimension: {vector_dim}")

        if vec.size != vector_dim:
            raise ValueError(f"Item {k} vector dim {vec.size} != {vector_dim}")
        vectors[k] = vec.copy()  # 拷贝防止潜在引用问题

    return vectors

In [1]:


# --------------- 预处理 & 模型 ---------------
device = "cuda" if torch.cuda.is_available() else "cpu"

model, preprocess = clip.load("ViT-B/32", device=device)   # 输出 512 维
model = model.to(device).eval()

# ------- 收集并排序 jpg（保持不变） -------
def collect_images(root):
    paths = glob.glob(os.path.join(root, "*.jpg"))
    paths = [p for p in paths if "-" not in os.path.basename(p)]
    # 按文件名里的数字顺序排：1.jpg -> 2.jpg -> ...
    paths.sort(key=lambda x: int(os.path.splitext(os.path.basename(x))[0]))
    return paths

img_paths = collect_images(ROOT_DIR)
print(f"Found {len(img_paths)} valid jpg files.")

# --------------- 检查并删除现有 LMDB 文件 ---------------
if os.path.exists(LMDB_PATH):
    print(f"Removing existing LMDB file: {LMDB_PATH}")
    os.remove(LMDB_PATH)

# --------------- 第 1 步：增量 PCA 拟合 ---------------
print("Step 1: Fitting PCA...")
pca = IncrementalPCA(n_components=OUT_DIM)

with torch.no_grad():
    for s in tqdm(range(0, len(img_paths), PCA_BATCH), desc="Fit PCA"):
        batch = img_paths[s:s + PCA_BATCH]
        imgs  = [preprocess(Image.open(p).convert("RGB")) for p in batch]
        feats = torch.stack(imgs).to(device)

        # 提取 512 维特征
        feats = model.encode_image(feats).cpu().numpy().astype(np.float32)  # (B,512)
        pca.partial_fit(feats)

print("PCA fitting done.")

# --------------- 第 2 步：提特征 → 降维 → 写 LMDB ---------------
print("Step 2: Generating embeddings and storing to LMDB...")
env = lmdb.open(LMDB_PATH, map_size=MAP_SIZE, subdir=False)

with env.begin(write=True) as txn, torch.no_grad():
    # 把 PCA 参数也存进去，后续可复用
    txn.put(b"__pca_mean__", pca.mean_.astype(np.float32).tobytes())
    txn.put(b"__pca_components__", pca.components_.astype(np.float32).tobytes())
    txn.put(b"__dim__", np.array([OUT_DIM], dtype=np.int32).tobytes())

    for s in tqdm(range(0, len(img_paths), BATCH_SIZE), desc="Embed & Store"):
        batch = img_paths[s:s + BATCH_SIZE]
        imgs  = [preprocess(Image.open(p).convert("RGB")) for p in batch]

        # 提取 512 维特征
        feats_512 = model.encode_image(torch.stack(imgs).to(device)).cpu().numpy().astype(np.float32)

        # 使用 PCA 降维到 128 维
        feats_128 = pca.transform(feats_512)

        for path, vec in zip(batch, feats_128):
            item_id = int(os.path.splitext(os.path.basename(path))[0])   # 1, 2, ...
            key = f"{item_id:08d}".encode()   # 00000001, 00000002 …
            txn.put(key, vec.astype(np.float32).tobytes())

env.close()
print("✅ 128-dim embeddings saved to", LMDB_PATH)

# --------------- 验证存储的数据 ---------------
def verify_lmdb_data(lmdb_path, expected_dim):
    env = lmdb.open(lmdb_path, readonly=True, subdir=False, lock=False, readahead=False)

    sample_count = 0
    with env.begin() as txn:
        cursor = txn.cursor()
        for key_bytes, val_bytes in cursor:
            try:
                key_str = key_bytes.decode()
                if not key_str.isdigit():
                    continue

                vec = np.frombuffer(val_bytes, dtype=np.float32)
                if sample_count == 0:
                    print(f"First item key: {key_str}, vector dim: {vec.size}")

                if vec.size != expected_dim:
                    print(f"❌ Item {key_str} has wrong dimension: {vec.size} != {expected_dim}")
                    env.close()
                    return False

                sample_count += 1
                if sample_count >= 5:  # 只检查前5个
                    break
            except:
                continue

    env.close()
    print(f"✅ Verified {sample_count} items, all have dimension {expected_dim}")
    return True

# 验证数据
verify_lmdb_data(LMDB_PATH, OUT_DIM)

# --------------- 修复后的加载函数 ---------------


# 测试加载
print("\nTesting load function...")
vectors = load_lmdb_to_dict(LMDB_PATH, vector_dim=128, dtype=np.float32)
print(f"Successfully loaded {len(vectors)} vectors")
if vectors:
    first_key = next(iter(vectors))
    print(f"First vector shape: {vectors[first_key].shape}")

Fit PCA:   0%|          | 0/10 [00:00<?, ?it/s]

Found 19220 valid jpg files.
Step 1: Fitting PCA...


Fit PCA: 100%|██████████| 10/10 [12:08<00:00, 72.81s/it]
Embed & Store:   0%|          | 0/301 [00:00<?, ?it/s]

PCA fitting done.
Step 2: Generating embeddings and storing to LMDB...


Embed & Store: 100%|██████████| 301/301 [10:23<00:00,  2.07s/it]


✅ 128-dim embeddings saved to D:\VideoRecSystem\MicroLens\cover_emb128.lmdb
First item key: 00000001, vector dim: 128
✅ Verified 5 items, all have dimension 128

Testing load function...
Successfully loaded 19220 vectors
First vector shape: (128,)


In [4]:
vectors = load_lmdb_to_dict(LMDB_PATH, vector_dim=128, dtype=np.float32)