In [1]:
import pandas as pd
import numpy as np
import lmdb
import torch
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel

# 模型选择
MODEL_NAME = "BAAI/bge-large-en-v1.5"

# 加载模型和分词器
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)
model.eval()

# 句子编码函数
def encode(text: str) -> np.ndarray:
    prompt = "Represent this sentence for retrieval: " + text
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
        embedding = outputs.last_hidden_state[:, 0]
        return embedding.squeeze().cpu().numpy()

# 路径配置
csv_path = r"D:\VideoRecSystem\MicroLens\DataSet\MicroLens-50k_titles.csv"
lmdb_path = r"D:\VideoRecSystem\MicroLens\title_emb1024.lmdb"

# 读取 CSV
df = pd.read_csv(csv_path)

# create LMDB
env = lmdb.open(lmdb_path, map_size=1024<<20, subdir=False)  # assign 1 GB space for this lmdb file

# write data
with env.begin(write=True) as txn:
    for idx, row in tqdm(df.iterrows(), total=len(df)):
        key = str(row["item"]).encode("utf-8")
        text = str(row["title"])
        vec = encode(text)
        value = vec.astype(np.float32).tobytes()
        txn.put(key, value)

env.close()
print("finished")


100%|██████████| 19220/19220 [1:41:48<00:00,  3.15it/s]  


✅ 完成：向量已写入 LMDB。


In [2]:
def load_lmdb_to_dict(lmdb_path, vector_dim=None, dtype=np.float32):
    env = lmdb.open(lmdb_path, readonly=True, subdir=False, lock=False, readahead=False)

    # 如果没有指定维度，尝试从 LMDB 中读取
    if vector_dim is None:
        with env.begin() as txn:
            dim_bytes = txn.get(b"__dim__")
            if dim_bytes:
                vector_dim = np.frombuffer(dim_bytes, dtype=np.int32)[0]
                print(f"Found stored dimension: {vector_dim}")
            else:
                print("No dimension info found in LMDB, will auto-detect from first item")

    raw_data = {}

    with env.begin() as txn:
        cursor = txn.cursor()
        for key_bytes, val_bytes in cursor:
            try:
                key_str = key_bytes.decode()
                if not key_str.isdigit():
                    continue
                key_int = int(key_str)
            except:
                continue

            raw_data[key_int] = bytes(val_bytes)  # 拷贝 buffer

    env.close()

    vectors = {}
    for k, val in raw_data.items():
        vec = np.frombuffer(val, dtype=dtype)

        # 自动检测维度
        if vector_dim is None:
            vector_dim = vec.size
            print(f"Auto-detected vector dimension: {vector_dim}")

        if vec.size != vector_dim:
            raise ValueError(f"Item {k} vector dim {vec.size} != {vector_dim}")
        vectors[k] = vec.copy()  # 拷贝防止潜在引用问题

    return vectors

In [3]:
# 测试加载
print("\nTesting load function...")
vectors = load_lmdb_to_dict(lmdb_path, vector_dim=1024, dtype=np.float32)
print(f"Successfully loaded {len(vectors)} vectors")
if vectors:
    first_key = next(iter(vectors))
    print(f"First vector shape: {vectors[first_key].shape}")


Testing load function...
Successfully loaded 19220 vectors
First vector shape: (1024,)
