In [1]:
%%bash
pip uninstall -y pandas pyarrow -q || true
pip install -q -U "pandas==2.2.2" "pyarrow<20" tqdm


   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 12.7/12.7 MB 135.8 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 42.1/42.1 MB 51.8 MB/s eta 0:00:00


In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
ITEMS_IN         = "/content/drive/MyDrive/MMREC/data/items.parquet"
INTERACTIONS_IN  = "/content/drive/MyDrive/MMREC/data/interactions.parquet"
FEATS_IN         = "/content/drive/MyDrive/MMREC/data/features/item_features_clip.parquet" 
OUT_DIR          = "/content/drive/MyDrive/MMREC/data/imgsubset"              
FEATS_OUT        = "/content/drive/MyDrive/MMREC/data/features/item_features_clip_imgsubset.parquet"

import os, re, numpy as np, pandas as pd
os.makedirs(OUT_DIR, exist_ok=True)

# 1) 读取
items = pd.read_parquet(ITEMS_IN, engine="pyarrow")
inter = pd.read_parquet(INTERACTIONS_IN, engine="pyarrow")
feats = pd.read_parquet(FEATS_IN, engine="pyarrow")

# 2) 自动识别图像向量列
img_cols = [c for c in feats.columns if re.match(r"^(clip_img|img_emb)_\d+$", c)]
if not img_cols:
    raise RuntimeError("在特征文件里找不到图像向量列（期望列名形如 clip_img_* 或 img_emb_*）。请确认 FEATS_IN 指向正确的文件。")
img_col = img_cols[0]
print("Using image feature column:", img_col)

# 3) 用“向量是否为全零”判断是否真的有图像特征（更可靠，不依赖 has_image）
def has_real_image_vec(v):
    a = np.asarray(v, dtype=np.float32).ravel()
    return a.size > 0 and np.any(a != 0)

feats["has_real_img"] = feats[img_col].apply(has_real_image_vec)

# 4) 统一 id 类型为字符串，避免类型不一致
for df in (items, inter, feats):
    if "item_id" in df.columns:
        df["item_id"] = df["item_id"].astype(str)
if "user_id" in inter.columns:
    inter["user_id"] = inter["user_id"].astype(str)

# 5) 取“确实有图像特征”的 item_id，并过滤三份表
im_ids = set(feats.loc[feats["has_real_img"], "item_id"])
print(f"items with non-zero image embedding: {len(im_ids):,}")

items_img = items[items["item_id"].isin(im_ids)].drop_duplicates(subset=["item_id"]).reset_index(drop=True)
inter_img = inter[inter["item_id"].isin(im_ids)].reset_index(drop=True)
feats_img = feats[feats["item_id"].isin(im_ids)].reset_index(drop=True)

# 6) 保存
items_out = os.path.join(OUT_DIR, "items.parquet")
inter_out = os.path.join(OUT_DIR, "interactions.parquet")
items_img.to_parquet(items_out, index=False, engine="pyarrow")
inter_img.to_parquet(inter_out, index=False, engine="pyarrow")
feats_img.to_parquet(FEATS_OUT, index=False, engine="pyarrow")

# 7) 打印统计
print("Saved:")
print("  ", items_out, "->", len(items_img))
print("  ", inter_out, "->", len(inter_img))
print("  ", FEATS_OUT,   "->", len(feats_img))
print(f"Users in interactions: {inter_img['user_id'].nunique():,} | Items in interactions: {inter_img['item_id'].nunique():,}")


Using image feature column: clip_img_512
items with non-zero image embedding: 30,812
Saved:
   /content/drive/MyDrive/MMREC/data/imgsubset/items.parquet -> 30812
   /content/drive/MyDrive/MMREC/data/imgsubset/interactions.parquet -> 213523
   /content/drive/MyDrive/MMREC/data/features/item_features_clip_imgsubset.parquet -> 30812
Users in interactions: 206,474 | Items in interactions: 30,812


In [6]:
!python -u "/content/drive/MyDrive/MMREC/clip_build_graph_v1.py" \
  --interactions "/content/drive/MyDrive/MMREC/data/imgsubset/interactions.parquet" \
  --items        "/content/drive/MyDrive/MMREC/data/imgsubset/items.parquet" \
  --item_features "/content/drive/MyDrive/MMREC/data/features/item_features_clip_imgsubset.parquet" \
  --out          "/content/drive/MyDrive/MMREC/data/graph"

split by user: 100% 206474/206474 [02:10<00:00, 1580.66it/s]
Saved to /content/drive/MyDrive/MMREC/data/graph {'n_users': 206474, 'n_items': 30812, 'n_train': 6464, 'n_valid': 585, 'n_test': 206474, 'txt_col': 'clip_txt_512', 'img_col': 'clip_img_512', 'txt_dim': 512, 'img_dim': 512}


In [7]:
import json, pandas as pd, os

OUT_DIR = "/content/drive/MyDrive/MMREC/data/graph"
meta_path = os.path.join(OUT_DIR, "meta.json")
print("\n== meta.json ==")
with open(meta_path, "r") as f:
    print(f.read())

print("\n== files in OUT_DIR ==")
print("\n".join(sorted(os.listdir(OUT_DIR))))

# 简看前几行
print("\n== train_edges head ==")
display(pd.read_parquet(os.path.join(OUT_DIR, "train_edges.parquet")).head())

print("\n== valid_pairs head ==")
display(pd.read_parquet(os.path.join(OUT_DIR, "valid_pairs.parquet")).head())

print("\n== test_pairs head ==")
display(pd.read_parquet(os.path.join(OUT_DIR, "test_pairs.parquet")).head())


== meta.json ==
{
  "n_users": 206474,
  "n_items": 30812,
  "n_train": 6464,
  "n_valid": 585,
  "n_test": 206474,
  "txt_col": "clip_txt_512",
  "img_col": "clip_img_512",
  "txt_dim": 512,
  "img_dim": 512
}

== files in OUT_DIR ==
item_features_aligned.npz
items_idx.parquet
meta.json
test_pairs.parquet
train_edges.parquet
train_item_deg.parquet
train_user_deg.parquet
users_idx.parquet
valid_pairs.parquet

== train_edges head ==


Unnamed: 0,u,i
0,133428,9558
1,29292,28871
2,65275,17346
3,31102,3810
4,121894,14469



== valid_pairs head ==


Unnamed: 0,u,i
0,67963,9146
1,19563,28871
2,59836,27661
3,205081,16137
4,45314,11500



== test_pairs head ==


Unnamed: 0,u,i
0,67377,8060
1,62182,23923
2,168171,6407
3,85154,3363
4,200359,20730


In [8]:
!python -u "/content/drive/MyDrive/MMREC/clip_build_graph_v1.py" \
  --interactions "/content/drive/MyDrive/MMREC/data/interactions.parquet" \
  --items        "/content/drive/MyDrive/MMREC/data/items.parquet" \
  --item_features "/content/drive/MyDrive/MMREC/data/features/item_features_clip.parquet" \
  --out          "/content/drive/MyDrive/MMREC/data/graph_full_v1"

split by user: 100% 2035490/2035490 [21:20<00:00, 1589.96it/s]
Saved to /content/drive/MyDrive/MMREC/data/graph_full_v1 {'n_users': 2035490, 'n_items': 825869, 'n_train': 384768, 'n_valid': 80681, 'n_test': 2035490, 'txt_col': 'clip_txt_512', 'img_col': 'clip_img_512', 'txt_dim': 512, 'img_dim': 512}
