In [1]:
import os, pathlib

PROJ = "/content/ubp_gemini_clean"
os.makedirs(PROJ, exist_ok=True)
os.makedirs(f"{PROJ}/data", exist_ok=True)
os.makedirs(f"{PROJ}/outputs", exist_ok=True)

print("Project root:", PROJ)
!ls -lah /content | head

Project root: /content/ubp_gemini_clean
total 20K
drwxr-xr-x 1 root root 4.0K Feb 27 05:58 .
drwxr-xr-x 1 root root 4.0K Feb 27 05:57 ..
drwxr-xr-x 4 root root 4.0K Jan 16 14:24 .config
drwxr-xr-x 1 root root 4.0K Jan 16 14:24 sample_data
drwxr-xr-x 4 root root 4.0K Feb 27 05:58 ubp_gemini_clean


In [2]:
!pip -q install "openai>=1.40.0" tqdm

In [2]:
import os
from google.colab import userdata

os.environ["GEMINI_API_KEY"] = userdata.get("GEMINI_API_KEY")
print("Key loaded:", bool(os.environ.get("GEMINI_API_KEY")))
assert os.environ.get("GEMINI_API_KEY"), "GEMINI_API_KEY missing (Colab Secrets 里没取到)"

Key loaded: True


In [5]:
!git clone https://github.com/LivXue/SoMe.git /content/SoMe

Cloning into '/content/SoMe'...
remote: Enumerating objects: 307, done.[K
remote: Counting objects: 100% (307/307), done.[K
remote: Compressing objects: 100% (210/210), done.[K
remote: Total 307 (delta 117), reused 272 (delta 82), pack-reused 0 (from 0)[K
Receiving objects: 100% (307/307), 9.97 MiB | 24.97 MiB/s, done.
Resolving deltas: 100% (117/117), done.


In [6]:
!ls /content/SoMe

agent.py	     test_media_content_recommend.py
config.py	     test_misinformation_detection.py
datasets	     test_realtime_event_detection.py
eval_scripts	     test_social_media_question_answering.py
LICENSE		     test_streaming_event_summary.py
pics		     test_user_behavior_prediction.py
qwen_agent	     test_user_comment_simulation.py
qwen_agent.egg-info  test_user_emotion_analysis.py
README.md	     tools
requirements.txt     vllm
tasks


In [9]:
!pip -q install huggingface_hub

from huggingface_hub import list_repo_files

repo_id = "LivXue/SoMe"
files = list_repo_files(repo_id=repo_id, repo_type="dataset")

cands = [f for f in files if f.endswith("all_posts.json")]
print("all_posts.json candidates:", len(cands))
print("\n".join(cands[:50]))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


all_posts.json candidates: 1
database/post_data/all_posts.json


In [10]:
import os
from huggingface_hub import hf_hub_download

repo_id = "LivXue/SoMe"
target = "database/post_data/all_posts.json"

local_path = hf_hub_download(
    repo_id=repo_id,
    repo_type="dataset",
    filename=target,
    local_dir="/content/SoMe",   # 保持 SoMe 目录结构
)

print("Downloaded to:", local_path)

# 确认文件真的存在
print("Exists:", os.path.exists("/content/SoMe/database/post_data/all_posts.json"))
!ls -lh /content/SoMe/database/post_data/all_posts.json

database/post_data/all_posts.json:   0%|          | 0.00/303M [00:00<?, ?B/s]

Downloaded to: /content/SoMe/database/post_data/all_posts.json
Exists: True
-rw-r--r-- 1 root root 289M Feb 27 06:06 /content/SoMe/database/post_data/all_posts.json


In [11]:
import json, os, random

gt_path = "/content/SoMe/datasets/user_behavior_prediction/ground_truth.json"
post_path = "/content/SoMe/database/post_data/all_posts.json"

assert os.path.exists(gt_path)
assert os.path.exists(post_path)

gt = json.load(open(gt_path, "r", encoding="utf-8"))
posts = json.load(open(post_path, "r", encoding="utf-8"))

def build_samples(k=5, seed=7):
    random.seed(seed)
    pairs = []
    like_users = list(gt.get("like", {}).keys())
    random.shuffle(like_users)

    for uid in like_users:
        for idx, e in enumerate(gt["like"][uid]):
            wid = e["weibo_id"]
            if wid in posts:
                pairs.append((str(uid), str(idx), wid, e.get("label"), posts[wid]))
                if len(pairs) >= k:
                    return pairs
    return pairs

samples = build_samples(k=5)
print("samples:", len(samples))
print("first sample:", samples[0][0], samples[0][2], samples[0][3], samples[0][4].get("内容","")[:50])

samples: 5
first sample: 5704097105 5115522819560654 是 年终上新第二弹


In [12]:
!pip -q install "openai>=1.40.0" tqdm

In [13]:
import os, json, re
from tqdm import tqdm
from openai import OpenAI

assert os.environ.get("GEMINI_API_KEY"), "GEMINI_API_KEY missing (请先从 Colab secrets 读取或手动设置)"

MODEL = "gemini-3-flash-preview"
OUT_DIR = "/content/ubp_gemini_clean/outputs"
os.makedirs(OUT_DIR, exist_ok=True)
OUT_PATH = f"{OUT_DIR}/ubp_{MODEL}_k{len(samples)}.json"

client = OpenAI(
    api_key=os.environ["GEMINI_API_KEY"],
    base_url="https://generativelanguage.googleapis.com/v1beta/openai/"
)

def parse_yesno(text: str):
    t = (text or "").strip().splitlines()[0].strip()
    if t.startswith("是"): return "是"
    if t.startswith("否"): return "否"
    if re.search(r"\bYES\b|是", t, re.I): return "是"
    if re.search(r"\bNO\b|否", t, re.I): return "否"
    return None

pred = {"点赞": {}}

for uid, idx, wid, label, post in tqdm(samples):
    pred["点赞"].setdefault(uid, {})
    prompt = (
        "你是一个严格的二分类器。\n"
        f"判断：用户ID={uid} 是否会对下面微博点赞。\n"
        f"微博JSON：{json.dumps(post, ensure_ascii=False)}\n\n"
        "输出格式：你必须严格只输出一个字：是 或 否。不要解释、不要标点、不要多余内容。"
    )
    try:
        resp = client.chat.completions.create(
            model=MODEL,
            messages=[{"role": "user", "content": prompt}],
            temperature=0,
        )
        raw = resp.choices[0].message.content
        yn = parse_yesno(raw)
        pred["点赞"][uid][idx] = yn if yn is not None else (raw[:200] if raw else "Error_empty")
    except Exception as e:
        pred["点赞"][uid][idx] = f"Error_{type(e).__name__}"

    with open(OUT_PATH, "w", encoding="utf-8") as f:
        json.dump(pred, f, ensure_ascii=False, indent=2)

print("✅ saved:", OUT_PATH)
print(json.dumps(pred, ensure_ascii=False, indent=2)[:800])

100%|██████████| 5/5 [00:33<00:00,  6.80s/it]

✅ saved: /content/ubp_gemini_clean/outputs/ubp_gemini-3-flash-preview_k5.json
{
  "点赞": {
    "5704097105": {
      "0": "否",
      "1": "是"
    },
    "6876733172": {
      "0": "是",
      "1": "否"
    },
    "6326205588": {
      "0": "是"
    }
  }
}





In [14]:
import json, os

OUT_PATH = "/content/ubp_gemini_clean/outputs/ubp_gemini-3-flash-preview_k5.json"
GT_PATH = "/content/SoMe/datasets/user_behavior_prediction/ground_truth.json"

pred = json.load(open(OUT_PATH, "r", encoding="utf-8"))
gt = json.load(open(GT_PATH, "r", encoding="utf-8"))

def norm(x):
    if x is None:
        return None
    s = str(x).strip()
    if s.startswith("是"): return "是"
    if s.startswith("否"): return "否"
    return None

total = 0
known = 0
correct = 0

# 只对 pred 里出现的 (uid, idx) 计分（对应你这次跑的 5 条）
for uid, idx_map in pred.get("点赞", {}).items():
    for idx, yhat_raw in idx_map.items():
        yhat = norm(yhat_raw)
        y = norm(gt["like"][uid][int(idx)]["label"])
        total += 1
        if yhat is None:
            continue
        known += 1
        if y == yhat:
            correct += 1

print("Total:", total)
print("TCR:", round(known/total, 4), f"({known}/{total})")
print("ACC_known:", round(correct/known, 4) if known else 0, f"({correct}/{known})")

Total: 5
TCR: 1.0 (5/5)
ACC_known: 0.6 (3/5)


In [5]:
import json, os, random

gt_path = "/content/SoMe/datasets/user_behavior_prediction/ground_truth.json"
post_path = "/content/SoMe/database/post_data/all_posts.json"

gt = json.load(open(gt_path, "r", encoding="utf-8"))
posts = json.load(open(post_path, "r", encoding="utf-8"))

def build_samples(k=30, seed=7):
    random.seed(seed)
    pairs = []
    like_users = list(gt.get("like", {}).keys())
    random.shuffle(like_users)

    for uid in like_users:
        for idx, e in enumerate(gt["like"][uid]):
            wid = e["weibo_id"]
            if wid in posts:
                pairs.append((str(uid), str(idx), wid, e.get("label"), posts[wid]))
                if len(pairs) >= k:
                    return pairs
    return pairs

samples30 = build_samples(k=30, seed=13)
print("samples30:", len(samples30))
print("preview:", samples30[0][0], samples30[0][2], samples30[0][3], samples30[0][4].get("内容","")[:50])

samples30: 30
preview: 2113119595 5165425143581074 是 缅怀逝者，致敬重生，勇毅前行！


In [9]:
import os, json, re, time, signal
from openai import OpenAI

MODEL = "gemini-3-flash-preview"
OUT_PATH30 = f"/content/ubp_gemini_clean/outputs/ubp_{MODEL}_k30.json"

assert os.environ.get("GEMINI_API_KEY"), "GEMINI_API_KEY missing"
assert "samples30" in globals(), "samples30 不存在：请先运行生成 samples30 的 cell"

# ✅ client 级别超时（网络层）
client = OpenAI(
    api_key=os.environ["GEMINI_API_KEY"],
    base_url="https://generativelanguage.googleapis.com/v1beta/openai/",
    timeout=60.0,
)

def parse_yesno(text: str):
    t = (text or "").strip().splitlines()[0].strip()
    if t.startswith("是"): return "是"
    if t.startswith("否"): return "否"
    if re.search(r"\bYES\b|是", t, re.I): return "是"
    if re.search(r"\bNO\b|否", t, re.I): return "否"
    return None

# load checkpoint
if os.path.exists(OUT_PATH30):
    pred30 = json.load(open(OUT_PATH30, "r", encoding="utf-8"))
else:
    pred30 = {"点赞": {}}
pred30.setdefault("点赞", {})

def done(uid, idx):
    return uid in pred30["点赞"] and idx in pred30["点赞"][uid]

class HardTimeout(Exception):
    pass

def _alarm_handler(signum, frame):
    raise HardTimeout()

signal.signal(signal.SIGALRM, _alarm_handler)

MAX_SECONDS = 60  # ✅ 超过 1 分钟跳过

for uid, idx, wid, label, post in samples30:
    pred30["点赞"].setdefault(uid, {})
    if done(uid, idx):
        continue

    # ✅ 可选：用精简字段减少卡住概率（强烈建议）
    post_small = {
        "内容": post.get("内容",""),
        "发布时间": post.get("发布时间",""),
        "发布者": post.get("发布者",""),
        "发布地点": post.get("发布地点",""),
        "转发量": post.get("转发量", post.get("转发", "")),
        "评论量": post.get("评论量", post.get("评论", "")),
        "点赞量": post.get("点赞量", post.get("点赞", "")),
    }

    prompt = (
        "你是一个严格的二分类器。\n"
        f"判断：用户ID={uid} 是否会对下面微博点赞。\n"
        f"微博JSON：{json.dumps(post_small, ensure_ascii=False)}\n\n"
        "输出格式：你必须严格只输出一个字：是 或 否。不要解释、不要标点、不要多余内容。"
    )

    print(f"running uid={uid} idx={idx} wid={wid} ...")

    try:
        signal.alarm(MAX_SECONDS)  # ✅ 硬超时开始
        resp = client.chat.completions.create(
            model=MODEL,
            messages=[{"role": "user", "content": prompt}],
            temperature=0,
        )
        signal.alarm(0)            # ✅ 关闭闹钟

        raw = resp.choices[0].message.content
        yn = parse_yesno(raw)
        pred30["点赞"][uid][idx] = yn if yn is not None else (raw[:200] if raw else "Error_empty")
        print(" ->", pred30["点赞"][uid][idx])

    except HardTimeout:
        pred30["点赞"][uid][idx] = "Error_TIMEOUT"
        print(" -> skipped (TIMEOUT)")

    except Exception as e:
        pred30["点赞"][uid][idx] = f"Error_{type(e).__name__}"
        print(" -> skipped (ERROR)", type(e).__name__)

    finally:
        signal.alarm(0)
        with open(OUT_PATH30, "w", encoding="utf-8") as f:
            json.dump(pred30, f, ensure_ascii=False, indent=2)

print("✅ finished / saved:", OUT_PATH30)

running uid=3932572740 idx=1 wid=5022407022871998 ...
 -> skipped (ERROR) APITimeoutError
running uid=3140866273 idx=0 wid=5155670790770157 ...
 -> 是
running uid=3140866273 idx=1 wid=5151640873076561 ...
 -> 是
running uid=7327213053 idx=0 wid=5102152756763391 ...
 -> 否
running uid=7327213053 idx=1 wid=5188672142181463 ...
 -> 是
✅ finished / saved: /content/ubp_gemini_clean/outputs/ubp_gemini-3-flash-preview_k30.json


In [10]:
import json

GT_PATH = "/content/SoMe/datasets/user_behavior_prediction/ground_truth.json"
PRED_PATH = "/content/ubp_gemini_clean/outputs/ubp_gemini-3-flash-preview_k30.json"

gt = json.load(open(GT_PATH, "r", encoding="utf-8"))
pred = json.load(open(PRED_PATH, "r", encoding="utf-8"))

def norm(x):
    if x is None:
        return None
    s = str(x).strip()
    if s.startswith("是"): return "是"
    if s.startswith("否"): return "否"
    return None

total = 0
known = 0
correct = 0
unparsed = 0
errors = 0

for uid, idx_map in pred.get("点赞", {}).items():
    for idx, yhat_raw in idx_map.items():
        total += 1
        y = norm(gt["like"][uid][int(idx)]["label"])
        yhat = norm(yhat_raw)

        if yhat is None:
            unparsed += 1
            if str(yhat_raw).startswith("Error_") or str(yhat_raw).startswith("Error"):
                errors += 1
            continue

        known += 1
        if y == yhat:
            correct += 1

print("Total:", total)
print("TCR:", round(known/total, 4), f"({known}/{total})")
print("ACC_known:", round(correct/known, 4) if known else 0, f"({correct}/{known})")
print("Unparsed:", unparsed, "| Error-like:", errors)

Total: 30
TCR: 0.9667 (29/30)
ACC_known: 0.5172 (15/29)
Unparsed: 1 | Error-like: 1


In [11]:
import os, json, random

BASE = "/content/SoMe"
GT_PATH = f"{BASE}/datasets/user_behavior_prediction/ground_truth.json"
POST_PATH = f"{BASE}/database/post_data/all_posts.json"

print("GT exists:", os.path.exists(GT_PATH), GT_PATH)
print("POST exists:", os.path.exists(POST_PATH), POST_PATH)

gt = json.load(open(GT_PATH, "r", encoding="utf-8"))
posts = json.load(open(POST_PATH, "r", encoding="utf-8"))

print("GT top keys:", list(gt.keys()))
print("Posts count:", len(posts))

GT exists: True /content/SoMe/datasets/user_behavior_prediction/ground_truth.json
POST exists: True /content/SoMe/database/post_data/all_posts.json
GT top keys: ['like', 'comment', 'repost']
Posts count: 833609


In [12]:
from collections import defaultdict

# user_history[action][uid] = list of (weibo_id, label)
user_history = defaultdict(lambda: defaultdict(list))

# gt 的结构通常是：{"like": {uid: [ {weibo_id,label}, ... ]}, "comment":..., "repost":...}
for action in ["like", "comment", "repost"]:
    if action not in gt:
        continue
    for uid, arr in gt[action].items():
        for item in arr:
            wid = item.get("weibo_id")
            lab = item.get("label")
            if wid is None or lab is None:
                continue
            user_history[action][uid].append((wid, lab))

def history_stats(uid: str):
    return {a: len(user_history[a].get(uid, [])) for a in ["like","comment","repost"]}

# quick check
some_uid = next(iter(gt["like"].keys()))
print("example uid:", some_uid, "stats:", history_stats(some_uid))
print("example like history head:", user_history["like"][some_uid][:3])

example uid: 5288580817 stats: {'like': 2, 'comment': 0, 'repost': 0}
example like history head: [('5120482505917816', '是'), ('5163988335461243', '否')]


In [13]:
random.seed(42)

# 展平 like 样本为 (uid, idx_str, weibo_id, label)
flat_like = []
for uid, arr in gt["like"].items():
    for i, item in enumerate(arr):
        wid = item["weibo_id"]
        lab = item["label"]
        if wid in posts:  # 必须有帖子内容
            flat_like.append((uid, str(i), wid, lab))

print("total like pairs with posts:", len(flat_like))

# 采样 30 条
samples30 = random.sample(flat_like, 30)

# 加载 post 内容
samples30 = [(uid, idx, wid, lab, posts[wid]) for (uid, idx, wid, lab) in samples30]

print("sample preview:", samples30[0][0], samples30[0][2], samples30[0][3])

total like pairs with posts: 1000
sample preview: 7770704066 5154931837502529 是


In [14]:
import json

def post_small(p: dict):
    return {
        "内容": p.get("内容",""),
        "发布时间": p.get("发布时间",""),
        "发布者": p.get("发布者",""),
        "发布地点": p.get("发布地点",""),
        "转发量": p.get("转发量", p.get("转发", "")),
        "评论量": p.get("评论量", p.get("评论", "")),
        "点赞量": p.get("点赞量", p.get("点赞", "")),
    }

def build_user_profile(uid: str, current_wid: str, k_like=3, k_other=2):
    """
    返回一个 dict，包含用户历史样本（含 label）的小集合。
    k_like: 取多少条历史点赞样本
    k_other: 评论/转发合计取多少条（各取一半，不够就少）
    """
    profile = {"like": [], "comment": [], "repost": []}

    # like history
    like_hist = [(wid, lab) for (wid, lab) in user_history["like"].get(uid, []) if wid != current_wid and wid in posts]
    # comment / repost history
    comm_hist = [(wid, lab) for (wid, lab) in user_history["comment"].get(uid, []) if wid != current_wid and wid in posts]
    rep_hist  = [(wid, lab) for (wid, lab) in user_history["repost"].get(uid, []) if wid != current_wid and wid in posts]

    # 采样（避免太长）
    if like_hist:
        profile["like"] = random.sample(like_hist, min(k_like, len(like_hist)))
    if comm_hist:
        profile["comment"] = random.sample(comm_hist, min(max(1, k_other//2), len(comm_hist)))
    if rep_hist:
        profile["repost"] = random.sample(rep_hist, min(max(1, k_other - len(profile["comment"])), len(rep_hist)))

    # 把 wid 映射成实际 post_small
    def expand(arr):
        out = []
        for wid, lab in arr:
            out.append({"weibo_id": wid, "label": lab, "post": post_small(posts[wid])})
        return out

    return {
        "uid": uid,
        "history_like": expand(profile["like"]),
        "history_comment": expand(profile["comment"]),
        "history_repost": expand(profile["repost"]),
    }

# quick check one
u, idx, wid, lab, p = samples30[0]
prof = build_user_profile(u, wid, k_like=3, k_other=2)
print("profile keys:", prof.keys())
print("history_like n:", len(prof["history_like"]))

profile keys: dict_keys(['uid', 'history_like', 'history_comment', 'history_repost'])
history_like n: 1


In [15]:
import os, json, re, time, signal
from openai import OpenAI

MODEL = "gemini-3-flash-preview"
OUT_DIR = "/content/ubp_gemini_history/outputs"
os.makedirs(OUT_DIR, exist_ok=True)
OUT_PATH = f"{OUT_DIR}/ubp_{MODEL}_history_k30.json"

assert os.environ.get("GEMINI_API_KEY"), "GEMINI_API_KEY missing"

client = OpenAI(
    api_key=os.environ["GEMINI_API_KEY"],
    base_url="https://generativelanguage.googleapis.com/v1beta/openai/",
    timeout=60.0,
)

def parse_yesno(text: str):
    t = (text or "").strip().splitlines()[0].strip()
    if t.startswith("是"): return "是"
    if t.startswith("否"): return "否"
    if re.search(r"\bYES\b|是", t, re.I): return "是"
    if re.search(r"\bNO\b|否", t, re.I): return "否"
    return None

# load checkpoint
if os.path.exists(OUT_PATH):
    pred = json.load(open(OUT_PATH, "r", encoding="utf-8"))
else:
    pred = {"点赞": {}}
pred.setdefault("点赞", {})

def done(uid, idx):
    return uid in pred["点赞"] and idx in pred["点赞"][uid]

class HardTimeout(Exception):
    pass

def _alarm_handler(signum, frame):
    raise HardTimeout()

signal.signal(signal.SIGALRM, _alarm_handler)
MAX_SECONDS = 60

def make_prompt(uid, current_post_small, profile_dict):
    """
    profile_dict: build_user_profile 返回的内容，里面有 history_like/comment/repost
    """
    # 注意：history 里 label 是 ground truth，用作“用户偏好提示”
    profile_json = json.dumps({
        "history_like": profile_dict["history_like"],
        "history_comment": profile_dict["history_comment"],
        "history_repost": profile_dict["history_repost"],
    }, ensure_ascii=False)

    return (
        "你是一个基于用户历史行为的预测器。\n"
        "你会得到该用户过去对微博的互动记录（包含标签：是/否），以及一条新的候选微博。\n"
        "请根据用户历史偏好与候选微博的匹配程度，预测该用户是否会对候选微博进行“点赞”。\n\n"
        f"用户ID：{uid}\n"
        f"用户历史行为（带标签）：{profile_json}\n\n"
        f"候选微博：{json.dumps(current_post_small, ensure_ascii=False)}\n\n"
        "输出要求：你必须严格只输出一个字：是 或 否。不要解释、不要标点、不要多余内容。"
    )

for uid, idx, wid, label, post in samples30:
    pred["点赞"].setdefault(uid, {})
    if done(uid, idx):
        continue

    cur_small = post_small(post)
    prof = build_user_profile(uid, wid, k_like=3, k_other=2)
    prompt = make_prompt(uid, cur_small, prof)

    print(f"running uid={uid} idx={idx} wid={wid} (history_like={len(prof['history_like'])}) ...")

    try:
        signal.alarm(MAX_SECONDS)
        resp = client.chat.completions.create(
            model=MODEL,
            messages=[{"role": "user", "content": prompt}],
            temperature=0,
        )
        signal.alarm(0)

        raw = resp.choices[0].message.content
        yn = parse_yesno(raw)
        pred["点赞"][uid][idx] = yn if yn is not None else (raw[:200] if raw else "Error_empty")
        print(" ->", pred["点赞"][uid][idx])

    except HardTimeout:
        pred["点赞"][uid][idx] = "Error_TIMEOUT"
        print(" -> skipped (TIMEOUT)")

    except Exception as e:
        pred["点赞"][uid][idx] = f"Error_{type(e).__name__}"
        print(" -> skipped (ERROR)", type(e).__name__)

    finally:
        signal.alarm(0)
        with open(OUT_PATH, "w", encoding="utf-8") as f:
            json.dump(pred, f, ensure_ascii=False, indent=2)

print("✅ finished / saved:", OUT_PATH)

running uid=7770704066 idx=0 wid=5154931837502529 (history_like=1) ...
 -> 是
running uid=2022191037 idx=0 wid=5123383550808971 (history_like=1) ...
 -> 是
running uid=5223740328 idx=1 wid=5174257718919189 (history_like=1) ...
 -> 是
running uid=2277621081 idx=1 wid=5189084227305618 (history_like=1) ...
 -> 否
running uid=7223738176 idx=1 wid=5189028095988879 (history_like=1) ...
 -> 否
running uid=6736618482 idx=0 wid=5061965133644042 (history_like=1) ...
 -> 否
running uid=6708858433 idx=0 wid=5082132731135468 (history_like=1) ...
 -> 否
running uid=2282740140 idx=0 wid=5165872616571737 (history_like=1) ...
 -> 否
running uid=1899426901 idx=0 wid=5164800138805820 (history_like=1) ...
 -> 是
running uid=5951459431 idx=0 wid=5104546760885593 (history_like=1) ...
 -> 否
running uid=6321728615 idx=0 wid=5119251296028398 (history_like=1) ...
 -> 否
running uid=2277621081 idx=0 wid=5158562093074697 (history_like=1) ...
 -> 否
running uid=7814648672 idx=1 wid=5188975343175452 (history_like=1) ...
 -> 否

In [16]:
import json

PRED_PATH = "/content/ubp_gemini_history/outputs/ubp_gemini-3-flash-preview_history_k30.json"

gt = json.load(open("/content/SoMe/datasets/user_behavior_prediction/ground_truth.json", "r", encoding="utf-8"))
pred = json.load(open(PRED_PATH, "r", encoding="utf-8"))

def norm(x):
    if x is None:
        return None
    s = str(x).strip()
    if s.startswith("是"): return "是"
    if s.startswith("否"): return "否"
    return None

total = 0
known = 0
correct = 0
unparsed = 0
errors = 0

for uid, idx_map in pred.get("点赞", {}).items():
    for idx, yhat_raw in idx_map.items():
        total += 1
        y = norm(gt["like"][uid][int(idx)]["label"])
        yhat = norm(yhat_raw)

        if yhat is None:
            unparsed += 1
            if str(yhat_raw).startswith("Error"):
                errors += 1
            continue

        known += 1
        if y == yhat:
            correct += 1

print("Total:", total)
print("TCR:", round(known/total, 4), f"({known}/{total})")
print("ACC_known:", round(correct/known, 4) if known else 0, f"({correct}/{known})")
print("Unparsed:", unparsed, "| Error-like:", errors)
print("Saved:", PRED_PATH)

Total: 30
TCR: 1.0 (30/30)
ACC_known: 0.4333 (13/30)
Unparsed: 0 | Error-like: 0
Saved: /content/ubp_gemini_history/outputs/ubp_gemini-3-flash-preview_history_k30.json


In [18]:
import os, json

BASE="/content/SoMe"
GT_PATH=f"{BASE}/datasets/user_behavior_prediction/ground_truth.json"
POST_PATH=f"{BASE}/database/post_data/all_posts.json"

print("GT:", os.path.exists(GT_PATH), GT_PATH)
print("POST:", os.path.exists(POST_PATH), POST_PATH)

gt=json.load(open(GT_PATH,"r",encoding="utf-8"))
posts=json.load(open(POST_PATH,"r",encoding="utf-8"))
print("like users:", len(gt["like"]), "posts:", len(posts))

GT: True /content/SoMe/datasets/user_behavior_prediction/ground_truth.json
POST: True /content/SoMe/database/post_data/all_posts.json
like users: 500 posts: 833609


In [19]:
from collections import defaultdict

user_liked = defaultdict(list)
for uid, arr in gt["like"].items():
    for item in arr:
        if item.get("label") == "是":
            wid=item.get("weibo_id")
            if wid in posts:
                user_liked[uid].append(wid)

print("users with >=1 liked:", sum(1 for u in user_liked if len(user_liked[u])>0))

users with >=1 liked: 500


In [20]:
def post_to_text(p):
    content=(p.get("内容") or "").strip()
    author=(p.get("发布者") or "").strip()
    loc=(p.get("发布地点") or "").strip()
    meta=[]
    if author: meta.append(f"作者:{author}")
    if loc: meta.append(f"地点:{loc}")
    return (f"{' '.join(meta)} 内容:{content}").strip()

# quick check
uid0=next(iter(user_liked.keys()))
wid0=user_liked[uid0][0]
print(post_to_text(posts[wid0])[:200])

作者:李小璐Super璐 地点:发布于 北京 内容:🙏祈祷平安


In [21]:
!pip -q install -U sentence-transformers

import numpy as np
from sentence_transformers import SentenceTransformer
from functools import lru_cache

embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

@lru_cache(maxsize=20000)
def emb_text(text:str):
    return embedder.encode([text], normalize_embeddings=True)[0]

def cosine_sim_vec(a, B):
    # a: (d,), B: (n,d)
    return B @ a

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [22]:
import random

def retrieve_topk_liked(uid, candidate_wid, k=3, pool_max=50):
    history=[w for w in user_liked.get(uid, []) if w!=candidate_wid and w in posts]
    if not history:
        return []

    if len(history) > pool_max:
        history = random.sample(history, pool_max)

    cand_text = post_to_text(posts[candidate_wid])
    a = emb_text(cand_text)

    hist_texts=[post_to_text(posts[w]) for w in history]
    B = np.stack([emb_text(t) for t in hist_texts], axis=0)

    sims = cosine_sim_vec(a, B)
    top_idx = np.argsort(-sims)[:min(k, len(history))]

    out=[]
    for i in top_idx:
        out.append({
            "weibo_id": history[i],
            "similarity": float(sims[i]),
            "text": hist_texts[i][:250]
        })
    return out

# sanity check
print(retrieve_topk_liked(uid0, wid0, k=3))

[]


In [25]:
import os, json, re
from tqdm import tqdm
from openai import OpenAI

assert os.environ.get("GEMINI_API_KEY"), "GEMINI_API_KEY missing"

client = OpenAI(
    api_key=os.environ["GEMINI_API_KEY"],
    base_url="https://generativelanguage.googleapis.com/v1beta/openai/",
    timeout=60.0,
)
MODEL="gemini-3-flash-preview"

def parse_yesno(text):
    t=(text or "").strip().splitlines()[0].strip()
    if t.startswith("是"): return "是"
    if t.startswith("否"): return "否"
    return None

def make_prompt(uid, cand_post, retrieved):
    cand_block=json.dumps({
        "内容": cand_post.get("内容",""),
        "发布时间": cand_post.get("发布时间",""),
        "发布者": cand_post.get("发布者",""),
        "发布地点": cand_post.get("发布地点",""),
        "转发量": cand_post.get("转发量",""),
        "评论量": cand_post.get("评论量",""),
        "点赞量": cand_post.get("点赞量",""),
    }, ensure_ascii=False)

    hist_block=json.dumps(retrieved, ensure_ascii=False, indent=2)

    return (
        "你是一个个性化用户行为预测器。\n"
        "给定用户过去点赞过的微博中，与候选微博最相似的Top-K历史微博（作为用户偏好证据），以及候选微博。\n"
        "请预测该用户是否会对候选微博点赞。\n\n"
        f"用户ID：{uid}\n\n"
        f"历史点赞证据Top-K：\n{hist_block}\n\n"
        f"候选微博：\n{cand_block}\n\n"
        "输出要求：严格只输出一个字：是 或 否。不要解释。"
    )

In [26]:
import random
random.seed(42)

flat=[]
for uid, arr in gt["like"].items():
    for i,item in enumerate(arr):
        wid=item["weibo_id"]
        lab=item["label"]
        if wid in posts:
            flat.append((uid,str(i),wid,lab,posts[wid]))

samples30 = random.sample(flat, 30)
print("samples30 ready:", len(samples30))

samples30 ready: 30


In [27]:
import os, json

OUT_DIR="/content/ubp_gemini_retrieval_text/outputs"
os.makedirs(OUT_DIR, exist_ok=True)
OUT_PATH=f"{OUT_DIR}/ubp_{MODEL}_retrieval_text_k30.json"

if os.path.exists(OUT_PATH):
    pred=json.load(open(OUT_PATH,"r",encoding="utf-8"))
else:
    pred={"点赞":{}}
pred.setdefault("点赞",{})

for uid, idx, wid, lab, post in tqdm(samples30):
    pred["点赞"].setdefault(uid,{})
    if idx in pred["点赞"][uid]:
        continue

    retrieved = retrieve_topk_liked(uid, wid, k=3, pool_max=50)
    prompt = make_prompt(uid, post, retrieved)

    try:
        resp = client.chat.completions.create(
            model=MODEL,
            messages=[{"role":"user","content":prompt}],
            temperature=0
        )
        raw = resp.choices[0].message.content
        yn = parse_yesno(raw)
        pred["点赞"][uid][idx]= yn if yn is not None else (raw[:200] if raw else "Error_empty")
    except Exception as e:
        pred["点赞"][uid][idx]=f"Error_{type(e).__name__}"

    with open(OUT_PATH,"w",encoding="utf-8") as f:
        json.dump(pred,f,ensure_ascii=False,indent=2)

print("✅ saved:", OUT_PATH)

100%|██████████| 30/30 [02:58<00:00,  5.97s/it]

✅ saved: /content/ubp_gemini_retrieval_text/outputs/ubp_gemini-3-flash-preview_retrieval_text_k30.json





In [28]:
import json

pred=json.load(open(OUT_PATH,"r",encoding="utf-8"))

def norm(x):
    if x is None: return None
    s=str(x).strip()
    if s.startswith("是"): return "是"
    if s.startswith("否"): return "否"
    return None

total=known=correct=unparsed=errors=0
for uid, idx_map in pred["点赞"].items():
    for idx, yhat_raw in idx_map.items():
        total += 1
        y = norm(gt["like"][uid][int(idx)]["label"])
        yhat = norm(yhat_raw)
        if yhat is None:
            unparsed += 1
            if str(yhat_raw).startswith("Error"):
                errors += 1
            continue
        known += 1
        if y == yhat:
            correct += 1

print("Total:", total)
print("TCR:", round(known/total,4), f"({known}/{total})")
print("ACC_known:", round(correct/known,4) if known else 0, f"({correct}/{known})")
print("Unparsed:", unparsed, "| Error-like:", errors)
print("Saved:", OUT_PATH)

Total: 30
TCR: 1.0 (30/30)
ACC_known: 0.4667 (14/30)
Unparsed: 0 | Error-like: 0
Saved: /content/ubp_gemini_retrieval_text/outputs/ubp_gemini-3-flash-preview_retrieval_text_k30.json
