### 人工识别结果输入

In [9]:
import pandas as pd
import json
from pathlib import Path
import math

current_dir = Path.cwd()
data_dir = current_dir
input_file = data_dir.parent / "data_origin/3-2-XJC_YCB_Thinking_TO_json.csv"
output_file = data_dir.parent / "data/3-2-Human_Recognition_Mode_txt.json"

# 读入 CSV
df = pd.read_csv(input_file)

result = {}

# 遍历行
for _, row in df.iterrows():
    L1_name = row.get('L1_name_EN', '')
    L2_name = row.get('L2_name_EN', '')

    if not L1_name or not L2_name:
        continue

    if L1_name not in result:
        result[L1_name] = {}
    result[L1_name][L2_name] = {}

    # 遍历列 0,1,2
    for col in ['0', '1', '2']:
        val = row.get(col, '')
        if val is None or (isinstance(val, float) and math.isnan(val)):
            val = ''
        else:
            val = str(val)

        parts = val.split('+')
        if len(parts) == 3:
            result[L1_name][L2_name][col] = {
                "Starting": parts[0],
                "Trend": parts[1],
                "Ending": parts[2]
            }
        else:
            result[L1_name][L2_name][col] = {
                "Starting": "",
                "Trend": "",
                "Ending": ""
            }

# 确保输出目录存在
output_file.parent.mkdir(parents=True, exist_ok=True)

# 先序列化为字符串，再做全局替换，把 '_' 换成 ','
json_text = json.dumps(result, ensure_ascii=False, indent=2)
json_text = json_text.replace('_', ',')

# 写入文件
with open(output_file, 'w', encoding='utf-8') as f:
    f.write(json_text)

print(f"Input file: {input_file.resolve()}")
print(f"Output file: {output_file.resolve()}")


Input file: F:\Desktop\CAMPF_Supplementary\data_origin\3-2-XJC_YCB_Thinking_TO_json.csv
Output file: F:\Desktop\CAMPF_Supplementary\data\3-2-Human_Recognition_Mode_txt.json


### JSON附加至国家级政策聚类数据

In [3]:
import pandas as pd
import json
from pathlib import Path
import math

# 可能存在的 L1 相关列（如果存在，就用 JSON 的 L1 覆盖；否则新增 L1_name_EN）
L1_COLUMNS_CANDIDATES = ["L1_name_EN", "L1_name_CN", "L1", "L1分类", "L1分类中文名"]

def build_l2_lookup_from_json(recognition_data: dict):
    """
    将 JSON 构造成：{ L2政策名(与CSV一致): {"L1": 顶层统领L1, "clusters": {"0":{}, "1":{}, "2":{}} } }
    假设 JSON 结构为：顶层 L1 -> L2 -> 0/1/2（与你上传的文件一致）。
    """
    lookup = {}
    for l1, by_l2 in recognition_data.items():
        if not isinstance(by_l2, dict):
            continue
        for l2_name, clusters in by_l2.items():
            if isinstance(clusters, dict):
                # 只保留 0/1/2 三个簇（若存在）
                cdict = {k: clusters.get(k, {}) for k in ("0", "1", "2") if k in clusters}
                if cdict:
                    lookup[str(l2_name).strip()] = {"L1": str(l1).strip(), "clusters": cdict}
    return lookup

def process_policy_data(json_path, input_csv_path, output_csv_path):
    # 读取 JSON（顶层为 L1，第二层为 L2；与你上传的文件一致）
    with open(json_path, "r", encoding="utf-8") as f:
        recognition_data = json.load(f)

    # 构建 L2 -> {L1, clusters} 查找表
    l2_lookup = build_l2_lookup_from_json(recognition_data)

    # 读取 CSV（不提取/依赖任何 L1 列）
    df = pd.read_csv(input_csv_path)

    # 去重：按 L2政策中文名 + 国家（若存在这两列）
    subset_cols = [c for c in ["L2政策中文名", "国家"] if c in df.columns]
    if subset_cols:
        df = df.drop_duplicates(subset=subset_cols, keep="first").reset_index(drop=True)

    # 确保结果字段存在
    for c in ["Starting", "Trend", "Ending"]:
        if c not in df.columns:
            df[c] = ""
    # 若没有任何 L1 列，则新增 L1_name_EN
    if not any(col in df.columns for col in L1_COLUMNS_CANDIDATES):
        df["L1_name_EN"] = ""

    matched = 0
    for idx, row in df.iterrows():
        l2_cn = str(row.get("L2政策中文名", "")).strip()
        if not l2_cn:
            continue

        # 用 L2 中文名去 JSON 查 L1 与 clusters
        entry = l2_lookup.get(l2_cn)
        if not entry:
            continue

        # 覆盖/写入 L1（来自 JSON 统领）
        l1_from_json = entry.get("L1", "")
        if l1_from_json:
            written = False
            for col in L1_COLUMNS_CANDIDATES:
                if col in df.columns:
                    df.at[idx, col] = l1_from_json
                    written = True
            if not written:  # 所有候选列都不存在则写到 L1_name_EN
                df.at[idx, "L1_name_EN"] = l1_from_json

        # 仍按 CSV 的 聚类ID 取 Starting/Trend/Ending
        cluster_raw = row.get("聚类ID", "")
        if isinstance(cluster_raw, float) and math.isnan(cluster_raw):
            continue
        s = str(cluster_raw).strip()
        cluster_id = str(int(s)) if s.isdigit() else s

        if cluster_id in entry["clusters"]:
            info = entry["clusters"][cluster_id] or {}
            df.at[idx, "Starting"] = str(info.get("Starting", ""))
            df.at[idx, "Trend"] = str(info.get("Trend", ""))
            df.at[idx, "Ending"] = str(info.get("Ending", ""))
            matched += 1

    # 保存（其它列保持不变）
    Path(output_csv_path).parent.mkdir(parents=True, exist_ok=True)
    df.to_csv(output_csv_path, index=False, encoding="utf-8-sig")

    print(f"处理完成：共 {len(df)} 行，匹配成功 {matched} 行")
    print(f"输出文件：{Path(output_csv_path).resolve()}")

current_dir = Path.cwd()
json_file = current_dir.parent / "data/3-2-Human_Recognition_Mode_txt.json"
input_csv = current_dir.parent / "data/3-1-L2_Policy_Clustering_countries.csv"
output_csv = current_dir.parent / "data/3-2-Human_Recognition_Mode_Countries.csv"

# 执行
process_policy_data(json_file, input_csv, output_csv)


处理完成：共 735 行，匹配成功 735 行
输出文件：F:\Desktop\CAMPF_Supplementary\data\3-2-Human_Recognition_Mode_Countries.csv
