##### 对X进行处理（各种变量）

In [1]:
import pandas as pd
import numpy as np
import json
import os
import matplotlib.pyplot as plt
import seaborn as sns
import geopandas as gpd
from typing import Dict, Tuple

# ================= 配置与路径 =================
JSON_PATH = "../country_list.json"
SHP_PATH = "../../data_origin/global/global_ems.shp"
DATA_ROOT = "../../data"

# 输出子目录
OUT_DIRS = {
    "long": os.path.join(DATA_ROOT, "PDF_data_Visual/Long_dataframe"),
    "pdf": os.path.join(DATA_ROOT, "PDF_data_Visual/PDF_10_networks"),
    "map": os.path.join(DATA_ROOT, "PDF_data_Visual/Gloabal_networks_map")
}
for d in OUT_DIRS.values(): os.makedirs(d, exist_ok=True)

N_PALETTE = ["#4E79A7", "#F28E2B", "#E15759", "#76B7B2", "#59A14F", "#EDC948", "#B07AA1", "#FF9DA7", "#9C755F", "#BAB0AC"]

def prepare_geodata(shp_path: str, json_path: str) -> Tuple[Dict[str, tuple], gpd.GeoDataFrame]:
    """建立研究国家 ISO 到坐标的映射并加载地图"""
    with open(json_path, 'r', encoding='utf-8') as f:
        countries = json.load(f)['countries']
        name_to_iso = {c['country_name']: c['iso'] for c in countries}
    
    world = gpd.read_file(shp_path)
    # 名字纠偏映射
    alias = {"Russia": "Russian Federation", "South Korea": "Rep. of Korea", "Turkey": "Türkiye"}
    world['iso'] = world['Country'].apply(lambda x: name_to_iso.get(alias.get(x, x)))
    
    # 提取 research_c == 1 的国家代表点坐标
    study = world[world['research_c'] == 1].copy()
    coord_map = study.set_index('iso').geometry.representative_point().apply(lambda p: (p.x, p.y)).to_dict()
    return coord_map, world

def plot_network_map(long_df: pd.DataFrame, coords: dict, world: gpd.GeoDataFrame, 
                     save_path: str, color: str, is_binary: bool):
    """绘制全球网络图：弱连接在下，强连接在上"""
    fig, ax = plt.subplots(figsize=(14, 8))
    world.plot(ax=ax, color='#f0f0f0', edgecolor='#d0d0d0', linewidth=0.5)
    
    # 排序：Weight 从小到大绘图，确保强连接 zorder 更高
    plot_df = long_df[long_df['Weight'] > 0].sort_values('Weight')
    # 针对连续变量，仅绘制前 50% 强的连接以防视觉遮盖
    if not is_binary and len(plot_df) > 500:
        plot_df = plot_df[plot_df['Weight'] >= plot_df['Weight'].quantile(0.5)]

    for _, row in plot_df.iterrows():
        u, v, w = row['Source'], row['Target'], row['Weight']
        if u in coords and v in coords:
            p1, p2 = coords[u], coords[v]
            l_width = 1.0 if is_binary else w * 4
            ax.plot([p1[0], p2[0]], [p1[1], p2[1]], color=color, 
                    linewidth=l_width, alpha=0.3, zorder=10 + w)

    # 节点绘制
    lons, lats = zip(*coords.values())
    ax.scatter(lons, lats, color='#2c3e50', s=12, zorder=100)
    ax.axis('off')
    plt.savefig(save_path, dpi=300, bbox_inches='tight')
    plt.close()

def main():
    coord_map, world_gdf = prepare_geodata(SHP_PATH, JSON_PATH)
    # 动态搜寻所有矩阵 CSV
    files = [f for f in os.listdir(DATA_ROOT) if f.endswith('.csv') and '_' in f and 'long' not in f]

    for i, f_name in enumerate(files):
        # 1. 数据处理：矩阵转长表
        df_mat = pd.read_csv(os.path.join(DATA_ROOT, f_name), index_col=0)
        df_mat.index.name = 'Source'  # 修复核心：强制命名索引，确保 melt 识别 id_vars
        
        long_df = df_mat.reset_index().melt(id_vars='Source', var_name='Target', value_name='Weight')
        long_df = long_df[long_df['Source'] != long_df['Target']].dropna()
        
        base = os.path.splitext(f_name)[0]
        long_df.to_csv(os.path.join(OUT_DIRS["long"], f"{base}_long.csv"), index=False)

        # 2. PDF 分布图：高分辨率直方图（不平滑）
        plt.figure(figsize=(7, 5))
        is_binary = set(long_df['Weight'].unique()).issubset({0, 1, 0.0, 1.0})
        
        if is_binary:
            sns.countplot(data=long_df, x='Weight', color=N_PALETTE[i % 10], alpha=0.8)
        else:
            # 割得细：bins=100, element='step' 逼近真实 PDF
            sns.histplot(long_df[long_df['Weight'] > 0]['Weight'], bins=100, 
                         color=N_PALETTE[i % 10], element="step", alpha=0.5, stat="density")
        
        plt.title(f"Empirical Distribution: {base}")
        plt.savefig(os.path.join(OUT_DIRS["pdf"], f"{base}_PDF.png"), dpi=300)
        plt.close()

        # 3. 绘制全球网络图
        plot_network_map(long_df, coord_map, world_gdf, 
                         os.path.join(OUT_DIRS["map"], f"{base}_Map.png"), 
                         N_PALETTE[i % 10], is_binary)

if __name__ == "__main__":
    main()

##### 对Y进行处理（overlap的表格-分部门-总的）

In [2]:
import pandas as pd
import numpy as np
import json
import os
import matplotlib.pyplot as plt
import seaborn as sns
import geopandas as gpd
from pathlib import Path
from typing import Dict, Tuple

# ================= 配置与路径 =================

# 1. 项目数据根目录
PROJECT_DATA_ROOT = Path(r"F:\Desktop\科研项目\1.负责科研项目\Climate Policy\CAMPF_Supplementary_V2\data")

# 2. 工作目录
WORK_DIR = PROJECT_DATA_ROOT / "Y_overlapping_cluster_heatmap"
INPUT_DIR = WORK_DIR / "Raw"
OUT_DIR_LONG = WORK_DIR / "Long_dataframe"
OUT_DIR_VISUAL = WORK_DIR / "Visul"

# 3. 辅助文件路径
JSON_PATH = PROJECT_DATA_ROOT / "country_list.json"
SHP_PATH = PROJECT_DATA_ROOT.parent / "data_origin" / "global" / "global_ems.shp" 

# 调色板
N_PALETTE = ["#4E79A7", "#F28E2B", "#E15759", "#76B7B2", "#59A14F", "#EDC948", "#B07AA1", "#FF9DA7", "#9C755F", "#BAB0AC"]

# 备用名称映射 (以防 Shapefile 没有 ISO 列)
NAME_MAP_FALLBACK = {
    "Argentina": "ARG", "Australia": "AUS", "Austria": "AUT", "Belgium": "BEL", "Bulgaria": "BGR",
    "Brazil": "BRA", "Canada": "CAN", "Switzerland": "CHE", "Chile": "CHL", "China": "CHN",
    "Colombia": "COL", "Costa Rica": "CRI", "Czech Republic": "CZE", "Germany": "DEU", 
    "Denmark": "DNK", "Spain": "ESP", "Estonia": "EST", "Finland": "FIN", "France": "FRA", 
    "United Kingdom": "GBR", "Greece": "GRC", "Croatia": "HRV", "Hungary": "HUN", 
    "Indonesia": "IDN", "India": "IND", "Ireland": "IRL", "Iceland": "ISL", "Israel": "ISR", 
    "Italy": "ITA", "Japan": "JPN", "South Korea": "KOR", "Lithuania": "LTU", "Luxembourg": "LUX", 
    "Latvia": "LVA", "Mexico": "MEX", "Malta": "MLT", "Netherlands": "NLD", "Norway": "NOR", 
    "New Zealand": "NZL", "Peru": "PER", "Poland": "POL", "Portugal": "PRT", "Romania": "ROU", 
    "Russia": "RUS", "Saudi Arabia": "SAU", "Slovakia": "SVK", "Slovenia": "SVN", 
    "Sweden": "SWE", "Turkey": "TUR", "United States": "USA", "South Africa": "ZAF"
}

def prepare_geodata(shp_path: Path, json_path: Path) -> Tuple[Dict[str, tuple], gpd.GeoDataFrame]:
    """加载地图和坐标映射"""
    if not shp_path.exists():
        print(f"[Error] 地图文件未找到: {shp_path}")
        return {}, gpd.GeoDataFrame()
        
    world = gpd.read_file(shp_path)
    
    # 尝试匹配 ISO
    iso_col = None
    for col in ['ISO_A3', 'ISO', 'iso', 'ADM0_A3', 'Country_Code']:
        if col in world.columns:
            iso_col = col
            break
            
    if iso_col:
        world['iso_matched'] = world[iso_col]
    else:
        # 尝试名称匹配
        for col in ['Country', 'NAME', 'Name', 'ADMIN']:
            if col in world.columns:
                world['iso_matched'] = world[col].map(NAME_MAP_FALLBACK)
                break

    # 提取坐标
    coord_map = {}
    valid_world = world.dropna(subset=['iso_matched'])
    for _, row in valid_world.iterrows():
        iso = row['iso_matched']
        if row.geometry:
            pt = row.geometry.representative_point()
            coord_map[iso] = (pt.x, pt.y)
            
    return coord_map, world

def plot_network_map(long_df: pd.DataFrame, coords: dict, world: gpd.GeoDataFrame, 
                     save_path: Path, color: str, is_binary: bool):
    """绘制全球网络图 (仅绘制 Weight > 0 的连接)"""
    if world.empty: return

    fig, ax = plt.subplots(figsize=(14, 8))
    world.plot(ax=ax, color='#f0f0f0', edgecolor='#d0d0d0', linewidth=0.5)
    
    # 核心修改：只画 > 0 的线
    plot_df = long_df[long_df['Weight'] > 0].copy()
    if plot_df.empty:
        plt.close()
        return

    plot_df = plot_df.sort_values('Weight')
    
    # 防止线太密集，如果非二值且数据量极大，可选择性过滤（此处暂保留全部非0）
    max_w = plot_df['Weight'].max()
    
    for _, row in plot_df.iterrows():
        u, v, w = row['Source'], row['Target'], row['Weight']
        if u in coords and v in coords:
            p1, p2 = coords[u], coords[v]
            
            l_width = 1.0 if is_binary else 0.5 + (w / max_w) * 3.0
            alpha = 0.3 if w < max_w/3 else 0.6
            
            ax.plot([p1[0], p2[0]], [p1[1], p2[1]], color=color, 
                    linewidth=l_width, alpha=alpha, zorder=10 + int(w))

    ax.axis('off')
    plt.tight_layout()
    plt.savefig(save_path, dpi=300, bbox_inches='tight')
    plt.close()

def process_category(category_name: str, coord_map: dict, world_gdf: gpd.GeoDataFrame):
    """处理单个类别"""
    input_subdir = INPUT_DIR / category_name
    out_long_subdir = OUT_DIR_LONG / category_name
    out_visual_subdir = OUT_DIR_VISUAL / category_name
    
    if not input_subdir.exists():
        return

    out_long_subdir.mkdir(parents=True, exist_ok=True)
    out_visual_subdir.mkdir(parents=True, exist_ok=True)
    
    files = list(input_subdir.glob("Y_*.csv"))
    print(f"\nProcessing [{category_name}]: {len(files)} files")
    
    for i, f_path in enumerate(files):
        try:
            # 1. 读取矩阵
            df_mat = pd.read_csv(f_path, index_col=0)
            df_mat.index.name = 'Source'
            
            # 2. 转长表 (保留 0 值!)
            long_df = df_mat.reset_index().melt(id_vars='Source', var_name='Target', value_name='Weight')
            
            # 3. 数据清洗：只去除“自己对自己”的行，保留 Weight=0 的行
            long_df = long_df[long_df['Source'] != long_df['Target']]
            
            # 保存长表 (包含 0)
            base_name = f_path.stem
            long_df.to_csv(out_long_subdir / f"{base_name}_long.csv", index=False, encoding='utf-8-sig')
            
            # 4. 统计分布图 (必须包含 0!)
            plt.figure(figsize=(8, 5))
            color = N_PALETTE[i % len(N_PALETTE)]
            
            # 判断是否二值 (仅由0和1组成)
            unique_vals = long_df['Weight'].unique()
            is_binary = set(unique_vals).issubset({0, 1, 0.0, 1.0})
            
            if is_binary:
                # 二值图：用 Countplot 显示 0 和 1 的数量
                sns.countplot(data=long_df, x='Weight', color=color, alpha=0.8)
                plt.title(f"Binary Distribution (Including 0s): {base_name}")
                plt.xlabel("Overlap Count (0=No Overlap, 1=Overlap)")
            else:
                # 连续整数图：用 Histplot，discrete=True 保证整数柱子对齐
                # stat='count' 显示绝对数量，或者 'density' 显示密度
                sns.histplot(data=long_df, x='Weight', discrete=True, color=color, alpha=0.6)
                plt.title(f"Weight Distribution (Including 0s): {base_name}")
                plt.xlabel("Number of Overlapping Policies")
            
            plt.ylabel("Count of Country Pairs")
            plt.grid(axis='y', linestyle='--', alpha=0.3)
            plt.savefig(out_visual_subdir / f"{base_name}_PDF.png", dpi=300)
            plt.close()

            # 5. 地图 (过滤 0)
            # 地图里画 0 线没有意义，所以传参时只传 Weight > 0 的副本 (函数内部也会防范)
            plot_network_map(long_df, coord_map, world_gdf, 
                             out_visual_subdir / f"{base_name}_Map.png", color, is_binary)
            
            print(f"  -> Done: {base_name}")

        except Exception as e:
            print(f"  [Error] {f_path.name}: {e}")

def main():
    print("Loading Geography...")
    coord_map, world_gdf = prepare_geodata(SHP_PATH, JSON_PATH)
    
    if not coord_map:
        print("[Warning] No coordinates mapped. Maps will be empty.")

    for cat in ["Breadth", "Intensity"]:
        process_category(cat, coord_map, world_gdf)

    print("\nAll tasks completed.")

if __name__ == "__main__":
    main()

Loading Geography...

Processing [Breadth]: 5 files
  -> Done: Y_All
  -> Done: Y_Commitment-based
  -> Done: Y_Incentive-based
  -> Done: Y_Regulatory
  -> Done: Y_Research and Development (R&D)

Processing [Intensity]: 5 files
  -> Done: Y_All
  -> Done: Y_Commitment-based
  -> Done: Y_Incentive-based
  -> Done: Y_Regulatory
  -> Done: Y_Research and Development (R&D)

All tasks completed.
