In [1]:
import pandas as pd
import requests
import os
import json
import time
from typing import List, Dict

# ================= 配置 =================
OUT_FILE = "../../data_origin/WB_RAW/WB_Raw_Data_Full_Expanded.csv"
JSON_PATH = "../country_list.json"
START_YEAR, END_YEAR = 2005, 2023

INDICATORS = {
    "GDP_PerCap": "NY.GDP.PCAP.KD",
    "Population": "SP.POP.TOTL",
    "GHG_Total": "EN.GHG.ALL.MT.CE.AR5",
    "CO2_Total": "EN.GHG.CO2.MT.CE.AR5",
    "Coal_Rent": "NY.GDP.COAL.RT.ZS",
    "Oil_Rent":  "NY.GDP.PETR.RT.ZS",
    "Gas_Rent":  "NY.GDP.NGAS.RT.ZS",
    "Fuel_Export_Pct": "TX.VAL.FUEL.ZS.UN"
}

def fetch_wb_data(json_path: str, out_file: str, indicators: Dict[str, str]) -> pd.DataFrame:
    """批量抓取世行数据并持久化至CSV文件"""
    with open(json_path, 'r', encoding='utf-8') as f:
        target_isos = [c['iso'] for c in json.load(f)['countries']]

    batch_size = 20
    batches = [target_isos[i:i + batch_size] for i in range(0, len(target_isos), batch_size)]
    records = []

    for name, code in indicators.items():
        for batch in batches:
            url = f"http://api.worldbank.org/v2/country/{';'.join(batch)}/indicator/{code}"
            params = {"date": f"{START_YEAR}:{END_YEAR}", "format": "json", "per_page": 20000}
            
            # 基础重试逻辑，确保网络波动时不丢包
            for _ in range(3):
                try:
                    resp = requests.get(url, params=params, timeout=30)
                    if resp.status_code == 200:
                        data = resp.json()
                        if len(data) > 1 and data[1]:
                            records.extend([{
                                'indicator': name,
                                'iso': e['countryiso3code'],
                                'year': int(e['date']),
                                'value': float(e['value'])
                            } for e in data[1] if e['value'] is not None])
                        break
                except Exception:
                    time.sleep(2)

    df = pd.DataFrame(records)
    if not df.empty:
        os.makedirs(os.path.dirname(out_file), exist_ok=True)
        df.to_csv(out_file, index=False)
    return df

def diagnose_gaps(df: pd.DataFrame, json_path: str) -> None:
    """扫描下载数据，识别指标与年份的缺失覆盖情况"""
    with open(json_path, 'r', encoding='utf-8') as f:
        target_isos = [c['iso'] for c in json.load(f)['countries']]
        
    all_years = list(range(START_YEAR, END_YEAR + 1))
    all_inds = list(INDICATORS.keys())
    
    # 构建透视表并强制重索引，使完全缺失的指标也能在表中显示为“全缺”
    pivot = df.pivot_table(index='year', columns='indicator', values='iso', aggfunc='count')
    pivot = pivot.reindex(index=all_years, columns=all_inds)
    
    missing_table = len(target_isos) - pivot.fillna(0)
    
    # 终端可视化输出
    print(f"\nMissing Data Report (Target: {len(target_isos)} countries)")
    headers = [h[:7] for h in all_inds]
    print(f"{'Year':<6} | " + " | ".join([f"{h:>7}" for h in headers]))
    print("-" * (10 + 10 * len(all_inds)))
    
    for year in all_years:
        row = [("ok" if m == 0 else ("!!" if m == len(target_isos) else str(int(m)))) 
               for m in missing_table.loc[year]]
        print(f"{year:<6} | " + " | ".join([f"{s:>7}" for s in row]))

if __name__ == "__main__":
    raw_df = fetch_wb_data(JSON_PATH, OUT_FILE, INDICATORS)
    diagnose_gaps(raw_df, JSON_PATH)


Missing Data Report (Target: 49 countries)
Year   | GDP_Per | Populat | GHG_Tot | CO2_Tot | Coal_Re | Oil_Ren | Gas_Ren | Fuel_Ex
------------------------------------------------------------------------------------------
2005   |      ok |      ok |      ok |      ok |      ok |      ok |      ok |      ok
2006   |      ok |      ok |      ok |      ok |      ok |      ok |      ok |      ok
2007   |      ok |      ok |      ok |      ok |      ok |       1 |      ok |      ok
2008   |      ok |      ok |      ok |      ok |      ok |       1 |      ok |      ok
2009   |      ok |      ok |      ok |      ok |      ok |       1 |      ok |      ok
2010   |      ok |      ok |      ok |      ok |      ok |       1 |      ok |      ok
2011   |      ok |      ok |      ok |      ok |      ok |       1 |      ok |      ok
2012   |      ok |      ok |      ok |      ok |      ok |       1 |      ok |      ok
2013   |      ok |      ok |      ok |      ok |      ok |       1 |      ok |    

#### 对比填充结果

In [4]:
import pandas as pd
import numpy as np
import json
import os
from typing import List, Tuple

# ================= 配置 =================
RAW_FILE = "../../data_origin/WB_RAW/WB_Raw_Data_Full_Expanded.csv"
JSON_PATH = "../country_list.json"
OUTPUT_DIR = "../../data"
YEAR_START, YEAR_END = 2005, 2023

def process_attributes(df: pd.DataFrame) -> pd.DataFrame:
    """提取基础指标并计算派生属性"""
    pivot = df.pivot_table(index=['iso', 'year'], columns='indicator', values='value').reset_index()
    if 'GHG_Total' in pivot.columns and 'Population' in pivot.columns:
        pivot['Attr_GHG_PC'] = (pivot['GHG_Total'] * 1e6) / pivot['Population']
    rent_cols = [c for c in ['Coal_Rent', 'Oil_Rent', 'Gas_Rent'] if c in pivot.columns]
    pivot['Attr_Rent'] = pivot[rent_cols].fillna(0).sum(axis=1)
    rename_map = {'GDP_PerCap': 'Attr_GDP_PC', 'Fuel_Export_Pct': 'Attr_Fuel_Ex'}
    return pivot.rename(columns={k: v for k, v in rename_map.items() if k in pivot.columns})

def calc_matrix(wide_df: pd.DataFrame, iso_list: List[str], use_nanmean: bool = False) -> np.ndarray:
    """内部函数：执行年度归一化并计算均值矩阵"""
    yearly_mats = []
    for year in range(YEAR_START, YEAR_END + 1):
        if year not in wide_df.index: continue
        v = wide_df.loc[year].values
        # 计算绝对差 |A - B|，若含NaN则结果为NaN
        diff = np.abs(v.reshape(-1, 1) - v)
        d_min, d_max = np.nanmin(diff), np.nanmax(diff)
        norm = (diff - d_min) / (d_max - d_min) if d_max > d_min else np.zeros_like(diff)
        yearly_mats.append(norm)
    
    # 根据参数选择：忽略NaN取平均（剔除逻辑）或直接取平均（填充逻辑）
    avg_mat = np.nanmean(yearly_mats, axis=0) if use_nanmean else np.mean(yearly_mats, axis=0)
    np.fill_diagonal(avg_mat, 0)
    return avg_mat

def run_comparison_pipeline(df: pd.DataFrame, iso_list: List[str], output_dir: str) -> None:
    """生成矩阵并对比填充数据与原始数据的不确定性差异"""
    tasks = [("1-6-1-GDP_Diff", "Attr_GDP_PC"), ("1-6-2-GHG_Diff", "Attr_GHG_PC"), 
             ("1-6-3-Rent_Diff", "Attr_Rent"), ("1-6-4-Fuel_Ex_Diff", "Attr_Fuel_Ex")]

    print(f"{'Indicator':<20} | {'Mean Diff':<12} | {'Max Diff':<12} | {'Impact %'}")
    print("-" * 65)

    for filename, col in tasks:
        if col not in df.columns: continue
        
        # 宽表准备
        wide_raw = df.pivot(index='year', columns='iso', values=col).reindex(columns=iso_list)
        
        # 1. 填充模式：执行 ffill 补齐滞后年份
        wide_filled = wide_raw.ffill(limit=3).fillna(0)
        mat_filled = calc_matrix(wide_filled, iso_list, use_nanmean=False)
        
        # 2. 剔除模式：不填充，直接计算含 NaN 的矩阵并用 nanmean 跨年平均
        mat_skipped = calc_matrix(wide_raw, iso_list, use_nanmean=True)
        
        # 3. 差异评估
        abs_error = np.abs(mat_filled - mat_skipped)
        mean_err, max_err = np.mean(abs_error), np.max(abs_error)
        # 影响百分比：差异占总平均水平的比例
        impact = (mean_err / np.mean(mat_filled) * 100) if np.mean(mat_filled) > 0 else 0
        
        print(f"{col:<20} | {mean_err:<12.6f} | {max_err:<12.6f} | {impact:>7.2f}%")

        # 4. 保存最终矩阵（使用填充后的版本）
        res_df = pd.DataFrame(mat_filled, index=iso_list, columns=iso_list).sort_index(axis=0).sort_index(axis=1)
        res_df.to_csv(os.path.join(output_dir, f"{filename}.csv"))

if __name__ == "__main__":
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    with open(JSON_PATH, 'r', encoding='utf-8') as f:
        target_isos = [c['iso'] for c in json.load(f)['countries']]
    
    attr_data = process_attributes(pd.read_csv(RAW_FILE))
    run_comparison_pipeline(attr_data, target_isos, OUTPUT_DIR)

Indicator            | Mean Diff    | Max Diff     | Impact %
-----------------------------------------------------------------
Attr_GDP_PC          | 0.000000     | 0.000000     |    0.00%
Attr_GHG_PC          | 0.000000     | 0.000000     |    0.00%
Attr_Rent            | 0.000000     | 0.000000     |    0.00%
Attr_Fuel_Ex         | 0.001020     | 0.042487     |    0.56%


#### 直接计算

##### 新的，减一搞的相似度（启用版）

In [1]:
import pandas as pd
import numpy as np
import json
import os
from typing import List

# ================= 配置 =================
RAW_FILE = "../../data_origin/WB_RAW/WB_Raw_Data_Full_Expanded.csv"
JSON_PATH = "../country_list.json"
OUTPUT_DIR = "../../data"
YEAR_START, YEAR_END = 2005, 2023

def process_attributes(df: pd.DataFrame) -> pd.DataFrame:
    """提取基础指标并计算派生属性（人均排放与总租金）"""
    pivot = df.pivot_table(index=['iso', 'year'], columns='indicator', values='value').reset_index()
    
    # 计算人均温室气体 (吨/人)
    if 'GHG_Total' in pivot.columns and 'Population' in pivot.columns:
        pivot['Attr_GHG_PC'] = (pivot['GHG_Total'] * 1e6) / pivot['Population']
    
    # 计算化石能源总租金
    rent_cols = [c for c in ['Coal_Rent', 'Oil_Rent', 'Gas_Rent'] if c in pivot.columns]
    pivot['Attr_Rent'] = pivot[rent_cols].fillna(0).sum(axis=1)
    
    # 统一列名映射
    rename_map = {'GDP_PerCap': 'Attr_GDP_PC', 'Fuel_Export_Pct': 'Attr_Fuel_Ex'}
    return pivot.rename(columns={k: v for k, v in rename_map.items() if k in pivot.columns})

def generate_average_sim_matrices(df: pd.DataFrame, iso_list: List[str], output_dir: str) -> None:
    """生成经过年度归一化处理的历年平均【相似度】矩阵"""
    
    # 1. 【修改点】文件名后缀改为 _Sim (Sim = Similarity)
    tasks = [
        ("1-6-1-GDP_Sim",     "Attr_GDP_PC"),
        ("1-6-2-GHG_Sim",     "Attr_GHG_PC"),
        ("1-6-3-Rent_Sim",    "Attr_Rent"),
        ("1-6-4-Fuel_Ex_Sim", "Attr_Fuel_Ex")
    ]

    for filename, attr_col in tasks:
        if attr_col not in df.columns: continue

        # 宽表化
        wide = df.pivot(index='year', columns='iso', values=attr_col).reindex(columns=iso_list)
        wide = wide.ffill(limit=3).fillna(0)

        yearly_normalized_mats = []
        for year in range(YEAR_START, YEAR_END + 1):
            if year not in wide.index: continue
            
            vals = wide.loc[year].values
            # 计算绝对差异矩阵 |A_i - A_j|
            diff_matrix = np.abs(vals.reshape(-1, 1) - vals)
            
            # 执行年度内 Min-Max 归一化 (0=差异最小, 1=差异最大)
            d_min, d_max = diff_matrix.min(), diff_matrix.max()
            norm_diff = (diff_matrix - d_min) / (d_max - d_min) if d_max > d_min else np.zeros_like(diff_matrix)
            
            # 2. 【核心修改】转换为相似度：Sim = 1 - Norm_Diff
            # 这样 1 就代表"完全相同"（差异最小），0 代表"最不相同"
            norm_sim = 1 - norm_diff
            
            yearly_normalized_mats.append(norm_sim)

        if yearly_normalized_mats:
            # 计算历年平均值
            avg_matrix = np.mean(yearly_normalized_mats, axis=0)
            
            # 3. 【对角线修改】相似度矩阵的对角线强制 0，因为是网络分析中不考虑自环
            np.fill_diagonal(avg_matrix, 0)
            
            # 保存
            res_df = pd.DataFrame(avg_matrix, index=iso_list, columns=iso_list)
            res_df = res_df.sort_index(axis=0).sort_index(axis=1)
            res_df.to_csv(os.path.join(output_dir, f"{filename}.csv"))
            print(f"已生成相似度矩阵: {filename}.csv")

if __name__ == "__main__":
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    with open(JSON_PATH, 'r', encoding='utf-8') as f:
        target_isos = [c['iso'] for c in json.load(f)['countries']]

    attr_data = process_attributes(pd.read_csv(RAW_FILE))
    # 调用新的生成函数
    generate_average_sim_matrices(attr_data, target_isos, OUTPUT_DIR)

已生成相似度矩阵: 1-6-1-GDP_Sim.csv
已生成相似度矩阵: 1-6-2-GHG_Sim.csv
已生成相似度矩阵: 1-6-3-Rent_Sim.csv
已生成相似度矩阵: 1-6-4-Fuel_Ex_Sim.csv


##### 旧版本的差异度

In [None]:
import pandas as pd
import numpy as np
import json
import os
from typing import List

# ================= 配置 =================
RAW_FILE = "../../data_origin/WB_RAW/WB_Raw_Data_Full_Expanded.csv"
JSON_PATH = "../country_list.json"
OUTPUT_DIR = "../../data"
YEAR_START, YEAR_END = 2005, 2023

def process_attributes(df: pd.DataFrame) -> pd.DataFrame:
    """提取基础指标并计算派生属性（人均排放与总租金）"""
    pivot = df.pivot_table(index=['iso', 'year'], columns='indicator', values='value').reset_index()
    
    # 计算人均温室气体 (吨/人): (Mt * 1e6) / Population
    if 'GHG_Total' in pivot.columns and 'Population' in pivot.columns:
        pivot['Attr_GHG_PC'] = (pivot['GHG_Total'] * 1e6) / pivot['Population']
    
    # 计算化石能源总租金 (Coal + Oil + Gas)
    rent_cols = [c for c in ['Coal_Rent', 'Oil_Rent', 'Gas_Rent'] if c in pivot.columns]
    pivot['Attr_Rent'] = pivot[rent_cols].fillna(0).sum(axis=1)
    
    # 统一列名映射
    rename_map = {'GDP_PerCap': 'Attr_GDP_PC', 'Fuel_Export_Pct': 'Attr_Fuel_Ex'}
    return pivot.rename(columns={k: v for k, v in rename_map.items() if k in pivot.columns})

def generate_average_diff_matrices(df: pd.DataFrame, iso_list: List[str], output_dir: str) -> None:
    """生成经过年度归一化处理的历年平均差异矩阵"""
    # 定义任务序列：(文件名序号, 属性名)
    tasks = [
        ("1-6-1-GDP_Diff",     "Attr_GDP_PC"),
        ("1-6-2-GHG_Diff",     "Attr_GHG_PC"),
        ("1-6-3-Rent_Diff",    "Attr_Rent"),
        ("1-6-4-Fuel_Ex_Diff", "Attr_Fuel_Ex")
    ]

    for filename, attr_col in tasks:
        if attr_col not in df.columns: continue

        # 宽表化：处理滞后年份缺口 (ffill) 并重置国家索引
        wide = df.pivot(index='year', columns='iso', values=attr_col).reindex(columns=iso_list)
        wide = wide.ffill(limit=3).fillna(0)

        yearly_normalized_mats = []
        for year in range(YEAR_START, YEAR_END + 1):
            if year not in wide.index: continue
            
            vals = wide.loc[year].values
            # 计算绝对差异矩阵 |A_i - A_j|
            diff_matrix = np.abs(vals.reshape(-1, 1) - vals)
            
            # 执行年度内 Min-Max 归一化 (防止量级大的年份主导权重)
            d_min, d_max = diff_matrix.min(), diff_matrix.max()
            norm_matrix = (diff_matrix - d_min) / (d_max - d_min) if d_max > d_min else np.zeros_like(diff_matrix)
            yearly_normalized_mats.append(norm_matrix)

        if yearly_normalized_mats:
            # 计算历年平均值并强制对角线为 0 (自反性)
            avg_matrix = np.mean(yearly_normalized_mats, axis=0)
            np.fill_diagonal(avg_matrix, 0)
            
            # 构建 DataFrame 并严格排序索引确保矩阵对齐
            res_df = pd.DataFrame(avg_matrix, index=iso_list, columns=iso_list)
            res_df = res_df.sort_index(axis=0).sort_index(axis=1)
            res_df.to_csv(os.path.join(output_dir, f"{filename}.csv"))

if __name__ == "__main__":
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    with open(JSON_PATH, 'r', encoding='utf-8') as f:
        target_isos = [c['iso'] for c in json.load(f)['countries']]

    attr_data = process_attributes(pd.read_csv(RAW_FILE))
    generate_average_diff_matrices(attr_data, target_isos, OUTPUT_DIR)