### 文件清洗

In [None]:
import pandas as pd 
import json
from pathlib import Path

current_dir = Path.cwd()
data_dir = current_dir.parent / "data"
data_origin_dir = current_dir.parent / "data_origin"

json_country_path = data_dir / "country_list.json"
config_mapping_path = data_dir / "config_mappings.json"
wb_csv = data_origin_dir / "GDP_per-API_NY.GDP.PCAP.KD_DS2_en_csv_v2_130141.csv"

out_clean = data_dir / "5-1-GDP_clean.csv"

with open(json_country_path, "r", encoding="utf-8") as f:
    country_json = json.load(f)
if isinstance(country_json, dict) and "countries" in country_json:
    country_codes = set(country_json["countries"])
else:
    raise ValueError("国家列表 JSON 格式不符合要求，应包含 'countries' 字段。")

# -------- 读取映射文件，提取国家名映射 --------
with open(config_mapping_path, "r", encoding="utf-8") as f:
    config_data = json.load(f)
country_name_map = config_data.get("country_names", {})

df = pd.read_csv(wb_csv, skiprows=4)
df = df[df["Country Code"].isin(country_codes)].copy()

# -------- 保留1990–2023列 --------
year_cols_all = [str(y) for y in range(1990, 2024)]
year_cols_exist = [c for c in df.columns if c in year_cols_all]
base_cols = ["Country Name", "Country Code", "Indicator Name"]
df = df[base_cols + year_cols_exist]

# -------- 数值化年份列 --------
df[year_cols_exist] = df[year_cols_exist].apply(pd.to_numeric, errors="coerce")

# -------- 添加中文国家名（在第二列后）--------
df.insert(1, "Country Name_CN", df["Country Code"].map(country_name_map))

df.to_csv(out_clean, index=False, encoding="utf-8-sig")

print(f"- 清洗后数据（1990-2023）：{out_clean}")
print(f"- 共 {len(df)} 个国家/地区")
print(f"- 年份范围：1990-2023")

- 清洗后数据（1990-2023）：f:\Desktop\CAMPF_Supplementary\data\5-1-GDP_clean.csv
- 共 49 个国家/地区
- 年份范围：1990-2023


### 按簇画图

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
from matplotlib.ticker import MaxNLocator
from pathlib import Path
import os


def setup_chinese_fonts():
    """设置中文字体"""
    candidates = ['SimHei', 'Microsoft YaHei', 'STHeiti', 'Heiti TC', 'Arial Unicode MS']
    for f in candidates:
        path = fm.findfont(f)
        if path and os.path.exists(path):
            plt.rcParams['font.sans-serif'] = [f]
            plt.rcParams['axes.unicode_minus'] = False
            plt.rcParams['figure.dpi'] = 300
            return fm.FontProperties(fname=path)
    return fm.FontProperties()


def load_data(data_dir):
    """加载聚类映射和GDP数据（相对路径）"""
    cluster_path = data_dir / "4-2-Consensus_Policy_Cluster_Mapping.csv"
    gdp_path = data_dir / "5-1-GDP_clean.csv"
    
    cluster_df = pd.read_csv(cluster_path, encoding='utf-8-sig')
    gdp_df = pd.read_csv(gdp_path, encoding='utf-8-sig')
    return cluster_df, gdp_df


def prepare_gdp_data(gdp_df):
    """准备GDP时间序列数据（只保留2005-2023）"""
    year_cols = [str(year) for year in range(2005, 2024)]
    
    country_gdp = {}
    for _, row in gdp_df.iterrows():
        country_cn = row['Country Name_CN']
        gdp_series = row[year_cols].astype(float)
        country_gdp[country_cn] = gdp_series
    
    return country_gdp, year_cols


def plot_k_clusters(k_value, cluster_data, country_gdp, year_cols, output_folder, font_cn):
    """为指定K值绘制聚类组图"""
    
    # 获取该K值下的所有簇
    k_clusters = cluster_data[cluster_data['K值'] == k_value]
    cluster_ids = sorted(k_clusters['共识聚类ID'].unique())
    n_clusters = len(cluster_ids)
    
    # 按簇组织国家
    cluster_dict = {}
    for cluster_id in cluster_ids:
        countries = k_clusters[k_clusters['共识聚类ID'] == cluster_id]['国家'].tolist()
        cluster_dict[cluster_id] = countries
    
    # 计算聚类平均（每个簇内）
    all_cluster_avgs = []
    for cluster_id in cluster_ids:
        countries = cluster_dict[cluster_id]
        cluster_avg = pd.Series(0.0, index=year_cols)
        cluster_count = pd.Series(0, index=year_cols)
        
        for country in countries:
            if country in country_gdp:
                series = country_gdp[country]
                for year in year_cols:
                    if not pd.isna(series[year]):
                        cluster_avg[year] += series[year]
                        cluster_count[year] += 1
        
        cluster_avg = cluster_avg / cluster_count.replace(0, np.nan)
        all_cluster_avgs.append(cluster_avg)
    
    # 计算整体平均（所有国家的简单平均，2005-2023）
    all_countries = k_clusters['国家'].unique()  # 只用该K值下的国家
    global_avg = pd.Series(0.0, index=year_cols)
    global_count = pd.Series(0, index=year_cols)
    
    for country in all_countries:
        if country in country_gdp:
            series = country_gdp[country]
            for year in year_cols:
                if not pd.isna(series[year]):
                    global_avg[year] += series[year]
                    global_count[year] += 1
    
    global_avg = global_avg / global_count.replace(0, np.nan)
    
    # 计算全局最大值用于统一y轴
    global_max = 0.0
    for country in all_countries:
        if country in country_gdp:
            max_val = country_gdp[country].max()
            if not pd.isna(max_val):
                global_max = max(global_max, max_val)
    y_max = global_max * 1.1
    
    # 创建子图布局
    n_cols = min(4, n_clusters)
    n_rows = (n_clusters + n_cols - 1) // n_cols
    
    # 增加图的高度以容纳国家名称
    fig = plt.figure(figsize=(5 * n_cols, 5.5 * n_rows))
    
    cluster_colors = plt.cm.tab10(np.linspace(0, 1, n_clusters))
    cluster_markers = ['o', 's', '^', 'D', 'v', 'p', '*', 'h']
    
    years_numeric = [int(y) for y in year_cols]
    
    # 为每个簇绘制子图
    for idx, cluster_id in enumerate(cluster_ids):
        countries = cluster_dict[cluster_id]
        
        row = idx // n_cols
        col = idx % n_cols
        
        # 创建子图 - 为国家名称留出空间
        ax = plt.subplot2grid((n_rows * 2, n_cols), (row * 2, col), rowspan=1)
        
        color = cluster_colors[idx]
        marker = cluster_markers[idx % len(cluster_markers)]
        
        # 绘制各国曲线
        for country in countries:
            if country in country_gdp:
                series = country_gdp[country]
                ax.plot(years_numeric, series.values, 
                       color=color, alpha=0.3, lw=1, zorder=1)
        
        # 绘制簇平均
        cluster_avg = all_cluster_avgs[idx]
        ax.plot(years_numeric, cluster_avg.values, 
               marker=marker, color=color, lw=3, ms=8, 
               label=f'聚类平均', 
               markevery=max(1, len(years_numeric)//10), zorder=10)
        
        # 绘制整体平均线（所有国家的平均）
        ax.plot(years_numeric, global_avg.values, 
               color='#FF6B35', linestyle='--', lw=2.5, 
               label='整体平均', alpha=0.9, zorder=9)
        
        # 设置标题和标签
        ax.set_title(f'簇 {cluster_id} ({len(countries)}国)', 
                    fontproperties=font_cn, fontweight='bold', fontsize=11)
        ax.set_xlabel('年份', fontproperties=font_cn, fontsize=9)
        ax.set_ylabel('人均GDP (不变价2015美元)', fontproperties=font_cn, fontsize=9)
        ax.set_ylim(0, y_max)
        ax.xaxis.set_major_locator(MaxNLocator(integer=True))
        ax.legend(prop=font_cn, loc='best', fontsize=8)
        ax.grid(True, alpha=0.3)
        ax.tick_params(axis='x', rotation=45, labelsize=8)
        ax.tick_params(axis='y', labelsize=8)
        
        # 在子图下方添加国家名称（上对齐、左对齐，6列布局）
        ax_text = plt.subplot2grid((n_rows * 2, n_cols), (row * 2 + 1, col))
        ax_text.axis('off')
        
        # 将国家名称按6列排列，从左到右、从上到下
        n_country_cols = 6
        for i, country in enumerate(countries):
            country_row = i // n_country_cols
            country_col = i % n_country_cols
            
            # 计算位置（左对齐、上对齐）
            x = country_col * 0.166  # 每列占16.6%的宽度
            y = 1 - country_row * 0.125  # 从顶部开始，每行间隔0.125
            
            # 添加文本（左对齐、顶部对齐）
            ax_text.text(x, y, country, 
                        fontproperties=font_cn, 
                        fontsize=8,
                        ha='left',  # 左对齐
                        va='top')   # 顶部对齐
    
    # 隐藏多余子图
    for idx in range(n_clusters, n_rows * n_cols):
        row = idx // n_cols
        col = idx % n_cols
        ax_empty = plt.subplot2grid((n_rows * 2, n_cols), (row * 2, col), rowspan=2)
        ax_empty.axis('off')
    
    # 设置总标题
    fig.suptitle(f'K={k_value} 共识聚类GDP趋势分析 (2005-2023)', 
                fontproperties=font_cn, fontweight='bold', fontsize=14, y=0.995)
    
    plt.tight_layout(rect=[0, 0, 1, 0.97])
    
    # 保存图片
    out_path = output_folder / f'K{k_value}_GDP_Cluster_Trends.png'
    plt.savefig(out_path, dpi=300, bbox_inches='tight')
    plt.close()
    
    print(f"[完成] K={k_value} (共{n_clusters}个簇)")


def main():
    # 设置相对路径（参考你的结构）
    current_dir = Path.cwd()
    data_dir = current_dir.parent / "data"
    output_folder = data_dir / "GDP_Cluster_Analysis"
    
    # 创建输出文件夹
    output_folder.mkdir(parents=True, exist_ok=True)
    
    print(f"数据文件夹: {data_dir.resolve()}")
    print(f"输出文件夹: {output_folder.resolve()}")
    
    # 设置中文字体
    font_cn = setup_chinese_fonts()
    
    # 加载数据
    print("\n正在加载数据...")
    cluster_df, gdp_df = load_data(data_dir)
    
    # 准备GDP数据（2005-2023）
    country_gdp, year_cols = prepare_gdp_data(gdp_df)
    
    # 获取所有K值
    k_values = sorted(cluster_df['K值'].unique())
    
    print(f"找到 {len(k_values)} 个K值: {k_values}")
    print(f"分析时间范围: 2005-2023\n")
    
    # 为每个K值绘图
    for k in k_values:
        plot_k_clusters(k, cluster_df, country_gdp, year_cols, output_folder, font_cn)
    
    print(f"\n所有图表已保存到: {output_folder}")


if __name__ == "__main__":
    main()


数据文件夹: F:\Desktop\CAMPF_Supplementary\data
输出文件夹: F:\Desktop\CAMPF_Supplementary\data\GDP_Cluster_Analysis

正在加载数据...
找到 10 个K值: [np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6), np.int64(7), np.int64(8), np.int64(9), np.int64(10), np.int64(11)]
分析时间范围: 2005-2023

[完成] K=2 (共2个簇)
[完成] K=3 (共3个簇)
[完成] K=4 (共4个簇)
[完成] K=5 (共5个簇)
[完成] K=6 (共6个簇)
[完成] K=7 (共7个簇)
[完成] K=8 (共8个簇)
[完成] K=9 (共9个簇)
[完成] K=10 (共10个簇)
[完成] K=11 (共11个簇)

所有图表已保存到: f:\Desktop\CAMPF_Supplementary\data\GDP_Cluster_Analysis


: 