In [None]:
# ==========================================
# Cell 1: 聚类分析 (Clustering)
# ==========================================
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from scipy.cluster.hierarchy import linkage, fcluster
from pathlib import Path
from typing import Tuple, List, Dict
import warnings

# 忽略警告
warnings.filterwarnings('ignore')

# 1. 核心聚类算法
def auto_select_k_and_cluster(X_data: np.ndarray, max_k: int = 10) -> np.ndarray:
    """自动选择最佳K值并返回聚类标签 (基于肘部法则-最大降幅)"""
    n_samples = len(X_data)
    if n_samples < 3:
        return np.zeros(n_samples, dtype=int)

    Z = linkage(X_data, method='ward')
    
    wcss_list = []
    valid_ks = list(range(2, min(max_k, n_samples) + 1))
    
    for k in valid_ks:
        labels = fcluster(Z, k, criterion='maxclust')
        wcss = 0
        for i in range(1, k + 1):
            cluster_points = X_data[labels == i]
            if len(cluster_points) > 0:
                center = cluster_points.mean(axis=0)
                wcss += np.sum((cluster_points - center) ** 2)
        wcss_list.append(wcss)
    
    if len(wcss_list) < 2:
        best_k = valid_ks[0]
    else:
        deltas = [wcss_list[i] - wcss_list[i+1] for i in range(len(wcss_list)-1)]
        best_idx = np.argmax(deltas)
        best_k = valid_ks[best_idx + 1]
    
    return fcluster(Z, best_k, criterion='maxclust') - 1

def run_clustering_single_metric(df_source: pd.DataFrame, l2_name: str, need_scale: bool = False) -> pd.Series:
    """处理单条L2政策"""
    # === 修改点1：严格限制年份 2005-2023 ===
    df = df_source[(df_source['TIME_PERIOD'] >= 2005) & (df_source['TIME_PERIOD'] <= 2023)]
    
    X = df.set_index(['REF_AREA', 'TIME_PERIOD'])[l2_name].unstack().fillna(0)
    
    if X.empty:
        return pd.Series(dtype=int)

    if need_scale:
        scaler = MinMaxScaler()
        if X.values.max() == X.values.min():
            X_vals = X.values
        else:
            X_vals = scaler.fit_transform(X)
    else:
        X_vals = X.values
        
    labels = auto_select_k_and_cluster(X_vals)
    return pd.Series(labels, index=X.index, name='ClusterID')

# 2. 数据 IO
def load_data(data_dir: Path):
    df_b = pd.read_parquet(data_dir / "2-1-country_breadth.parquet")
    df_i = pd.read_parquet(data_dir / "2-1-country_intensity.parquet")
    l2_list = [c for c in df_b.columns if c not in {'REF_AREA', 'TIME_PERIOD'}]
    return df_b, df_i, l2_list

def save_clustered_data(cluster_results: Dict[str, pd.Series], df_source: pd.DataFrame, output_path: Path):
    records = []
    # 同样确保源数据也是 2005-2023，以便保存时一致
    df_source_filtered = df_source[(df_source['TIME_PERIOD'] >= 2005) & (df_source['TIME_PERIOD'] <= 2023)]
    
    for l2, clusters in cluster_results.items():
        if clusters.empty: continue
        
        sub = df_source_filtered[['REF_AREA', 'TIME_PERIOD', l2]].copy()
        sub.columns = ['国家', '年份', '占比'] # 统一列名
        sub['聚类ID'] = sub['国家'].map(clusters)
        sub['L2政策'] = l2
        sub['L2政策中文名'] = l2 # 占位，确保列存在
        
        sub = sub.dropna(subset=['聚类ID'])
        sub['聚类ID'] = sub['聚类ID'].astype(int)
        records.append(sub)
        
    if records:
        final_df = pd.concat(records)
        final_df.to_csv(output_path, index=False, encoding='utf-8-sig')
        print(f"✅ 已保存: {output_path.name}")

# 3. 执行
base_dir = Path.cwd().parent
data_dir = base_dir / "data" if (base_dir / "data").exists() else Path.cwd() / "data"
print(f"数据目录: {data_dir}")

df_b, df_i, l2_list = load_data(data_dir)

print("正在执行 Breadth 聚类 (2005-2023)...")
b_results = {l2: run_clustering_single_metric(df_b, l2, need_scale=False) for l2 in l2_list}
save_clustered_data(b_results, df_b, data_dir / "3-1-L2_Policy_Clustering_Breadth.csv")

print("正在执行 Intensity 聚类 (2005-2023)...")
i_results = {l2: run_clustering_single_metric(df_i, l2, need_scale=True) for l2 in l2_list}
save_clustered_data(i_results, df_i, data_dir / "3-1-L2_Policy_Clustering_Intensity.csv")

In [None]:
# ==========================================
# 直接画图并排序
# ==========================================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
from matplotlib.ticker import MaxNLocator, MultipleLocator
from pathlib import Path
from typing import Tuple, Any
import matplotlib as mpl

# === 配置与样式 ===
def setup_mpl_single2() -> None:
    mpl.rc('font', size=25)
    mpl.rcParams.update({
        'legend.fontsize': 'small',
        'xtick.labelsize': 'small', 'ytick.labelsize': 'small',
        'lines.linewidth': 2, 'axes.linewidth': 2,
        'xtick.major.pad': '12', 'ytick.major.pad': '12',
        'xtick.direction': 'in', 'ytick.direction': 'in',
        'xtick.top': False, 'ytick.right': False,
        'mathtext.default': 'regular', 'axes.titlesize': 'small'
    })

setup_mpl_single2()

NATURE_COLORS = [
    '#E64B35', "#6917C2", '#00A087', '#3C5488', '#F39B7F', 
    '#8491B4', '#91D1C2', '#DC0000', '#7E6148', '#B09C85', 
    '#E18727', '#20854E', '#0072B5', '#BC3C29', '#6F99AD'
]

def lighten_color(color: Any, amount: float = 0.7) -> Tuple[float, float, float]:
    c = mcolors.to_rgb(color)
    return tuple([c[i] + (1 - c[i]) * amount for i in range(3)])

# === 排序与辅助逻辑 ===
def sort_clusters(sort_df: pd.DataFrame) -> pd.DataFrame:
    """根据 Starting > Ending > Trend 规则对元数据进行排序"""
    orders = {
        'Starting': {'Low': 0, 'Medium': 1, 'High': 2},
        'Ending': {'Low': 0, 'Medium': 1, 'High': 2},
        'Trend': {'Rise': 0, 'Stable': 1, 'Fluctuate': 2, 'Decline': 3}
    }
    
    df = sort_df.copy()
    df['sort_key'] = df.apply(lambda r: (
        orders['Starting'].get(r['Starting'], 99),
        orders['Ending'].get(r['Ending'], 99),
        orders['Trend'].get(r['Trend'], 99),
        r.get('MeanStart', 0)
    ), axis=1)
    
    return df.sort_values('sort_key')

def get_plot_title(sort_df: pd.DataFrame, l2_name: str, cid: int) -> str:
    """构建标题字符串"""
    row = sort_df[(sort_df['L2政策中文名'] == l2_name) & (sort_df['聚类ID'] == cid)]
    if row.empty: return f"Cluster {cid}"
    r = row.iloc[0]
    return f"{r['Starting']}+{r['Trend']}+{r['Ending'].replace(' Share', '')}"

# === 核心绘图逻辑 ===
def plot_policy_trends(l2_name: str, l2_data: pd.DataFrame, sort_df: pd.DataFrame, 
                       metric_label: str, output_dir: Path) -> None:
    # 1. 严格筛选年份
    df_plot = l2_data[(l2_data['年份'] >= 2005) & (l2_data['年份'] <= 2023)]
    if df_plot.empty: return

    # 2. 获取排序后的 Cluster ID
    l2_sort_info = sort_df[sort_df['L2政策中文名'] == l2_name]
    if l2_sort_info.empty: return
    sorted_cids = l2_sort_info.sort_values('sort_key')['聚类ID'].unique()

    # 3. 准备绘图数据
    years = sorted(df_plot['年份'].unique())
    matrix = df_plot.pivot(index='年份', columns='国家', values='占比').reindex(years)
    overall_mean = matrix.mean(axis=1)
    y_max = matrix.max().max() * 1.15

    # 4. 布局设置
    n_clusters = len(sorted_cids)
    n_cols, n_rows = 3, (n_clusters + 2) // 3
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(20, 3.5 * n_rows), squeeze=False)

    for idx, cid in enumerate(sorted_cids):
        ax = axes[idx // n_cols, idx % n_cols]
        countries = [c for c in l2_sort_info[l2_sort_info['聚类ID'] == cid]['国家'].values if c in matrix.columns]
        if not countries: continue

        # 样式参数
        color = NATURE_COLORS[idx % len(NATURE_COLORS)]
        fill_color = lighten_color(color, 0.7)
        sub_matrix = matrix[countries]
        c_mean = sub_matrix.mean(axis=1)

        # 绘图
        ax.plot(sub_matrix.index, sub_matrix.values, color=color, alpha=0.25, lw=1.2, zorder=1)
        ax.plot(c_mean.index, c_mean, marker='o', color=color, lw=2.5, ms=7,
                mfc=fill_color, mec=color, mew=1.8, label='Cluster Average',
                markevery=max(1, len(years)//10), zorder=10)
        ax.plot(overall_mean.index, overall_mean, color='#000000', ls='--', lw=2.5,
                label='Overall Average', alpha=0.85, zorder=9, dashes=(3, 2))

        # 文本与轴设置
        title = get_plot_title(l2_sort_info, l2_name, cid)
        ax.set_title(f"{title}\n({len(countries)} countries)", pad=15, ha='center')
        
        # === 修改点：Y轴向下延伸 4% ===
        y_neg_padding = y_max * 0.04
        ax.set_ylim(bottom=-y_neg_padding, top=y_max)
        
        ax.xaxis.set_major_locator(MultipleLocator(1))
        ax.yaxis.set_major_locator(MaxNLocator(nbins=6))
        ax.tick_params(axis='x', rotation=90, labelsize=15, pad=5.5)
        ax.tick_params(axis='y')
        
        if idx % n_cols == 0: ax.set_ylabel(metric_label)
        else: ax.set_ylabel('')
            
        ax.spines['top'].set_visible(False)
        ax.spines['right'].set_visible(False)
        
        # 图例 (所有子图均显示)
        ax.legend(loc='best', frameon=False, fontsize=12, handlelength=2.0, markerscale=0.8)

    # 隐藏空子图
    for j in range(n_clusters, n_rows * n_cols):
        axes[j // n_cols, j % n_cols].axis('off')

    plt.subplots_adjust(wspace=0.25, top=0.85, bottom=0.15, hspace=0.45)
    
    safe_name = l2_name.replace("/", "_").replace(" ", "_").replace(":", "")
    out_path = output_dir / f"{safe_name}_{metric_label}_Sorted.png"
    plt.savefig(out_path, dpi=300, bbox_inches='tight', pad_inches=0.1)
    plt.close()
    print(f"  -> 保存: {out_path.name}")

# === 执行流程 ===
def process_file_plotting(file_name: str, metric_label: str) -> None:
    base_dir = Path.cwd().parent
    data_dir = base_dir / "data" if (base_dir / "data").exists() else Path.cwd() / "data"
    feature_file = data_dir / "3-2-Automated_Recognition_Mode.csv"
    input_file = data_dir / file_name

    if not input_file.exists() or not feature_file.exists():
        print(f"❌ 缺少必要文件: {input_file.name} 或特征文件")
        return

    print(f"\n>>> 开始绘图: {metric_label}")
    df_data = pd.read_csv(input_file, encoding='utf-8-sig')
    df_meta = pd.read_csv(feature_file, encoding='utf-8-sig')
    
    # 筛选对应类型的元数据并排序
    df_meta = sort_clusters(df_meta[df_meta['Type'] == metric_label])
    
    out_dir = data_dir / "3-1-(3-2)Sorted_L2_Policy_Clustering_pic" / f"Plots_{metric_label}"
    out_dir.mkdir(parents=True, exist_ok=True)

    for l2 in df_data['L2政策中文名'].dropna().unique():
        plot_policy_trends(l2, df_data[df_data['L2政策中文名'] == l2], df_meta, metric_label, out_dir)
    print(f"✅ 完成。输出: {out_dir}")

# 运行
process_file_plotting("3-1-L2_Policy_Clustering_Breadth.csv", "Breadth")
process_file_plotting("3-1-L2_Policy_Clustering_Intensity.csv", "Intensity")


>>> 开始绘图: Breadth
  -> 保存: Buildings_–_market-based_instruments_Breadth_Sorted.png
  -> 保存: Buildings_–_non_market-based_instruments_Breadth_Sorted.png
  -> 保存: Climate_governance_Breadth_Sorted.png
  -> 保存: Electricity_–_market-based_instruments_Breadth_Sorted.png
  -> 保存: Electricity_–_non_market-based_instruments_Breadth_Sorted.png
  -> 保存: Fossil_fuel_production_policies_Breadth_Sorted.png
  -> 保存: GHG_emissions_data_and_reporting_Breadth_Sorted.png
  -> 保存: GHG_emissions_targets_Breadth_Sorted.png
  -> 保存: Industry_–_market-based_instruments_Breadth_Sorted.png
  -> 保存: Industry_–_non_market-based_instruments_Breadth_Sorted.png
  -> 保存: International_climate_co-operation_Breadth_Sorted.png
  -> 保存: International_public_finance_Breadth_Sorted.png
  -> 保存: Public_Research,_Development_and_Demonstration_Breadth_Sorted.png
  -> 保存: Transport_–_market-based_instruments_Breadth_Sorted.png
  -> 保存: Transport_–_non_market-based_instruments_Breadth_Sorted.png
✅ 完成。输出: f:\Desktop\科研项目\1.负责科