In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.font_manager import FontProperties
from matplotlib.ticker import MaxNLocator
from pathlib import Path

# ==========================================
# Configuration
# ==========================================
SEED = 42
SIMULATIONS = 5000
np.random.seed(SEED)

DATA_DIR = Path.cwd() if (Path.cwd() / "3-1-L2_Policy_Clustering_Breadth.csv").exists() else Path.cwd().parent / "data"

def get_font(size: int = 26) -> FontProperties:
    """Returns the standardized Times New Roman font property."""
    return FontProperties(family='Times New Roman', size=size, weight='black')

plt.rcParams.update({
    'font.family': 'serif',
    'font.serif': ['Times New Roman'],
    'font.weight': 'black',
    'axes.axisbelow': True,
    'figure.dpi': 300
})
sns.set_theme(style="whitegrid")

# ==========================================
# Data Processing & Simulation
# ==========================================

def preprocess_data(filepath: Path) -> pd.DataFrame:
    """
    Loads and normalizes the dataset, removing duplicates.

    Args:
        filepath: Path to the CSV file.

    Returns:
        Cleaned DataFrame with columns ['P', 'C', 'G'].
    """
    df = pd.read_csv(filepath, encoding='utf-8-sig')
    
    col_map = {'L2政策中文名': 'P', '国家': 'C', '聚类ID': 'G'}
    df = df.rename(columns=col_map)[['P', 'C', 'G']]
    
    df['P'] = df['P'].astype(str)
    df['C'] = df['C'].astype(str)
    df['G'] = df['G'].astype(str)
    
    return df.drop_duplicates(subset=['P', 'C', 'G']).reset_index(drop=True)

def compute_overlaps_vectorized(df: pd.DataFrame) -> np.ndarray:
    """
    Calculates pairwise overlaps using matrix multiplication.

    Args:
        df: DataFrame containing P, C, G columns.

    Returns:
        1D Array of overlap counts (upper triangle).
    """
    local_df = df.copy()
    local_df['feature'] = local_df['P'].str.cat(local_df['G'], sep='_')
    
    matrix = pd.crosstab(local_df['C'], local_df['feature'])
    matrix = (matrix > 0).astype(int) 
    
    adj_matrix = matrix.dot(matrix.T).values
    return adj_matrix[np.triu_indices_from(adj_matrix, k=1)]

def run_simulation(df: pd.DataFrame, n_iter: int = 5000) -> np.ndarray:
    """
    Performs Monte Carlo simulation by shuffling Groups within Policies.

    Args:
        df: Original DataFrame.
        n_iter: Number of shuffle iterations.

    Returns:
        Concatenated array of overlap counts from all simulations.
    """
    results = []
    sim_df = df.copy()
    grouper = sim_df.groupby('P')['G']
    
    for _ in range(n_iter):
        sim_df['G'] = grouper.transform(np.random.permutation)
        results.append(compute_overlaps_vectorized(sim_df))
            
    return np.concatenate(results)

# ==========================================
# Visualization Logic
# ==========================================

def plot_histogram(real_vals: np.ndarray, random_vals: np.ndarray, 
                   filename: str, label: str) -> None:
    """
    Generates a histogram comparing real vs random overlap distributions.
    """
    plt.figure(figsize=(12, 8))
    ax = plt.gca()

    q90 = np.percentile(random_vals, 90)
    q95 = np.percentile(random_vals, 95)
    
    # Plot Histograms
    sns.histplot(random_vals, color='#1f77b4', label=f'Random\n(Shuffle N={SIMULATIONS})', 
                 kde=False, stat='probability', discrete=True, 
                 common_norm=False, alpha=0.5, edgecolor=None)
    
    sns.histplot(real_vals, color='#d62728', label='Raw Data', 
                 kde=False, stat='probability', discrete=True, 
                 common_norm=False, alpha=0.5, edgecolor=None)

    # Threshold Lines
    if int(q90) == int(q95):
        label_txt = f'Random 90% & 95%\n(x={int(q95)})'
        plt.axvline(q95, color='black', linestyle='-', linewidth=2.5, label=label_txt)
    else:
        plt.axvline(q90, color='#000080', linestyle='--', linewidth=2, 
                    label=f'Random 90%\n(x={int(q90)})')
        plt.axvline(q95, color='black', linestyle='-', linewidth=2.5, 
                    label=f'Random 95%\n(x={int(q95)})')

    # Formatting
    ax.set_xlim(-0.5, 15.5)
    ax.set_xticks(range(16))
    
    plt.title(f"{label}: Overlap Distribution", fontproperties=get_font(30), pad=20)
    plt.xlabel("Overlap Count", fontproperties=get_font(24))
    plt.ylabel("Proportion", fontproperties=get_font(24))
    plt.xticks(fontproperties=get_font(20))
    plt.yticks(fontproperties=get_font(20))
    
    # Legend
    plt.legend(prop=get_font(16), frameon=False, loc='upper left')
    
    for spine in ax.spines.values():
        spine.set_linewidth(3)
        spine.set_color('black')
        
    plt.savefig(DATA_DIR / filename, bbox_inches='tight')
    plt.close()

def calculate_cdf(data: np.ndarray, x_range: np.ndarray) -> list:
    """Calculates cumulative percentile rank for discrete integer range."""
    total = len(data)
    return [(np.sum(data <= x) / total) * 100 for x in x_range]

def plot_cdf_curve(real_vals: np.ndarray, random_vals: np.ndarray, 
                   filename: str, label: str) -> None:
    """
    Generates a line plot showing cumulative percentile ranks (0-15).
    """
    plt.figure(figsize=(14, 9))
    ax = plt.gca()
    
    x_axis = np.arange(16)
    y_rand = calculate_cdf(random_vals, x_axis)
    y_real = calculate_cdf(real_vals, x_axis)
    
    c_rand, c_real = '#1f77b4', '#d62728'

    # Plot Lines
    plt.plot(x_axis, y_rand, color=c_rand, marker='o', markersize=9, 
             linewidth=3, label=f'Random\n(Shuffle)', linestyle='-')
    plt.plot(x_axis, y_real, color=c_real, marker='o', markersize=9, 
             linewidth=3, label='Raw Data', linestyle='-')

    # Data Labels (Collision Avoidance)
    font_annot = get_font(13)
    offset_base = 2.5
    
    for i, x in enumerate(x_axis):
        yr, ys = y_real[i], y_rand[i]
        is_crowded = abs(yr - ys) < 5.0
        offset = offset_base + (2.0 if is_crowded else 0)
        
        if yr >= ys:
            # Real above Random
            plt.text(x, yr + offset, f"{yr:.2f}", color=c_real, ha='center', va='bottom', fontproperties=font_annot)
            plt.text(x, ys - offset, f"{ys:.2f}", color=c_rand, ha='center', va='top', fontproperties=font_annot)
        else:
            # Random above Real
            plt.text(x, ys + offset, f"{ys:.2f}", color=c_rand, ha='center', va='bottom', fontproperties=font_annot)
            plt.text(x, yr - offset, f"{yr:.2f}", color=c_real, ha='center', va='top', fontproperties=font_annot)

    # Formatting
    ax.set_xticks(range(16))
    ax.set_xlim(-0.5, 15.5)
    ax.set_ylim(-8, 118)

    plt.title(f"{label}: Cumulative Percentile", fontproperties=get_font(30), pad=20)
    plt.xlabel("Overlap Count", fontproperties=get_font(24))
    plt.ylabel("Percentile Rank (%)", fontproperties=get_font(24))
    plt.xticks(fontproperties=get_font(20))
    plt.yticks(fontproperties=get_font(20))
    
    # Legend
    plt.legend(prop=get_font(16), frameon=False, loc='upper left')
    
    for spine in ax.spines.values():
        spine.set_linewidth(3)
        spine.set_color('black')
        
    plt.savefig(DATA_DIR / filename, bbox_inches='tight')
    plt.close()

# ==========================================
# Main Execution
# ==========================================

def main():
    files = [
        ("3-1-L2_Policy_Clustering_Breadth.csv", "Breadth"),
        ("3-1-L2_Policy_Clustering_Intensity.csv", "Intensity")
    ]

    for fname, label in files:
        fpath = DATA_DIR / fname
        if not fpath.exists():
            continue

        df = preprocess_data(fpath)
        real_overlaps = compute_overlaps_vectorized(df)
        random_overlaps = run_simulation(df, n_iter=SIMULATIONS)
        
        plot_histogram(real_overlaps, random_overlaps, f"Fig_{label}_Hist.png", label)
        plot_cdf_curve(real_overlaps, random_overlaps, f"Fig_{label}_Curve.png", label)

if __name__ == "__main__":
    main()