In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from astropy.constants import G
from astropy import units as u
from scipy.integrate import quad
import symlib
import os
from glob import glob
import seaborn as sns
from scipy.stats import spearmanr
from scipy.optimize import curve_fit   
from scipy.interpolate import interp1d

base_dir = "/Users/fengbocheng/Projects/Symphony-PPSD"
suite_names = ["SymphonyLMC", "SymphonyMilkyWay", "SymphonyGroup", "SymphonyLCluster", "SymphonyCluster"]
sim_colors = {
        "SymphonyLMC": sns.color_palette("colorblind")[4],
        "SymphonyMilkyWay": sns.color_palette("colorblind")[0],
        "SymphonyGroup": sns.color_palette("colorblind")[2],
        "SymphonyLCluster": sns.color_palette("colorblind")[1],
        "SymphonyCluster": sns.color_palette("colorblind")[3],
    }
sim_names = {
        "SymphonyLMC": "LMC",
        "SymphonyMilkyWay": "Milky~Way",
        "SymphonyGroup": "Group",
        "SymphonyLCluster": "L-Cluster",
        "SymphonyCluster": "Cluster",
    }
mean_cvir = {
    "SymphonyLMC": 12.2,
    "SymphonyMilkyWay": 10.8,
    "SymphonyGroup": 9.0,
    "SymphonyLCluster": 5.0,
    "SymphonyCluster": 5.6,
}
out_dir = os.path.join(base_dir, "output", "FIGURE")

In [None]:
from scipy.stats import chi2

def chi_squared_between_suites(
    base_dir,
    suite_names,
    quantity="Q_r",       # Options: "Q_r", "Q_tot", "slope_Q_r", "slope_Q_tot"
    coord="r",            # "r" = radius; "m" = enclosed mass
    x_range=None,         # Tuple (xmin, xmax) to limit the x-range for comparison
    n_points=100,         # Number of interpolation points on common grid
    plot=True,            # If True, show comparison plots
):
    x_key = "r_scaled" if coord == "r" else "m_scaled"
    is_slope = quantity.startswith("slope_")
    suite_data = {}
    global_x_min = np.inf
    global_x_max = -np.inf

    for suite in suite_names:
        x_stack, y_stack = [], []

        if is_slope:
            slope_dir = os.path.join(base_dir, "output", suite, f"ppsd_slope_profiles_{coord}")
            files = sorted(f for f in os.listdir(slope_dir) if f.endswith(".csv") and f.startswith("halo_"))

            for f in files:
                try:
                    df = pd.read_csv(os.path.join(slope_dir, f))
                    if x_key not in df.columns or quantity not in df.columns:
                        continue
                    x = df[x_key].values
                    y = df[quantity].values
                    x_stack.append(x)
                    y_stack.append(y)

                    valid_x = x[np.isfinite(x)]
                    if valid_x.size > 0:
                        global_x_min = min(global_x_min, valid_x.min())
                        global_x_max = max(global_x_max, valid_x.max())
                except Exception as e:
                    print(f"[Warning] Failed to read slope {f}: {e}")
                    continue
        else:
            profile_dir = os.path.join(base_dir, "output", suite, "ppsd_profiles")
            files = sorted(f for f in os.listdir(profile_dir) if f.endswith(".csv"))

            for f in files:
                df = pd.read_csv(os.path.join(profile_dir, f))
                if x_key not in df.columns or quantity not in df.columns:
                    continue
                x = df[x_key].values
                y = df[quantity].values
                x_stack.append(x)
                y_stack.append(y)

                valid_x = x[np.isfinite(x)]
                if valid_x.size > 0:
                    global_x_min = min(global_x_min, valid_x.min())
                    global_x_max = max(global_x_max, valid_x.max())

        if len(x_stack) == 0:
            raise RuntimeError(f"No valid profiles found for suite {suite}")

        suite_data[suite] = {"x_stack": x_stack, "y_stack": y_stack}

    # Step 2: Interpolation grid
    xmin, xmax = x_range
    
    x_common = np.logspace(np.log10(xmin), np.log10(xmax), n_points)
    # Step 3: Interpolate and average
    for suite in suite_names:
        y_interp_all = []
        for x_arr, y_arr in zip(suite_data[suite]["x_stack"], suite_data[suite]["y_stack"]):
            y_interp = np.interp(x_common, x_arr, y_arr, left=np.nan, right=np.nan)
            y_interp_all.append(y_interp)
        y_interp_all = np.array(y_interp_all)

        suite_data[suite]["x"] = x_common
        suite_data[suite]["mean"] = np.nanmean(y_interp_all, axis=0)
        suite_data[suite]["std"] = np.nanstd(y_interp_all, axis=0)

    # Step 4: χ² test
    results = {}
    for s1, s2 in itertools.combinations(suite_names, 2):
        d1, d2 = suite_data[s1], suite_data[s2]
        x = d1["x"]
        μ1, σ1 = d1["mean"], d1["std"]
        μ2, σ2 = d2["mean"], d2["std"]

        valid = np.isfinite(μ1) & np.isfinite(μ2) & (σ1 + σ2 > 0)
        if x_range is not None:
            valid &= (x >= xmin) & (x <= xmax)

        ν = np.sum(valid)
        if ν < 5:
            print(f"[Warning] Too few valid points for {s1} vs {s2}")
            continue

        χ2_val = np.sum((μ1[valid] - μ2[valid])**2 / (σ1[valid]**2 + σ2[valid]**2))
        p_val = 1 - chi2.cdf(χ2_val, ν)

        results[(s1, s2)] = {"chi2": χ2_val, "dof": ν, "pval": p_val}

        if plot:
            plt.figure(figsize=(6, 4), dpi=400)
            plt.errorbar(x[valid], μ1[valid], yerr=σ1[valid], fmt="-o", label=s1, markersize=3, capsize=2)
            plt.errorbar(x[valid], μ2[valid], yerr=σ2[valid], fmt="-s", label=s2, markersize=3, capsize=2)
            plt.xscale("log")
            plt.xlabel(f"{coord.upper()} (scaled)")
            plt.ylabel(quantity)
            plt.title(f"{s1} vs {s2}  |  χ²={χ2_val:.1f}, ν={ν}, p={p_val:.3g}")
            plt.grid(True, which="both", ls=":")
            plt.legend()
            plt.tight_layout()
            plt.show()

    return results

In [54]:
def ks_test_profilewise_between_suites_with_significance(
    base_dir,
    suite_names,
    quantity="Q_r",
    coord="r",
    x_range=None,
    n_points=100,
    plot=True,
    significance_levels=[0.05, 0.01]  # Mark thresholds for statistical significance
):
    x_key = "r_scaled" if coord == "r" else "m_scaled"
    is_slope = quantity.startswith("slope_")
    suite_data = {}
    global_x_min = np.inf
    global_x_max = -np.inf

    for suite in suite_names:
        x_stack, y_stack = [], []

        if is_slope:
            slope_dir = os.path.join(base_dir, "output", suite, f"ppsd_slope_profiles_{coord}")
            files = sorted(f for f in os.listdir(slope_dir) if f.endswith(".csv") and f.startswith("halo_"))
        else:
            profile_dir = os.path.join(base_dir, "output", suite, "ppsd_profiles")
            files = sorted(f for f in os.listdir(profile_dir) if f.endswith(".csv"))

        for f in files:
            try:
                path = os.path.join(slope_dir if is_slope else profile_dir, f)
                df = pd.read_csv(path)
                if x_key not in df.columns or quantity not in df.columns:
                    continue
                x = df[x_key].values
                y = df[quantity].values
                x_stack.append(x)
                y_stack.append(y)
                valid_x = x[np.isfinite(x)]
                if valid_x.size > 0:
                    global_x_min = min(global_x_min, valid_x.min())
                    global_x_max = max(global_x_max, valid_x.max())
            except Exception as e:
                print(f"[Warning] Failed to read {f}: {e}")
                continue

        if len(x_stack) == 0:
            raise RuntimeError(f"No valid profiles found for suite {suite}")

        suite_data[suite] = {"x_stack": x_stack, "y_stack": y_stack}

    if x_range is None:
        xmin, xmax = global_x_min, global_x_max
    else:
        xmin, xmax = x_range

    x_common = np.logspace(np.log10(xmin), np.log10(xmax), n_points)

    for suite in suite_names:
        y_interp_all = []
        for x_arr, y_arr in zip(suite_data[suite]["x_stack"], suite_data[suite]["y_stack"]):
            y_interp = np.interp(x_common, x_arr, y_arr, left=np.nan, right=np.nan)
            y_interp_all.append(y_interp)
        suite_data[suite]["y_interp_all"] = np.array(y_interp_all)

    results = {}
    for s1, s2 in itertools.combinations(suite_names, 2):
        y1_all = suite_data[s1]["y_interp_all"]
        y2_all = suite_data[s2]["y_interp_all"]

        ks_stats = []
        p_values = []
        for i in range(n_points):
            v1 = y1_all[:, i]
            v2 = y2_all[:, i]
            v1 = v1[np.isfinite(v1)]
            v2 = v2[np.isfinite(v2)]

            if len(v1) < 5 or len(v2) < 5:
                ks_stats.append(np.nan)
                p_values.append(np.nan)
                continue

            ks_stat, p_val = ks_2samp(v1, v2)
            ks_stats.append(ks_stat)
            p_values.append(p_val)

        ks_stats = np.array(ks_stats)
        p_values = np.array(p_values)

        results[(s1, s2)] = {
            "x": x_common,
            "ks_stat": ks_stats,
            "pval": p_values,
        }

        if plot:
            fig, ax1 = plt.subplots(figsize=(8, 4), dpi=150)
            ax1.plot(x_common, ks_stats, label="KS statistic", color="tab:blue", lw=2)
            ax1.set_xscale("log")
            ax1.set_xlabel(f"{coord.upper()} (scaled)")
            ax1.set_ylabel("KS statistic", color="tab:blue")
            ax1.tick_params(axis='y', labelcolor='tab:blue')
            ax1.grid(True, which="both", ls=":")

            ax2 = ax1.twinx()
            ax2.plot(x_common, p_values, label="p-value", color="tab:red", lw=1.8)
            for sig in significance_levels:
                ax2.axhline(sig, color="gray", ls="--", lw=0.8, label=f"p = {sig}")
            ax2.set_ylabel("p-value", color="tab:red")
            ax2.tick_params(axis='y', labelcolor='tab:red')
            ax2.set_yscale("log")

            lines, labels = ax1.get_legend_handles_labels()
            lines2, labels2 = ax2.get_legend_handles_labels()
            ax2.legend(lines + lines2, labels + labels2, loc="upper right")

            plt.title(f"KS Test: {s1} vs {s2}")
            fig.tight_layout()
            plt.show()

    return results

In [None]:
def load_delta_c_norm(suite):
    """
    Load cvir values and compute delta_c_norm = (cvir - median)/std.
    """
    path = os.path.join(base_dir, "output", suite, "halo_concentrations.csv")
    df = pd.read_csv(path)
    c = pd.to_numeric(df["cvir"], errors='coerce').dropna()
    median_c = c.median()
    std_c = c.std()
    return ((c - median_c) / std_c).values

def load_delta_gamma_norm(suite):
    """
    Load accretion rates and compute delta_gamma_norm = (log10(gamma) - median)/std
    only for gamma > 0.
    """
    path = os.path.join(base_dir, "output", suite, "accretion_rates.csv")
    df = pd.read_csv(path)
    g = pd.to_numeric(df["gamma"], errors='coerce').dropna()
    gpos = g[g > 0]
    logg = np.log10(gpos)
    median_logg = np.median(logg)
    std_logg = np.std(logg)
    return ((logg - median_logg) / std_logg).values

# === Compute normalized distributions ===
delta_c = {suite: load_delta_c_norm(suite) for suite in suite_names}
delta_gamma = {suite: load_delta_gamma_norm(suite) for suite in suite_names}

# === Pairwise K-S tests ===
def ks_test_across_suites(delta_dict, metric_name):
    """
    Perform pairwise two-sample Kolmogorov–Smirnov tests.
    Returns a DataFrame of KS statistics and p-values.
    """
    results = []
    for s1, s2 in itertools.combinations(suite_names, 2):
        data1 = delta_dict[s1]
        data2 = delta_dict[s2]
        ks_stat, p_value = ks_2samp(data1, data2)
        results.append({
            "Suite 1": s1,
            "Suite 2": s2,
            f"KS_{metric_name}_stat": ks_stat,
            f"p_{metric_name}": p_value
        })
    return pd.DataFrame(results)

ks_c_df = ks_test_across_suites(delta_c, "c")
ks_g_df = ks_test_across_suites(delta_gamma, "gamma")

print("Pairwise K-S test results for delta_c_norm:")
print(ks_c_df.to_string(index=False))
print("\nPairwise K-S test results for delta_gamma_norm:")
print(ks_g_df.to_string(index=False))

# === Visualization functions ===
def plot_metric_distributions(delta_dict, metric_label, bins=30):
    """
    Plot overlaid histograms and boxplot of a normalized metric across suites
    using the sim_colors palette.
    """
    # Overlaid histograms
    plt.figure(figsize=(8, 4), dpi=400)
    for suite, data in delta_dict.items():
        plt.hist(data, bins=bins, alpha=0.5, label=suite,
                 density=True, color=sim_colors[suite])
    plt.xlabel(metric_label)
    plt.ylabel("Density")
    plt.title(f"Overlaid Histograms of {metric_label}")
    plt.legend()
    plt.tight_layout()
    plt.show()

    # Boxplot comparison
    plt.figure(figsize=(8, 4), dpi=400)
    bp = plt.boxplot(
        [delta_dict[s] for s in suite_names],
        labels=suite_names,
        showfliers=False,
        patch_artist=True
    )
    # Color each box
    for patch, suite in zip(bp['boxes'], suite_names):
        patch.set_facecolor(sim_colors[suite])
    plt.ylabel(metric_label)
    plt.title(f"Boxplot of {metric_label} Across Suites")
    plt.xticks(rotation=45, ha="right")
    plt.tight_layout()
    plt.show()

# === Generate visualizations ===
plot_metric_distributions(delta_c, "Delta c_norm")
plot_metric_distributions(delta_gamma, "Delta gamma_norm")

In [None]:
import itertools
from scipy.stats import ks_2samp

# === Load and normalize Jeans deviation ===
def load_delta_jeans_norm(suite):
    path = os.path.join(base_dir, "output", suite, "jeans_deviation_total.csv")
    df = pd.read_csv(path)
    dJ = pd.to_numeric(df["delta_J_tot"], errors='coerce').dropna()
    return ((dJ - dJ.median()) / dJ.std()).values

delta_jeans = {s: load_delta_jeans_norm(s) for s in suite_names}

# === Pairwise K-S tests ===
def ks_test_across_suites(delta_dict, metric_name):
    results = []
    for s1, s2 in itertools.combinations(suite_names, 2):
        data1 = delta_dict[s1]
        data2 = delta_dict[s2]
        ks_stat, p_value = ks_2samp(data1, data2)
        results.append({
            "Suite 1": s1,
            "Suite 2": s2,
            f"KS_{metric_name}_stat": ks_stat,
            f"p_{metric_name}": p_value
        })
    return pd.DataFrame(results)

ks_j_df = ks_test_across_suites(delta_jeans, "jeans_norm")

# === Display K-S test results ===
print("Pairwise K-S test results for normalized Jeans deviation:\n")
display(ks_j_df)

# === Visualization ===
plt.figure(figsize=(8, 4), dpi=400)
for suite, data in delta_jeans.items():
    plt.hist(data, bins=30, density=True, alpha=0.5,
             color=sim_colors[suite], label=suite)
plt.xlabel("Normalized Jeans Deviation")
plt.ylabel("Density")
plt.title("Overlaid Histograms of Normalized Jeans Deviation")
plt.legend()
plt.tight_layout()
plt.show()

plt.figure(figsize=(8, 4), dpi=400)
bp = plt.boxplot([delta_jeans[s] for s in suite_names],
                 labels=suite_names, patch_artist=True, showfliers=False)
for patch, suite in zip(bp['boxes'], suite_names):
    patch.set_facecolor(sim_colors[suite])
plt.ylabel("Normalized Jeans Deviation")
plt.title("Boxplot of Normalized Jeans Deviation Across Suites")
plt.xticks(rotation=45, ha="right")
plt.tight_layout()
plt.show()
