In [17]:
import pandas as pd

def prepare_for_id_reps_ood_performance_correlation(id_results_file, ood_results_file, output_file):

    METRICS = {"AbsoluteAccDiff", "JSD", "Disagreement"}

    # Load parquets
    id_df  = pd.read_parquet(id_results_file).set_index("id", drop=False)
    ood_df = pd.read_parquet(ood_results_file).set_index("id", drop=False)

    # Keep only the three metrics from OOD
    ood_keep = ood_df[ood_df["metric"].isin(METRICS)][["id", "metric_value"]].copy()

    # Normalize ONLY the OOD ids: _ood -> _id
    ood_keep["id"] = ood_keep["id"].str.replace("_ood", "_id", regex=False)
    ood_keep = ood_keep.set_index("id", drop=False)

    # Build id -> value map (1:1)
    val_map = dict(zip(ood_keep["id"], ood_keep["metric_value"]))

    # Overwrite ID metric_value for those metrics using the map (no checks)
    mask = id_df["metric"].isin(METRICS)
    id_df.loc[mask, "metric_value"] = id_df.loc[mask, "id"].map(val_map)

    # Save merged parquet
    id_df.to_parquet(output_file)
    print(f"Saved: {output_file}")

In [None]:
id_results_file = "/home/nikolabulat/resi_reproduction/resi/experiments/results/graphs_output_correlation_test_good_cora_deg_cov_val_id_test_id.parquet"
ood_results_file = "/home/nikolabulat/resi_reproduction/resi/experiments/results/graphs_output_correlation_test_good_cora_deg_cov_val_id_test_ood.parquet"
output_file = "/home/nikolabulat/resi_reproduction/resi/experiments/results/sample.parquet"

prepare_for_id_reps_ood_performance_correlation(id_results_file, ood_results_file, output_file)

Saved: /home/nikolabulat/resi_reproduction/resi/experiments/results/sample.parquet


In [1]:
import pandas as pd

def make_abs_shift_parquet(id_results_file: str,
                           ood_results_file: str,
                           output_file: str) -> None:
    """
    Create a parquet where each row's metric_value is |metric_value_ID - metric_value_OOD|,
    matched 1:1 by the long 'id' string (normalizing _ood -> _id on the OOD file).
    Only rows present in BOTH files (by id) are written.
    """

    # Load and keep 'id' as both column and index
    id_df  = pd.read_parquet(id_results_file).set_index("id", drop=False)
    ood_df = pd.read_parquet(ood_results_file).set_index("id", drop=False)

    # Normalize ONLY OOD ids so they match ID ids
    ood_df = ood_df.copy()
    ood_df["id"] = ood_df["id"].str.replace("_ood", "_id", regex=False)
    ood_df = ood_df.set_index("id", drop=False)

    # Intersect by id
    common_ids = id_df.index.intersection(ood_df.index)

    # Start from the ID rows, then replace metric_value with absolute difference
    out = id_df.loc[common_ids].copy()
    out["metric_value"] = (id_df.loc[common_ids, "metric_value"]
                           - ood_df.loc[common_ids, "metric_value"]).abs()

    # Save result
    out.to_parquet(output_file)
    print(f"Saved: {output_file}")


In [2]:
id_results_file = "/home/nikolabulat/resi_reproduction/resi/experiments/results/graphs_output_correlation_test_good_cora_deg_cov_val_id_test_id.parquet"
ood_results_file = "/home/nikolabulat/resi_reproduction/resi/experiments/results/graphs_output_correlation_test_good_cora_deg_cov_val_id_test_ood.parquet"
output_file = "/home/nikolabulat/resi_reproduction/resi/experiments/results/sample_abs.parquet"

make_abs_shift_parquet(id_results_file, ood_results_file, output_file)

Saved: /home/nikolabulat/resi_reproduction/resi/experiments/results/sample_abs.parquet
