In [25]:
import pandas as pd

COLUMNS_TO_TAKE = {
    "organ": "dimension",
    "view": "subdimension",
    "R-Squared_all": "r2",
    "R-Squared_sd_all": "r2_std",
}

DATA_TYPE_NAMING = {
    "instances": "all_samples_per_participant",
    "eids": "average_per_participant",
    "*": "all_samples_when_possible_otherwise_average",
}
DICT_TO_CHANGE_DIMENSIONS = {"ImmuneSystem": "BloodCells"}


if __name__ == "__main__":
    for sample_definition in ["eids"]:  # ["instances", "eids"]:
        scores_raw = (
            pd.read_csv(
                f"../../all_data/page2_predictions/Performances/PERFORMANCES_bestmodels_alphabetical_{sample_definition}_Age_test.csv"
            )[COLUMNS_TO_TAKE]
            .rename(columns=COLUMNS_TO_TAKE)
            .set_index("dimension")
        )

        ensembles_scores_raw = (
            pd.read_csv(
                f"../../all_data/page2_predictions/Performances/PERFORMANCES_withEnsembles_alphabetical_{sample_definition}_Age_test.csv"
            )[COLUMNS_TO_TAKE]
            .rename(columns=COLUMNS_TO_TAKE)
            .set_index(["dimension", "subdimension"])
        )
        ensembles_scores_raw["subdimension"] = ensembles_scores_raw.index.get_level_values("subdimension")

        if sample_definition == "instances":
            for dimension_to_correct in ["Hearing", "Lungs"]:
                scores_raw.loc[dimension_to_correct, ["subdimension", "r2", "r2_std"]] = ensembles_scores_raw.loc[
                    (dimension_to_correct, "*"), ["subdimension", "r2", "r2_std"]
                ].values[0]
        else:  # sample_definition == "eids"
            scores_raw.loc["ImmuneSystem", ["subdimension", "r2", "r2_std"]] = ensembles_scores_raw.loc[
                ("ImmuneSystem", "*"), ["subdimension", "r2", "r2_std"]
            ].values[0]
        scores = scores_raw.reset_index()
        scores["squeezed_dimensions"] = scores["dimension"] + scores["subdimension"].replace("*", "")
        scores.set_index("squeezed_dimensions", inplace=True)

        correlations_raw_ = pd.read_csv(
            f"../../all_data/page4_correlations/ResidualsCorrelations/ResidualsCorrelations_bestmodels_{sample_definition}_Age_test.csv"
        )
        correlations_raw = correlations_raw_.melt(
            id_vars=["Unnamed: 0"], value_vars=correlations_raw_.columns.drop("Unnamed: 0")
        )
        correlations_raw.rename(
            columns={
                "Unnamed: 0": "squeezed_dimensions_1",
                "variable": "squeezed_dimensions_2",
                "value": "correlation",
            },
            inplace=True,
        )

        correlations_std_raw_ = pd.read_csv(
            f"../../all_data/page4_correlations/ResidualsCorrelations/ResidualsCorrelations_bestmodels_sd_{sample_definition}_Age_test.csv"
        )
        correlations_std_raw = correlations_std_raw_.melt(
            id_vars=["Unnamed: 0"], value_vars=correlations_std_raw_.columns.drop("Unnamed: 0")
        )
        correlations_std_raw.rename(
            columns={
                "Unnamed: 0": "squeezed_dimensions_1",
                "variable": "squeezed_dimensions_2",
                "value": "correlation_std",
            },
            inplace=True,
        )

        correlations = pd.DataFrame(
            None,
            columns=[
                "squeezed_dimensions_1",
                "dimension_1",
                "subdimension_1",
                "r2_1",
                "r2_std_1",
                "squeezed_dimensions_2",
                "dimension_2",
                "subdimension_2",
                "r2_2",
                "r2_std_2",
                "correlation",
                "correlation_std",
            ],
        )

        for idx_dimensions in ["1", "2"]:
            correlations[f"squeezed_dimensions_{idx_dimensions}"] = correlations_raw[
                f"squeezed_dimensions_{idx_dimensions}"
            ]
            correlations.set_index(f"squeezed_dimensions_{idx_dimensions}", inplace=True)
            correlations[f"dimension_{idx_dimensions}"] = scores["dimension"]
            correlations[f"subdimension_{idx_dimensions}"] = scores["subdimension"]
            correlations[f"r2_{idx_dimensions}"] = scores["r2"]
            correlations[f"r2_std_{idx_dimensions}"] = scores["r2_std"]
            correlations.reset_index(inplace=True)

        correlations_raw.set_index(["squeezed_dimensions_1", "squeezed_dimensions_2"], inplace=True)
        correlations_std_raw.set_index(["squeezed_dimensions_1", "squeezed_dimensions_2"], inplace=True)
        correlations.set_index(["squeezed_dimensions_1", "squeezed_dimensions_2"], inplace=True)
        correlations["correlation"] = correlations_raw["correlation"]
        correlations["correlation_std"] = correlations_std_raw["correlation_std"]

        correlations.reset_index(drop=True)# .replace(DICT_TO_CHANGE_DIMENSIONS)


In [42]:
import pandas as pd 

correlation_all_samples_per_participant = pd.read_feather(
    f"../../all_data/correlation_between_accelerated_aging_dimensions/custom_dimensions_all_samples_per_participant.feather"
).set_index(
    [
        "dimension_1",
        "subdimension_1",
        "dimension_2",
        "subdimension_2",
    ]
)
correlation_average_per_participant = pd.read_feather(
    f"../../all_data/correlation_between_accelerated_aging_dimensions/custom_dimensions_average_per_participant.feather"
).set_index(
    [
        "dimension_1",
        "subdimension_1",
        "dimension_2",
        "subdimension_2",
    ]
)

index_to_replace = correlation_all_samples_per_participant[
    correlation_all_samples_per_participant["correlation"].isna()
].index
all_samples_when_possible_otherwise_average = correlation_all_samples_per_participant.copy()
all_samples_when_possible_otherwise_average.loc[index_to_replace] = correlation_average_per_participant.loc[
    index_to_replace
]
all_samples_when_possible_otherwise_average.reset_index() #.to_feather(
#     f"all_data/correlation_between_accelerated_aging_dimensions/all_dimensions_{DATA_TYPE_NAMING['*']}.feather"
# )


Unnamed: 0,dimension_1,subdimension_1,dimension_2,subdimension_2,r2_1,r2_std_1,r2_2,r2_std_2,correlation,correlation_std
0,*,*,*,*,0.660751,0.001829,0.660751,0.001829,1.000000,0.000000
1,*instances01,*,*,*,0.869851,0.003602,0.660751,0.001829,0.838413,0.004623
2,*instances1.5x,*,*,*,0.635053,0.001931,0.660751,0.001829,0.998501,0.000013
3,*instances23,*,*,*,0.900849,0.003993,0.660751,0.001829,0.774242,0.009939
4,Abdomen,*,*,*,0.763341,0.002007,0.660751,0.001829,0.375136,0.023708
...,...,...,...,...,...,...,...,...,...,...
1084,Musculoskeletal,Hips,PhysicalActivity,*,0.683108,0.002588,0.625308,0.002016,0.137794,0.007962
1085,Musculoskeletal,Knees,PhysicalActivity,*,0.682830,0.002667,0.625308,0.002016,0.110722,0.008152
1086,Musculoskeletal,FullBody,PhysicalActivity,*,0.853533,0.001274,0.625308,0.002016,0.159431,0.005429
1087,Musculoskeletal,Scalars,PhysicalActivity,*,0.246785,0.001106,0.625308,0.002016,0.125744,0.003902


In [50]:
all_samples_when_possible_otherwise_average.loc[("*instances1.5x")]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,r2_1,r2_std_1,r2_2,r2_std_2,correlation,correlation_std
subdimension_1,dimension_2,subdimension_2,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
*,*,*,0.635053,0.001931,0.660751,0.001829,0.998501,1.3e-05
*,*instances01,*,0.627445,0.002007,0.804912,0.020429,0.008787,0.071341
*,*instances1.5x,*,0.635053,0.001931,0.635053,0.001931,1.0,0.0
*,*instances23,*,0.627445,0.002007,0.904079,0.003515,0.180503,0.033673
*,Abdomen,*,0.627445,0.002007,0.758434,0.002114,0.15509,0.007985
*,Abdomen,Liver,0.627445,0.002007,0.70776,0.002221,0.134956,0.004764
*,Abdomen,Pancreas,0.627445,0.002007,0.698286,0.002643,0.143712,0.009962
*,Arterial,*,0.627445,0.002007,0.679971,0.00538,0.109654,0.012735
*,Arterial,PulseWaveAnalysis,0.627445,0.002007,0.389648,0.001714,0.12803,0.003337
*,Arterial,Carotids,0.627445,0.002007,0.644509,0.005954,0.084547,0.009884
