In [1]:
import pandas as pd
import glob

In [4]:
# T1D datasets to include
datasets_to_include = ["ohio", "dubosson", "weinstock", "T1DEXI_adults"]
base_path = "results"

# List to collect data
data_list = []

# Loop through each dataset and construct file paths
for dataset in datasets_to_include:
    csv_files = glob.glob(f"{base_path}/{dataset}/*.csv")
    
    for file in csv_files:
        # Extract kernel and percentile from the filename
        filename_parts = file.split("/")[-1].split("_")
        kernel = filename_parts[-2]  # Example: "rbf" or "linear"
        percentile = filename_parts[-1].replace(".csv", "")  # Example: "0.90"

        # Read the CSV and extract relevant data
        df = pd.read_csv(file)
        for _, row in df.iterrows():
            data_list.append({
                "Metric": row["Metric"],
                "Bias Before Imputation": row["Bias Before Imputation"],
                "Bias After Imputation": row["Bias After Imputation"],
                "MSE Before Imputation": row["MSE Before Imputation"],
                "MSE After Imputation": row["MSE After Imputation"],
                "Kernel": kernel,
                "Percentile": percentile
            })

# Convert list of dictionaries to a DataFrame
combined_df = pd.DataFrame(data_list)

# Ensure columns are numeric for aggregation
numeric_cols = ["Bias Before Imputation", "Bias After Imputation", 
                "MSE Before Imputation", "MSE After Imputation"]

# Group by Kernel, Percentile, and Metric, and calculate mean and std
grouped = (
    combined_df
    .groupby(["Kernel", "Percentile", "Metric"])[numeric_cols]
    .agg(["mean", "std"])
    .reset_index()
)

# Create separate DataFrames for each (Kernel, Percentile) combination
dfs_by_group = {}
for (kernel, percentile), group_df in grouped.groupby(["Kernel", "Percentile"]):
    # Sort the index to avoid the warning
    group_df = group_df.sort_index(level=["Kernel", "Percentile"])
    key = f"{kernel}_{percentile}"
    dfs_by_group[key] = group_df.drop(["Kernel", "Percentile"], axis=1, level=0)

metric_order = ["Mean", "SD", "CV"]

# Sort each dataframe based on 'Metric' order
for key, df in dfs_by_group.items():
    df['Metric'] = pd.Categorical(df['Metric'], categories=metric_order, ordered=True)
    sorted_df = df.sort_values(by='Metric')
    dfs_by_group[key] = sorted_df.reset_index(drop=True)

# Example: Display all sorted DataFrames
for key, df in dfs_by_group.items():
    print(f"DataFrame for {key}:")
    display(df)
    print("\n")


DataFrame for exponential_0.7:


Unnamed: 0_level_0,Metric,Bias Before Imputation,Bias Before Imputation,Bias After Imputation,Bias After Imputation,MSE Before Imputation,MSE Before Imputation,MSE After Imputation,MSE After Imputation
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,mean,std,mean,std,mean,std
0,Mean,-1.578238,0.276501,-2.004084,0.352596,2.75962,0.968744,4.614218,1.709339
1,SD,-1.111066,0.137877,-0.007758,0.207696,1.395625,0.389534,0.210747,0.118999
2,CV,-0.067645,0.011344,0.111659,0.022274,0.005631,0.001511,0.014775,0.005784




DataFrame for exponential_0.8:


Unnamed: 0_level_0,Metric,Bias Before Imputation,Bias Before Imputation,Bias After Imputation,Bias After Imputation,MSE Before Imputation,MSE Before Imputation,MSE After Imputation,MSE After Imputation
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,mean,std,mean,std,mean,std
0,Mean,-1.128987,0.200559,-1.213753,0.283908,1.422541,0.511983,1.832245,0.927311
1,SD,-0.861198,0.114393,-0.130551,0.130878,0.843078,0.258753,0.130636,0.102309
2,CV,-0.05417,0.008351,0.044577,0.01609,0.003544,0.001028,0.003159,0.002176




DataFrame for matern32_0.7:


Unnamed: 0_level_0,Metric,Bias Before Imputation,Bias Before Imputation,Bias After Imputation,Bias After Imputation,MSE Before Imputation,MSE Before Imputation,MSE After Imputation,MSE After Imputation
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,mean,std,mean,std,mean,std
0,Mean,-1.578238,0.276501,-0.536044,0.21869,2.75962,0.968744,0.435385,0.329782
1,SD,-1.111066,0.137877,-0.469073,0.157982,1.395625,0.389534,0.34299,0.224704
2,CV,-0.067645,0.011344,-0.030571,0.006059,0.005631,0.001511,0.001745,0.000784




DataFrame for matern32_0.8:


Unnamed: 0_level_0,Metric,Bias Before Imputation,Bias Before Imputation,Bias After Imputation,Bias After Imputation,MSE Before Imputation,MSE Before Imputation,MSE After Imputation,MSE After Imputation
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,mean,std,mean,std,mean,std
0,Mean,-1.128987,0.200559,-0.231324,0.125525,1.422541,0.511983,0.096376,0.107143
1,SD,-0.861198,0.114393,-0.252758,0.118299,0.843078,0.258753,0.113173,0.115602
2,CV,-0.05417,0.008351,-0.018925,0.007184,0.003544,0.001028,0.000706,0.000579




DataFrame for matern52_0.7:


Unnamed: 0_level_0,Metric,Bias Before Imputation,Bias Before Imputation,Bias After Imputation,Bias After Imputation,MSE Before Imputation,MSE Before Imputation,MSE After Imputation,MSE After Imputation
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,mean,std,mean,std,mean,std
0,Mean,-1.578238,0.276501,-0.996138,0.30317,2.75962,0.968744,1.331883,0.701279
1,SD,-1.111066,0.137877,-0.431092,0.15352,1.395625,0.389534,0.318343,0.210261
2,CV,-0.067645,0.011344,-0.004447,0.008418,0.005631,0.001511,0.001153,0.000576




DataFrame for matern52_0.8:


Unnamed: 0_level_0,Metric,Bias Before Imputation,Bias Before Imputation,Bias After Imputation,Bias After Imputation,MSE Before Imputation,MSE Before Imputation,MSE After Imputation,MSE After Imputation
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,mean,std,mean,std,mean,std
0,Mean,-1.128987,0.200559,-0.466757,0.19685,1.422541,0.511983,0.345656,0.26702
1,SD,-0.861198,0.114393,-0.336613,0.102517,0.843078,0.258753,0.199329,0.136491
2,CV,-0.05417,0.008351,-0.018201,0.003443,0.003544,0.001028,0.000838,0.000437






In [5]:
# T2D datasets to include
datasets_to_include = ["colas", "iglu", "hall"]
base_path = "results"

# List to collect data
data_list = []

# Loop through each dataset and construct file paths
for dataset in datasets_to_include:
    csv_files = glob.glob(f"{base_path}/{dataset}/*.csv")
    
    for file in csv_files:
        # Extract kernel and percentile from the filename
        filename_parts = file.split("/")[-1].split("_")
        kernel = filename_parts[-2]  # Example: "rbf" or "linear"
        percentile = filename_parts[-1].replace(".csv", "")  # Example: "0.90"

        # Read the CSV and extract relevant data
        df = pd.read_csv(file)
        for _, row in df.iterrows():
            data_list.append({
                "Metric": row["Metric"],
                "Bias Before Imputation": row["Bias Before Imputation"],
                "Bias After Imputation": row["Bias After Imputation"],
                "MSE Before Imputation": row["MSE Before Imputation"],
                "MSE After Imputation": row["MSE After Imputation"],
                "Kernel": kernel,
                "Percentile": percentile
            })

# Convert list of dictionaries to a DataFrame
combined_df = pd.DataFrame(data_list)

# Ensure columns are numeric for aggregation
numeric_cols = ["Bias Before Imputation", "Bias After Imputation", 
                "MSE Before Imputation", "MSE After Imputation"]

# Group by Kernel, Percentile, and Metric, and calculate mean and std
grouped = (
    combined_df
    .groupby(["Kernel", "Percentile", "Metric"])[numeric_cols]
    .agg(["mean", "std"])
    .reset_index()
)

# Create separate DataFrames for each (Kernel, Percentile) combination
dfs_by_group = {}
for (kernel, percentile), group_df in grouped.groupby(["Kernel", "Percentile"]):
    # Sort the index to avoid the warning
    group_df = group_df.sort_index(level=["Kernel", "Percentile"])
    key = f"{kernel}_{percentile}"
    dfs_by_group[key] = group_df.drop(["Kernel", "Percentile"], axis=1, level=0)

metric_order = ["Mean", "SD", "CV"]

# Sort each dataframe based on 'Metric' order
for key, df in dfs_by_group.items():
    df['Metric'] = pd.Categorical(df['Metric'], categories=metric_order, ordered=True)
    sorted_df = df.sort_values(by='Metric')
    dfs_by_group[key] = sorted_df.reset_index(drop=True)

# Example: Display all sorted DataFrames
for key, df in dfs_by_group.items():
    print(f"DataFrame for {key}:")
    display(df)
    print("\n")


DataFrame for exponential_0.7:


Unnamed: 0_level_0,Metric,Bias Before Imputation,Bias Before Imputation,Bias After Imputation,Bias After Imputation,MSE Before Imputation,MSE Before Imputation,MSE After Imputation,MSE After Imputation
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,mean,std,mean,std,mean,std
0,Mean,-0.756863,0.378641,-0.858203,0.614725,0.74213,0.677695,1.181785,1.395794
1,SD,-0.679978,0.306464,0.246925,0.174325,0.635888,0.437537,0.194269,0.101065
2,CV,-0.07826,0.026883,0.072809,0.033855,0.008259,0.005537,0.008623,0.004405




DataFrame for exponential_0.8:


Unnamed: 0_level_0,Metric,Bias Before Imputation,Bias Before Imputation,Bias After Imputation,Bias After Imputation,MSE Before Imputation,MSE Before Imputation,MSE After Imputation,MSE After Imputation
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,mean,std,mean,std,mean,std
0,Mean,-0.564011,0.272487,-0.459941,0.365105,0.409267,0.349144,0.395997,0.477253
1,SD,-0.55103,0.24671,0.081139,0.097594,0.421449,0.288565,0.049402,0.048212
2,CV,-0.063907,0.024465,0.027323,0.018171,0.005621,0.004219,0.001717,0.00136




DataFrame for matern32_0.7:


Unnamed: 0_level_0,Metric,Bias Before Imputation,Bias Before Imputation,Bias After Imputation,Bias After Imputation,MSE Before Imputation,MSE Before Imputation,MSE After Imputation,MSE After Imputation
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,mean,std,mean,std,mean,std
0,Mean,-0.756863,0.378641,-0.181932,0.138712,0.74213,0.677695,0.061972,0.055332
1,SD,-0.679978,0.306464,-0.150685,0.149632,0.635888,0.437537,0.090606,0.101431
2,CV,-0.07826,0.026883,-0.01577,0.018939,0.008259,0.005537,0.001431,0.001836




DataFrame for matern32_0.8:


Unnamed: 0_level_0,Metric,Bias Before Imputation,Bias Before Imputation,Bias After Imputation,Bias After Imputation,MSE Before Imputation,MSE Before Imputation,MSE After Imputation,MSE After Imputation
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,mean,std,mean,std,mean,std
0,Mean,-0.564011,0.272487,-0.107689,0.076614,0.409267,0.349144,0.021894,0.016701
1,SD,-0.55103,0.24671,-0.150722,0.116934,0.421449,0.288565,0.054053,0.034769
2,CV,-0.063907,0.024465,-0.018713,0.01469,0.005621,0.004219,0.000838,0.000622




DataFrame for matern52_0.7:


Unnamed: 0_level_0,Metric,Bias Before Imputation,Bias Before Imputation,Bias After Imputation,Bias After Imputation,MSE Before Imputation,MSE Before Imputation,MSE After Imputation,MSE After Imputation
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,mean,std,mean,std,mean,std
0,Mean,-0.756863,0.378641,-0.24297,0.205079,0.74213,0.677695,0.125611,0.103513
1,SD,-0.679978,0.306464,-0.070293,0.131796,0.635888,0.437537,0.085795,0.066159
2,CV,-0.07826,0.026883,-0.001831,0.0129,0.008259,0.005537,0.001479,0.001555




DataFrame for matern52_0.8:


Unnamed: 0_level_0,Metric,Bias Before Imputation,Bias Before Imputation,Bias After Imputation,Bias After Imputation,MSE Before Imputation,MSE Before Imputation,MSE After Imputation,MSE After Imputation
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,mean,std,mean,std,mean,std
0,Mean,-0.564011,0.272487,-0.108258,0.095575,0.409267,0.349144,0.032726,0.02632
1,SD,-0.55103,0.24671,-0.022023,0.109943,0.421449,0.288565,0.077468,0.092872
2,CV,-0.063907,0.024465,-0.001184,0.013863,0.005621,0.004219,0.000956,0.000914




