In [None]:
import pandas as pd
import config

Investigate cluster center features

In [None]:
# cluster centers for div=0, average over 5 runs for each p
for p in ["5000", "12000", "25000", "50000", "100000"]:
    dataframes = []
    for run in ["case1_run1", "case1_run2", "case1_run3", "case1_run4", "case1_run5"]:
        df = pd.read_parquet(f"{config.path_to_data}/{run}/cluster_centers/centers_{p}.parquet")
        ratio_row = {}
        for col in df.columns:
            val_0 = df.loc[0, col]
            val_1 = df.loc[1, col]
            
            # add a row for ratio of feature means
            if val_1 != 0:
                ratio_mean = val_0 / val_1
                ratio_row[col] = f"{ratio_mean:.2f}"
            else:
                ratio_row[col] = "inf"
        
        df.loc['ratio'] = ratio_row
        df = df.apply(pd.to_numeric, errors='coerce')
        dataframes.append(df)

    concatenated = pd.concat(dataframes, keys=range(len(dataframes)))

    # calculate mean and standard deviation of the cluster feature mean over the runs
    mean_df = concatenated.groupby(level=1).mean()
    std_df = concatenated.groupby(level=1).std()

    result_df = mean_df.copy()
    for col in mean_df.columns:
        result_df[col] = mean_df[col].round(2).astype(str) + " (" + std_df[col].round(2).astype(str) + ")"

    print(f"p:{p}", result_df)
    result_df.to_csv(f"{config.path_to_evaluation}/cluster_centers_avg/center_avg_{p}.csv")

    