In [1]:
from google.colab import drive
drive.mount('/content/drive')

import os
shared_path = '/content/drive/Shareddrives/Baiying'
os.chdir(shared_path)
print("Working directory:", os.getcwd())

%cd CALF

Mounted at /content/drive
Working directory: /content/drive/Shareddrives/Baiying
/content/drive/Shareddrives/Baiying/CALF


# Function

In [2]:
import os
import pandas as pd

def process_results_csv(csv_file_path, base_dataset_path, dataset_subfolders):
    """
    Processes a CSV file containing subject-level results, maps subject IDs to datasets,
    and generates a summary table of averaged MAE and RMSE per dataset.

    Args:
        csv_file_path (str): The path to the input CSV file.
        base_dataset_path (str): The base directory containing dataset subfolders.
        dataset_subfolders (list): A list of dataset subfolder names to consider for mapping.

    Returns:
        tuple: A tuple containing two pandas DataFrames:
               - result_df_processed: The original DataFrame with an added 'dataset_name' column, sorted.
               - summary_df: A summary DataFrame with averaged MAE and RMSE per dataset.
    """
    print(f"\nProcessing file: {csv_file_path}")

    # 1. Load the CSV file
    result_df = pd.read_csv(csv_file_path)

    # 2. Create dataset_files_map for this run
    dataset_files_map = {}
    for dataset_folder in dataset_subfolders:
        current_dataset_path = os.path.join(base_dataset_path, dataset_folder)
        files_in_dataset = []
        if os.path.isdir(current_dataset_path): # Ensure it's a directory
            for root, _, files in os.walk(current_dataset_path):
                for file in files:
                    file_name_without_extension = os.path.splitext(file)[0].lower()
                    files_in_dataset.append(file_name_without_extension)
        dataset_files_map[dataset_folder] = files_in_dataset

    # 3. Map Subject ID to Dataset Name
    subject_to_dataset_map = {}
    found_subjects = set()
    unique_subject_ids = result_df['subject_id'].astype(str).str.strip().unique()

    potential_subject_patterns = {}
    for sub_id in unique_subject_ids:
        s_id_str = str(sub_id).strip()
        patterns_base = [
            s_id_str,
            s_id_str.replace(' ', '_'),
            s_id_str.replace('-', '_'),
            s_id_str.replace('.', '')
        ]
        potential_subject_patterns[sub_id] = list(set(patterns_base + [p.lower() for p in patterns_base]))

    print(f"Attempting to map {len(unique_subject_ids)} unique subject IDs...")
    for dataset_name, normalized_files_list in dataset_files_map.items():
        normalized_files_set = set(normalized_files_list)
        for sub_id_original, patterns_to_check in potential_subject_patterns.items():
            if sub_id_original in found_subjects:
                continue
            for pattern in patterns_to_check:
                if pattern in normalized_files_set:
                    subject_to_dataset_map[sub_id_original] = dataset_name
                    found_subjects.add(sub_id_original)
                    break
        if len(found_subjects) == len(unique_subject_ids):
            print("All subjects mapped. Exiting dataset iteration early.")
            break

    result_df['dataset_name'] = result_df['subject_id'].astype(str).str.strip().map(subject_to_dataset_map)

    unmapped_subjects = result_df[result_df['dataset_name'].isnull()]['subject_id'].unique()
    if len(unmapped_subjects) > 0:
        print(f"Warning: {len(unmapped_subjects)} subjects could not be mapped. Examples: {unmapped_subjects[:5]}...")
    else:
        print("All subjects successfully mapped.")

    # Sort the dataframe
    result_df_processed = result_df.sort_values(by=['dataset_name', 'subject_id']).reset_index(drop=True)

    # 4. Calculate summary_df
    summary_df = result_df_processed.groupby('dataset_name').agg(
        avg_MAE_15min=('MAE_15min_mgdl', 'mean'),
        avg_RMSE_15min=('RMSE_15min_mgdl', 'mean'),
        avg_MAE_30min=('MAE_30min_mgdl', 'mean'),
        avg_RMSE_30min=('RMSE_30min_mgdl', 'mean'),
        avg_MAE_60min=('MAE_60min_mgdl', 'mean'),
        avg_RMSE_60min=('RMSE_60min_mgdl', 'mean'),
        avg_MAE_90min=('MAE_90min_mgdl', 'mean'),
        avg_RMSE_90min=('RMSE_90min_mgdl', 'mean')
    ).reset_index()

    return result_df_processed, summary_df

In [None]:
# Example usage of the function with the original file:
base_dataset_path = '/content/drive/Shareddrives/Baiying/preprocessed_dataset/test_dataset'
main_csv_path = '/content/drive/Shareddrives/Baiying/CALF/results/long_term_forecast_glucose_train_CALF_Glucose_ftS_sl144_ll72_pl18_dm768_nh12_el2_dl1_df2048_fc1_ebtimeF_dtTrue_test_gpt4_0/per_subject_metrics/per_subject_horizons.csv'
dataset_subfolders = [
    f for f in os.listdir(base_dataset_path)
    if os.path.isdir(os.path.join(base_dataset_path, f)) and f != 'mixed'
]
# Ensure base_dataset_path and dataset_subfolders are defined from previous steps
# (If not, you'd need to define them here, e.g., base_dataset_path = '...', dataset_subfolders = [...] )

processed_df_llama, summary_table_llama = process_results_csv(main_csv_path, base_dataset_path, dataset_subfolders)

print("\nProcessed DataFrame (first 5 rows):")
display(processed_df_llama.head())

print("\nSummary Table:")
display(summary_table_llama)

# Extract the directory of the main CSV file
output_metrics_dir = os.path.dirname(main_csv_path)

# Create the directory if it doesn't exist
os.makedirs(output_metrics_dir, exist_ok=True)

# Save processed_df_gpt2
processed_df_llama_path = os.path.join(output_metrics_dir, 'processed_per_subject_horizons.csv')
processed_df_llama.to_csv(processed_df_llama_path, index=False)
print(f"Processed DataFrame saved to {processed_df_llama_path}")

# Save summary_table_gpt2
summary_table_llama_path = os.path.join(output_metrics_dir, 'summary_per_dataset.csv')
summary_table_llama.to_csv(summary_table_llama_path, index=False)
print(f"Summary table saved to {summary_table_llama_path}")


Processing file: /content/drive/Shareddrives/Baiying/CALF/results/long_term_forecast_glucose_train_CALF_Glucose_ftS_sl144_ll72_pl18_dm768_nh12_el2_dl1_df2048_fc1_ebtimeF_dtTrue_test_gpt4_0/per_subject_metrics/per_subject_horizons.csv
Attempting to map 284 unique subject IDs...
All subjects mapped. Exiting dataset iteration early.
All subjects successfully mapped.

Processed DataFrame (first 5 rows):


Unnamed: 0,subject_id,num_windows,MAE_15min_mgdl,RMSE_15min_mgdl,MAE_30min_mgdl,RMSE_30min_mgdl,MAE_60min_mgdl,RMSE_60min_mgdl,MAE_90min_mgdl,RMSE_90min_mgdl,dataset_name
0,HUPA0001P,659,12.385268,17.647318,21.471703,30.519186,35.228333,49.114651,47.173588,60.874084,14_HUPA-UCM
1,HUPA0002P,476,6.897544,9.970762,13.643682,20.234842,27.205746,40.350872,37.659134,55.403927,14_HUPA-UCM
2,HUPA0003P,593,7.101452,9.532248,13.120835,17.627996,27.513363,35.237194,40.146778,50.051323,14_HUPA-UCM
3,HUPA0004P,476,6.326643,9.666673,11.809371,18.304626,22.101694,35.223488,32.03492,49.864372,14_HUPA-UCM
4,HUPA0005P,611,6.216848,8.662485,11.417621,15.142756,18.497841,24.963411,23.591219,32.118805,14_HUPA-UCM



Summary Table:


Unnamed: 0,dataset_name,avg_MAE_15min,avg_RMSE_15min,avg_MAE_30min,avg_RMSE_30min,avg_MAE_60min,avg_RMSE_60min,avg_MAE_90min,avg_RMSE_90min
0,14_HUPA-UCM,6.67869,9.808318,13.012587,18.540296,24.255386,33.714194,32.915735,44.743445
1,17_T1DM-UOM,8.821324,12.907113,16.176344,22.956511,26.895341,36.848159,33.366103,44.571361
2,18_Bris-T1D Open,6.776587,11.034874,12.146538,19.540505,20.742827,32.359205,26.397392,40.097548
3,19_AZT1D,9.38313,13.490962,15.48727,21.685922,24.846587,33.447367,30.773599,40.646705
4,2_D1NAMO,7.878683,10.326079,18.456646,23.267696,37.740431,45.051935,50.739002,60.144854
5,BIG_IDEA_LAB,6.040646,9.046199,10.643577,16.015852,14.993863,21.716487,16.164401,22.695709
6,CGMacros,7.205828,10.51845,11.871918,17.517686,16.272056,23.310713,18.058792,25.310268
7,ShanghaiT1DM,4.937987,6.538781,10.014857,13.389542,19.272681,25.779891,27.243459,35.855841
8,ShanghaiT2DM,4.119302,5.696915,8.593183,12.129706,15.51555,22.04341,20.324378,28.272699
9,UCHTT1DM,8.327146,12.247926,13.295625,18.47583,17.204038,23.161446,20.375842,26.287313


Processed DataFrame saved to /content/drive/Shareddrives/Baiying/CALF/results/long_term_forecast_glucose_train_CALF_Glucose_ftS_sl144_ll72_pl18_dm768_nh12_el2_dl1_df2048_fc1_ebtimeF_dtTrue_test_gpt4_0/per_subject_metrics/processed_per_subject_horizons.csv
Summary table saved to /content/drive/Shareddrives/Baiying/CALF/results/long_term_forecast_glucose_train_CALF_Glucose_ftS_sl144_ll72_pl18_dm768_nh12_el2_dl1_df2048_fc1_ebtimeF_dtTrue_test_gpt4_0/per_subject_metrics/summary_per_dataset.csv


## Summarize few shot result

In [3]:
# Example usage of the function with the original file:
base_dataset_path = '/content/drive/Shareddrives/Baiying/preprocessed_dataset/test_dataset'
main_csv_path = '/content/drive/Shareddrives/Baiying/CALF/results/long_term_forecast_glucose_train_few_shot_CALF_Glucose_ftS_sl144_ll72_pl18_dm768_nh12_el2_dl1_df2048_fc1_ebtimeF_dtTrue_test_gpt4_0/result_open/per_subject_metrics/per_subject_horizons.csv'
dataset_subfolders = [
    f for f in os.listdir(base_dataset_path)
    if os.path.isdir(os.path.join(base_dataset_path, f)) and f != 'mixed'
]
# Ensure base_dataset_path and dataset_subfolders are defined from previous steps
# (If not, you'd need to define them here, e.g., base_dataset_path = '...', dataset_subfolders = [...] )

processed_df_llama, summary_table_llama = process_results_csv(main_csv_path, base_dataset_path, dataset_subfolders)

print("\nProcessed DataFrame (first 5 rows):")
display(processed_df_llama.head())

print("\nSummary Table:")
display(summary_table_llama)

# Extract the directory of the main CSV file
output_metrics_dir = os.path.dirname(main_csv_path)

# Create the directory if it doesn't exist
os.makedirs(output_metrics_dir, exist_ok=True)

# Save processed_df_gpt2
processed_df_llama_path = os.path.join(output_metrics_dir, 'processed_per_subject_horizons.csv')
processed_df_llama.to_csv(processed_df_llama_path, index=False)
print(f"Processed DataFrame saved to {processed_df_llama_path}")

# Save summary_table_gpt2
summary_table_llama_path = os.path.join(output_metrics_dir, 'summary_per_dataset.csv')
summary_table_llama.to_csv(summary_table_llama_path, index=False)
print(f"Summary table saved to {summary_table_llama_path}")


Processing file: /content/drive/Shareddrives/Baiying/CALF/results/long_term_forecast_glucose_train_few_shot_CALF_Glucose_ftS_sl144_ll72_pl18_dm768_nh12_el2_dl1_df2048_fc1_ebtimeF_dtTrue_test_gpt4_0/result_open/per_subject_metrics/per_subject_horizons.csv
Attempting to map 336 unique subject IDs...
All subjects mapped. Exiting dataset iteration early.
All subjects successfully mapped.

Processed DataFrame (first 5 rows):


Unnamed: 0,subject_id,num_windows,MAE_15min_mgdl,RMSE_15min_mgdl,MAE_30min_mgdl,RMSE_30min_mgdl,MAE_60min_mgdl,RMSE_60min_mgdl,MAE_90min_mgdl,RMSE_90min_mgdl,dataset_name
0,HUPA0001P,3,8.736226,9.608066,16.585403,19.949974,27.719339,28.799709,37.966507,40.820339,14_HUPA-UCM
1,HUPA0002P,2,4.972548,5.278456,10.169172,11.828938,5.273632,5.765187,15.720192,15.720755,14_HUPA-UCM
2,HUPA0003P,3,9.903267,11.185617,15.692164,17.657625,26.754242,28.339235,39.25053,48.453327,14_HUPA-UCM
3,HUPA0004P,2,16.617813,20.718451,33.628357,35.004868,72.399391,72.408974,55.316803,55.341549,14_HUPA-UCM
4,HUPA0005P,3,9.838844,10.261489,17.4823,17.807419,15.230771,16.755941,18.219721,18.623251,14_HUPA-UCM



Summary Table:


Unnamed: 0,dataset_name,avg_MAE_15min,avg_RMSE_15min,avg_MAE_30min,avg_RMSE_30min,avg_MAE_60min,avg_RMSE_60min,avg_MAE_90min,avg_RMSE_90min
0,14_HUPA-UCM,8.773687,10.10733,16.009385,18.663835,27.004264,30.089,38.912586,43.904102
1,17_T1DM-UOM,9.711777,13.733022,16.305904,22.468582,26.413167,34.373133,34.335954,44.873547
2,18_Bris-T1D Open,7.219621,10.974843,12.544899,19.156615,20.322716,30.851544,26.202996,38.794907
3,19_AZT1D,8.859347,11.403116,14.439301,19.04183,25.430103,31.66271,30.716393,37.811812
4,1_Hall2018,6.864434,6.885959,10.986948,11.084384,13.268959,13.506791,10.323117,10.354622
5,2_D1NAMO,6.646793,6.646793,7.206289,7.206289,29.617039,29.617039,46.050928,46.050928
6,BIG_IDEA_LAB,6.775805,8.010902,9.638583,10.79981,9.71198,10.584506,12.409044,13.927084
7,CGMacros,8.900211,10.144743,13.378612,14.84343,18.107393,21.06929,21.636505,24.663101
8,ShanghaiT1DM,5.677082,6.76336,10.804326,12.588875,16.7704,20.179342,26.35366,32.106229
9,ShanghaiT2DM,5.982291,6.756472,10.228772,11.658742,17.12695,19.906382,21.946671,25.207816


Processed DataFrame saved to /content/drive/Shareddrives/Baiying/CALF/results/long_term_forecast_glucose_train_few_shot_CALF_Glucose_ftS_sl144_ll72_pl18_dm768_nh12_el2_dl1_df2048_fc1_ebtimeF_dtTrue_test_gpt4_0/result_open/per_subject_metrics/processed_per_subject_horizons.csv
Summary table saved to /content/drive/Shareddrives/Baiying/CALF/results/long_term_forecast_glucose_train_few_shot_CALF_Glucose_ftS_sl144_ll72_pl18_dm768_nh12_el2_dl1_df2048_fc1_ebtimeF_dtTrue_test_gpt4_0/result_open/per_subject_metrics/summary_per_dataset.csv


In [4]:
# Example usage of the function with the original file:
base_dataset_path = '/content/drive/Shareddrives/Baiying/preprocessed_dataset/test_dataset/controlled_datasets'
main_csv_path = '/content/drive/Shareddrives/Baiying/CALF/results/long_term_forecast_glucose_train_CALF_Glucose_ftS_sl144_ll72_pl18_dm768_nh12_el2_dl1_df2048_fc1_ebtimeF_dtTrue_test_gpt4_0/result_controlled/per_subject_metrics/per_subject_horizons.csv'
dataset_subfolders = [
    f for f in os.listdir(base_dataset_path)
    if os.path.isdir(os.path.join(base_dataset_path, f)) and f != 'mixed'
]
# Ensure base_dataset_path and dataset_subfolders are defined from previous steps
# (If not, you'd need to define them here, e.g., base_dataset_path = '...', dataset_subfolders = [...] )

processed_df_llama, summary_table_llama = process_results_csv(main_csv_path, base_dataset_path, dataset_subfolders)

print("\nProcessed DataFrame (first 5 rows):")
display(processed_df_llama.head())

print("\nSummary Table:")
display(summary_table_llama)

# Extract the directory of the main CSV file
output_metrics_dir = os.path.dirname(main_csv_path)

# Create the directory if it doesn't exist
os.makedirs(output_metrics_dir, exist_ok=True)

# Save processed_df_gpt2
processed_df_llama_path = os.path.join(output_metrics_dir, 'processed_per_subject_horizons.csv')
processed_df_llama.to_csv(processed_df_llama_path, index=False)
print(f"Processed DataFrame saved to {processed_df_llama_path}")

# Save summary_table_gpt2
summary_table_llama_path = os.path.join(output_metrics_dir, 'summary_per_dataset.csv')
summary_table_llama.to_csv(summary_table_llama_path, index=False)
print(f"Summary table saved to {summary_table_llama_path}")


Processing file: /content/drive/Shareddrives/Baiying/CALF/results/long_term_forecast_glucose_train_CALF_Glucose_ftS_sl144_ll72_pl18_dm768_nh12_el2_dl1_df2048_fc1_ebtimeF_dtTrue_test_gpt4_0/result_controlled/per_subject_metrics/per_subject_horizons.csv
Attempting to map 12 unique subject IDs...
All subjects mapped. Exiting dataset iteration early.
All subjects successfully mapped.

Processed DataFrame (first 5 rows):


Unnamed: 0,subject_id,num_windows,MAE_15min_mgdl,RMSE_15min_mgdl,MAE_30min_mgdl,RMSE_30min_mgdl,MAE_60min_mgdl,RMSE_60min_mgdl,MAE_90min_mgdl,RMSE_90min_mgdl,dataset_name
0,567,2237,9.600617,16.959597,17.612631,27.627792,30.745319,44.018257,38.535282,52.729832,5_T1DEXI
1,591,2604,9.642523,13.866681,16.51284,22.471451,27.224564,35.187458,34.591732,43.732948,5_T1DEXI
2,540,2762,9.352543,12.88951,17.978848,24.256367,32.875713,43.201038,41.89967,54.658386,6_T1DEXIP
3,544,2565,7.526588,10.520686,13.611608,19.023478,24.682219,33.689686,33.899635,44.675861,OhioT1DM
4,552,2218,6.987555,9.536574,13.011893,17.974052,23.025379,31.52956,29.634933,39.443062,OhioT1DM



Summary Table:


Unnamed: 0,dataset_name,avg_MAE_15min,avg_RMSE_15min,avg_MAE_30min,avg_RMSE_30min,avg_MAE_60min,avg_RMSE_60min,avg_MAE_90min,avg_RMSE_90min
0,5_T1DEXI,9.62157,15.413139,17.062736,25.049622,28.984941,39.602858,36.563507,48.23139
1,6_T1DEXIP,9.352543,12.88951,17.978848,24.256367,32.875713,43.201038,41.89967,54.658386
2,OhioT1DM,8.256033,12.54385,14.268109,20.607892,24.519088,33.751562,32.324561,43.293003


Processed DataFrame saved to /content/drive/Shareddrives/Baiying/CALF/results/long_term_forecast_glucose_train_CALF_Glucose_ftS_sl144_ll72_pl18_dm768_nh12_el2_dl1_df2048_fc1_ebtimeF_dtTrue_test_gpt4_0/result_controlled/per_subject_metrics/processed_per_subject_horizons.csv
Summary table saved to /content/drive/Shareddrives/Baiying/CALF/results/long_term_forecast_glucose_train_CALF_Glucose_ftS_sl144_ll72_pl18_dm768_nh12_el2_dl1_df2048_fc1_ebtimeF_dtTrue_test_gpt4_0/result_controlled/per_subject_metrics/summary_per_dataset.csv
