In [1]:
import pandas as pd
import numpy as np

import sys
import os

# Get the parent directory path
parent_dir = os.path.abspath(os.path.join(os.getcwd(), ".."))
if parent_dir not in sys.path:
    sys.path.insert(0, parent_dir)
    
from src.data_processing import (
    get_daily_segments_loc,
    check_and_filter_nan_segments,
    find_intervals_more_than_15_and_fill
)
from src.data_loaders import (
    load_data_iso,
    load_data_cap,
    load_data_cap1,
    load_data_ohio,
    load_data_glucobench
)

In [4]:
def raw_analysis(all_data_dict):
    
    # Placeholder for all BG measurements, TIR, TBR, TAR percentages, and total days
    all_bg_values = []
    tir_percentages = []  # Store TIR percentages for each patient
    tbr_percentages = []  # Store TBR percentages for each patient
    tar_percentages = []  # Store TAR percentages for each patient
    total_measurements = 0
    total_days_list = []

    # Iterate over each patient's all_data DataFrame
    for patient_id, data in all_data_dict.items():
        if 'cbg' in data.columns:
            # Extract blood glucose values
            bg_values = data['cbg'].dropna().values
            all_bg_values.extend(bg_values)
            
            # Calculate TIR, TBR, TAR for the current patient (in mmol/L)
            tir_count = sum((3.9 <= bg_values) & (bg_values <= 10.0))  # TIR count for this patient
            tbr_count = sum(bg_values < 3.9)  # TBR count for this patient
            tar_count = sum(bg_values > 10.0)  # TAR count for this patient
            
            # Calculate TIR, TBR, and TAR percentages for the current patient
            tir_percentage = (tir_count / len(bg_values)) * 100  # TIR percentage for this patient
            tbr_percentage = (tbr_count / len(bg_values)) * 100  # TBR percentage for this patient
            tar_percentage = (tar_count / len(bg_values)) * 100  # TAR percentage for this patient
            
            # Append the percentages to their respective lists
            tir_percentages.append(tir_percentage)
            tbr_percentages.append(tbr_percentage)
            tar_percentages.append(tar_percentage)
            
            # Update total measurements
            total_measurements += len(bg_values)
            
            # Calculate total number of rows (5-minute intervals)
            total_intervals = len(data)
            
            # Convert intervals to days (since 288 intervals = 1 day)
            total_days = total_intervals / 288

            if patient_id == "1636-69-001":
                total_days = 10
            # Append to the list of total days
            total_days_list.append(total_days)

    # Convert BG values to pandas Series for easier calculation
    all_bg_series = pd.Series(all_bg_values)

    # Calculate mean and standard deviation for BG
    mean_bg = np.nanmean(all_bg_series)
    std_bg = np.nanstd(all_bg_series)

    # Calculate mean and standard deviation for TIR, TBR, and TAR percentages
    mean_tir = pd.Series(tir_percentages).mean()
    std_tir = pd.Series(tir_percentages).std()

    mean_tbr = pd.Series(tbr_percentages).mean()
    std_tbr = pd.Series(tbr_percentages).std()

    mean_tar = pd.Series(tar_percentages).mean()
    std_tar = pd.Series(tar_percentages).std()

    # Convert the total days list to a pandas Series for calculations
    total_days_series = pd.Series(total_days_list)

    # Calculate mean and standard deviation for days
    mean_days = total_days_series.mean()
    std_days = total_days_series.std()

    # Output the results
    print(f"Mean Blood Glucose: {mean_bg:.2f} mmol/L")
    print(f"Standard Deviation of Blood Glucose: {std_bg:.2f} mmol/L")
    print(f"Mean TIR (3.9–10.0 mmol/L): {mean_tir:.2f}%")
    print(f"Standard Deviation of TIR: {std_tir:.2f}%")
    print(f"Mean TBR (< 3.9 mmol/L): {mean_tbr:.2f}%")
    print(f"Standard Deviation of TBR: {std_tbr:.2f}%")
    print(f"Mean TAR (> 10.0 mmol/L): {mean_tar:.2f}%")
    print(f"Standard Deviation of TAR: {std_tar:.2f}%")
    print(f"Mean Days: {mean_days:.2f}")
    print(f"Standard Deviation of Days: {std_days:.2f}")


In [5]:
def preprocessing(all_data_dict, patient_ids, percentile=0.8):
    # 1. create continues
    cont_segments_dict = {}
    filtered_segments_dict = {}

    for patient_id in patient_ids:
        patient_data = all_data_dict
        cont_segments_loc = get_daily_segments_loc(
            {patient_id: patient_data[patient_id]}, "cbg"
        )
        cont_segments_dict[patient_id] = cont_segments_loc[patient_id]
        filtered_segments = check_and_filter_nan_segments(
            patient_data[patient_id], cont_segments_dict[patient_id], "cbg"
        )
        filtered_segments_dict[patient_id] = filtered_segments

    # Initialize a new dictionary to store the segmented data per patient
    segmented_data_dict = {}
    segment_stats_dict = {}
    # Loop over each patient and extract their segments based on the filtered indices
    for patient_id, filtered_segments in filtered_segments_dict.items():
        # Get the start indices for the patient's filtered segments
        start_indices = filtered_segments["start_indices"]
        end_indices = filtered_segments["end_indices"]

        # Extract the corresponding data for each segment
        patient_data = all_data_dict[
            patient_id
        ]  # Assuming this is the full data for the patient

        # Initialize a list to store the segmented data for the patient
        patient_segments = []
        patient_segment_stats = []

        # Loop over the start indices to slice the data for each segment
        for i in range(
            len(start_indices)
        ):  # Loop until the second-to-last index (since we're using pairs)
            start_idx = start_indices[i]
            end_idx = end_indices[i]

            # Extract the segment for this pair of indices
            segment_data = patient_data[start_idx : end_idx + 1]["cbg"]

            # Calculate the length of the segment
            segment_length = len(segment_data)

            # Count the number of NaN values in the segment
            num_nan_values = np.sum(np.isnan(segment_data))

            if segment_length == 288:
                # print(f"The segment has {num_nan_values} NaN values.")
                # Append the segment to the patient's list of segments
                patient_segments.append(np.array(segment_data))

                # Store the statistics (length and NaN count) for this segment
                patient_segment_stats.append(
                    {"segment_length": segment_length, "num_nan_values": num_nan_values}
                )
        if len(patient_segments) != 0:
            # Store the segmented data in the dictionary, using the patient_id as the key
            segmented_data_dict[patient_id] = patient_segments
            segment_stats_dict[patient_id] = patient_segment_stats

    original_segments = segmented_data_dict

    # 2. censored segments
    # Dictionary for storing the interpolated data for each patient
    censored_segments = {}
    censored_thresh = {}
    censored_segment_indices = {}
    thresh_data = {}

    # Assuming 'original_data' is a dictionary with patient data, and each entry is a 1D array of continuous data points
    for patient_id, segments in original_segments.items():
        # Store interpolated segments for the current patient
        censored_segments[patient_id] = []
        censored_thresh[patient_id] = []
        censored_segment_indices[patient_id] = []
        thresh_data[patient_id] = []

        for segment in segments:
            if len(segment) != 288:
                print("segment len = ", len(segment))
            thresh = np.quantile(segment[~np.isnan(segment)], percentile)
            if thresh == np.max(segment[~np.isnan(segment)]):
                print("maximum thresh")
                filtered_segment = segment[~np.isnan(segment)]  # Remove NaN values
                filtered_segment = filtered_segment[
                    filtered_segment < np.max(filtered_segment)
                ]  # Exclude the max value
                # Compute the quantile on the filtered dataset
                thresh = np.quantile(filtered_segment, percentile)
                print("new thresh = ", thresh)
            # pint("thresh = ", thresh)
            censored_thresh[patient_id].append(thresh)
            censored_segment = np.where(segment > thresh, np.nan, segment)
            censored_segment_index = np.where(segment > thresh)

            censored_segments[patient_id].append(censored_segment)
            censored_segment_indices[patient_id].append(censored_segment_index)
            thresh_data[patient_id].append(thresh)

    return original_segments

In [6]:
def preprocessing_analysis(segments):
    # Placeholder for overall values
    all_patient_mean_bg = []
    all_patient_std_bg = []
    all_patient_mean_tir = []
    all_patient_std_tir = []
    all_patient_mean_tbr = []
    all_patient_std_tbr = []
    all_patient_mean_tar = []
    all_patient_std_tar = []
    all_patient_mean_days = []
    all_patient_std_days = []

    # Iterate over each patient's data (which is a list of daily arrays)
    for patient_id, daily_data_list in segments.items():
        daily_bg_values = []
        daily_tir_percentages = []
        daily_tbr_percentages = []
        daily_tar_percentages = []
        total_days = len(daily_data_list)
        
        # Process each day's data for the patient
        for daily_data in daily_data_list:
            daily_data = np.array(daily_data)  # Ensure it's a NumPy array
            
            # Blood glucose values
            bg_values = daily_data
            daily_bg_values.extend(bg_values)
            
            # Calculate TIR, TBR, TAR for the current day's data (in mmol/L)
            tir_count = np.sum((bg_values >= 3.9) & (bg_values <= 10.0))  # TIR count for this day
            tbr_count = np.sum(bg_values < 3.9)  # TBR count for this day
            tar_count = np.sum(bg_values > 10.0)  # TAR count for this day
            
            # Calculate TIR, TBR, and TAR percentages for the current day
            tir_percentage = (tir_count / len(bg_values)) * 100  # TIR percentage for this day
            tbr_percentage = (tbr_count / len(bg_values)) * 100  # TBR percentage for this day
            tar_percentage = (tar_count / len(bg_values)) * 100  # TAR percentage for this day
            
            # Store daily TIR, TBR, TAR percentages
            daily_tir_percentages.append(tir_percentage)
            daily_tbr_percentages.append(tbr_percentage)
            daily_tar_percentages.append(tar_percentage)
        
        # Calculate mean and standard deviation for BG, TIR, TBR, TAR for this patient
        mean_bg = np.nanmean(daily_bg_values)
        std_bg = np.nanstd(daily_bg_values)
        
        mean_tir = np.mean(daily_tir_percentages)
        std_tir = np.std(daily_tir_percentages)
        
        mean_tbr = np.mean(daily_tbr_percentages)
        std_tbr = np.std(daily_tbr_percentages)
        
        mean_tar = np.mean(daily_tar_percentages)
        std_tar = np.std(daily_tar_percentages)
        
        # Store the results for the patient
        all_patient_mean_bg.append(mean_bg)
        all_patient_std_bg.append(std_bg)
        all_patient_mean_tir.append(mean_tir)
        all_patient_std_tir.append(std_tir)
        all_patient_mean_tbr.append(mean_tbr)
        all_patient_std_tbr.append(std_tbr)
        all_patient_mean_tar.append(mean_tar)
        all_patient_std_tar.append(std_tar)
        all_patient_mean_days.append(total_days)
        
    # Convert lists to pandas Series for easier calculation
    mean_bg_series = pd.Series(all_patient_mean_bg)
    std_bg_series = pd.Series(all_patient_std_bg)
    mean_tir_series = pd.Series(all_patient_mean_tir)
    std_tir_series = pd.Series(all_patient_std_tir)
    mean_tbr_series = pd.Series(all_patient_mean_tbr)
    std_tbr_series = pd.Series(all_patient_std_tbr)
    mean_tar_series = pd.Series(all_patient_mean_tar)
    std_tar_series = pd.Series(all_patient_std_tar)
    mean_days_series = pd.Series(all_patient_mean_days)

    # Calculate overall mean and standard deviation across all patients
    overall_mean_bg = np.nanmean(mean_bg_series)
    overall_std_bg = np.nanstd(mean_bg_series)

    overall_mean_tir = mean_tir_series.mean()
    overall_std_tir = std_tir_series.mean()

    overall_mean_tbr = mean_tbr_series.mean()
    overall_std_tbr = std_tbr_series.mean()

    overall_mean_tar = mean_tar_series.mean()
    overall_std_tar = std_tar_series.mean()

    overall_mean_days = mean_days_series.mean()
    overall_std_days = mean_days_series.std()

    # Output results per patient and across all patients
    # print("Per Patient Metrics:")
    # for i, patient_id in enumerate(segments.keys()):
    #     print(f"Patient {patient_id}:")
    #     print(f"  Mean BG: {all_patient_mean_bg[i]:.2f} mmol/L")
    #     print(f"  Std BG: {all_patient_std_bg[i]:.2f} mmol/L")
    #     print(f"  Mean TIR: {all_patient_mean_tir[i]:.2f}%")
    #     print(f"  Std TIR: {all_patient_std_tir[i]:.2f}%")
    #     print(f"  Mean TBR: {all_patient_mean_tbr[i]:.2f}%")
    #     print(f"  Std TBR: {all_patient_std_tbr[i]:.2f}%")
    #     print(f"  Mean TAR: {all_patient_mean_tar[i]:.2f}%")
    #     print(f"  Std TAR: {all_patient_std_tar[i]:.2f}%")
    #     print(f"  Total Days: {all_patient_mean_days[i]:.2f} days")
    #     print("-" * 40)

    print("\nOverall Metrics Across All Patients:")
    print(f"Mean BG: {overall_mean_bg:.2f} mmol/L")
    print(f"Standard Deviation of BG: {overall_std_bg:.2f} mmol/L")
    print(f"Mean TIR: {overall_mean_tir:.2f}%")
    print(f"Standard Deviation of TIR: {overall_std_tir:.2f}%")
    print(f"Mean TBR: {overall_mean_tbr:.2f}%")
    print(f"Standard Deviation of TBR: {overall_std_tbr:.2f}%")
    print(f"Mean TAR: {overall_mean_tar:.2f}%")
    print(f"Standard Deviation of TAR: {overall_std_tar:.2f}%")
    print(f"Mean Days: {overall_mean_days:.2f}")
    print(f"Standard Deviation of Days: {overall_std_days:.2f}")


Ohio

In [5]:
# Create the DataFrame
data = {
    "ID": [540, 544, 552, 567, 584, 596, 559, 563, 570, 575, 588, 591],
    "Gender": ["male", "male", "male", "female", "male", "male", "female", "male", "male", "female", "female", "female"],
    "Age": ["20–40", "40–60", "20–40", "20–40", "40–60", "60–80", "40–60", "40–60", "40–60", "40–60", "40–60", "40–60"],
    "Pump Model": ["630G", "530G", "630G", "630G", "530G", "530G", "530G", "530G", "530G", "530G", "530G", "530G"],
    "Sensor Band": ["Empatica", "Empatica", "Empatica", "Empatica", "Empatica", "Empatica", "Basis", "Basis", "Basis", "Basis", "Basis", "Basis"],
    "Cohort": [2020, 2020, 2020, 2020, 2020, 2020, 2018, 2018, 2018, 2018, 2018, 2018]
}

df = pd.DataFrame(data)

# Extract lower and upper bounds from the 'Age' column
df['Lower Bound'] = df['Age'].apply(lambda x: int(x.split('–')[0]))
df['Upper Bound'] = df['Age'].apply(lambda x: int(x.split('–')[1]))

# Calculate mean lower and upper bounds
mean_lower_bound = np.mean(df['Lower Bound'])
mean_upper_bound = np.mean(df['Upper Bound'])

print("Mean Lower Bound:", mean_lower_bound)
print("Mean Upper Bound:", mean_upper_bound)
print("Mean Age Range: [{:.2f}, {:.2f}]".format(mean_lower_bound, mean_upper_bound))


Mean Lower Bound: 36.666666666666664
Mean Upper Bound: 56.666666666666664
Mean Age Range: [36.67, 56.67]


In [6]:
patient_ids = [
            "540",
            "544",
            "552",
            "559",
            "563",
            "567",
            "570",
            "575",
            "584",
            "588",
            "591",
            "596",
        ]
train_data_dict = {}
test_data_dict = {}
all_data_dict = {}
for patient_id in patient_ids:
    _, train_data, test_data, all_data = load_data_ohio(
        patient_id, include_test=True
    )
    train_data_dict[patient_id] = train_data
    test_data_dict[patient_id] = test_data
    all_data_dict[patient_id] = all_data

In [7]:
all_data_dict['540']

Unnamed: 0,5minute_intervals_timestamp,missing_cbg,cbg,finger,basal,hr,gsr,carbInput,bolus,timestamp,minutes_elapsed,days_elapsed
0,6.035755e+06,0.0,4.2180,80.0,,,0.337214,,,11/03/1970 20:35,0,0.000000
1,6.035756e+06,0.0,3.9960,,,,0.317671,,,11/03/1970 20:35,5,0.003472
2,6.035757e+06,0.0,3.7740,,,,0.342821,,,11/03/1970 20:35,10,0.006944
3,6.035758e+06,0.0,3.6075,,,,0.359941,,,11/03/1970 20:35,15,0.010417
4,6.035759e+06,0.0,3.4965,,,,0.365833,,,11/03/1970 20:35,20,0.013889
...,...,...,...,...,...,...,...,...,...,...,...,...
16169,6.051926e+06,0.0,13.3200,,0.8,,,,,12/03/1970 01:05,80845,56.142361
16170,6.051927e+06,0.0,13.8195,,0.8,,,,,12/03/1970 01:05,80850,56.145833
16171,6.051928e+06,0.0,14.0970,,0.8,,,,,12/03/1970 01:05,80855,56.149306
16172,6.051929e+06,0.0,13.7640,,0.8,,,,,12/03/1970 01:05,80860,56.152778


In [8]:
len(all_data_dict)

12

In [9]:
raw_analysis(all_data_dict)

total day =  56.15972222222222 540
total day =  54.885416666666664 544
total day =  52.24652777777778 552
total day =  51.93055555555556 559
total day =  54.81944444444444 563
total day =  56.96180555555556 567
total day =  50.3125 570
total day =  54.93402777777778 575
total day =  56.395833333333336 584
total day =  55.50347222222222 588
total day =  54.173611111111114 591
total day =  57.74652777777778 596
Mean Blood Glucose: 8.86 mmol/L
Standard Deviation of Blood Glucose: 3.37 mmol/L
Mean TIR (3.9–10.0 mmol/L): 63.28%
Standard Deviation of TIR: 10.05%
Mean TBR (< 3.9 mmol/L): 3.56%
Standard Deviation of TBR: 2.49%
Mean TAR (> 10.0 mmol/L): 33.16%
Standard Deviation of TAR: 11.19%
Mean Days: 54.67
Standard Deviation of Days: 2.20


In [10]:
segments = preprocessing(all_data_dict, patient_ids)

maximum thresh
new thresh =  16.206


In [11]:
len(segments)

12

In [12]:
preprocessing_analysis(segments)


Overall Metrics Across All Patients:
Mean BG: 8.80 mmol/L
Standard Deviation of BG: 0.88 mmol/L
Mean TIR: 63.14%
Standard Deviation of TIR: 15.77%
Mean TBR: 3.18%
Standard Deviation of TBR: 3.82%
Mean TAR: 32.07%
Standard Deviation of TAR: 15.82%
Mean Days: 35.75
Standard Deviation of Days: 8.18


Broll

In [13]:
all_data_dict, patient_ids = load_data_glucobench('iglu')

In [14]:
print("Raw # Subjects​", len(all_data_dict))
print("Raw Analysis")
raw_analysis(all_data_dict)
segments = preprocessing(all_data_dict, patient_ids)
print("Processed # Subjects​", len(segments))
print("\nProcessed Analysis")
preprocessing_analysis(segments)

Raw # Subjects​ 5
Raw Analysis
total day =  12.072916666666666 Subject 1
total day =  16.65972222222222 Subject 2
total day =  5.663194444444445 Subject 3
total day =  12.836805555555555 Subject 4
total day =  10.565972222222221 Subject 5
Mean Blood Glucose: 8.81 mmol/L
Standard Deviation of Blood Glucose: 3.15 mmol/L
Mean TIR (3.9–10.0 mmol/L): 71.31%
Standard Deviation of TIR: 28.17%
Mean TBR (< 3.9 mmol/L): 0.20%
Standard Deviation of TBR: 0.17%
Mean TAR (> 10.0 mmol/L): 28.50%
Standard Deviation of TAR: 28.30%
Mean Days: 11.56
Standard Deviation of Days: 3.99
Processed # Subjects​ 5

Processed Analysis

Overall Metrics Across All Patients:
Mean BG: 8.94 mmol/L
Standard Deviation of BG: 1.86 mmol/L
Mean TIR: 68.90%
Standard Deviation of TIR: 10.05%
Mean TBR: 0.25%
Standard Deviation of TBR: 0.41%
Mean TAR: 29.28%
Standard Deviation of TAR: 9.83%
Mean Days: 7.20
Standard Deviation of Days: 2.95


Colas

In [15]:
all_data_dict, patient_ids = load_data_glucobench('colas')

In [16]:
all_data_dict

{'121':      Unnamed: 0   id                 Tid     cbg  gender   age   BMI  \
 0         14032  121 2012-01-01 00:00:00  3.9405     1.0  42.0  37.3   
 1         14033  121 2012-01-01 00:05:00  4.2735     1.0  42.0  37.3   
 2         14034  121 2012-01-01 00:10:00  4.6620     1.0  42.0  37.3   
 3         14035  121 2012-01-01 00:15:00  4.8840     1.0  42.0  37.3   
 4         14036  121 2012-01-01 00:20:00  4.9395     1.0  42.0  37.3   
 ..          ...  ...                 ...     ...     ...   ...   ...   
 283       14315  121 2012-01-01 23:35:00  6.2715     1.0  42.0  37.3   
 284       14316  121 2012-01-01 23:40:00  6.2715     1.0  42.0  37.3   
 285       14317  121 2012-01-01 23:45:00  6.2715     1.0  42.0  37.3   
 286       14318  121 2012-01-01 23:50:00  6.2715     1.0  42.0  37.3   
 287       14319  121 2012-01-01 23:55:00  6.2715     1.0  42.0  37.3   
 
      glycaemia  HbA1c  follow.up  T2DM  minutes_elapsed  days_elapsed  
 0         93.0    6.1      554.0  True   

In [17]:
# Initialize counters for male and female patients
male_count = 0
female_count = 0

# Dictionary to store gender per patient
gender_per_patient = {}

# Iterate over each patient's data in the dictionary
for patient_id, data in all_data_dict.items():
    # Extract the gender value for the patient from the first row (assuming it's the same for the entire patient)
    gender = data['gender'].iloc[0]  # Using the first row to determine gender
    
    # Store the gender of the patient
    gender_per_patient[patient_id] = "Male" if gender == 0.0 else "Female"
    
    # Count the gender
    if gender == 0.0:
        male_count += 1
    elif gender == 1.0:
        female_count += 1

# Output the gender of each patient and the counts
print("Gender of each patient:")
for patient_id, gender in gender_per_patient.items():
    print(f"Patient {patient_id}: {gender}")

print(f"\nTotal number of male patients: {male_count}")
print(f"Total number of female patients: {female_count}")

Gender of each patient:
Patient 121: Female
Patient 153: Male
Patient 155: Male
Patient 157: Male
Patient 161: Male
Patient 172: Male
Patient 185: Male
Patient 191: Male
Patient 195: Female
Patient 22: Female
Patient 23: Male
Patient 25: Male
Patient 28: Female
Patient 31: Female
Patient 56: Male
Patient 86: Female
Patient 93: Male

Total number of male patients: 11
Total number of female patients: 6


In [18]:
print("Raw # Subjects​", len(all_data_dict))
print("Raw Analysis")
raw_analysis(all_data_dict)
segments = preprocessing(all_data_dict, patient_ids)
print("Processed # Subjects​", len(segments))
print("\nProcessed Analysis")
preprocessing_analysis(segments)

Raw # Subjects​ 17
Raw Analysis
total day =  1.0 121
total day =  2.0 153
total day =  2.0 155
total day =  2.0 157
total day =  2.0 161
total day =  2.0 172
total day =  2.0 185
total day =  1.9965277777777777 191
total day =  2.0 195
total day =  2.0 22
total day =  2.0 23
total day =  2.0 25
total day =  2.0 28
total day =  2.0 31
total day =  2.0 56
total day =  2.0 86
total day =  2.0 93
Mean Blood Glucose: 5.82 mmol/L
Standard Deviation of Blood Glucose: 1.13 mmol/L
Mean TIR (3.9–10.0 mmol/L): 95.97%
Standard Deviation of TIR: 7.10%
Mean TBR (< 3.9 mmol/L): 3.60%
Standard Deviation of TBR: 7.14%
Mean TAR (> 10.0 mmol/L): 0.43%
Standard Deviation of TAR: 1.52%
Mean Days: 1.94
Standard Deviation of Days: 0.24
Processed # Subjects​ 17

Processed Analysis

Overall Metrics Across All Patients:
Mean BG: 5.77 mmol/L
Standard Deviation of BG: 0.70 mmol/L
Mean TIR: 95.82%
Standard Deviation of TIR: 1.75%
Mean TBR: 3.75%
Standard Deviation of TBR: 1.77%
Mean TAR: 0.43%
Standard Deviation o

Dubosson

In [35]:
def load_rawdata_dubosson(dataset_name):
    """Loads training data from the 2018 and 2020 datasets.
    Optionally includes test data, but only for final evaluation.
    Returns both raw (unmodified) and processed (with elapsed time columns) data."""

    path = "data/" + str(dataset_name) + ".csv"

    data = pd.read_csv(path)
    data.rename(columns={"time": "Tid", "gl": "cbg"}, inplace=True)
    data["cbg"] = data["cbg"] * 0.0555
    data["id"] = data["id"].astype(str)

    if dataset_name == "colas":
        # Keep only rows where the T2DM column is True
        data = data[data["T2DM"] == True].copy()
    elif dataset_name == "hall":
        data = data[data["diagnosis"] == 2].copy()

    # Initialize an empty dictionary to store DataFrames for each patient
    all_data_dict = {}

    # Iterate over the unique patient IDs in the DataFrame
    for patient_id in data["id"].unique():
        # Filter the DataFrame for the current patient_id
        patient_data = data[data["id"] == patient_id].copy()

        # Ensure "Tid" is in datetime format for proper processing
        patient_data["Tid"] = pd.to_datetime(patient_data["Tid"])

        # Sort the DataFrame by "Tid"
        patient_data.sort_values(by="Tid", inplace=True)
        patient_data.reset_index(drop=True, inplace=True)  # Reset the index

        # Format "Tid" back to the desired string format if needed
        patient_data["Tid"] = patient_data["Tid"].dt.strftime("%d/%m/%Y %H:%M")

        patient_data = find_intervals_more_than_15_and_fill(patient_data)
        patient_data["minutes_elapsed"] = np.arange(0, len(patient_data) * 5, 5)
        patient_data["days_elapsed"] = patient_data["minutes_elapsed"] / 1440

        all_data_dict[patient_id] = patient_data

    return all_data_dict, all_data_dict.keys()

In [19]:
# Create the DataFrame
data = {
    "Age": ["NA", "20–29", "20–29", "20–29", "30–39", "30–39", "30–39", "60–69", "70–79"],
    "Gender": ["Man", "Man", "Man", "Man", "Man", "Man", "Woman", "Woman", "Woman"],
    "Height (cm)": ["180–189", "170–179", "180–189", "180–189", "180–189", "190–199", "160–169", "150–159", "160–169"],
    "Weight (kg)": ["80–89", "60–69", "70–79", "80–89", "80–89", "70–79", "70–79", "50–59", "50–59"]
}


df = pd.DataFrame(data)

# Extract lower and upper bounds from the 'Age' column
df['Lower Bound'] = df['Age'].apply(lambda x: int(x.split('–')[0]))
df['Upper Bound'] = df['Age'].apply(lambda x: int(x.split('–')[1]))

# Calculate mean lower and upper bounds
mean_lower_bound = np.mean(df['Lower Bound'])
mean_upper_bound = np.mean(df['Upper Bound'])

print("Mean Lower Bound:", mean_lower_bound)
print("Mean Upper Bound:", mean_upper_bound)
print("Mean Age Range: [{:.2f}, {:.2f}]".format(mean_lower_bound, mean_upper_bound))

ValueError: invalid literal for int() with base 10: 'NA'

In [39]:
all_data_dict, patient_ids = load_rawdata_dubosson('dubosson')

In [40]:
print("Raw # Subjects​", len(all_data_dict))
print("Raw Analysis")
raw_analysis(all_data_dict)
all_data_dict, patient_ids = load_data_glucobench('dubosson')
segments = preprocessing(all_data_dict, patient_ids)
print("Processed # Subjects​", len(segments))
print("\nProcessed Analysis")
preprocessing_analysis(segments)

Raw # Subjects​ 9
Raw Analysis
Mean Blood Glucose: 9.20 mmol/L
Standard Deviation of Blood Glucose: 4.22 mmol/L
Mean TIR (3.9–10.0 mmol/L): 58.60%
Standard Deviation of TIR: 20.19%
Mean TBR (< 3.9 mmol/L): 11.90%
Standard Deviation of TBR: 17.94%
Mean TAR (> 10.0 mmol/L): 29.49%
Standard Deviation of TAR: 19.48%
Mean Days: 3.23
Standard Deviation of Days: 1.40
Processed # Subjects​ 7

Processed Analysis

Overall Metrics Across All Patients:
Mean BG: 9.12 mmol/L
Standard Deviation of BG: 1.08 mmol/L
Mean TIR: 57.34%
Standard Deviation of TIR: 12.90%
Mean TBR: 7.60%
Standard Deviation of TBR: 5.59%
Mean TAR: 35.06%
Standard Deviation of TAR: 13.24%
Mean Days: 3.14
Standard Deviation of Days: 0.69


Hall

In [21]:
all_data_dict, patient_ids = load_data_glucobench('hall')
print("Raw # Subjects​", len(all_data_dict))
print("Raw Analysis")
raw_analysis(all_data_dict)
segments = preprocessing(all_data_dict, patient_ids)
print("Processed # Subjects​", len(segments))
print("\nProcessed Analysis")
preprocessing_analysis(segments)

Raw # Subjects​ 5
Raw Analysis
Mean Blood Glucose: 6.29 mmol/L
Standard Deviation of Blood Glucose: 1.65 mmol/L
Mean TIR (3.9–10.0 mmol/L): 94.60%
Standard Deviation of TIR: 4.21%
Mean TBR (< 3.9 mmol/L): 1.42%
Standard Deviation of TBR: 1.96%
Mean TAR (> 10.0 mmol/L): 3.98%
Standard Deviation of TAR: 4.71%
Mean Days: 7.46
Standard Deviation of Days: 1.76
Processed # Subjects​ 5

Processed Analysis

Overall Metrics Across All Patients:
Mean BG: 6.27 mmol/L
Standard Deviation of BG: 0.63 mmol/L
Mean TIR: 92.69%
Standard Deviation of TIR: 4.76%
Mean TBR: 1.49%
Standard Deviation of TBR: 1.33%
Mean TAR: 3.87%
Standard Deviation of TAR: 3.32%
Mean Days: 5.00
Standard Deviation of Days: 1.41


In [23]:
# Placeholder lists to store age values for each patient
all_ages = []

# Iterate through each patient ID and calculate average age and std for each
for patient_id, patient_data in all_data_dict.items():
    # Convert patient data to DataFrame
    df = pd.DataFrame(patient_data)
    
    # Append the 'Age' column to all_ages list
    all_ages.extend(df['Age'].dropna())  # dropna to ignore NaN values

# Calculate the mean and std for all ages across all patients
mean_age = sum(all_ages) / len(all_ages) if all_ages else float('nan')  # Mean calculation
std_age = pd.Series(all_ages).std()  # Standard deviation using pandas

# Output the results
print(f"Average Age: {mean_age:.2f}")
print(f"Standard Deviation of Age: {std_age:.2f}")

Average Age: 56.75
Standard Deviation of Age: 6.58


Weinstock

In [24]:
all_data_dict, patient_ids = load_data_glucobench('weinstock')
print("Raw # Subjects​", len(all_data_dict))
print("Raw Analysis")
raw_analysis(all_data_dict)
segments = preprocessing(all_data_dict, patient_ids)
print("Processed # Subjects​", len(segments))
print("\nProcessed Analysis")
preprocessing_analysis(segments)

Raw # Subjects​ 200
Raw Analysis
Mean Blood Glucose: 9.65 mmol/L
Standard Deviation of Blood Glucose: 4.58 mmol/L
Mean TIR (3.9–10.0 mmol/L): 50.62%
Standard Deviation of TIR: 14.00%
Mean TBR (< 3.9 mmol/L): 7.69%
Standard Deviation of TBR: 6.05%
Mean TAR (> 10.0 mmol/L): 41.69%
Standard Deviation of TAR: 16.81%
Mean Days: 15.04
Standard Deviation of Days: 5.91
maximum thresh
new thresh =  18.4815
maximum thresh
new thresh =  19.569300000000002
maximum thresh
new thresh =  20.9235
maximum thresh
new thresh =  14.541
maximum thresh
new thresh =  18.1596
maximum thresh
new thresh =  14.5077
maximum thresh
new thresh =  19.3029
maximum thresh
new thresh =  15.9285
maximum thresh
new thresh =  18.204
maximum thresh
new thresh =  13.886100000000003
maximum thresh
new thresh =  12.476400000000005
maximum thresh
new thresh =  19.5915
maximum thresh
new thresh =  16.2726
maximum thresh
new thresh =  19.3695
maximum thresh
new thresh =  20.535
maximum thresh
new thresh =  16.416900000000002
max

In [None]:
# Initialize counters for male and female patients
male_count = 0
female_count = 0

# Dictionary to store gender per patient
gender_per_patient = {}

# Iterate over each patient's data in the dictionary
for patient_id, data in all_data_dict.items():
    # Extract the gender value for the patient from the first row (assuming it's the same for the entire patient)
    gender = data['Gender'].iloc[0]  # Using the first row to determine gender
        
    # Count the gender
    if gender == 'M':
        male_count += 1
    elif gender == 'F':
        female_count += 1

# Output the gender of each patient and the counts
print("Gender of each patient:")
for patient_id, gender in gender_per_patient.items():
    print(f"Patient {patient_id}: {gender}")

print(f"\nTotal number of male patients: {male_count}")
print(f"Total number of female patients: {female_count}")

Gender of each patient:

Total number of male patients: 106
Total number of female patients: 94


In [26]:
# Initialize counters for male and female patients
male_count = 0
female_count = 0

# Dictionary to store gender per patient
gender_per_patient = {}

# Iterate over each patient's data in the dictionary
for patient_id, data in segments.items():
    # Extract the gender value for the patient from the first row (assuming it's the same for the entire patient)
    gender = all_data_dict[patient_id]['Gender'].iloc[0]  # Using the first row to determine gender
        
    # Count the gender
    if gender == 'M':
        male_count += 1
    elif gender == 'F':
        female_count += 1

# Output the gender of each patient and the counts
print("Gender of each patient:")
for patient_id, gender in gender_per_patient.items():
    print(f"Patient {patient_id}: {gender}")

print(f"\nTotal number of male patients: {male_count}")
print(f"Total number of female patients: {female_count}")

Gender of each patient:

Total number of male patients: 100
Total number of female patients: 89


ISO

In [7]:
patient_ids = [str(i) for i in range(102, 224)]
to_remove = [
    "103",
    "108",
    "110",
    "113",
    "115",
    "116",  # Not sure yet
    "122",
    "126",
    "127",
    "128",
    "130",
    "134",
    "139",
    "143",
    "144",
    "146",
    "149",
    "152",
    "155",
    "156",
    "164",
    "174",
    "175",
    "176",
    "177",
    "180",
    "185",
    "186",
    "190",
    "193",
    "194",
    "197",
    "203",
    "206",
    "211",
    "212",
    "220",
]
patient_ids = list(set(patient_ids) - set(to_remove))
all_data_dict = {}

for patient_id in patient_ids:
    all_data = load_data_iso(patient_id)
    all_data_dict[patient_id] = all_data

In [8]:
print("Raw # Subjects​", len(all_data_dict))
print("Raw Analysis")
raw_analysis(all_data_dict)
segments = preprocessing(all_data_dict, patient_ids)
print("Processed # Subjects​", len(segments))
print("\nProcessed Analysis")
preprocessing_analysis(segments)

Raw # Subjects​ 85
Raw Analysis
Mean Blood Glucose: 9.99 mmol/L
Standard Deviation of Blood Glucose: 3.67 mmol/L
Mean TIR (3.9–10.0 mmol/L): 50.75%
Standard Deviation of TIR: 33.00%
Mean TBR (< 3.9 mmol/L): 0.80%
Standard Deviation of TBR: 2.78%
Mean TAR (> 10.0 mmol/L): 48.45%
Standard Deviation of TAR: 33.45%
Mean Days: 4.05
Standard Deviation of Days: 2.38
Processed # Subjects​ 85

Processed Analysis

Overall Metrics Across All Patients:
Mean BG: 10.51 mmol/L
Standard Deviation of BG: 2.85 mmol/L
Mean TIR: 49.85%
Standard Deviation of TIR: 6.86%
Mean TBR: 0.83%
Standard Deviation of TBR: 0.69%
Mean TAR: 49.22%
Standard Deviation of TAR: 7.00%
Mean Days: 3.46
Standard Deviation of Days: 2.36


T1DEXI

In [30]:
all_data_dict, patient_ids = load_data_glucobench('T1DEXI_adults')
print("Raw # Subjects​", len(all_data_dict))
print("Raw Analysis")
raw_analysis(all_data_dict)
segments = preprocessing(all_data_dict, patient_ids)
print("Processed # Subjects​", len(segments))
print("\nProcessed Analysis")
preprocessing_analysis(segments)

Raw # Subjects​ 497
Raw Analysis
Mean Blood Glucose: 8.12 mmol/L
Standard Deviation of Blood Glucose: 3.13 mmol/L
Mean TIR (3.9–10.0 mmol/L): 73.91%
Standard Deviation of TIR: 15.27%
Mean TBR (< 3.9 mmol/L): 3.38%
Standard Deviation of TBR: 3.18%
Mean TAR (> 10.0 mmol/L): 22.71%
Standard Deviation of TAR: 15.77%
Mean Days: 27.39
Standard Deviation of Days: 2.99
maximum thresh
new thresh =  21.3564
maximum thresh
new thresh =  18.648
maximum thresh
new thresh =  14.7741
maximum thresh
new thresh =  20.4795
maximum thresh
new thresh =  18.7257
maximum thresh
new thresh =  19.4805
maximum thresh
new thresh =  20.279700000000002
maximum thresh
new thresh =  20.3796
maximum thresh
new thresh =  21.3675
maximum thresh
new thresh =  21.2565
maximum thresh
new thresh =  14.2857
maximum thresh
new thresh =  18.381600000000006
maximum thresh
new thresh =  13.542
maximum thresh
new thresh =  14.097
maximum thresh
new thresh =  14.3745
maximum thresh
new thresh =  14.1414
maximum thresh
new thresh