# PEDAP
Process PEDAP dataset.

Source: [Jaeb diabetes datasets](https://public.jaeb.org/datasets/diabetes) - PEDAP Public Dataset - Release 5 - 2025-05-12.zip

In [None]:
import pandas as pd
import numpy as np
import os
import json
import datetime

data_source = '../../data_raw/Pedap_public/Data Files'
output_dir = '../../diax/PEDAP/'

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

In [None]:
basal_all = pd.read_csv(f'{data_source}/PEDAPTandemBASALDELIVERY.txt', sep='|')
bolus_all = pd.read_csv(f'{data_source}/PEDAPTandemBolusDelivered.txt', sep='|')  # also has manual carb announcements
cgm_all = pd.read_csv(f'{data_source}/PEDAPTandemCGMDATAGXB.txt', sep='|')
phys_all = pd.read_csv(f'{data_source}/PEDAPDiabPhysExam.txt', sep='|')  

insulin_all = pd.read_csv(f'{data_source}/PEDAPInsulin.txt', sep='|')
insulin_all = insulin_all[insulin_all['InsRoute'] == 'Pump']
insulin_all['InsTypeStartDt'] = pd.to_datetime(insulin_all['InsTypeStartDt'], errors='coerce')
insulin_all['InsTypeStopDt'] = pd.to_datetime(insulin_all['InsTypeStopDt'], errors='coerce')

# pd.read_csv(f'{data_source}/PEDAPTandemUSERMODECHANGE.txt', sep='|')  # has info about exercise mode, but for now not used

In [None]:
pt_ids = set(cgm_all['PtID']).intersection(set(basal_all['PtID'])).intersection(set(bolus_all['PtID']))
print(f"Found {len(pt_ids)} subjects with all required datasets.")
subjects = list(pt_ids)

In [None]:
subject_id = subjects[0]
cgm = cgm_all[cgm_all['PtID'] == subject_id].copy()
basal = basal_all[basal_all['PtID'] == subject_id].copy()
bolus = bolus_all[bolus_all['PtID'] == subject_id].copy()
phys = phys_all[phys_all['PtID'] == subject_id].copy()
insulin = insulin_all[insulin_all['PtID'] == subject_id].copy()

In [None]:
def parse_mixed(x):  # we have to use this because at exactly midnight the format is only date without time
    for fmt in ("%m/%d/%Y %I:%M:%S %p", "%m/%d/%Y"):
        try:
            return pd.to_datetime(x, format=fmt)
        except (ValueError, TypeError):
            continue
    return pd.NaT

In [None]:
def parse_subject(cgm, basal, bolus, insulin, phys, subject_id, output_file):
    cgm['time'] = cgm['DeviceDtTm'].apply(parse_mixed)
    cgm = cgm.dropna(subset='time')  # drop any times we couldn't parse
    cgm_data = cgm

    # Basal
    basal['time'] = basal['DeviceDtTm'].apply(parse_mixed)
    basal = basal.dropna(subset='time')  # drop any times we couldn't parse
    basal_data = basal

    # Bolus
    bolus['time'] = bolus['DeviceDtTm'].apply(parse_mixed)
    bolus = bolus.dropna(subset='time')  # drop any times we couldn't parse
    bolus_data = bolus

    meal_data = bolus[bolus['CarbAmount'] > 0].copy()

    # Time normalizing
    start_times = [
        cgm_data["time"].min(),
        basal_data["time"].min(),
        bolus_data["time"].min(),
    ]
    start_time = min(start_times)

    def normalize(df, col):
        df = df.copy()
        df.loc[:, col] = (df[col]).dt.strftime('%Y-%m-%d %H:%M:%S')
        return df.sort_values(col).reset_index(drop=True)

    cgm_data = normalize(cgm_data, "time")
    basal_data = normalize(basal_data, "time")
    bolus_data = normalize(bolus_data, "time")

    ins_string = 'UNKNOWN'
    if len(insulin) > 1:  # multiple insulin entries
        start_date = start_time.strftime('%Y-%m-%d')
        dates = insulin['InsTypeStartDt'].dt.strftime('%Y-%m-%d').fillna(start_date).tolist()
        ins = insulin['InsulinName'].tolist()

        # if the insulin is the same as the previous one, skip it
        for i in range(1, len(ins)):
            if ins[i] == ins[i-1]:
                dates[i] = np.nan
        # remove the nan entries
        idx_keep = [i for i in range(len(dates)) if pd.notna(dates[i])]
        dates = [dates[i] for i in idx_keep]
        ins = [ins[i] for i in idx_keep]

        if len(ins) == 1:
            ins_string = ins[0]
        else:
            ins_string = {'date': dates, 'insulin': ins}
    elif len(insulin) == 1:
        ins_string = insulin['InsulinName'].iloc[0]

    # Get height and weight from phys exam data
    weight = phys['Weight'].values[0]
    height = phys['Height'].values[0]

    # correct units if needed
    weight_units = phys['WeightUnits'].values[0]
    if pd.notna(weight_units):  # if not nan, process
        weight_units = weight_units.lower().strip()
    else:
        weight_units = ""

    # convert weight if needed
    if weight_units in ['lbs', 'pounds', 'lb']:
        weight = weight * 0.453592  # convert to kg
    elif weight_units in ['kg', 'kilograms', 'kg.']:
        pass  # already in kg
    else:
        print(f"Unknown weight units: {weight_units}, assuming kg.")

    # get the height units
    height_units = phys['HeightUnits'].values[0]
    if pd.notna(height_units):  # if not nan, process
        height_units = height_units.lower().strip()
    else:
        height_units = ""
    # convert height if needed
    if height_units in ['in', 'inches', 'inch']:
        height = height * 2.54  # convert to cm
    elif height_units in ['cm', 'centimeters', 'cms']:
        pass  # already in cm
    else:
        print(f"Unknown height units: {height_units}, assuming cm.")

    output = {
        "metadata": {
            "unique_id": "id number of the subject",
            "time": {
                "unit": "Y-m-d H:M:S",
                "description": "Timestamps for each measurement, assumed to be in local time",
            },
            "cgm": {
                "unit": "mg/dL",
                "description": "Continuous Glucose Monitor readings",
                "device": "UNKOWN",
                "precision": 1,
            },
            "basal_rate": {
                "unit": "U/hr",
                "description": "The rate of insulin delivery from the pump",
                "device": "Tandem pump",
                "insulin": ins_string,
            },
            "bolus": {
                "unit": "U",
                "description": "The amount of insulin delivered in a bolus, meal and correction, in units",
                "device": "Tandem pump",
                "insulin": ins_string,
            },
            "carbs": {
                "unit": "grams",
                "description": "User announced carbohydrate intake associated with bolus",
            }
        },
        "unique_id": subject_id,
        "cgm": {"time": cgm_data["time"].tolist(), 
                "value": cgm_data["CGMValue"].tolist()},
        "basal_rate": {
            "time": basal_data["time"].tolist(),
            "value": basal_data["BasalRate"].tolist(),
        },
        "bolus": {
            "time": bolus_data["time"].tolist(),
            "value": bolus_data["BolusAmount"].tolist(),
        },
        "carbs": {
            "time": meal_data["time"].tolist(),
            "value": meal_data["CarbAmount"].tolist(),
        },
    }
    
    with open(output_file, "w") as f:
        json.dump(output, f, indent=2, default=str)

In [None]:
import tqdm

for subject_id in tqdm.tqdm(subjects):
    cgm = cgm_all[cgm_all['PtID'] == subject_id].copy()
    basal = basal_all[basal_all['PtID'] == subject_id].copy()
    bolus = bolus_all[bolus_all['PtID'] == subject_id].copy()
    insulin = insulin_all[insulin_all['PtID'] == subject_id].copy()
    phys = phys_all[phys_all['PtID'] == subject_id].copy()

    output_file = f'{output_dir}/PEDAP_subject_{subject_id}.json'

    parse_subject(cgm, basal, bolus, insulin, phys, subject_id, output_file)
    