# IOBP2 Dataset


Source: [Jaeb diabetes datasets](https://public.jaeb.org/datasets/diabetes) - IOBP2 RCT Public Dataset.zip

In [None]:
import pandas as pd
import numpy as np
import os
import json
import datetime

data_source = '../../data_raw/IOBP2_Public/Data Tables'
output_dir = '../../diax/IOBP2/'

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

In [None]:
dset_all = pd.read_csv(f'{data_source}/IOBP2DeviceiLet.txt', sep='|')

insulin_all = pd.read_csv(f'{data_source}/IOBP2Insulin.txt', sep='|')
insulin_all = insulin_all[insulin_all['InsRoute'] == 'Pump']  # subjects [445, 315, 366] have data in iLet but no InsRoute=Pump
insulin_all['InsTypeStartDt'] = pd.to_datetime(insulin_all['InsTypeStartDt'], errors='coerce')
insulin_all['InsTypeStopDt'] = pd.to_datetime(insulin_all['InsTypeStopDt'], errors='coerce')

height_weight = pd.read_csv(f'{data_source}/IOBP2HeightWeight.txt', sep='|')

smbg = pd.read_csv(f'{data_source}/IOBP2DeviceBGM.txt', sep='|')

In [None]:
pt_ids = set(dset_all['PtID'].unique())
subjects = list(pt_ids)

In [None]:
def parse_mixed(x):  # we have to use this because at exactly midnight the format is only date without time
    for fmt in ("%m/%d/%Y %I:%M:%S %p", "%m/%d/%Y"):
        try:
            return pd.to_datetime(x, format=fmt)
        except (ValueError, TypeError):
            continue
    return pd.NaT

In [None]:
def parse_subject(dset, insulin, smbg, hw, subject_id, output_file):
    # get datetime
    dset.loc[:, 'time'] = dset['DeviceDtTm'].apply(parse_mixed)
    smbg.loc[:, 'time'] = smbg['DeviceDtTm'].apply(parse_mixed)

    # get time delta (used for computing basal rate in u/hr)
    dset.loc[:,'time_delta'] = dset.loc[:,'time'].diff()
    dset.iloc[0, dset.columns.get_loc('time_delta')] = pd.Timedelta(minutes=5)  # first row has no previous time

    # Compute current delivery from previous delivery
    dset.loc[:,'BasalDelivNow'] = dset.loc[:,'BasalDelivPrev'].shift(-1)  # shift previous basal delivery to current time
    dset.loc[:,'BolusDelivNow'] = dset.loc[:,'BolusDelivPrev'].shift(-1)
    dset.loc[:,'BasalDelivNow'] = dset.loc[:,'BasalDelivNow'].fillna(0) # fill NaN values with 0
    dset.loc[:,'BolusDelivNow'] = dset.loc[:,'BolusDelivNow'].fillna(0)

    # Extract relevant columns
    cgm_data = dset[['time', 'CGMVal']]

    basal_data = dset[['time', 'BasalDelivNow', 'time_delta']].copy()
    basal_data.loc[:, 'BasalRate'] = basal_data['BasalDelivNow'] / basal_data['time_delta'].dt.total_seconds() * 3600  # convert to units per hour

    bolus_data = dset[['time', 'BolusDelivNow', 'MealBolus']].copy()
    bolus_data.loc[:, 'BolusAmount'] = bolus_data['BolusDelivNow'] + bolus_data['MealBolus']  # total bolus amount
    bolus_data = bolus_data[bolus_data['BolusAmount'] > 0]  # only keep rows with bolus delivery

    carb_data = dset[['time', 'MealBolus', 'MealSize']]
    carb_data = carb_data[carb_data['MealBolus'] > 0]  # only keep rows with meal bolus

    smbg_data = smbg[['time', 'BGMVal']]

    # Time normalizing
    start_times = [
        cgm_data["time"].min(),
        basal_data["time"].min(),
        bolus_data["time"].min(),
        smbg_data["time"].min(),
        carb_data["time"].min(),
    ]
    # Filter out any NaT values before taking the min
    start_times = [t for t in start_times if pd.notna(t)]
    start_time = min(start_times)

    def normalize(df, col):
        if len(df) == 0:
            return df  # return empty DataFrame if no data
        
        df = df.copy()
        df.loc[:, col] = (df[col]).dt.strftime('%Y-%m-%d %H:%M:%S')
        return df.sort_values(col).reset_index(drop=True)

    cgm_data = normalize(cgm_data, "time")
    basal_data = normalize(basal_data, "time")
    bolus_data = normalize(bolus_data, "time")
    carb_data = normalize(carb_data, "time")
    smbg_data = normalize(smbg_data, "time")

    ins_string = 'UNKNOWN'
    if len(insulin) > 1:  # multiple insulin entries
        start_date = start_time.strftime('%Y-%m-%d')
        dates = insulin['InsTypeStartDt'].dt.strftime('%Y-%m-%d').fillna(start_date).tolist()
        ins = insulin['InsulinName'].tolist()

        # if the insulin is the same as the previous one, skip it
        for i in range(1, len(ins)):
            if ins[i] == ins[i-1]:
                dates[i] = np.nan
        # remove the nan entries
        idx_keep = [i for i in range(len(dates)) if pd.notna(dates[i])]
        dates = [dates[i] for i in idx_keep]
        ins = [ins[i] for i in idx_keep]

        if len(ins) == 1:
            ins_string = ins[0]
        else:
            ins_string = {'date': dates, 'insulin': ins}
    elif len(insulin) == 1:
        ins_string = insulin['InsulinName'].iloc[0]

    # Get height and weight from phys exam data
    weight_data = {'value': [], 'date': []}
    height_data = {'value': [], 'date': []}
    for _, row in hw.iterrows():
        if pd.notna(row['Weight']):
            w = row['Weight']
            unit = row['WeightUnits']
            if unit in ['lbs', 'pounds', 'lb']:
                w = w * 0.453592  # convert to kg
            elif unit in ['kg', 'kilograms', 'kg.']:
                pass  # already in kg
            else:
                print(f"Unknown weight units: {unit}, assuming kg.")
            weight_data['value'].append(w)
            weight_data['date'].append(row['WeightAssessDt'])
        if pd.notna(row['Height']):
            h = row['Height']
            unit = row['HeightUnits']
            if unit in ['cm', 'centimeters', 'cm.']:
                pass  # already in cm
            elif unit in ['inches', 'in', 'inch']:
                h = h * 2.54  # convert to cm
            else:
                print(f"Unknown height units: {unit}, assuming cm.")
            
            height_data['value'].append(h)
            height_data['date'].append(row['HeightAssessDt'])

    output = {
        "metadata": {
            "unique_id": "id number of the subject",
            "time": {
                "unit": "Y-m-d H:M:S",
                "description": "Timestamps for each measurement, assumed to be in local time",
            },
            "cgm": {
                "unit": "mg/dL",
                "description": "Continuous Glucose Monitor readings",
                "device": "UNKOWN",
                "precision": 1,
            },
            "basal_rate": {
                "unit": "U/hr",
                "description": "The rate of insulin delivery from the pump",
                "device": "iLet pump",
                "insulin": ins_string,
            },
            "bolus": {
                "unit": "U",
                "description": "The amount of insulin delivered in a bolus, meal and correction, in units",
                "device": "iLet pump",
                "insulin": ins_string,
            },
            "carb_category": {
                "unit": "String",
                "description": "User announced carbohydrate intake category associated with bolus. Options are: Less, Typical, or More",
            },
            "smbg": {
                "unit": "mg/dL",
                "description": "Self-Monitoring Blood Glucose readings",
                "device": "Unknown",
                "precision": 1,
            },
            "height": {
                "unit": "cm",
                "description": "Height of the subject on the specific date",
            },
            "weight": {
                "unit": "kg",
                "description": "Weight of the subject on the specific date",
            },
        },
        "unique_id": subject_id,
        "cgm": {"time": cgm_data["time"].tolist(), 
                "value": cgm_data["CGMVal"].tolist()},
        "basal_rate": {
            "time": basal_data["time"].tolist(),
            "value": basal_data["BasalRate"].tolist(),
        },
        "bolus": {
            "time": bolus_data["time"].tolist(),
            "value": bolus_data["BolusAmount"].tolist(),
        },
        "carb_category": {
            "time": carb_data["time"].tolist(),
            "value": carb_data["MealSize"].tolist(),
        },
        "smbg": {
            "time": smbg_data["time"].tolist(),
            "value": smbg_data["BGMVal"].tolist(),
        },
        "height": height_data,
        "weight": weight_data,
    }

    with open(output_file, 'w') as f:
        json.dump(output, f, indent=2, default=str)

In [None]:
subject_id = subjects[0]

dset = dset_all[dset_all['PtID'] == subject_id]
insulin = insulin_all[insulin_all['PtID'] == subject_id]
smbg = smbg[smbg['PtID'] == subject_id]
hw = height_weight[height_weight['PtID'] == subject_id]

In [None]:
import tqdm

for subject_id in tqdm.tqdm(subjects):
    dset = dset_all[dset_all['PtID'] == subject_id].copy()
    insulin = insulin_all[insulin_all['PtID'] == subject_id].copy()
    smbg = smbg[smbg['PtID'] == subject_id].copy()
    hw = height_weight[height_weight['PtID'] == subject_id].copy()

    output_file = f'{output_dir}/IOBP2_subject_{subject_id}.json'
    parse_subject(dset, insulin, smbg, hw, subject_id, output_file)