# DCLP3
Process DCLP3 dataset.

Source: [Jaeb diabetes datasets](https://public.jaeb.org/datasets/diabetes) - DCLP3 Public Dataset - Release 3 - 2022-08-04.zip

In [None]:
import pandas as pd
import numpy as np
import os
import json
import datetime

data_source = '../../data_raw/DCLP3_public/Data Files'
output_dir = '../../diax/DCLP3/'

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

In [None]:
cgm_all = pd.read_csv(f'{data_source}/cgm.txt', sep='|')
basal_all = pd.read_csv(f'{data_source}/Pump_BasalRateChange.txt', sep='|')
bolus_all = pd.read_csv(f'{data_source}/Pump_BolusDelivered.txt', sep='|')
smbg_all = pd.read_csv(f'{data_source}/RocheMeter_a.txt', sep='|', encoding='utf-16')
phys_all = pd.read_csv(f'{data_source}/DiabPhysExam_a.txt', sep='|', encoding='utf-16')

insulin_all = pd.read_csv(f'{data_source}/Insulin_a.txt', sep='|', encoding='utf-16')

insulin_all = insulin_all[insulin_all['InsRoute'] == 'Pump']
insulin_all['InsTypeStartDt'] = pd.to_datetime(insulin_all['InsTypeStartDt'], errors='coerce')
insulin_all['InsTypeStopDt'] = pd.to_datetime(insulin_all['InsTypeStopDt'], errors='coerce')

In [None]:
pt_ids = set(cgm_all['PtID']).intersection(set(basal_all['PtID'])).intersection(set(bolus_all['PtID']))
print(f"Found {len(pt_ids)} subjects with all required datasets.")
subjects = list(pt_ids)

In [None]:
def parse_subject(cgm, basal, bolus, insulin, smbg, phys, subject_id, output_file):
    cgm.loc[:, "time"] = pd.to_datetime(cgm["DataDtTm"], format="%d%b%y:%H:%M:%S")
    cgm_data = cgm

    # Basal
    basal.loc[:, "time"] = pd.to_datetime(basal["DataDtTm"], format="%Y-%m-%d %H:%M:%S")
    basal_data = basal

    # Bolus
    bolus.loc[:, "time"] = pd.to_datetime(bolus["DataDtTm"], format="%Y-%m-%d %H:%M:%S")
    bolus_data = bolus

    # SMBG
    smbg.loc[:, "time"] = pd.to_datetime(smbg["DataDtTm"], format="%Y-%m-%d %H:%M:%S")
    smbg_data = smbg

    # Time normalizing
    start_times = [
        cgm_data["time"].min(),
        basal_data["time"].min(),
        bolus_data["time"].min(),
        smbg_data["time"].min(),
    ]
    start_time = min(start_times)

    def normalize(df, col):
        df = df.copy()
        df.loc[:, col] = (df[col]).dt.strftime('%Y-%m-%d %H:%M:%S')
        return df.sort_values(col).reset_index(drop=True)

    cgm_data = normalize(cgm_data, "time")
    basal_data = normalize(basal_data, "time")
    bolus_data = normalize(bolus_data, "time")
    smbg_data = normalize(smbg_data, "time")

    ins_string = 'UNKNOWN'
    if len(insulin) > 1:  # multiple insulin entries
        start_date = start_time.strftime('%Y-%m-%d')
        dates = insulin['InsTypeStartDt'].dt.strftime('%Y-%m-%d').fillna(start_date).tolist()
        ins = insulin['ParentInsulinListID'].tolist()

        # if the insulin is the same as the previous one, skip it
        for i in range(1, len(ins)):
            if ins[i] == ins[i-1]:
                dates[i] = np.nan
        # remove the nan entries
        idx_keep = [i for i in range(len(dates)) if pd.notna(dates[i])]
        dates = [dates[i] for i in idx_keep]
        ins = [ins[i] for i in idx_keep]

        if len(ins) == 1:
            ins_string = ins[0]
        else:
            ins_string = {'date': dates, 'insulin': ins}
    elif len(insulin) == 1:
        ins_string = insulin['ParentInsulinListID'].iloc[0]

    # Get height and weight from phys exam data
    weight = phys['Weight'].values[0]
    height = phys['Height'].values[0]

    # correct units if needed
    weight_units = phys['WeightUnits'].values[0].lower().strip()
    if weight_units in ['lbs', 'pounds', 'lb']:
        weight = weight * 0.453592  # convert to kg
    elif weight_units in ['kg', 'kilograms', 'kg.']:
        pass  # already in kg
    else:
        print(f"Unknown weight units: {weight_units}, assuming kg.")

    height_units = phys['HeightUnits'].values[0].lower().strip()
    if height_units in ['in', 'inches', 'inch']:
        height = height * 2.54  # convert to cm
    elif height_units in ['cm', 'centimeters', 'cms']:
        pass  # already in cm
    else:
        print(f"Unknown height units: {height_units}, assuming cm.")

    output = {
        "metadata": {
            "unique_id": "id number of the subject",
            "time": {
                "unit": "Y-m-d H:M:S",
                "description": "Timestamps for each measurement, assumed to be in local time",
            },
            "cgm": {
                "unit": "mg/dL",
                "description": "Continuous Glucose Monitor readings",
                "device": "UNKOWN",
                "precision": 1,
            },
            "basal_rate": {
                "unit": "U/hr",
                "description": "The rate of insulin delivery from the pump",
                "device": "Tandem pump",
                "insulin": ins_string,
            },
            "bolus": {
                "unit": "U",
                "description": "The amount of insulin delivered in a bolus, meal and correction, in units",
                "device": "Tandem pump",
                "insulin": ins_string,
            },
            "smbg": {
                "unit": "mg/dL",
                "description": "Self-Monitored Blood Glucose readings",
                "device": "Roche Meter",
                "precision": 1,
            },
            "height": {
                "unit": "cm",
                "description": "Height of the subject at the start of the study",
            },
            "weight": {
                "unit": "kg",
                "description": "Weight of the subject at the start of the study",
            },
        },
        "unique_id": subject_id,
        "height": {"time": start_time, "value": height},
        "weight": {"time": start_time, "value": weight},
        "cgm": {"time": cgm_data["time"].tolist(), 
                "value": cgm_data["CGM"].tolist()},
        "basal_rate": {
            "time": basal_data["time"].tolist(),
            "value": basal_data["CommandedBasalRate"].tolist(),
        },
        "bolus": {
            "time": bolus_data["time"].tolist(),
            "value": bolus_data["BolusAmount"].tolist(),
        },
        "smbg": {
            "time": smbg_data["time"].tolist(),
            "value": smbg_data["BG"].tolist(),
        },
    }

    with open(output_file, "w") as f:
        json.dump(output, f, indent=2, default=str)

    # print(f"Generated JSON file for subject {subject_id}")

In [None]:
import tqdm

for subject_id in tqdm.tqdm(subjects):
    cgm = cgm_all[cgm_all['PtID'] == subject_id].copy()
    basal = basal_all[basal_all['PtID'] == subject_id].copy()
    bolus = bolus_all[bolus_all['PtID'] == subject_id].copy()
    insulin = insulin_all[insulin_all['PtID'] == subject_id].copy()
    smbg = smbg_all[smbg_all['PtID'] == subject_id].copy()
    phys = phys_all[phys_all['PtID'] == subject_id].copy()

    output_file = f'{output_dir}/DCLP3_subject_{subject_id}.json'

    parse_subject(cgm, basal, bolus, insulin, smbg, phys, subject_id, output_file)
    