In [5]:
# This is the script for processing Aleppo2017 data into the common format.
# Author: David Buchanan
# Date: Febuary 4th, 2020, edited June 14th, by Elizabeth Chun

import datetime
# For working with dates and times

# This study downloads as a zipped folder containing data tables and forms
# First download the entire dataset. Do not rename the downloaded folder
# Place the downloaded folder into a folder of your creation specific for this dataset
# You may name your created folder however you like
# Here we have named the created folder by first author last name and date of the original paper
dataset = "../data_downloads/aleppo"
# If you have a different naming method, you will need to adjust this, eg.
# dataset = "insert_your_name"

# using the original file path exactly as downloaded
file = dataset + "/Data Tables/HDeviceCGM.txt"
# alternatively, if file paths have been changed simply set file = the CGM file

basedate = datetime.date(2015, 5, 22)
# The data has days as days since study start
# Establishing a base date to work from

newfile = dataset + "_processed.csv"
with open(file) as file:  # Open the data file
    with open(newfile, "w") as export:  # Open the file for the processed data
        isheader = True
        # Flag to mark when the header is being read

        for line in file:
            # Work line by line within the file

            if isheader:
                # Executes on the first iteration i.e. when reading the header

                isheader = False
                # All future lines will not be header lines

                export.write('"id","time","gl"\n')
                # Write the export file's header

                continue
                # Move to the next iteration without executing the rest of the code

            line = line.split("|")
            # Split the data by delimiting character so it can be worked with

            day = datetime.timedelta(days=int(line[4]))
            # obtain the "number of days since enrolled in study" field in a way useful for doing date arithmetic

            thisdate = basedate + day
            # Create the date that will get written to the file

            thistime = datetime.datetime.strptime(line[5], "%H:%M:%S").time()
            # Read the reading time as a time object

            thedatetime = datetime.datetime.combine(thisdate, thistime)
            # Combine the read date and time objects

            val = line[9]

            export.write(
                str(line[2]) + "," + str(thedatetime) + "," + line[9][0:3] + "\n"
            )
            # Write all the data to the export file in the working directory

# this is a large dataset, so do not be surprised is the runtime is somewhat long

In [11]:
# import polars as pl
# my_file = "../data_downloads/aleppo_processed.csv"

# df = pl.read_csv(my_file)
# Load each data table
import pandas as pd
import os

dataset_path = os.path.abspath("../data_downloads/aleppo/Data Tables")
cgm_df = pd.read_csv(os.path.join(dataset_path, "HDeviceCGM.txt"), sep="|")
bgm_df = pd.read_csv(os.path.join(dataset_path, "HDeviceBGM.txt"), sep="|")
# a1c_df = pd.read_csv(os.path.join(dataset_path, 'HA1c.txt'), sep='|')
insulin_df = pd.read_csv(os.path.join(dataset_path, "HInsulin.txt"), sep="|")
medications_df = pd.read_csv(os.path.join(dataset_path, "HMedication.txt"), sep="|")
# patient_df = pd.read_csv(os.path.join(dataset_path, 'HPatient.txt'), sep='|')

In [12]:
print(cgm_df.columns)
print(bgm_df.columns)
# print(a1c_df.columns)
print(insulin_df.columns)
print(medications_df.columns)
# print(patient_df.columns)

Index(['RecID', 'ParentHDeviceUploadsID', 'PtID', 'SiteID',
       'DeviceDtTmDaysFromEnroll', 'DeviceTm', 'DexInternalDtTmDaysFromEnroll',
       'DexInternalTm', 'RecordType', 'GlucoseValue'],
      dtype='object')
Index(['RecID', 'ParentHDeviceUploadsID', 'PtID', 'SiteID',
       'DeviceDtTmDaysFromEnroll', 'DeviceTm', 'RecordType', 'RecordSubType',
       'GlucoseValue'],
      dtype='object')
Index(['RecID', 'PtID', 'SiteID', 'InsName', 'InsRoute', 'InsInjectionFreq',
       'InsTypeStart', 'InsTypeStartDtDaysFromEnroll', 'InsTypeStartUnknown',
       'InsTypeStopDtDaysFromEnroll', 'InsTypeStopUnknown',
       'InsTypeStartEstimate', 'InsTypeStopEstimate'],
      dtype='object')
Index(['RecID', 'PtID', 'SiteID', 'DrugName', 'MedDose', 'MedUnit',
       'MedDoseUnk', 'MedRoute', 'MedLocSide', 'MedFreqType', 'MedFreqNum',
       'MedFreqPer', 'MedFreqUnk', 'MedInd', 'MedicalCondition1',
       'MedicalCondition2', 'AdverseEvent1', 'AdverseEvent2',
       'PreExistingCondition1', 'Pr

In [19]:
cgm_df.shape

(14950661, 10)

In [None]:
import pandas as pd
from src.data.aleppo.aleppo import AleppoDataLoader

# file_path = "../data_downloads/aleppo_processed.csv"
# aleppo = AleppoDataLoader(file_path)
test = pd.read_csv("../data_downloads/aleppo_processed.csv")
test = test.rename(columns={"id": "p_num", "gl": "bgl", "time": "datetime"})
test.to_csv("../data_downloads/aleppo_processed.csv")

In [None]:
import pandas as pd

file_path = "../data_downloads/aleppo_processed.csv"
keep_columns = ["p_num", "date", "bgl"]
aleppo = AleppoDataLoader(file_path=file_path, keep_columns=keep_columns)

# Anderson Dataset

In [4]:
# This is the script for processing Anderson2016 data into the common format.
# Author: David Buchanan
# Adapted to Python by ChatGPT
# Original Date: January 31st, 2020, edited June 13th, 2020 by Elizabeth Chun

import os
import pandas as pd

# Set the dataset name (name of the created folder)
# dataset = "../data_downloads/anderson"

# Change the working directory to the dataset folder
# os.chdir(dataset)

# Define the file path
file_path = os.path.join("Data Tables", "CGM.txt")

# Alternatively, if the file structure has been changed, place CGM.txt directly in the folder and use:
# file_path = "CGM.txt"

# Read the raw data
curr = pd.read_csv(file_path, sep="|")

# Reorder and keep only the columns we want
curr = curr.iloc[
    :, [0, 4, 3]
]  # Columns 1, 5, and 4 in R are 0, 4, and 3 in Python (0-indexed)

# Rename columns to standard format
curr.columns = ["id", "time", "gl"]

# Ensure glucose values are numeric
curr["gl"] = pd.to_numeric(curr["gl"], errors="coerce")

# Standardize date and time
curr["time"] = pd.to_datetime(curr["time"], format="%Y-%m-%d %H:%M:%S", errors="coerce")

# Define the output file name
output_file = f"./{dataset}_processed.csv"

# Check if the file exists to determine header inclusion
write_header = not os.path.exists(output_file)

# Save the cleaned data
curr.to_csv(output_file, index=False, header=write_header, mode="a")

# Note: 'DisplayTime' is used because it is user-configurable.
# "The time displayed to the user on the receiver or phone. This time is assumed to be user-configurable."
# Source: https://developer.dexcom.com/glossary

OSError: Cannot save file into a non-existent directory: '..\data_downloads'

In [5]:
curr.to_csv("./test.csv", index=False, header=write_header, mode="a")

In [15]:
ketones_df = pd.read_csv(
    os.path.join(os.getcwd(), "Data Tables", "Ketone.txt"), sep="|"
)
ketones_df = ketones_df.rename(columns={"DeidentID": "id", "DataDtTm": "time"})[
    ["id", "time", "Ketone", "Units", "DeviceModel"]
]

In [17]:
curr

Unnamed: 0,id,time,gl
0,1,2013-09-29 17:36:48,194
1,1,2013-09-29 17:41:48,204
2,1,2013-09-29 17:46:48,201
3,1,2013-09-29 17:51:48,204
4,1,2013-09-29 17:56:48,200
...,...,...,...
1302631,20,2014-06-01 12:53:08,144
1302632,20,2014-06-01 12:58:09,143
1302633,20,2014-06-01 13:03:08,143
1302634,20,2014-06-01 13:08:08,140


In [16]:
ketones_df

Unnamed: 0,id,time,Ketone,Units,DeviceModel
0,1,2013-11-17 17:34:00,2.4,US,Abbott BG Meter
1,1,2013-11-14 19:31:00,0.2,US,Abbott BG Meter
2,1,2013-11-14 18:38:00,0.2,US,Abbott BG Meter
3,1,2013-11-04 12:43:00,4.1,US,Abbott BG Meter
4,1,2013-11-04 12:42:00,2.3,US,Abbott BG Meter
...,...,...,...,...,...
697,8,2014-12-19 18:09:00,0.7,US,Abbott BG Meter
698,8,2014-11-14 18:59:00,4.6,US,Abbott BG Meter
699,8,2014-11-14 18:58:00,0.7,US,Abbott BG Meter
700,10,2013-01-14 17:24:00,4.5,US,Abbott BG Meter
