In [5]:
# This is the script for processing Aleppo2017 data into the common format.
# Author: David Buchanan
# Date: Febuary 4th, 2020, edited June 14th, by Elizabeth Chun

import datetime
# For working with dates and times

# This study downloads as a zipped folder containing data tables and forms
# First download the entire dataset. Do not rename the downloaded folder
# Place the downloaded folder into a folder of your creation specific for this dataset
# You may name your created folder however you like
# Here we have named the created folder by first author last name and date of the original paper
dataset = "../data_downloads/aleppo"
# If you have a different naming method, you will need to adjust this, eg.
# dataset = "insert_your_name"

# using the original file path exactly as downloaded
file = dataset + "/Data Tables/HDeviceCGM.txt"
# alternatively, if file paths have been changed simply set file = the CGM file

basedate = datetime.date(2015, 5, 22)
# The data has days as days since study start
# Establishing a base date to work from

newfile = dataset + "_processed.csv"
with open(file) as file:  # Open the data file
    with open(newfile, "w") as export:  # Open the file for the processed data
        isheader = True
        # Flag to mark when the header is being read

        for line in file:
            # Work line by line within the file

            if isheader:
                # Executes on the first iteration i.e. when reading the header

                isheader = False
                # All future lines will not be header lines

                export.write('"id","time","gl"\n')
                # Write the export file's header

                continue
                # Move to the next iteration without executing the rest of the code

            line = line.split("|")
            # Split the data by delimiting character so it can be worked with

            day = datetime.timedelta(days=int(line[4]))
            # obtain the "number of days since enrolled in study" field in a way useful for doing date arithmetic

            thisdate = basedate + day
            # Create the date that will get written to the file

            thistime = datetime.datetime.strptime(line[5], "%H:%M:%S").time()
            # Read the reading time as a time object

            thedatetime = datetime.datetime.combine(thisdate, thistime)
            # Combine the read date and time objects

            val = line[9]

            export.write(
                str(line[2]) + "," + str(thedatetime) + "," + line[9][0:3] + "\n"
            )
            # Write all the data to the export file in the working directory

# this is a large dataset, so do not be surprised is the runtime is somewhat long

In [11]:
# import polars as pl
# my_file = "../data_downloads/aleppo_processed.csv"

# df = pl.read_csv(my_file)
# Load each data table
import pandas as pd
import os

dataset_path = os.path.abspath("../data_downloads/aleppo/Data Tables")
cgm_df = pd.read_csv(os.path.join(dataset_path, "HDeviceCGM.txt"), sep="|")
bgm_df = pd.read_csv(os.path.join(dataset_path, "HDeviceBGM.txt"), sep="|")
# a1c_df = pd.read_csv(os.path.join(dataset_path, 'HA1c.txt'), sep='|')
insulin_df = pd.read_csv(os.path.join(dataset_path, "HInsulin.txt"), sep="|")
medications_df = pd.read_csv(os.path.join(dataset_path, "HMedication.txt"), sep="|")
# patient_df = pd.read_csv(os.path.join(dataset_path, 'HPatient.txt'), sep='|')

In [12]:
print(cgm_df.columns)
print(bgm_df.columns)
# print(a1c_df.columns)
print(insulin_df.columns)
print(medications_df.columns)
# print(patient_df.columns)

Index(['RecID', 'ParentHDeviceUploadsID', 'PtID', 'SiteID',
       'DeviceDtTmDaysFromEnroll', 'DeviceTm', 'DexInternalDtTmDaysFromEnroll',
       'DexInternalTm', 'RecordType', 'GlucoseValue'],
      dtype='object')
Index(['RecID', 'ParentHDeviceUploadsID', 'PtID', 'SiteID',
       'DeviceDtTmDaysFromEnroll', 'DeviceTm', 'RecordType', 'RecordSubType',
       'GlucoseValue'],
      dtype='object')
Index(['RecID', 'PtID', 'SiteID', 'InsName', 'InsRoute', 'InsInjectionFreq',
       'InsTypeStart', 'InsTypeStartDtDaysFromEnroll', 'InsTypeStartUnknown',
       'InsTypeStopDtDaysFromEnroll', 'InsTypeStopUnknown',
       'InsTypeStartEstimate', 'InsTypeStopEstimate'],
      dtype='object')
Index(['RecID', 'PtID', 'SiteID', 'DrugName', 'MedDose', 'MedUnit',
       'MedDoseUnk', 'MedRoute', 'MedLocSide', 'MedFreqType', 'MedFreqNum',
       'MedFreqPer', 'MedFreqUnk', 'MedInd', 'MedicalCondition1',
       'MedicalCondition2', 'AdverseEvent1', 'AdverseEvent2',
       'PreExistingCondition1', 'Pr

In [19]:
cgm_df.shape

(14950661, 10)

In [18]:
test = pd.read_csv("../data_downloads/aleppo_processed.csv")
test

Unnamed: 0,id,time,gl
0,183,2015-05-16 05:35:41,162.0
1,183,2015-05-16 05:30:41,164.0
2,183,2015-05-16 05:25:41,168.0
3,183,2015-05-16 05:20:41,169.0
4,183,2015-05-16 05:15:41,170.0
...,...,...,...
14950656,293,2015-09-04 08:47:46,210.0
14950657,293,2015-09-04 08:42:46,211.0
14950658,293,2015-09-04 08:37:46,210.0
14950659,293,2015-09-04 08:32:46,207.0
