In [1]:
import pandas as pd
import numpy as np
import xml.sax

In [2]:
#creating ContentHandler class to parse xml file
class CGMHandler(xml.sax.ContentHandler):    
    label= ""
    gl = { "time" : [] , "value" : []}
    bas = { "time" : [] , "value" :[]}
    bol = { "time" : [], "doze" : [], "carb" : []}
    meal = { "type" : [], "time" : [], "carb" : []}
    bs = { "beg" : [], "end" : [], "q" : []}
    slp = { "beg" : [], "end" : [], "q" : []}
    wrk = { "beg" : [], "end" : [], "int" : []} 
    ex  = { "time" : [], "dur" : [], "int" : []}
    
    def __init__(self):
        self.CurrentData = ""
       
    def startElement(self, tag, attributes):
        if tag != "event": 
            self.label = tag
        self.CurrentData = tag
        if tag == "event":
            if self.label == "glucose_level":
                ts = attributes["ts"]
                val =  attributes["value"]
                self.gl["time"].append(ts)
                self.gl["value"].append(val)
                
            
            elif self.label == "basal":  
                ts = attributes["ts"]
                val =  attributes["value"]
                self.bas["time"].append(ts)
                self.bas["value"].append(val)
                
            elif self.label == "bolus":  
                ts = attributes["ts_begin"]
                doze =  attributes["dose"]
                car = attributes["bwz_carb_input"]
                self.bol["time"].append(ts)
                self.bol["doze"].append(doze)
                self.bol["carb"].append(car)
            
            elif self.label == "meal":  
                ts = attributes["ts"]
                typ =  attributes["type"]
                car = attributes["carbs"]
                self.meal["time"].append(ts)
                self.meal["type"].append(typ)
                self.meal["carb"].append(car)
            
            elif self.label == "sleep":  
                b = attributes["ts_begin"]
                e =  attributes["ts_end"]
                q = attributes["quality"]
                self.slp["beg"].append(b)
                self.slp["end"].append(e)
                self.slp["q"].append(q)
                
            elif self.label == "work":  
                b = attributes["ts_begin"]
                e =  attributes["ts_end"]
                i = attributes["intensity"]
                self.wrk["beg"].append(b)
                self.wrk["end"].append(e)
                self.wrk["int"].append(i)
            
            elif self.label == "exercise":  
                ts = attributes["ts"]
                d =  attributes["duration"]
                i = attributes["intensity"]
                self.ex["time"].append(ts)
                self.ex["dur"].append(d)
                self.ex["int"].append(i)
            
            elif self.label == "basis_sleep":  
                b = attributes["tbegin"]
                e =  attributes["tend"]
                q = attributes["quality"]
                self.bs["beg"].append(b)
                self.bs["end"].append(e)
                self.bs["q"].append(q)
            

In [3]:
#parsing the input xml file for one patient's record
parser = xml.sax.make_parser()
parser.setFeature(xml.sax.handler.feature_namespaces, 0)
Handler = CGMHandler()
parser.setContentHandler(Handler)   
parser.parse("/Users/shreyaananth/Desktop/College/CIP/Code/OhioT1DM/2018/train/563-ws-training.xml")

In [4]:
#creating a dataframe for each type of data given in xml file
gl = pd.DataFrame(Handler.gl)
bas = pd.DataFrame(Handler.bas)
bol = pd.DataFrame(Handler.bol)
meal = pd.DataFrame(Handler.meal)
bs = pd.DataFrame(Handler.bs)
slp = pd.DataFrame(Handler.slp)
wrk = pd.DataFrame(Handler.wrk)
ex = pd.DataFrame(Handler.ex)

In [5]:
#previous version of dataframe with incorrect date format
gl

Unnamed: 0,time,value
0,13-09-2021 12:33:00,219
1,13-09-2021 12:38:00,229
2,13-09-2021 12:43:00,224
3,13-09-2021 12:48:00,221
4,13-09-2021 12:53:00,215
...,...,...
12119,28-10-2021 23:36:00,252
12120,28-10-2021 23:41:00,254
12121,28-10-2021 23:46:00,250
12122,28-10-2021 23:51:00,246


In [6]:
#saving the dataframe into csv files to modify certain aspects manually 
gl.to_csv("/Users/shreyaananth/Desktop/College/CIP/Code/Data/563/Train/gl.csv")
bas.to_csv("/Users/shreyaananth/Desktop/College/CIP/Code/Data/563/Train/bas.csv")
bol.to_csv("/Users/shreyaananth/Desktop/College/CIP/Code/Data/563/Train/bol.csv")
meal.to_csv("/Users/shreyaananth/Desktop/College/CIP/Code/Data/563/Train/meal.csv")
bs.to_csv("/Users/shreyaananth/Desktop/College/CIP/Code/Data/563/Train/bs.csv")
slp.to_csv("/Users/shreyaananth/Desktop/College/CIP/Code/Data/563/Train/slp.csv")
wrk.to_csv("/Users/shreyaananth/Desktop/College/CIP/Code/Data/563/Train/wrk.csv")
ex.to_csv("/Users/shreyaananth/Desktop/College/CIP/Code/Data/563/Train/ex.csv")

In [7]:
#changing the format of all dates into %yyyy-%mm-%dd

In [8]:
gl = pd.read_csv('/Users/shreyaananth/Desktop/College/CIP/Code/Data/563/Train/gl.csv',index_col=0)
bas = pd.read_csv('/Users/shreyaananth/Desktop/College/CIP/Code/Data/563/Train/bas.csv',index_col=0)
bol = pd.read_csv('/Users/shreyaananth/Desktop/College/CIP/Code/Data/563/Train/bol.csv',index_col=0)
meal = pd.read_csv('/Users/shreyaananth/Desktop/College/CIP/Code/Data/563/Train/meal.csv',index_col=0)

In [9]:
ex = pd.read_csv('/Users/shreyaananth/Desktop/College/CIP/Code/Data/563/Train/ex.csv', index_col=0)

In [10]:
#processing the time column: changing from string to datetime object
gl['time'] = pd.to_datetime(gl['time'],infer_datetime_format=True)
bas['time'] = pd.to_datetime(bas['time'],infer_datetime_format=True)
bol['time'] = pd.to_datetime(bol['time'],infer_datetime_format=True)
meal['time'] = pd.to_datetime(meal['time'],infer_datetime_format=True)

In [11]:
ex['time'] = pd.to_datetime(ex['time'],infer_datetime_format=True)

In [12]:
#data after manual correction
gl

Unnamed: 0,time,value
0,2021-09-13 12:33:00,219
1,2021-09-13 12:38:00,229
2,2021-09-13 12:43:00,224
3,2021-09-13 12:48:00,221
4,2021-09-13 12:53:00,215
...,...,...
12119,2021-10-28 23:36:00,252
12120,2021-10-28 23:41:00,254
12121,2021-10-28 23:46:00,250
12122,2021-10-28 23:51:00,246


In [13]:
#extracting date from time column to facilitate grouping of data according to date
gl['date'] = gl['time'].dt.date
bas['date'] = bas['time'].dt.date
bol['date'] = bol['time'].dt.date
meal['date'] = meal['time'].dt.date

In [14]:
ex['date'] = ex['time'].dt.date

In [15]:
#creating a new dataframe that acts as a template for combining all the patient data into one table
data = pd.DataFrame()
data['date'] = gl['date']
data['time'] = gl['time']
data['cgm'] = gl['value']
data['bas'] = [0]*len(data)
data['bol'] = [0]*len(data)
data['meal_carb'] = [0]*len(data)
data['meal_type'] = ["None"]*len(data)
data['ex'] = [0]*len(data)

In [16]:
#creating dataframes for each day's data from the template dataframe
grouped_df = data.groupby(data.date)
keys = grouped_df.indices.keys()
keys = list(keys)

date_df = []
for i in range(len(keys)):
    df = grouped_df.get_group(keys[i])
    date_df.append(df)

In [17]:
#populating the dataframes with data 
for i in range(len(bas)):
    for k in range(len(date_df)):
        f = 0
        for j in range(len(date_df[k])-1):
            if bas.iloc[i,0]> date_df[k].iloc[j,1] and bas.iloc[i,0] <= date_df[k].iloc[j+1,1]:
                date_df[k].iloc[j+1,3] = bas.iloc[i,1] 
                f = 1
                break
        if f==1:
            break

for i in range(len(bol)):
    for k in range(len(date_df)):
        f = 0
        if f==1:
            break
        for j in range(len(date_df[k])-1):
            if bol.iloc[i,0]> date_df[k].iloc[j,1] and bol.iloc[i,0] <= date_df[k].iloc[j+1,1]:
                date_df[k].iloc[j+1,4] = bol.iloc[i,1] 
                f = 1
                break
        if f==1:
            break 
            
for i in range(len(meal)):
    for k in range(len(date_df)):
        f = 0
        if f==1:
            break
        for j in range(len(date_df[k])-1):
            if meal.iloc[i,1]> date_df[k].iloc[j,1] and meal.iloc[i,1] <= date_df[k].iloc[j+1,1]:
                date_df[k].iloc[j+1,5] = meal.iloc[i,2] 
                date_df[k].iloc[j+1,6] = meal.iloc[i,0] 
                f = 1
                break               
        if f==1:
            break

for i in range(len(ex)):
    for k in range(len(date_df)):
        f = 0
        if f==1:
            break
        for j in range(len(date_df[k])-1):
            if ex.iloc[i,0]> date_df[k].iloc[j,1] and ex.iloc[i,0] <= date_df[k].iloc[j+1,1]:
                for l in range(int(ex.iloc[i,1]/5)):
                    date_df[k].iloc[j+l+1,7] = ex.iloc[i,2] 
                f = 1
                break               
        if f==1:
            break

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  date_df[k].iloc[j+1,3] = bas.iloc[i,1]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  date_df[k].iloc[j+1,4] = bol.iloc[i,1]


In [18]:
#creating csv file for each day's record
for i in range(len(keys)):
    string = "date-" + str(keys[i])
    date_df[i].to_csv("/Users/shreyaananth/Desktop/College/CIP/Code/Data/563/Train/Date/"+string+".csv")

In [19]:
#sample dataframe from the created csv file
string = "/Users/shreyaananth/Desktop/College/CIP/Code/Data/563/Train/Date/date-" + str(keys[0]) + ".csv"
df = pd.read_csv(string, index_col=0)
df

Unnamed: 0,date,time,cgm,bas,bol,meal_carb,meal_type,ex
0,2021-09-13,2021-09-13 12:33:00,219,0.0,0.0,0,,0
1,2021-09-13,2021-09-13 12:38:00,229,0.0,0.0,0,,0
2,2021-09-13,2021-09-13 12:43:00,224,0.0,0.0,0,,0
3,2021-09-13,2021-09-13 12:48:00,221,0.0,0.0,0,,0
4,2021-09-13,2021-09-13 12:53:00,215,0.0,0.0,0,,0
...,...,...,...,...,...,...,...,...
130,2021-09-13,2021-09-13 23:38:00,132,0.0,0.0,0,,0
131,2021-09-13,2021-09-13 23:43:00,128,0.0,0.0,0,,0
132,2021-09-13,2021-09-13 23:48:00,126,0.0,0.0,0,,0
133,2021-09-13,2021-09-13 23:53:00,125,0.0,0.0,0,,0


In [20]:
#parsing the input xml file for one patient's record
parser = xml.sax.make_parser()
parser.setFeature(xml.sax.handler.feature_namespaces, 0)
Handler = CGMHandler()
parser.setContentHandler(Handler)   
parser.parse("/Users/shreyaananth/Desktop/College/CIP/Code/OhioT1DM/2018/test/563-ws-testing.xml")

In [21]:
#creating a dataframe for each type of data given in xml file
gl = pd.DataFrame(Handler.gl)
bas = pd.DataFrame(Handler.bas)
bol = pd.DataFrame(Handler.bol)
meal = pd.DataFrame(Handler.meal)
bs = pd.DataFrame(Handler.bs)
slp = pd.DataFrame(Handler.slp)
wrk = pd.DataFrame(Handler.wrk)
ex = pd.DataFrame(Handler.ex)

In [22]:
#saving the dataframe into csv files to modify certain aspects manually 
gl.to_csv("/Users/shreyaananth/Desktop/College/CIP/Code/Data/563/Test/gl.csv")
bas.to_csv("/Users/shreyaananth/Desktop/College/CIP/Code/Data/563/Test/bas.csv")
bol.to_csv("/Users/shreyaananth/Desktop/College/CIP/Code/Data/563/Test/bol.csv")
meal.to_csv("/Users/shreyaananth/Desktop/College/CIP/Code/Data/563/Test/meal.csv")
bs.to_csv("/Users/shreyaananth/Desktop/College/CIP/Code/Data/563/Test/bs.csv")
slp.to_csv("/Users/shreyaananth/Desktop/College/CIP/Code/Data/563/Test/slp.csv")
wrk.to_csv("/Users/shreyaananth/Desktop/College/CIP/Code/Data/563/Test/wrk.csv")
ex.to_csv("/Users/shreyaananth/Desktop/College/CIP/Code/Data/563/Test/ex.csv")

In [23]:
#changing the format of all dates into %yyyy-%mm-%dd

In [24]:
gl = pd.read_csv('/Users/shreyaananth/Desktop/College/CIP/Code/Data/563/Test/gl.csv',index_col=0)
bas = pd.read_csv('/Users/shreyaananth/Desktop/College/CIP/Code/Data/563/Test/bas.csv',index_col=0)
bol = pd.read_csv('/Users/shreyaananth/Desktop/College/CIP/Code/Data/563/Test/bol.csv',index_col=0)
meal = pd.read_csv('/Users/shreyaananth/Desktop/College/CIP/Code/Data/563/Test/meal.csv',index_col=0)

In [25]:
ex = pd.read_csv('/Users/shreyaananth/Desktop/College/CIP/Code/Data/563/Test/ex.csv', index_col=0)

In [26]:
#processing the time column: changing from string to datetime object
gl['time'] = pd.to_datetime(gl['time'],infer_datetime_format=True)
bas['time'] = pd.to_datetime(bas['time'],infer_datetime_format=True)
bol['time'] = pd.to_datetime(bol['time'],infer_datetime_format=True)
meal['time'] = pd.to_datetime(meal['time'],infer_datetime_format=True)

In [27]:
ex['time'] = pd.to_datetime(ex['time'],infer_datetime_format=True)

In [28]:
#extracting date from time column to facilitate grouping of data according to date
gl['date'] = gl['time'].dt.date
bas['date'] = bas['time'].dt.date
bol['date'] = bol['time'].dt.date
meal['date'] = meal['time'].dt.date

In [29]:
ex['date'] = ex['time'].dt.date

In [30]:
#creating a new dataframe that acts as a template for combining all the patient data into one table
data = pd.DataFrame()
data['date'] = gl['date']
data['time'] = gl['time']
data['cgm'] = gl['value']
data['bas'] = [0]*len(data)
data['bol'] = [0]*len(data)
data['meal_carb'] = [0]*len(data)
data['meal_type'] = ["None"]*len(data)
data['ex'] = [0]*len(data)

In [31]:
#creating dataframes for each day's data from the template dataframe
grouped_df = data.groupby(data.date)
keys = grouped_df.indices.keys()
keys = list(keys)

date_df = []
for i in range(len(keys)):
    df = grouped_df.get_group(keys[i])
    date_df.append(df)

In [32]:
#populating the dataframes with data 
for i in range(len(bas)):
    for k in range(len(date_df)):
        f = 0
        for j in range(len(date_df[k])-1):
            if bas.iloc[i,0]> date_df[k].iloc[j,1] and bas.iloc[i,0] <= date_df[k].iloc[j+1,1]:
                date_df[k].iloc[j+1,3] = bas.iloc[i,1] 
                f = 1
                break
        if f==1:
            break

for i in range(len(bol)):
    for k in range(len(date_df)):
        f = 0
        if f==1:
            break
        for j in range(len(date_df[k])-1):
            if bol.iloc[i,0]> date_df[k].iloc[j,1] and bol.iloc[i,0] <= date_df[k].iloc[j+1,1]:
                date_df[k].iloc[j+1,4] = bol.iloc[i,1] 
                f = 1
                break
        if f==1:
            break 
            
for i in range(len(meal)):
    for k in range(len(date_df)):
        f = 0
        if f==1:
            break
        for j in range(len(date_df[k])-1):
            if meal.iloc[i,1]> date_df[k].iloc[j,1] and meal.iloc[i,1] <= date_df[k].iloc[j+1,1]:
                date_df[k].iloc[j+1,5] = meal.iloc[i,2] 
                date_df[k].iloc[j+1,6] = meal.iloc[i,0] 
                f = 1
                break               
        if f==1:
            break

for i in range(len(ex)):
    for k in range(len(date_df)):
        f = 0
        if f==1:
            break
        for j in range(len(date_df[k])-1):
            if ex.iloc[i,0]> date_df[k].iloc[j,1] and ex.iloc[i,0] <= date_df[k].iloc[j+1,1]:
                for l in range(int(ex.iloc[i,1]/5)):
                    date_df[k].iloc[j+l+1,7] = ex.iloc[i,2] 
                f = 1
                break               
        if f==1:
            break

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  date_df[k].iloc[j+1,3] = bas.iloc[i,1]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  date_df[k].iloc[j+1,4] = bol.iloc[i,1]


In [33]:
#creating csv file for each day's record
for i in range(len(keys)):
    string = "date-" + str(keys[i])
    date_df[i].to_csv("/Users/shreyaananth/Desktop/College/CIP/Code/Data/563/Test/Date/"+string+".csv")