In [1]:
import numpy as np
import pandas as pd
import os


data_path = '../../../code_data/ctr3_cgm_only/raw_data/DataTables'
data_save_path = '../../../code_data/ctr3_cgm_only/'
process_file = 'coldstart_fl'
data_cgm = pd.read_csv(os.path.join(data_path, 'CGM.txt'),sep='|')
data_cgm

data_cgm['ts'] = pd.to_datetime(data_cgm['DisplayTime'])
data_cgm.rename(columns={'CGM':'glucose_level', 'DeidentID':'pid'}, inplace=True)
data_cgm['glucose_level'] = data_cgm['glucose_level'].replace(0.0, np.nan)
data_cgm = data_cgm[['glucose_level', 'ts', 'pid']]
data_cgm.head()


Unnamed: 0,glucose_level,ts,pid
0,194,2013-09-29 17:36:48,1
1,204,2013-09-29 17:41:48,1
2,201,2013-09-29 17:46:48,1
3,204,2013-09-29 17:51:48,1
4,200,2013-09-29 17:56:48,1


In [2]:
import copy
import datetime
pid_list = list(set(data_cgm['pid']))
data_merge = data_cgm
pid2data = {}
for pid in pid_list:
    pid2data[pid] = copy.deepcopy(data_merge[data_merge['pid']==pid])
    pid2data[pid].set_index('ts', inplace=True)
    pid2data[pid] = pid2data[pid].sort_index()
    pid2data[pid] = pid2data[pid].drop(['pid'], axis=1)

In [3]:
def gen_5_mins_indicate_column(pid2data):
    pid2fivemins_indicat = {}
    for pid in pid2data:
        print(pid)
        gl_df = pid2data[pid].loc[:, 'glucose_level']
        temp_dict_list = []
        st = gl_df.index[0]
        idx = 0 
        while st <= gl_df.index[-1]:
            # get next ts
            while gl_df.index[idx] <= st:
                
                if idx == len(gl_df) - 1:
                    break
                idx += 1
                
            # add current ts
            temp_dict_list.append({'ts': st, '5_mins_indicate': 1})
            
            # if next ts is much further from the current ts
            # move to next
            if (gl_df.index[idx] - st) > datetime.timedelta(hours=2):
                st = gl_df.index[idx]
            else:
                st = st + datetime.timedelta(minutes=5)

        temp_dict_list.append({'ts': st, '5_mins_indicate': 1})
        temp_dict_list = pd.DataFrame(temp_dict_list)
        temp_dict_list.set_index('ts', inplace=True)
        pid2fivemins_indicat[pid] = temp_dict_list
    return pid2fivemins_indicat

            

In [4]:
pid2fivemins_indicat = gen_5_mins_indicate_column(pid2data)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30


In [5]:
pid2data[1]

Unnamed: 0_level_0,glucose_level
ts,Unnamed: 1_level_1
2013-09-29 17:36:48,194
2013-09-29 17:41:48,204
2013-09-29 17:46:48,201
2013-09-29 17:51:48,204
2013-09-29 17:56:48,200
...,...
2014-02-26 07:35:00,168
2014-02-26 07:40:00,169
2014-02-26 07:45:00,172
2014-02-26 07:50:00,179


In [6]:
def regularize_data(pid2data):


    new_pid2data = {}
    
    for pid in pid2data:
        print(pid)
        five_mins_indicate = pid2fivemins_indicat[pid].index
  
        new_data = []
        
        for ed in five_mins_indicate:
            temp_dict = {'ts':ed, }
            if ed == five_mins_indicate[0]:
                temp = pid2data[pid].loc[ed]
            else:
                temp = pid2data[pid].loc[ed-datetime.timedelta(seconds=299):ed].mean(skipna=True)
                
            for name in pid2data[pid].columns:
                temp_dict[name] = temp[name]
            if pd.notna(temp_dict['glucose_level']):
                temp_dict['glucose_level_indicate'] = 0
            else:
                temp_dict['glucose_level_indicate'] = 1
            new_data.append(temp_dict)
        new_pid2data[pid] = pd.DataFrame(new_data)
        new_pid2data[pid].set_index('ts', inplace=True)

    return new_pid2data

In [7]:
new_pid2data = regularize_data(pid2data)

1


2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30


In [8]:
new_pid2data[2].describe()

Unnamed: 0,glucose_level,glucose_level_indicate
count,66517.0,67688.0
mean,152.962122,0.0173
std,47.413571,0.130388
min,39.0,0.0
25%,119.0,0.0
50%,146.0,0.0
75%,182.0,0.0
max,401.0,1.0


In [9]:
def add_time_attributes(pid2data):
    for pid in pid2data:
        data = pid2data[pid]

        data['date_temp'] = data.index

        data['day_of_week'] = data['date_temp'].dt.dayofweek
        data['day_of_week'] = data['day_of_week'].astype(float)

        data['week'] = data['date_temp'].dt.isocalendar().week
        data['week'] = data['week'].astype(float)

        data['year'] = data['date_temp'].dt.year
        data['year'] = data['year'].astype(float)


        data['hour'] = data['date_temp'].dt.hour
        data['hour'] = data['hour'].astype(float)

        data['minute'] = data['date_temp'].dt.minute
        data['minute'] = data['minute'].astype(float)

        # there is an error previously 360 -> 3600
        data['timestamp'] = data['date_temp'].dt.hour * 3600 +\
                                data['date_temp'].dt.minute * 60 +\
                                data['date_temp'].dt.second
        data['timestamp'] = data['timestamp'].astype(float)

        # new ————————————————————————
        seconds_in_day = 24*60*60

        data['sin_time'] = np.sin(2 * np.pi * data.timestamp / seconds_in_day)
        data['cos_time'] = np.cos(2 * np.pi * data.timestamp / seconds_in_day)
        data['sin_time'].astype(np.float64)
        data['cos_time'].astype(np.float64)
        # end ______________________


        pid2data[pid] = data.drop(['date_temp'], axis=1)
    
    return pid2data

In [10]:
new_pid2data = add_time_attributes(new_pid2data)

In [11]:
# add discrete y

def discrete(y):
    if y < 70:
        return 0
    elif 70<= y <=180:
        return 1
    elif y > 180:
        return 2
    else:
        return np.nan


def add_discrete_y(pid2data):
    for pid in pid2data:
        data = pid2data[pid]
        data['discrete_y'] = data['glucose_level']

        data['discrete_y'] = data['discrete_y'].apply(discrete)
        
    return pid2data

In [12]:
new_pid2data = add_discrete_y(new_pid2data)

In [13]:
pid2train = {}
pid2valid = {}
pid2test = {}
pid2regular_df = new_pid2data

for pid in pid2regular_df:
    idxs = pid2regular_df[pid].index[pd.notna(pid2regular_df[pid]['glucose_level'])]

    idxs = idxs.to_frame()

    total_len = len(idxs)
    train_end = int(total_len * 0.6)
    valid_end = int(total_len * 0.8) 

    pid2train[pid] = pid2regular_df[pid].loc[:idxs.iloc[train_end][0]]

    pid2valid[pid] = pid2regular_df[pid].loc[idxs.iloc[train_end + 1][0]:idxs.iloc[valid_end][0]]

    pid2test[pid] = pid2regular_df[pid].loc[idxs.iloc[valid_end + 1][0]:]

    print(pid, total_len, len(pid2train[pid]), len(pid2valid[pid]), len(pid2test[pid]))

1 23970 14814 4887 4870
2 66517 40727 13498 13463
3 65526 39670 13318 13267
4 74093 44815 14988 14869
5 49093 29847 9977 9939
6 48044 29233 9792 9804
7 56856 37020 11900 11980
8 54236 32939 10923 10983
9 30563 19567 6401 6525
10 48412 29696 9938 9897
11 66780 40340 13408 13392
12 28733 17683 5969 5995
13 58534 35920 11873 11933
14 76575 46849 15575 15593
15 66695 40185 13409 13381
16 34642 21151 6979 6963
17 62302 38023 12688 12582
18 30491 18449 6125 6116
19 30145 18339 6112 6234
20 67903 41194 13728 13730
21 22125 13689 4509 4623
22 22718 14064 4600 4666
23 22524 14036 4662 4631
24 23334 14323 4726 4842
25 22423 13626 4555 4590
26 28714 17581 5820 5794
27 31168 18843 6263 6293
28 31708 19550 6557 6559
29 30283 18603 6168 6135
30 26035 15824 5252 5261


In [14]:
pid_attri2mean_std = {}
attris_should_norm = ['glucose_level', 'timestamp']
for pid in pid_list:
    for attri in attris_should_norm:
        mean = pid2train[pid][attri].mean()
        std = pid2train[pid][attri].std()
        if std is np.NaN or std is pd.NaT or std == 0:
            std = 1e-6
        pid_attri2mean_std[(pid, attri)] = (mean, std)

if not os.path.exists(os.path.join(data_save_path, process_file)):
    os.makedirs(os.path.join(data_save_path, process_file))
np.save(os.path.join(data_save_path, process_file, 'pid_attri2mean_std.npy'), pid_attri2mean_std)

In [15]:
def norm_data(pid2data, pid_attri2mean_std, attris_should_norm):

    new_pid2data = {}
    for pid in pid2data:

        new_pid2data[pid] = copy.deepcopy(pid2data[pid]) 

        for attri in attris_should_norm:

            mean, std = pid_attri2mean_std[(pid, attri)]

            if std <= 1e-6:
                new_pid2data[pid][attri] = new_pid2data[pid][attri] / mean
            else:
                new_pid2data[pid][attri] = (new_pid2data[pid][attri] - mean) / std          

    return new_pid2data

pid2train_norm = norm_data(pid2train, pid_attri2mean_std, attris_should_norm)
pid2valid_norm = norm_data(pid2valid, pid_attri2mean_std, attris_should_norm)
pid2test_norm = norm_data(pid2test, pid_attri2mean_std, attris_should_norm)

In [16]:
attri_list_wo_glucose = [ 'day_of_week', 'hour', 'minute', 'timestamp',
    'sin_time', 'cos_time', 'glucose_level_indicate'
]

temp = []

for idx, attri in enumerate(attri_list_wo_glucose):
    temp.append({'attri':attri, 'idx':idx} )
attri2idx = pd.DataFrame(temp).set_index('attri')
idx2attri = pd.DataFrame(temp).set_index('idx')
print(attri2idx)
print(idx2attri)

save_path = os.path.join(data_save_path, process_file)

attri2idx.to_pickle(os.path.join(save_path,'attri2idx.pkl'))
idx2attri.to_pickle(os.path.join(save_path,'idx2attri.pkl'))


                        idx
attri                      
day_of_week               0
hour                      1
minute                    2
timestamp                 3
sin_time                  4
cos_time                  5
glucose_level_indicate    6
                      attri
idx                        
0               day_of_week
1                      hour
2                    minute
3                 timestamp
4                  sin_time
5                  cos_time
6    glucose_level_indicate


In [17]:
for pid in pid_list:
    pid2train_norm[pid] = pid2train_norm[pid].fillna(0.0)
    pid2valid_norm[pid] = pid2valid_norm[pid].fillna(0.0)
    pid2test_norm[pid] = pid2test_norm[pid].fillna(0.0)

In [18]:
# generate training data


def gen_final(pid2data, attri2idx,  n_prev, pred_window, name, valid_points = 24): 
    pid2data_npy = {}
    for pid in pid_list:
        data = pid2data[pid]
        
        y_list = []
        y_dis_list = []
        target_cos_time_list = []
        target_sin_time_list = []
        target_total_seconds_list = []
        attri_X_list = []

        glucose_level_X_list = []
        
        seq_st_ed_list = []
        
        seq_st, seq_ed = None, None


        for idx in range(len(data) - n_prev - pred_window + 1):
            glucose_level_y = data.iloc[idx + n_prev + pred_window - 1]['glucose_level']
            glucose_level_y_indicate = data.iloc[idx + n_prev + pred_window - 1]['glucose_level_indicate']
            if abs(glucose_level_y_indicate - 1) < 1e-6:
                continue
            
            not_consecutive = (data.index[idx + n_prev + pred_window - 1] - data.index[idx]) != datetime.timedelta(minutes=pred_window*5+n_prev*5 - 5)
            
            # Resonable
            num_valids = int(valid_points - sum(data.iloc[idx + n_prev - valid_points : idx + n_prev]['glucose_level_indicate']))
            if num_valids == 0 or not_consecutive:
                if seq_st is not None and seq_ed - seq_st >= 2:
                    seq_st_ed_list.append([seq_st, seq_ed])
                seq_st, seq_ed = None, None
                continue
            # end

            
            y_list.append(glucose_level_y)
            y_dis_list.append(data.iloc[idx + n_prev + pred_window - 1]['discrete_y'])
            target_cos_time_list.append(data.iloc[idx + n_prev + pred_window - 1]['cos_time'])
            target_sin_time_list.append(data.iloc[idx + n_prev + pred_window - 1]['sin_time'])
            target_total_seconds_list.append(data.iloc[idx + n_prev + pred_window - 1]['timestamp'])
            glucose_level_X = np.array(data.iloc[idx: idx + n_prev]['glucose_level'].to_numpy(), dtype=np.float32)


            attri_X = np.array(data.iloc[idx: idx + n_prev][list(attri2idx.index)].to_numpy(), dtype=np.float32)          

            glucose_level_X_list.append(np.expand_dims(glucose_level_X, axis=0))

            attri_X_list.append(np.expand_dims(attri_X, axis=0))


            if seq_st is None:
                seq_st = len(y_list) - 1
                seq_ed = len(y_list) - 1
            else:
                seq_ed = len(y_list) - 1

        if seq_st is not None and seq_ed - seq_st >= 2:
            seq_st_ed_list.append([seq_st, seq_ed])

        pid2data_npy[pid] = {}
        pid2data_npy[pid]['glucose_level_X'] = np.concatenate(glucose_level_X_list, axis=0)

        pid2data_npy[pid]['attri_X'] = np.concatenate(attri_X_list, axis=0)
        pid2data_npy[pid]['y'] = np.array(y_list, dtype=np.float32)
        pid2data_npy[pid]['y_dis'] = np.array(y_dis_list, dtype=np.int32)
        pid2data_npy[pid]['target_cos_time'] = np.array(target_cos_time_list, dtype=np.float32)
        pid2data_npy[pid]['target_sin_time'] = np.array(target_sin_time_list, dtype=np.float32)
        pid2data_npy[pid]['target_timestamp'] = np.array(target_total_seconds_list, dtype=np.float32)
        pid2data_npy[pid]['mean'] = pid_attri2mean_std[(pid, 'glucose_level')][0]
        pid2data_npy[pid]['std'] = pid_attri2mean_std[(pid, 'glucose_level')][1]

        pid2data_npy[pid]['seq_st_ed_list'] = np.array(seq_st_ed_list, dtype=np.int32)

        print(pid, pid2data_npy[pid]['glucose_level_X'].shape, pid2data_npy[pid]['attri_X'].shape)
        print('    ',len(y_list), seq_st_ed_list)
    save_path = os.path.join(data_save_path, process_file)
    np.save(os.path.join(save_path, f'{name}_{n_prev}_{pred_window}.npy'), pid2data_npy)




In [19]:
gen_final(pid2train_norm, attri2idx,  n_prev=24, pred_window=6, name='train_pid2data_npy', valid_points = 12)

1 (14161, 24) (14161, 24, 7)
     14161 [[0, 1907], [1908, 3830], [3831, 5721], [5722, 7059], [7060, 8937], [8938, 10800], [10801, 12746], [12747, 14160]]
2 (39135, 24) (39135, 24, 7)
     39135 [[0, 1582], [1583, 1860], [1861, 1863], [1864, 2007], [2008, 3876], [3877, 5782], [5783, 7710], [7711, 8505], [8506, 9583], [9584, 10343], [10344, 12179], [12180, 13811], [13812, 14025], [14026, 15954], [15955, 16485], [16486, 18420], [18421, 18425], [18426, 18587], [18588, 20504], [20505, 20601], [20602, 22421], [22422, 24338], [24339, 26287], [26288, 28177], [28178, 30120], [30121, 30384], [30385, 31942], [31943, 33882], [33883, 35819], [35820, 37760], [37761, 39134]]
3 (38625, 24) (38625, 24, 7)
     38625 [[0, 1925], [1926, 3874], [3875, 5818], [5819, 7744], [7745, 9702], [9703, 11638], [11639, 13574], [13575, 15510], [15511, 16699], [16700, 16917], [16918, 18795], [18796, 20739], [20740, 22658], [22659, 24349], [24350, 24858], [24859, 26803], [26804, 28753], [28754, 30691], [30692, 32376],

In [20]:
gen_final(pid2train_norm, attri2idx,  n_prev=24, pred_window=12, name='train_pid2data_npy', valid_points = 12)

1 (14117, 24) (14117, 24, 7)
     14117 [[0, 1901], [1902, 3819], [3820, 5704], [5705, 7037], [7038, 8910], [8911, 10767], [10768, 12707], [12708, 14116]]
2 (38984, 24) (38984, 24, 7)
     38984 [[0, 1582], [1583, 1860], [1861, 1998], [1999, 3861], [3862, 5761], [5762, 7683], [7684, 8472], [8473, 9544], [9545, 10299], [10300, 12130], [12131, 13758], [13759, 13967], [13968, 15893], [15894, 16419], [16420, 18348], [18349, 18504], [18505, 20417], [20418, 20514], [20515, 22328], [22329, 24239], [24240, 26184], [26185, 28068], [28069, 30005], [30006, 30263], [30264, 31815], [31816, 33749], [33750, 35680], [35681, 37615], [37616, 38983]]
3 (38482, 24) (38482, 24, 7)
     38482 [[0, 1919], [1920, 3862], [3863, 5800], [5801, 7720], [7721, 9672], [9673, 11602], [11603, 13532], [13533, 15462], [15463, 16651], [16652, 16863], [16864, 18736], [18737, 20674], [20675, 22587], [22588, 24272], [24273, 24775], [24776, 26714], [26715, 28658], [28659, 30590], [30591, 32269], [32270, 33693], [33694, 35336

In [21]:
gen_final(pid2valid_norm, attri2idx,  n_prev=24, pred_window=6, name='valid_pid2data_npy', valid_points = 12)

1 (4675, 24) (4675, 24, 7)
     4675 [[0, 480], [481, 2063], [2064, 2388], [2389, 2714], [2715, 4311], [4312, 4674]]
2 (12903, 24) (12903, 24, 7)
     12903 [[0, 498], [499, 550], [551, 2425], [2426, 4371], [4372, 4420], [4421, 6141], [6142, 7498], [7499, 9407], [9408, 9599], [9600, 9749], [9750, 9791], [9792, 11238], [11239, 11918], [11919, 12902]]
3 (12794, 24) (12794, 24, 7)
     12794 [[0, 1042], [1043, 2448], [2449, 4369], [4370, 4850], [4851, 6792], [6793, 7528], [7529, 7933], [7934, 9854], [9855, 11781], [11782, 12685], [12686, 12793]]
4 (14483, 24) (14483, 24, 7)
     14483 [[0, 1782], [1783, 3706], [3707, 5587], [5588, 7530], [7531, 8878], [8879, 10808], [10809, 10828], [10829, 12736], [12737, 12802], [12803, 13945], [13946, 13995], [13996, 14482]]
5 (9645, 24) (9645, 24, 7)
     9645 [[0, 1380], [1381, 3313], [3314, 5225], [5226, 5466], [5467, 7158], [7159, 8675], [8676, 9644]]
6 (9382, 24) (9382, 24, 7)
     9382 [[0, 728], [729, 2669], [2670, 3001], [3002, 3116], [3117, 441

In [22]:
gen_final(pid2valid_norm, attri2idx,  n_prev=24, pred_window=12, name='valid_pid2data_npy', valid_points = 12)

1 (4646, 24) (4646, 24, 7)
     4646 [[0, 474], [475, 2051], [2052, 2370], [2371, 2696], [2697, 4287], [4288, 4645]]
2 (12820, 24) (12820, 24, 7)
     12820 [[0, 492], [493, 538], [539, 2407], [2408, 4347], [4348, 4390], [4391, 6105], [6106, 7456], [7457, 9359], [9360, 9545], [9546, 9689], [9690, 9725], [9726, 11167], [11168, 11841], [11842, 12819]]
3 (12728, 24) (12728, 24, 7)
     12728 [[0, 1036], [1037, 2436], [2437, 4351], [4352, 4826], [4827, 6762], [6763, 7492], [7493, 7891], [7892, 9806], [9807, 11727], [11728, 12625], [12626, 12727]]
4 (14413, 24) (14413, 24, 7)
     14413 [[0, 1776], [1777, 3695], [3696, 5570], [5571, 7508], [7509, 8850], [8851, 10774], [10775, 10788], [10789, 12690], [12691, 12750], [12751, 13887], [13888, 13931], [13932, 14412]]
5 (9609, 24) (9609, 24, 7)
     9609 [[0, 1374], [1375, 3301], [3302, 5207], [5208, 5448], [5449, 7134], [7135, 8645], [8646, 9608]]
6 (9336, 24) (9336, 24, 7)
     9336 [[0, 722], [723, 2657], [2658, 2983], [2984, 3093], [3094, 438

In [23]:
gen_final(pid2test_norm, attri2idx,  n_prev=24, pred_window=6, name='test_pid2data_npy', valid_points = 12)

1 (4678, 24) (4678, 24, 7)
     4678 [[0, 1534], [1535, 2577], [2578, 3731], [3732, 4677]]
2 (13065, 24) (13065, 24, 7)
     13065 [[0, 644], [645, 2160], [2161, 4049], [4050, 5095], [5096, 5810], [5811, 7746], [7747, 9676], [9677, 11596], [11597, 13064]]
3 (12848, 24) (12848, 24, 7)
     12848 [[0, 514], [515, 597], [598, 2532], [2533, 4472], [4473, 6421], [6422, 8348], [8349, 10293], [10294, 11925], [11926, 12207], [12208, 12847]]
4 (14532, 24) (14532, 24, 7)
     14532 [[0, 1435], [1436, 1976], [1977, 3883], [3884, 5799], [5800, 6593], [6594, 7550], [7551, 9489], [9490, 11391], [11392, 13296], [13297, 14531]]
5 (9597, 24) (9597, 24, 7)
     9597 [[0, 80], [81, 2029], [2030, 3959], [3960, 3987], [3988, 4190], [4191, 5904], [5905, 7835], [7836, 9596]]
6 (9375, 24) (9375, 24, 7)
     9375 [[0, 683], [684, 1178], [1179, 1272], [1273, 3046], [3047, 4948], [4949, 6823], [6824, 8766], [8767, 9374]]
7 (11021, 24) (11021, 24, 7)
     11021 [[0, 377], [378, 1249], [1250, 1439], [1440, 1928], 

In [24]:
gen_final(pid2test_norm, attri2idx,  n_prev=24, pred_window=12, name='test_pid2data_npy', valid_points = 12)

1 (4654, 24) (4654, 24, 7)
     4654 [[0, 1528], [1529, 2565], [2566, 3713], [3714, 4653]]
2 (13011, 24) (13011, 24, 7)
     13011 [[0, 638], [639, 2148], [2149, 4031], [4032, 5071], [5072, 5780], [5781, 7710], [7711, 9634], [9635, 11548], [11549, 13010]]
3 (12797, 24) (12797, 24, 7)
     12797 [[0, 508], [509, 585], [586, 2514], [2515, 4448], [4449, 6391], [6392, 8312], [8313, 10251], [10252, 12162], [12163, 12796]]
4 (14473, 24) (14473, 24, 7)
     14473 [[0, 1429], [1430, 1964], [1965, 3865], [3866, 5775], [5776, 6563], [6564, 7514], [7515, 9447], [9448, 11343], [11344, 13242], [13243, 14472]]
5 (9550, 24) (9550, 24, 7)
     9550 [[0, 74], [75, 2017], [2018, 3941], [3942, 3963], [3964, 4160], [4161, 5869], [5870, 7794], [7795, 9549]]
6 (9333, 24) (9333, 24, 7)
     9333 [[0, 677], [678, 1167], [1168, 1255], [1256, 3023], [3024, 4919], [4920, 6790], [6791, 8727], [8728, 9332]]
7 (10947, 24) (10947, 24, 7)
     10947 [[0, 371], [372, 1242], [1243, 1426], [1427, 1909], [1910, 3823], [3