In [1]:
import numpy as np
import os
import pandas as pd
import copy 
import datetime 
data_path = '../../../code_data/abc4d/'
process_file = 'coldstart_fl'

pid_list = [4000 + d for d in range(1, 27)]
pid_list.remove(4019)
# print(pid_list, len(pid_list))

In [2]:
pid2CGM = {}
for pid in pid_list:
    try:
        load_path = os.path.join(data_path, 'raw_data', f'ABC{pid}_CGM_6m_I.xls')
        df = pd.read_excel(load_path, sheet_name='CGM',  names=['ts', 'glucose_level'], skiprows=1, usecols=[0,1])
    except:
        load_path = os.path.join(data_path, 'raw_data', f'ABC{pid}_CGM_6m_C.xls')
        df = pd.read_excel(load_path, sheet_name='CGM',  names=['ts', 'glucose_level'], skiprows=1, usecols=[0,1])

    df['ts'] = pd.to_datetime(df['ts'], dayfirst=True)
    df['glucose_level'] = df['glucose_level'] * 18.0 # change mmol/l to mg/dL

    df = df.groupby('ts').apply(lambda x: x[pd.notna(x['glucose_level'])].iloc[-1])['glucose_level'].to_frame()
    df = df.sort_index()
    df['glucose_level'].replace(0.0, np.nan)
    pid2CGM[pid] = df
    print(f'{pid}, total_points:{len(df)}, start:{df.index[0]}, end:{df.index[-1]}')

4001, total_points:45949, start:2015-11-26 13:06:00, end:2016-05-12 23:57:00
4002, total_points:43317, start:2015-12-11 00:00:00, end:2016-05-27 23:56:00
4003, total_points:37518, start:2015-12-11 00:00:00, end:2016-05-27 23:57:00
4004, total_points:41137, start:2015-12-18 12:19:00, end:2016-06-03 23:53:00
4005, total_points:43836, start:2016-01-18 00:04:00, end:2016-07-04 23:56:00
4006, total_points:46421, start:2016-01-14 00:02:00, end:2016-06-30 23:58:00
4007, total_points:44501, start:2016-02-12 00:01:00, end:2016-07-29 23:55:00
4008, total_points:44691, start:2016-02-12 00:02:00, end:2016-07-29 23:58:00
4009, total_points:47472, start:2016-02-12 00:03:00, end:2016-07-29 23:58:00
4010, total_points:43188, start:2016-02-23 00:00:00, end:2016-08-09 23:55:00
4011, total_points:32893, start:2016-02-23 00:01:00, end:2016-08-09 23:55:00
4012, total_points:44833, start:2016-02-23 00:01:00, end:2016-08-09 23:31:00
4013, total_points:40524, start:2016-02-29 00:01:00, end:2016-08-15 23:58:00

In [3]:
pid2meal_bolus_part = {}
names = ['ts', 'meal', 'bolus']
usecols = [0,7,3]
for pid in pid_list:
    try:
        load_path = os.path.join(data_path, 'raw_data', f'ABC{pid}_CGM_6m_I.xls')
        df = pd.read_excel(load_path, sheet_name='Insulin use and carbs',  names=names, usecols=usecols)
    except:
        load_path = os.path.join(data_path, 'raw_data', f'ABC{pid}_CGM_6m_C.xls')
        df = pd.read_excel(load_path, sheet_name='Insulin use and carbs',  names=names, usecols=usecols)

    if len(df) == 0:
        continue

    df['ts'] = pd.to_datetime(df['ts'], dayfirst=True)
    df = df.groupby('ts').sum()
    df = df.sort_index()
    df = df.astype(np.float64)
    df = df.replace(0.0, np.nan)
    pid2meal_bolus_part[pid] = df
    print(f'{pid}, total_points:{len(df)}, start:{df.index[0]}, end:{df.index[-1]}')

4001, total_points:1, start:2015-11-27 15:33:00, end:2015-11-27 15:33:00
4002, total_points:710, start:2015-12-11 07:34:00, end:2016-05-23 21:10:00
4004, total_points:48, start:2015-12-18 07:28:00, end:2016-04-09 17:33:00
4005, total_points:838, start:2016-01-18 08:15:00, end:2016-07-04 19:52:00
4006, total_points:71, start:2016-01-14 08:24:00, end:2016-02-12 09:07:00
4007, total_points:31, start:2016-02-12 06:06:00, end:2016-02-24 14:11:00
4008, total_points:6, start:2016-02-12 00:14:00, end:2016-06-29 12:07:00
4009, total_points:1278, start:2016-02-12 04:11:00, end:2016-07-29 18:35:00
4010, total_points:14, start:2016-02-23 08:12:00, end:2016-02-24 14:02:00
4011, total_points:9, start:2016-02-23 09:29:00, end:2016-02-26 13:53:00
4012, total_points:3, start:2016-02-23 10:08:00, end:2016-02-23 12:27:00
4013, total_points:102, start:2016-02-29 07:58:00, end:2016-03-31 12:40:00
4014, total_points:474, start:2016-03-01 21:03:00, end:2016-08-16 18:24:00
4015, total_points:392, start:2016-0

In [4]:
pid2other_attris = {}
names = ['ts', 'finger_stick', 'meal', 'bolus', 'case_icr', 'absorption', 'exercise', 'alcohol', 'iob', 'correction_bolus']
usecols = [2, 3, 5, 6, 9, 10, 11,  13, 14, 15]
for pid in pid_list:
    
    try:
        load_path = os.path.join(data_path, 'raw_data', f'ABC{pid}_log_final_I.csv')
        df = pd.read_csv(load_path, skiprows=11, names=names, usecols=usecols)
    except:
        load_path = os.path.join(data_path, 'raw_data', f'ABC{pid}_log_final_C.csv')
        df = pd.read_csv(load_path, skiprows=11, names=names, usecols=usecols)

    df['ts'] = pd.to_datetime(df['ts'], dayfirst=True, errors='coerce')
    df = df.groupby('ts').sum()
    df = df.sort_index()
    df = df.astype(np.float64)
    df = df.replace(0.0, np.nan)
    pid2other_attris[pid] = df
    print(f'{pid}, total_points:{len(df)}, start:{df.index[0]}, end:{df.index[-1]}')


4001, total_points:1055, start:2015-11-26 20:11:00, end:2016-05-13 12:15:00
4002, total_points:1168, start:2015-12-11 20:31:00, end:2016-04-20 19:25:00
4003, total_points:1071, start:2015-12-11 18:53:00, end:2016-06-16 19:20:00
4004, total_points:898, start:2015-12-18 10:52:00, end:2016-06-17 08:54:00
4005, total_points:53, start:2016-01-18 10:16:00, end:2016-02-12 09:21:00
4006, total_points:1281, start:2016-01-14 10:58:00, end:2016-07-11 22:52:00
4007, total_points:912, start:2016-02-23 19:02:00, end:2016-08-30 14:35:00
4008, total_points:1120, start:2016-02-12 13:53:00, end:2016-08-08 09:15:00
4009, total_points:2556, start:2016-02-12 14:11:00, end:2016-08-16 11:45:00
4010, total_points:1439, start:2016-02-23 11:45:00, end:2016-08-25 22:01:00
4011, total_points:614, start:2016-02-23 15:33:00, end:2016-08-09 12:21:00
4012, total_points:2164, start:2016-02-23 14:51:00, end:2016-09-03 16:28:00
4013, total_points:467, start:2016-02-29 12:37:00, end:2016-08-16 14:13:00
4014, total_points

In [5]:
pid2attris = {}
for pid in pid_list:
    if pid in pid2meal_bolus_part:
        pid2other_attris[pid].merge(pid2meal_bolus_part[pid], how='outer')
    pid2attris[pid] = pid2CGM[pid].join(pid2other_attris[pid], how='outer')
    print(f'{pid}, total_points:{len(pid2attris[pid])}, start:{pid2attris[pid].index[0]}, end:{pid2attris[pid].index[-1]}')


4001, total_points:46758, start:2015-11-26 13:06:00, end:2016-05-13 12:15:00
4002, total_points:44224, start:2015-12-11 00:00:00, end:2016-05-27 23:56:00
4003, total_points:38359, start:2015-12-11 00:00:00, end:2016-06-16 19:20:00
4004, total_points:41838, start:2015-12-18 10:52:00, end:2016-06-17 08:54:00
4005, total_points:43876, start:2016-01-18 00:04:00, end:2016-07-04 23:56:00
4006, total_points:47459, start:2016-01-14 00:02:00, end:2016-07-11 22:52:00
4007, total_points:45248, start:2016-02-12 00:01:00, end:2016-08-30 14:35:00
4008, total_points:45587, start:2016-02-12 00:02:00, end:2016-08-08 09:15:00
4009, total_points:49559, start:2016-02-12 00:03:00, end:2016-08-16 11:45:00
4010, total_points:44331, start:2016-02-23 00:00:00, end:2016-08-25 22:01:00
4011, total_points:33343, start:2016-02-23 00:01:00, end:2016-08-09 23:55:00
4012, total_points:46438, start:2016-02-23 00:01:00, end:2016-09-03 16:28:00
4013, total_points:40859, start:2016-02-29 00:01:00, end:2016-08-16 14:13:00

In [6]:
pid2regular_df = {}

saving_file = os.path.join(data_path, process_file, 'temp_data')

for pid in pid_list:
    data_dict = {}
    df = pid2attris[pid]
    ts_begin = df.index[0]
    ts_end = df.index[-1]

    data_dict[ts_begin] = list(df.loc[ts_begin])

    while ts_begin + datetime.timedelta(minutes=5) <= ts_end:
        st = ts_begin + datetime.timedelta(seconds=1)
        ed = ts_begin + datetime.timedelta(minutes=5)

        glucose_level = df.loc[st:ed, 'glucose_level']
        glucose_level = glucose_level[pd.notna(glucose_level)]
        if len(glucose_level) == 0:
            glucose_level = np.nan
        else:
            glucose_level = glucose_level.to_numpy()[-1]


        meal = df.loc[st:ed, 'meal'].sum()
        bolus = df.loc[st:ed, 'bolus'].sum()

        finger_stick = df.loc[st:ed, 'finger_stick'].mean()
        case_icr = df.loc[st:ed, 'case_icr'].mean()
        absorption = df.loc[st:ed, 'absorption'].mean()
        exercise = df.loc[st:ed, 'exercise'].mean()
        alcohol = df.loc[st:ed, 'alcohol'].mean()
        iob = df.loc[st:ed, 'iob'].mean()
        
        correction_bolus = df.loc[st:ed, 'correction_bolus'].sum()

        data_dict[ed] = [glucose_level, meal, bolus, finger_stick, case_icr, absorption, exercise, alcohol, iob, correction_bolus]

        ts_begin = ts_begin + datetime.timedelta(minutes=5)

    pid2regular_df[pid] = pd.DataFrame.from_dict(data_dict, orient='index', columns=df.columns)

    pid2regular_df[pid] = pid2regular_df[pid].replace(0.0, np.nan)

    if not os.path.exists(saving_file):
        os.makedirs(saving_file)

    pid2regular_df[pid].to_pickle(os.path.join(saving_file, f'{pid}_regular_df.pkl'))
    print(f'{pid}, total_points:{len(pid2regular_df[pid])}, start:{pid2regular_df[pid].index[0]}, end:{pid2regular_df[pid].index[-1]}')

4001, total_points:48662, start:2015-11-26 13:06:00, end:2016-05-13 12:11:00
4002, total_points:48672, start:2015-12-11 00:00:00, end:2016-05-27 23:55:00
4003, total_points:54377, start:2015-12-11 00:00:00, end:2016-06-16 19:20:00
4004, total_points:52393, start:2015-12-18 10:52:00, end:2016-06-17 08:52:00
4005, total_points:48671, start:2016-01-18 00:04:00, end:2016-07-04 23:54:00
4006, total_points:51827, start:2016-01-14 00:02:00, end:2016-07-11 22:52:00
4007, total_points:57775, start:2016-02-12 00:01:00, end:2016-08-30 14:31:00
4008, total_points:51375, start:2016-02-12 00:02:00, end:2016-08-08 09:12:00
4009, total_points:53709, start:2016-02-12 00:03:00, end:2016-08-16 11:43:00
4010, total_points:53257, start:2016-02-23 00:00:00, end:2016-08-25 22:00:00
4011, total_points:48671, start:2016-02-23 00:01:00, end:2016-08-09 23:51:00
4012, total_points:55782, start:2016-02-23 00:01:00, end:2016-09-03 16:26:00
4013, total_points:48843, start:2016-02-29 00:01:00, end:2016-08-16 14:11:00

In [7]:
# load pid2regular_df

pid2regular_df = {}
saving_file = os.path.join(data_path, process_file, 'temp_data')
for pid in pid_list:
    pid2regular_df[pid] = pd.read_pickle(os.path.join(saving_file, f'{pid}_regular_df.pkl'))

In [8]:
# add time features

def add_time_attributes(pid2data):
    for pid in pid2data:
        data = pid2data[pid]

        temp = data.index.to_frame().loc[:, 0].dt
        
        data['day_of_week'] = temp.dayofweek
        data['day_of_week'] = data['day_of_week'].astype(np.float64)

        data['hour'] = temp.hour
        data['hour'] = data['hour'].astype(np.float64)

        data['minute'] = temp.minute
        data['minute'] = data['minute'].astype(np.float64)

        
        data['total_seconds'] = temp.hour * 3600 +\
                                temp.minute * 60 +\
                                temp.second
        data['total_seconds'] = data['total_seconds'].astype(np.float64)
        
        # new ————————————————————————
        seconds_in_day = 24*60*60

        data['sin_time'] = np.sin(2 * np.pi * data.total_seconds / seconds_in_day)
        data['cos_time'] = np.cos(2 * np.pi * data.total_seconds / seconds_in_day)
        data['sin_time'].astype(np.float64)
        data['cos_time'].astype(np.float64)
        # end ______________________
        
        
    return pid2data

In [9]:
pid2regular_df = add_time_attributes(pid2regular_df)


In [10]:
pid2regular_df[4002].sin_time.describe()

count    4.867200e+04
mean    -3.492828e-21
std      7.071140e-01
min     -1.000000e+00
25%     -7.071068e-01
50%     -1.608123e-16
75%      7.071068e-01
max      1.000000e+00
Name: sin_time, dtype: float64

In [11]:
# add discrete y

def discrete(y):
    if y < 70:
        return 0
    elif 70<= y <=180:
        return 1
    elif y > 180:
        return 2
    else:
        return np.nan


def add_discrete_y(pid2data):
    for pid in pid2data:
        data = pid2data[pid]
        data['discrete_y'] = data['glucose_level']

        data['discrete_y'] = data['discrete_y'].apply(discrete)
        
    return pid2data

In [12]:
pid2regular_df = add_discrete_y(pid2regular_df)

In [13]:
pid2regular_df[4001].columns

Index(['glucose_level', 'finger_stick', 'meal', 'bolus', 'case_icr',
       'absorption', 'exercise', 'alcohol', 'iob', 'correction_bolus',
       'day_of_week', 'hour', 'minute', 'total_seconds', 'sin_time',
       'cos_time', 'discrete_y'],
      dtype='object')

In [14]:
def add_indicate_for_nan(pid2data, ignore_indicate_set):
    for pid in pid2data:
        data = pid2data[pid]
        for attri in data:
            if attri in ignore_indicate_set:
                continue
            data[attri + '_indicate'] = 1
            data.loc[pd.notna(data[attri]), attri + '_indicate'] = 0
            if sum(data[attri + '_indicate']) != len(data[attri]) - sum(pd.notna(data[attri])):
                print('error')
    return pid2data

In [15]:
attris_indicate_set = ['day_of_week', 'hour', 'minute',
       'total_seconds', 'sin_time', 'cos_time',]
pid2regular_df = add_indicate_for_nan(pid2regular_df, attris_indicate_set)

In [16]:
pid2train = {}
pid2valid = {}
pid2test = {}

for pid in pid_list:
    idxs = pid2regular_df[pid].index[pd.notna(pid2regular_df[pid]['glucose_level'])]

    idxs = idxs.to_frame()

    total_len = len(idxs)
    train_end = int(total_len * 0.6)
    valid_end = int(total_len * 0.8) 

    pid2train[pid] = pid2regular_df[pid].loc[:idxs.iloc[train_end][0]]

    pid2valid[pid] = pid2regular_df[pid].loc[idxs.iloc[train_end + 1][0]:idxs.iloc[valid_end][0]]

    pid2test[pid] = pid2regular_df[pid].loc[idxs.iloc[valid_end + 1][0]:]

    print(pid, total_len, len(pid2train[pid]), len(pid2valid[pid]), len(pid2test[pid]))

4001 45494 29441 9583 9638
4002 43016 30287 9310 9075
4003 37316 27870 10118 16389
4004 40718 29331 9083 13979
4005 43466 29368 9636 9667
4006 46036 29433 9558 12836
4007 44251 27955 10175 19645
4008 44279 29316 9535 12524
4009 46931 28987 9570 15152
4010 42980 28243 9139 15875
4011 32788 27154 12208 9309
4012 44545 28453 9311 18018
4013 40443 27850 9976 11017
4014 41032 29611 9457 17078
4015 40053 27153 10114 11404
4016 26854 21489 6473 20710
4017 43132 29208 9737 13344
4018 45524 28939 9860 17349
4020 38120 28840 9968 12243
4021 43270 29377 9579 16852
4022 44066 28232 10948 15359
4023 44741 28336 10750 15795
4024 50236 33656 11029 10417
4025 53883 34494 11512 13610
4026 51200 34209 10621 16914


In [17]:
pid2train[pid].columns

Index(['glucose_level', 'finger_stick', 'meal', 'bolus', 'case_icr',
       'absorption', 'exercise', 'alcohol', 'iob', 'correction_bolus',
       'day_of_week', 'hour', 'minute', 'total_seconds', 'sin_time',
       'cos_time', 'discrete_y', 'glucose_level_indicate',
       'finger_stick_indicate', 'meal_indicate', 'bolus_indicate',
       'case_icr_indicate', 'absorption_indicate', 'exercise_indicate',
       'alcohol_indicate', 'iob_indicate', 'correction_bolus_indicate',
       'discrete_y_indicate'],
      dtype='object')

In [18]:
pid_attri2mean_std = {}
attris_should_norm = ['glucose_level', 'meal', 'bolus', 'finger_stick', 'case_icr', 'absorption', 'exercise',
       'alcohol', 'iob', 'correction_bolus', 'total_seconds']
for pid in pid_list:
    for attri in attris_should_norm:
        mean = pid2train[pid][attri].mean()
        std = pid2train[pid][attri].std()
        if std is np.NaN or std is pd.NaT or std == 0:
            std = 1e-6
        pid_attri2mean_std[(pid, attri)] = (mean, std)

np.save(os.path.join(data_path, process_file, 'pid_attri2mean_std.npy'), pid_attri2mean_std)

In [19]:
def norm_data(pid2data, pid_attri2mean_std, attris_should_norm):

    new_pid2data = {}
    for pid in pid2data:

        new_pid2data[pid] = copy.deepcopy(pid2data[pid]) 

        for attri in attris_should_norm:

            mean, std = pid_attri2mean_std[(pid, attri)]

            if std <= 1e-6:
                new_pid2data[pid][attri] = new_pid2data[pid][attri] / mean
            else:
                new_pid2data[pid][attri] = (new_pid2data[pid][attri] - mean) / std          
        
    return new_pid2data

In [20]:
pid2train_norm = norm_data(pid2train, pid_attri2mean_std, attris_should_norm)
pid2valid_norm = norm_data(pid2valid, pid_attri2mean_std, attris_should_norm)
pid2test_norm = norm_data(pid2test, pid_attri2mean_std, attris_should_norm)

In [21]:
attri_list_wo_glucose = [
    'meal', 'bolus', 'finger_stick', 'case_icr', 'absorption', 'exercise',
    'alcohol', 'iob', 'correction_bolus', 'day_of_week', 'hour', 'minute',
    'total_seconds', 'sin_time', 'cos_time', 
    'glucose_level_indicate', 'meal_indicate', 'bolus_indicate',
    'finger_stick_indicate', 'absorption_indicate', 'exercise_indicate',
    'alcohol_indicate', 'iob_indicate', 'correction_bolus_indicate',
]

temp = []

for idx, attri in enumerate(attri_list_wo_glucose):
    temp.append({'attri':attri, 'idx':idx}, )
attri2idx = pd.DataFrame(temp).set_index('attri')
idx2attri = pd.DataFrame(temp).set_index('idx')
print(attri2idx)
print(idx2attri)

save_path = os.path.join(data_path, process_file)

attri2idx.to_pickle(os.path.join(save_path,'attri2idx.pkl'))
idx2attri.to_pickle(os.path.join(save_path,'idx2attri.pkl'))


                           idx
attri                         
meal                         0
bolus                        1
finger_stick                 2
case_icr                     3
absorption                   4
exercise                     5
alcohol                      6
iob                          7
correction_bolus             8
day_of_week                  9
hour                        10
minute                      11
total_seconds               12
sin_time                    13
cos_time                    14
glucose_level_indicate      15
meal_indicate               16
bolus_indicate              17
finger_stick_indicate       18
absorption_indicate         19
exercise_indicate           20
alcohol_indicate            21
iob_indicate                22
correction_bolus_indicate   23
                         attri
idx                           
0                         meal
1                        bolus
2                 finger_stick
3                     case_icr
4       

In [22]:
for pid in pid_list:
    pid2train_norm[pid] = pid2train_norm[pid].fillna(0.0)
    pid2valid_norm[pid] = pid2valid_norm[pid].fillna(0.0)
    pid2test_norm[pid] = pid2test_norm[pid].fillna(0.0)

In [23]:
pid2train_norm[4001]

Unnamed: 0,glucose_level,finger_stick,meal,bolus,case_icr,absorption,exercise,alcohol,iob,correction_bolus,...,finger_stick_indicate,meal_indicate,bolus_indicate,case_icr_indicate,absorption_indicate,exercise_indicate,alcohol_indicate,iob_indicate,correction_bolus_indicate,discrete_y_indicate
2015-11-26 13:06:00,0.239605,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.00000,...,1,1,1,1,1,1,1,1,1,0
2015-11-26 13:11:00,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.00000,...,1,1,1,1,1,1,1,1,1,1
2015-11-26 13:16:00,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.00000,...,1,1,1,1,1,1,1,1,1,1
2015-11-26 13:21:00,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.00000,...,1,1,1,1,1,1,1,1,1,1
2015-11-26 13:26:00,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.00000,...,1,1,1,1,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2016-03-07 18:06:00,1.279618,0.0,-1.295807,0.917043,0.655861,-0.050618,0.0,0.0,0.756425,0.50199,...,1,0,0,0,0,1,1,0,0,0
2016-03-07 18:11:00,1.342649,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.00000,...,1,1,1,1,1,1,1,1,1,0
2016-03-07 18:16:00,1.374164,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.00000,...,1,1,1,1,1,1,1,1,1,0
2016-03-07 18:21:00,1.374164,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.00000,...,1,1,1,1,1,1,1,1,1,0


In [24]:
# generate training data


def gen_final(pid2data, attri2idx,  n_prev, pred_window, name, valid_points = 24): 
    pid2data_npy = {}
    for pid in pid_list:
        data = pid2data[pid]
        
        y_list = []
        y_dis_list = []
        target_cos_time_list = []
        target_sin_time_list = []
        target_total_seconds_list = []

        attri_X_list = []

        glucose_level_X_list = []
        
        seq_st_ed_list = []
        
        seq_st, seq_ed = None, None


        for idx in range(len(data) - n_prev - pred_window + 1):
            glucose_level_y = data.iloc[idx + n_prev + pred_window - 1]['glucose_level']
            glucose_level_y_indicate = data.iloc[idx + n_prev + pred_window - 1]['glucose_level_indicate']
            if abs(glucose_level_y_indicate - 1) < 1e-6:
                continue

            # Resonable
            num_valids = int(valid_points - sum(data.iloc[idx + n_prev - valid_points : idx + n_prev]['glucose_level_indicate']))
            if num_valids == 0:
                if seq_st is not None and seq_ed - seq_st >= 2:
                    seq_st_ed_list.append([seq_st, seq_ed])
                seq_st, seq_ed = None, None
                continue
            # end

            
            y_list.append(glucose_level_y)
            y_dis_list.append(data.iloc[idx + n_prev + pred_window - 1]['discrete_y'])
            target_cos_time_list.append(data.iloc[idx + n_prev + pred_window - 1]['cos_time'])
            target_sin_time_list.append(data.iloc[idx + n_prev + pred_window - 1]['sin_time'])
            target_total_seconds_list.append(data.iloc[idx + n_prev + pred_window - 1]['total_seconds'])
            glucose_level_X = np.array(data.iloc[idx: idx + n_prev]['glucose_level'].to_numpy(), dtype=np.float32)


            attri_X = np.array(data.iloc[idx: idx + n_prev][list(attri2idx.index)].to_numpy(), dtype=np.float32)          

            glucose_level_X_list.append(np.expand_dims(glucose_level_X, axis=0))

            attri_X_list.append(np.expand_dims(attri_X, axis=0))
        
            if seq_st is None:
                seq_st = len(y_list) - 1
                seq_ed = len(y_list) - 1
            else:
                seq_ed = len(y_list) - 1

        if seq_st is not None and seq_ed - seq_st >= 2:
            seq_st_ed_list.append([seq_st, seq_ed])

        pid2data_npy[pid] = {}
        pid2data_npy[pid]['glucose_level_X'] = np.concatenate(glucose_level_X_list, axis=0)

        pid2data_npy[pid]['attri_X'] = np.concatenate(attri_X_list, axis=0)
        pid2data_npy[pid]['y'] = np.array(y_list, dtype=np.float32)
        pid2data_npy[pid]['y_dis'] = np.array(y_dis_list, dtype=np.int32)
        pid2data_npy[pid]['target_cos_time'] = np.array(target_cos_time_list, dtype=np.float32)
        pid2data_npy[pid]['target_sin_time'] = np.array(target_sin_time_list, dtype=np.float32)
        pid2data_npy[pid]['target_total_seconds'] = np.array(target_total_seconds_list, dtype=np.float32)
        pid2data_npy[pid]['mean'] = pid_attri2mean_std[(pid, 'glucose_level')][0]
        pid2data_npy[pid]['std'] = pid_attri2mean_std[(pid, 'glucose_level')][1]

        pid2data_npy[pid]['seq_st_ed_list'] = np.array(seq_st_ed_list, dtype=np.int32)

        print(pid, pid2data_npy[pid]['glucose_level_X'].shape, pid2data_npy[pid]['attri_X'].shape)
        print('    ',len(y_list), seq_st_ed_list)
    save_path = os.path.join(data_path, process_file)
    np.save(os.path.join(save_path, f'{name}_{n_prev}_{pred_window}.npy'), pid2data_npy)




In [25]:
gen_final(pid2train_norm, attri2idx,  n_prev=24, pred_window=6, name='train_pid2data_npy', valid_points = 12)

4001 (27099, 24) (27099, 24, 24)
     27099 [[0, 141], [142, 2074], [2075, 2270], [2271, 3004], [3005, 3956], [3957, 5901], [5902, 7799], [7800, 8039], [8040, 8048], [8049, 9681], [9682, 9845], [9846, 10547], [10548, 12492], [12493, 12688], [12689, 13830], [13831, 14366], [14367, 15260], [15261, 15458], [15459, 15992], [15993, 16034], [16035, 16075], [16076, 16198], [16199, 17019], [17020, 17313], [17314, 17631], [17632, 17688], [17689, 18760], [18761, 20436], [20437, 20827], [20828, 22280], [22281, 23100], [23101, 23891], [23892, 25024], [25025, 25398], [25399, 25780], [25781, 26516], [26517, 26883], [26884, 27090], [27091, 27098]]
4002 (25486, 24) (25486, 24, 24)
     25486 [[0, 648], [649, 2096], [2097, 2540], [2541, 2869], [2870, 2911], [2912, 3643], [3644, 4345], [4346, 4968], [4969, 5308], [5309, 5317], [5318, 5710], [5711, 5939], [5940, 6240], [6241, 6243], [6244, 6502], [6503, 6717], [6718, 7011], [7012, 7078], [7079, 7528], [7529, 7989], [7990, 8303], [8304, 8509], [8510, 8512

In [26]:
gen_final(pid2train_norm, attri2idx,  n_prev=24, pred_window=12, name='train_pid2data_npy', valid_points = 12)

4001 (26951, 24) (26951, 24, 24)
     26951 [[0, 135], [136, 2062], [2063, 2252], [2253, 2986], [2987, 3932], [3933, 5871], [5872, 7763], [7764, 8003], [8004, 8014], [8015, 9646], [9647, 9804], [9805, 10500], [10501, 12440], [12441, 12636], [12637, 13778], [13779, 14308], [14309, 15196], [15197, 15388], [15389, 15917], [15918, 15958], [15959, 15995], [15996, 16112], [16113, 16935], [16936, 17223], [17224, 17545], [17546, 17596], [17597, 18662], [18663, 20332], [20333, 20723], [20724, 22170], [22171, 22984], [22985, 23775], [23776, 24902], [24903, 25272], [25273, 25648], [25649, 26382], [26383, 26743], [26744, 26948]]
4002 (25265, 24) (25265, 24, 24)
     25265 [[0, 642], [643, 2090], [2091, 2528], [2529, 2857], [2858, 2893], [2894, 3625], [3626, 4321], [4322, 4938], [4939, 5272], [5273, 5275], [5276, 5668], [5669, 5891], [5892, 6190], [6192, 6449], [6450, 6658], [6659, 6951], [6952, 7017], [7018, 7461], [7462, 7916], [7917, 8224], [8225, 8424], [8427, 9030], [9031, 9569], [9570, 10210]

In [27]:
gen_final(pid2valid_norm, attri2idx,  n_prev=24, pred_window=6, name='valid_pid2data_npy', valid_points = 12)

4001 (9019, 24) (9019, 24, 24)
     9019 [[0, 231], [232, 238], [239, 1035], [1036, 1172], [1173, 1517], [1518, 3473], [3474, 4278], [4279, 5181], [5182, 6232], [6233, 8202], [8203, 8722], [8723, 9018]]
4002 (8530, 24) (8530, 24, 24)
     8530 [[0, 479], [480, 1115], [1116, 3065], [3066, 3345], [3346, 3663], [3664, 5179], [5180, 6203], [6204, 7004], [7005, 8529]]
4003 (7160, 24) (7160, 24, 24)
     7160 [[0, 973], [974, 1127], [1128, 1641], [1642, 1647], [1648, 1763], [1764, 1977], [1978, 2213], [2214, 2406], [2407, 2443], [2444, 2585], [2586, 2642], [2643, 2668], [2669, 2766], [2767, 2847], [2848, 2924], [2925, 2940], [2941, 2948], [2949, 3009], [3010, 3223], [3224, 3247], [3248, 3567], [3568, 3811], [3812, 3818], [3819, 3973], [3974, 4054], [4055, 4208], [4209, 4295], [4296, 4445], [4446, 4524], [4525, 4606], [4607, 4760], [4761, 4816], [4817, 5298], [5299, 5597], [5598, 5672], [5673, 5805], [5806, 5832], [5833, 5893], [5894, 6104], [6105, 6133], [6134, 6329], [6330, 6543], [6544, 67

In [28]:
gen_final(pid2valid_norm, attri2idx,  n_prev=24, pred_window=12, name='valid_pid2data_npy', valid_points = 12)

4001 (8977, 24) (8977, 24, 24)
     8977 [[0, 231], [232, 1040], [1041, 1176], [1177, 1517], [1518, 3467], [3468, 4266], [4267, 5163], [5164, 6208], [6209, 8172], [8173, 8686], [8687, 8976]]
4002 (8488, 24) (8488, 24, 24)
     8488 [[0, 473], [474, 1103], [1104, 3047], [3048, 3321], [3322, 3639], [3640, 5149], [5150, 6172], [6173, 6968], [6969, 8487]]
4003 (6935, 24) (6935, 24, 24)
     6935 [[0, 968], [969, 1116], [1117, 1630], [1631, 1637], [1638, 1748], [1749, 1956], [1957, 2186], [2187, 2374], [2375, 2411], [2412, 2553], [2554, 2604], [2605, 2624], [2625, 2716], [2717, 2791], [2792, 2862], [2863, 2872], [2875, 2929], [2930, 3137], [3138, 3155], [3156, 3160], [3161, 3474], [3475, 3718], [3720, 3872], [3873, 3951], [3952, 4105], [4106, 4192], [4193, 4336], [4337, 4409], [4410, 4485], [4486, 4633], [4634, 4683], [4684, 5159], [5160, 5458], [5459, 5527], [5528, 5654], [5655, 5675], [5676, 5730], [5731, 5935], [5936, 5958], [5959, 6148], [6149, 6361], [6362, 6556], [6558, 6597], [6598, 

In [29]:
gen_final(pid2test_norm, attri2idx,  n_prev=24, pred_window=6, name='test_pid2data_npy', valid_points = 12)

4001 (9049, 24) (9049, 24, 24)
     9049 [[0, 882], [883, 1540], [1541, 1706], [1707, 2334], [2335, 8870], [8871, 9048]]
4002 (8570, 24) (8570, 24, 24)
     8570 [[0, 4982], [4983, 7729], [7730, 8569]]
4003 (7136, 24) (7136, 24, 24)
     7136 [[0, 40], [41, 90], [91, 223], [224, 291], [292, 361], [362, 418], [419, 706], [707, 1070], [1071, 1332], [1333, 1501], [1502, 1673], [1674, 1901], [1902, 2014], [2015, 2083], [2084, 2312], [2313, 2510], [2511, 2662], [2663, 2717], [2718, 2792], [2793, 2866], [2867, 3025], [3026, 3098], [3099, 3183], [3184, 3244], [3245, 3300], [3301, 3350], [3351, 3511], [3512, 3756], [3757, 4205], [4206, 4277], [4278, 4397], [4398, 4476], [4477, 4602], [4603, 4747], [4748, 4804], [4805, 4832], [4833, 4966], [4967, 5033], [5034, 5063], [5064, 5126], [5127, 5198], [5199, 5321], [5322, 5527], [5528, 5600], [5601, 5664], [5665, 5730], [5731, 5802], [5803, 5965], [5966, 6194], [6195, 6202], [6203, 6347], [6348, 6567], [6568, 6770], [6771, 6958], [6959, 7135]]
4004 (8

In [30]:
gen_final(pid2test_norm, attri2idx,  n_prev=24, pred_window=12, name='test_pid2data_npy', valid_points = 12)

4001 (9029, 24) (9029, 24, 24)
     9029 [[0, 876], [877, 1533], [1534, 1698], [1699, 2320], [2321, 8856], [8857, 9028]]
4002 (8564, 24) (8564, 24, 24)
     8564 [[0, 4982], [4983, 7729], [7730, 8563]]
4003 (6879, 24) (6879, 24, 24)
     6879 [[0, 34], [35, 84], [85, 211], [212, 273], [274, 337], [338, 394], [395, 682], [683, 1040], [1041, 1296], [1297, 1460], [1461, 1626], [1627, 1851], [1852, 1965], [1966, 2028], [2029, 2251], [2252, 2443], [2444, 2593], [2594, 2642], [2643, 2711], [2712, 2779], [2780, 2932], [2933, 2999], [3000, 3083], [3084, 3144], [3145, 3199], [3200, 3244], [3245, 3399], [3400, 3639], [3640, 4082], [4083, 4148], [4149, 4262], [4263, 4335], [4336, 4455], [4456, 4600], [4601, 4651], [4652, 4679], [4680, 4807], [4808, 4868], [4869, 4897], [4898, 4955], [4956, 5024], [5025, 5141], [5142, 5341], [5342, 5408], [5409, 5466], [5467, 5526], [5527, 5592], [5593, 5749], [5750, 5972], [5975, 6113], [6114, 6327], [6328, 6524], [6525, 6706], [6707, 6878]]
4004 (8083, 24) (8083