In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import numpy as np
import pandas as pd
import copy
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
import os
pd.set_option('display.max_columns', None)
import pickle

In [3]:
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score # sensitivity
from sklearn.metrics import confusion_matrix

# T1DEXI

## data preprocess

In [None]:
def convert_to_datetime(date_str):
  try:
    return pd.to_datetime(date_str)
  except ValueError:
    return pd.to_datetime(date_str + ' 00:00:00')

In [None]:
df = pd.read_csv('/content/drive/Shareddrives/Yanjun/ReproGenBG/ReproGenBG_Dataset/T1DEXI_subset/cgm_valid.csv')
print(df.shape)
df.head(5)

(504623, 18)


Unnamed: 0.1,Unnamed: 0,STUDYID,DOMAIN,USUBJID,LBSEQ,LBTESTCD,LBTEST,LBCAT,LBORRES,LBORRESU,LBSTRESC,LBSTRESN,LBSTRESU,LBDTC,LBSOURCE,LBCRNORD,LBSETTNG,LBTMINT
0,1,T1DEXI,LB,1,1,GLUC,Glucose,CGM,115.0,mg/dL,115.0,115.0,mg/dL,2020-05-11 00:01:17,,,,
1,2,T1DEXI,LB,1,2,GLUC,Glucose,CGM,109.0,mg/dL,109.0,109.0,mg/dL,2020-05-11 00:06:17,,,,
2,3,T1DEXI,LB,1,3,GLUC,Glucose,CGM,105.0,mg/dL,105.0,105.0,mg/dL,2020-05-11 00:11:17,,,,
3,4,T1DEXI,LB,1,4,GLUC,Glucose,CGM,106.0,mg/dL,106.0,106.0,mg/dL,2020-05-11 00:16:18,,,,
4,5,T1DEXI,LB,1,5,GLUC,Glucose,CGM,110.0,mg/dL,110.0,110.0,mg/dL,2020-05-11 00:21:18,,,,


In [None]:
print(len(pd.unique(df['USUBJID'])))
df = df[df['LBTESTCD'] == 'GLUC']
df = df[['USUBJID', 'LBORRES', 'LBDTC']]
print(df.shape)
df.head(5)

64
(504559, 3)


Unnamed: 0,USUBJID,LBORRES,LBDTC
0,1,115.0,2020-05-11 00:01:17
1,1,109.0,2020-05-11 00:06:17
2,1,105.0,2020-05-11 00:11:17
3,1,106.0,2020-05-11 00:16:18
4,1,110.0,2020-05-11 00:21:18


In [None]:
def daily_filter(df):
  # print('func1', df.shape)
  samples_per_day = df.groupby(df['LBDTC'].dt.date).size()
  dates_with_350_plus_records = samples_per_day[samples_per_day > 350]
  df = df[~df['LBDTC'].dt.date.isin(dates_with_350_plus_records.index)]
  # print('func1', df.shape)
  return df

In [None]:
def BG_value_filter(df):
  # print(df.shape)
  time_diffs = df['LBDTC'].diff().dt.total_seconds() / 60  # Difference in minutes
  df.insert(len(df.columns), 'time_diffs', time_diffs)

  # rows_to_remove = df[(time_diffs >= 0) & (time_diffs <= .5)]
  # print(rows_to_remove)
  df = df[~df['time_diffs'].between(0, .5)] # remove BG values that time difference with last value < 0.5 min
  df = df.drop('time_diffs', axis=1)
  # print(df.shape)

  return df

In [None]:
# uid 1127 has 3 days with more than 350 records, and these 3 days data are deleted
# uid 1509 is removed, all days have > 350 records

cur_df = df[df['USUBJID'] == 1127]
cur_df['LBDTC'] = cur_df['LBDTC'].apply(convert_to_datetime)
samples_per_day = cur_df.groupby(cur_df['LBDTC'].dt.date).size()
dates_with_350_plus_records = samples_per_day[samples_per_day > 350]
print(dates_with_350_plus_records)

LBDTC
2019-07-23    419
2019-07-24    576
2019-07-25    502
dtype: int64


In [None]:
pd.options.mode.chained_assignment = None  # Disable the warning

uid_lst = pd.unique(df['USUBJID'])

for uid in uid_lst:
  print(uid)
  cur_df = df[df['USUBJID'] == uid]
  print(cur_df.shape)
  # cur_df['LBDTC'] = pd.to_datetime(cur_df['LBDTC'])
  cur_df['LBDTC'] = cur_df['LBDTC'].apply(convert_to_datetime)
  cur_df = daily_filter(cur_df)
  cur_df = BG_value_filter(cur_df)
  print(cur_df.shape)

  cur_df.to_csv('/content/drive/Shareddrives/Yanjun/ReproGenBG/ReproGenBG_Dataset/T1DEXI_processed/' + str(uid) + '.csv', index=False)


1
(7887, 3)
(7887, 3)
1010
(7674, 3)
(7674, 3)
1015
(7242, 3)
(7242, 3)
103
(7218, 3)
(7215, 3)
1043
(7911, 3)
(7911, 3)
1082
(7959, 3)
(7959, 3)
11
(8003, 3)
(8003, 3)
1115
(7944, 3)
(7944, 3)
1121
(7935, 3)
(7935, 3)
1127
(8643, 3)
(7146, 3)
1139
(5832, 3)
(5832, 3)
114
(7812, 3)
(7812, 3)
1143
(7838, 3)
(7838, 3)
115
(7997, 3)
(7997, 3)
1171
(7947, 3)
(7947, 3)
1194
(7992, 3)
(7992, 3)
1201
(7948, 3)
(7948, 3)
1205
(7735, 3)
(7735, 3)
1211
(7926, 3)
(7926, 3)
1219
(7972, 3)
(7972, 3)
1230
(7883, 3)
(7883, 3)
1239
(7887, 3)
(7885, 3)
1271
(7993, 3)
(7993, 3)
1286
(8014, 3)
(8014, 3)
1311
(7789, 3)
(7789, 3)
1330
(7962, 3)
(7962, 3)
1336
(7890, 3)
(7890, 3)
1343
(7602, 3)
(7602, 3)
1345
(7637, 3)
(7637, 3)
1348
(7920, 3)
(7920, 3)
1361
(7798, 3)
(7798, 3)
1362
(6789, 3)
(6788, 3)
1363
(7883, 3)
(7883, 3)
1377
(7930, 3)
(7930, 3)
1381
(7875, 3)
(7875, 3)
1386
(7583, 3)
(7582, 3)
1408
(7648, 3)
(7648, 3)
1422
(7621, 3)
(7621, 3)
1427
(7725, 3)
(7725, 3)
1433
(7868, 3)
(7868, 3)
1435
(73

## baseline functions

In [None]:
def find_continuous_rows(df):
  continuous_rows = []
  for i in range(len(df) - (6 + 6 -1)): # fit with seq_len + pred_len
    current_rows = df.iloc[i:i+6+6]
    time_diffs = np.diff(pd.to_datetime(current_rows['LBDTC'])).astype('timedelta64[m]')
    # print(time_diffs)
    if all(diff <= timedelta(minutes=7) for diff in time_diffs):
      continuous_rows.append(current_rows)
  return continuous_rows

In [None]:
def data_process(cgm_df):
  continuous_segments = find_continuous_rows(cgm_df)
  print(len(continuous_segments))
  # print(continuous_segments[0])
  if continuous_segments:
    num_train = int(len(continuous_segments) * 0.7)
    num_test = int(len(continuous_segments) * 0.2)
    num_vali = len(continuous_segments) - num_train - num_test
    border1s = [0, num_train - 6, len(continuous_segments) - num_test - 6]
    border2s = [num_train, num_train + num_vali, len(continuous_segments)]

    # # print('last time in train set:', continuous_segments[num_train-1].LBDTC.values[-1], cgm_df.shape)
    # train_df = cgm_df[cgm_df['mg/dl'] <= continuous_segments[num_train-1].LBDTC.values[-1]]
    # train_segments = continuous_segments[border1s[0]:border2s[0]]
    # print(f"Number of train segments: {len(train_segments)}")

    # vali_segments = continuous_segments[border1s[1]:border2s[1]]
    # print(f"Number of vali segments: {len(vali_segments)}")

    border1 = border1s[2]
    border2 = border2s[2]
    # print(border1, border2)
    test_segments = continuous_segments[border1:border2]
    print(f"Number of test segments: {len(test_segments)}")

    # return train_df, train_segments, vali_segments, test_segments
    return test_segments

In [None]:
def data_process_population(cgm_df):
  continuous_segments = find_continuous_rows(cgm_df)
  # print(len(continuous_segments))
  # print(continuous_segments[0])
  if continuous_segments:
    return continuous_segments

In [None]:
def get_seq_pred(test_segments):
  features_list = []
  trues_list = []

  for segment in test_segments:
    features = segment.iloc[:6]['LBORRES'].values
    trues = segment['LBORRES'].values[-1]
    features_list.append(features)
    trues_list.append(trues)

  return features_list, trues_list

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np

def get_mse_rmse_mae(true, pred):
  # Calculate MSE
  mse = mean_squared_error(true, pred)
  # Calculate RMSE
  rmse = np.sqrt(mse)
  # Calculate MAE
  mae = mean_absolute_error(true, pred)

  true, pred = np.array(true), np.array(pred)
  mape = np.mean(np.abs((true - pred) / true)) * 100

  return mse, rmse, mae, mape

In [None]:
def get_baseline_performance(test_segments):
    features_list, trues_list = get_seq_pred(test_segments)

    # 30 mins diff
    trues = trues_list
    preds = [i[-1] for i in features_list]

    # print('----- 30 mins differnece (between features[-1], and pred[5]) ----')
    diff_30_mins = abs(np.array(trues) - np.array(preds))
    print('30 mins diff avg:', round(np.mean(diff_30_mins), 2), 'std:', round(np.std(diff_30_mins), 2))
    mse, rmse, mae, mape = get_mse_rmse_mae(trues, preds)
    print('MSE:', round(mse, 2), 'RMSE:', round(rmse, 2), 'MAE:', round(mae, 2), 'MAPE', round(mape, 2))
    return mse, rmse, mae, mape

## baseline calculation

### population (last 20% data (test))


In [None]:
# individual

root = '/content/drive/Shareddrives/Yanjun/ReproGenBG/ReproGenBG_Dataset/T1DEXI_processed/'

res_lst = []

for file in os.listdir(root):
  if file.endswith('.csv'):
    print(file)
    df = pd.read_csv(root + file)
    df['LBDTC'] = df['LBDTC'].apply(convert_to_datetime)
    test_segments = data_process(df)
    print(len(test_segments))
    mse, rmse, mae, mape = get_baseline_performance(test_segments)

    res_lst.append([file.split('.')[0], rmse, mae, mape])

1127.csv
7091
Number of test segments: 1424
1424
30 mins diff avg: 18.58 std: 15.82
MSE: 595.34 RMSE: 24.4 MAE: 18.58 MAPE 15.37
1.csv
7799
Number of test segments: 1565
1565
30 mins diff avg: 26.7 std: 25.97
MSE: 1386.93 RMSE: 37.24 MAE: 26.7 MAPE 21.09
1010.csv
7552
Number of test segments: 1516
1516
30 mins diff avg: 22.58 std: 20.79
MSE: 942.08 RMSE: 30.69 MAE: 22.58 MAPE 20.56
1015.csv
6173
Number of test segments: 1240
1240
30 mins diff avg: 10.23 std: 10.57
MSE: 216.44 RMSE: 14.71 MAE: 10.23 MAPE 7.22
103.csv
5701
Number of test segments: 1146
1146
30 mins diff avg: 10.77 std: 10.5
MSE: 226.25 RMSE: 15.04 MAE: 10.77 MAPE 8.93
1043.csv
7807
Number of test segments: 1567
1567
30 mins diff avg: 18.48 std: 19.0
MSE: 702.47 RMSE: 26.5 MAE: 18.48 MAPE 14.4
1082.csv
7890
Number of test segments: 1584
1584
30 mins diff avg: 16.66 std: 16.26
MSE: 541.98 RMSE: 23.28 MAE: 16.66 MAPE 14.64
11.csv
7970
Number of test segments: 1600
1600
30 mins diff avg: 14.05 std: 11.89
MSE: 338.81 RMSE: 18

In [None]:
print(res_lst)

df = pd.DataFrame(res_lst, columns=['Test patient', 'RMSE', 'MAE', 'MAPE'])
df.to_csv('/content/drive/Shareddrives/Yanjun/ReproGenBG/baseline_performance/T1DEXI_20test.csv', index=False)

[['1127', 24.39950151452491, 18.58005617977528, 15.372112304153607], ['1', 37.24145376585411, 26.69712460063898, 21.086846556423446], ['1010', 30.693361722461383, 22.579815303430077, 20.559094688344974], ['1015', 14.712047948491389, 10.228225806451613, 7.224643622495923], ['103', 15.041652465754005, 10.773123909249565, 8.931133253033899], ['1043', 26.50414471928069, 18.477345245692405, 14.402062797543866], ['1082', 23.280527368284734, 16.660984848484848, 14.635630578447776], ['11', 18.406843156826213, 14.053125, 14.501071920515612], ['1115', 21.0022883782423, 14.98408656906429, 6.670999285589993], ['1121', 30.084426083838324, 21.736409608091023, 19.607367644981586], ['1139', 23.237311649352904, 16.75296262534184, 11.051100448389892], ['114', 24.90495516399436, 17.923027166882278, 11.884958106867128], ['1143', 22.051773211113566, 15.79987004548408, 10.179903612061993], ['115', 31.57861888769475, 23.8285175879397, 17.560227992717238], ['1171', 24.025613058886172, 17.544933078393882, 14.3

### loocv (whole data)

In [None]:
# population

root = '/content/drive/Shareddrives/Yanjun/ReproGenBG/ReproGenBG_Dataset/T1DEXI_processed/'

res_lst = []

for file in os.listdir(root):
  if file.endswith('.csv'):
    print(file.split('.')[0].split('_')[-1])
    df = pd.read_csv(root + file)
    df['LBDTC'] = df['LBDTC'].apply(convert_to_datetime)
    test_segments = data_process_population(df)
    print(len(test_segments))
    mse, rmse, mae, mape = get_baseline_performance(test_segments)
    res_lst.append([file.split('.')[0], rmse, mae, mape])

1127
7091
30 mins diff avg: 17.61 std: 15.69
MSE: 556.2 RMSE: 23.58 MAE: 17.61 MAPE 14.28
1
7799
30 mins diff avg: 22.41 std: 23.49
MSE: 1053.69 RMSE: 32.46 MAE: 22.41 MAPE 15.42
1010
7552
30 mins diff avg: 19.02 std: 20.03
MSE: 762.95 RMSE: 27.62 MAE: 19.02 MAPE 16.08
1015
6173
30 mins diff avg: 9.35 std: 9.69
MSE: 181.39 RMSE: 13.47 MAE: 9.35 MAPE 7.18
103
5701
30 mins diff avg: 9.64 std: 9.47
MSE: 182.62 RMSE: 13.51 MAE: 9.64 MAPE 8.06
1043
7807
30 mins diff avg: 16.81 std: 16.95
MSE: 569.77 RMSE: 23.87 MAE: 16.81 MAPE 12.62
1082
7890
30 mins diff avg: 14.16 std: 14.23
MSE: 402.98 RMSE: 20.07 MAE: 14.16 MAPE 12.92
11
7970
30 mins diff avg: 15.21 std: 14.08
MSE: 429.48 RMSE: 20.72 MAE: 15.21 MAPE 15.64
1115
7827
30 mins diff avg: 17.23 std: 18.61
MSE: 642.91 RMSE: 25.36 MAE: 17.23 MAPE 10.15
1121
7880
30 mins diff avg: 20.46 std: 19.9
MSE: 814.87 RMSE: 28.55 MAE: 20.46 MAPE 17.68
1139
5457
30 mins diff avg: 16.89 std: 15.46
MSE: 524.24 RMSE: 22.9 MAE: 16.89 MAPE 12.53
114
7700
30 min

In [None]:
print(res_lst)

df = pd.DataFrame(res_lst, columns=['Test patient', 'RMSE', 'MAE', 'MAPE'])
df.to_csv('/content/drive/Shareddrives/Yanjun/ReproGenBG/baseline_performance/T1DEXI_loocv.csv', index=False)

[['1127', 23.583880251135547, 17.61162036384149, 14.280914861912159], ['1', 32.460614579981645, 22.408001025772535, 15.422695831246823], ['1010', 27.62154380557665, 19.018273305084747, 16.083398222570803], ['1015', 13.46805954455484, 9.349100923375993, 7.181502798502512], ['103', 13.51357591114699, 9.63795825293808, 8.058932058368686], ['1043', 23.869956377662273, 16.811323171512743, 12.615797498450402], ['1082', 20.07432640169085, 14.158301647655259, 12.92493202339699], ['11', 20.723957974479198, 15.206398996235885, 15.642455109627896], ['1115', 25.35568365775996, 17.22677909799412, 10.154811379927672], ['1121', 28.54588878450734, 20.46294416243655, 17.6790618802732], ['1139', 22.896329206861587, 16.890599230346343, 12.526402604403945], ['114', 24.426069211865777, 17.905064935064935, 12.803172937243538], ['1143', 21.324485075934422, 15.626581452980306, 11.21157189103998], ['115', 27.766280532981682, 20.11373092926491, 14.58248719939995], ['1171', 24.929650250599096, 18.017656090071647

# DiaTrend

## data preprocess

In [14]:
def daily_filter(df):
  # print(df.shape)
  samples_per_day = df.groupby(df['date'].dt.date).size()
  dates_with_350_plus_records = samples_per_day[samples_per_day > 350]
  print('-----', dates_with_350_plus_records)
  df = df[~df['date'].dt.date.isin(dates_with_350_plus_records.index)]
  # print(df.shape)
  return df

In [15]:
def BG_value_filter(df):
  # print(df.shape)
  time_diffs = df['date'].diff().dt.total_seconds() / 60  # Difference in minutes
  df.insert(len(df.columns), 'time_diffs', time_diffs)

  # rows_to_remove = df[(time_diffs >= 0) & (time_diffs <= .5)]
  # print(rows_to_remove)
  df = df[~df['time_diffs'].between(0, .5)] # remove BG values that time difference with last value < 0.5 min
  df = df.drop('time_diffs', axis=1)
  # print(df.shape)

  return df

In [16]:
root = '/content/drive/Shareddrives/Yanjun/ReproGenBG/ReproGenBG_Dataset/diatrend_subset/'

for file in os.listdir(root):
  if file.endswith('.csv'):
    print(file)
    df = pd.read_csv(root + file)
    print("original size:", df.shape)
    df['date'] = pd.to_datetime(df['date'])
    df = daily_filter(df)
    print('remove duplicated values: ', df.shape)
    df = BG_value_filter(df)
    print('remove days with > 350 values: ', df.shape)


    # df.to_csv('/content/drive/Shareddrives/Yanjun/ReproGenBG/ReproGenBG_Dataset/diatrend_processed/' + file, index=False)

processed_cgm_data_Subject44.csv
original size: (13152, 2)
----- date
2021-10-27    412
2021-10-28    550
2021-10-29    571
2021-10-30    574
2021-10-31    576
dtype: int64
remove duplicated values:  (10469, 2)
remove days with > 350 values:  (10436, 2)
processed_cgm_data_Subject54.csv
original size: (8423, 2)
----- Series([], dtype: int64)
remove duplicated values:  (8423, 2)
remove days with > 350 values:  (8423, 2)
processed_cgm_data_Subject46.csv
original size: (11720, 2)
----- Series([], dtype: int64)
remove duplicated values:  (11720, 2)
remove days with > 350 values:  (11720, 2)
processed_cgm_data_Subject47.csv
original size: (11825, 2)
----- Series([], dtype: int64)
remove duplicated values:  (11825, 2)
remove days with > 350 values:  (11825, 2)
processed_cgm_data_Subject22.csv
original size: (11981, 2)
----- Series([], dtype: int64)
remove duplicated values:  (11981, 2)
remove days with > 350 values:  (11969, 2)
processed_cgm_data_Subject19.csv
original size: (11886, 2)
----- 

## baseline functions

In [None]:
def find_continuous_rows(df):
  continuous_rows = []
  for i in range(len(df) - (6 + 6 -1)): # fit with seq_len + pred_len
    current_rows = df.iloc[i:i+6+6]
    time_diffs = np.diff(pd.to_datetime(current_rows['date'])).astype('timedelta64[m]')
    # print(time_diffs)
    if all(diff <= timedelta(minutes=7) for diff in time_diffs):
      continuous_rows.append(current_rows)
  return continuous_rows

In [None]:
def data_process(cgm_df):
  continuous_segments = find_continuous_rows(cgm_df)
  print(len(continuous_segments))
  # print(continuous_segments[0])
  if continuous_segments:
    num_train = int(len(continuous_segments) * 0.7)
    num_test = int(len(continuous_segments) * 0.2)
    num_vali = len(continuous_segments) - num_train - num_test
    border1s = [0, num_train - 6, len(continuous_segments) - num_test - 6]
    border2s = [num_train, num_train + num_vali, len(continuous_segments)]

    # # print('last time in train set:', continuous_segments[num_train-1].LBDTC.values[-1], cgm_df.shape)
    # train_df = cgm_df[cgm_df['mg/dl'] <= continuous_segments[num_train-1].LBDTC.values[-1]]
    # train_segments = continuous_segments[border1s[0]:border2s[0]]
    # print(f"Number of train segments: {len(train_segments)}")

    # vali_segments = continuous_segments[border1s[1]:border2s[1]]
    # print(f"Number of vali segments: {len(vali_segments)}")

    border1 = border1s[2]
    border2 = border2s[2]
    # print(border1, border2)
    test_segments = continuous_segments[border1:border2]
    print(f"Number of test segments: {len(test_segments)}")

    # return train_df, train_segments, vali_segments, test_segments
    return test_segments

In [None]:
def data_process_population(cgm_df):
  continuous_segments = find_continuous_rows(cgm_df)
  # print(len(continuous_segments))
  # print(continuous_segments[0])
  if continuous_segments:
    return continuous_segments

In [None]:
def get_seq_pred(test_segments):
  features_list = []
  trues_list = []

  for segment in test_segments:
    features = segment.iloc[:6]['mg/dl'].values
    trues = segment['mg/dl'].values[-1]
    features_list.append(features)
    trues_list.append(trues)

  return features_list, trues_list

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np

def get_mse_rmse_mae(true, pred):
  # Calculate MSE
  mse = mean_squared_error(true, pred)
  # Calculate RMSE
  rmse = np.sqrt(mse)
  # Calculate MAE
  mae = mean_absolute_error(true, pred)

  true, pred = np.array(true), np.array(pred)
  mape = np.mean(np.abs((true - pred) / true)) * 100

  return mse, rmse, mae, mape

In [None]:
def get_baseline_performance(test_segments):
    features_list, trues_list = get_seq_pred(test_segments)

    # 30 mins diff
    trues = trues_list
    preds = [i[-1] for i in features_list]

    # print('----- 30 mins differnece (between features[-1], and pred[5]) ----')
    diff_30_mins = abs(np.array(trues) - np.array(preds))
    print('30 mins diff avg:', round(np.mean(diff_30_mins), 2), 'std:', round(np.std(diff_30_mins), 2))
    mse, rmse, mae, mape = get_mse_rmse_mae(trues, preds)
    print('MSE:', round(mse, 2), 'RMSE:', round(rmse, 2), 'MAE:', round(mae, 2), 'MAPE', round(mape, 2))
    return mse, rmse, mae, mape

## baseline calculation

### population

In [None]:
# individual

root = '/content/drive/Shareddrives/Yanjun/ReproGenBG/ReproGenBG_Dataset/diatrend_processed/'

MSE_lst = []
RMSE_lst = []
MAE_lst = []
MAPE_lst = []

for file in os.listdir(root):
  if file.endswith('.csv'):
    print(file)
    df = pd.read_csv(root + file)
    df['date'] = df['date'].apply(convert_to_datetime)
    test_segments = data_process(df)
    print(len(test_segments))
    mse, rmse, mae, mape = get_baseline_performance(test_segments)
    MSE_lst.append(mse)
    RMSE_lst.append(rmse)
    MAE_lst.append(mae)
    MAPE_lst.append(mape)
    # break

processed_cgm_data_Subject1.csv
8611
Number of test segments: 1728
1728
----- 30 mins differnece (between features[-1], and pred[5]) ----
30 mins diff avg: 16.03 std: 14.58
MSE: 469.63 RMSE: 21.67 MAE: 16.03 MAPE 9.94
processed_cgm_data_Subject10.csv
11829
Number of test segments: 2371
2371
----- 30 mins differnece (between features[-1], and pred[5]) ----
30 mins diff avg: 23.59 std: 21.62
MSE: 1023.83 RMSE: 32.0 MAE: 23.59 MAPE 11.25
processed_cgm_data_Subject11.csv
11623
Number of test segments: 2330
2330
----- 30 mins differnece (between features[-1], and pred[5]) ----
30 mins diff avg: 21.76 std: 21.62
MSE: 941.14 RMSE: 30.68 MAE: 21.76 MAPE 13.07
processed_cgm_data_Subject12.csv
11783
Number of test segments: 2362
2362
----- 30 mins differnece (between features[-1], and pred[5]) ----
30 mins diff avg: 25.96 std: 23.49
MSE: 1225.56 RMSE: 35.01 MAE: 25.96 MAPE 14.45
processed_cgm_data_Subject13.csv
10688
Number of test segments: 2143
2143
----- 30 mins differnece (between features[-

In [None]:
print('MSE:', round(np.mean(MSE_lst), 2), round(np.std(MSE_lst), 2))
print('RMSE:', round(np.mean(RMSE_lst), 2), round(np.std(RMSE_lst), 2))
print('MAE:', round(np.mean(MAE_lst), 2), round(np.std(MAE_lst), 2))
print('MAPE:', round(np.mean(MAPE_lst), 2), round(np.std(MAPE_lst), 2))

MSE: 859.11 356.51
RMSE: 28.73 5.81
MAE: 20.82 4.2
MAPE: 12.42 2.81


In [None]:
# individual

root = '/content/drive/Shareddrives/Yanjun/ReproGenBG/ReproGenBG_Dataset/diatrend_processed/'

res_lst = []

for file in os.listdir(root):
  if file.endswith('.csv'):
    print(file)
    df = pd.read_csv(root + file)
    df['date'] = df['date'].apply(convert_to_datetime)
    test_segments = data_process(df)
    print(len(test_segments))
    mse, rmse, mae, mape = get_baseline_performance(test_segments)

    res_lst.append([file.split('.')[0].split('_')[-1], rmse, mae, mape])

processed_cgm_data_Subject1.csv
8611
Number of test segments: 1728
1728
30 mins diff avg: 16.03 std: 14.58
MSE: 469.63 RMSE: 21.67 MAE: 16.03 MAPE 9.94
processed_cgm_data_Subject10.csv
11829
Number of test segments: 2371
2371
30 mins diff avg: 23.59 std: 21.62
MSE: 1023.83 RMSE: 32.0 MAE: 23.59 MAPE 11.25
processed_cgm_data_Subject11.csv
11623
Number of test segments: 2330
2330
30 mins diff avg: 21.76 std: 21.62
MSE: 941.14 RMSE: 30.68 MAE: 21.76 MAPE 13.07
processed_cgm_data_Subject12.csv
11783
Number of test segments: 2362
2362
30 mins diff avg: 25.96 std: 23.49
MSE: 1225.56 RMSE: 35.01 MAE: 25.96 MAPE 14.45
processed_cgm_data_Subject13.csv
10688
Number of test segments: 2143
2143
30 mins diff avg: 12.78 std: 13.89
MSE: 356.26 RMSE: 18.87 MAE: 12.78 MAPE 10.53
processed_cgm_data_Subject14.csv
11428
Number of test segments: 2291
2291
30 mins diff avg: 16.97 std: 20.49
MSE: 708.07 RMSE: 26.61 MAE: 16.97 MAPE 5.5
processed_cgm_data_Subject15.csv
11857
Number of test segments: 2377
2377


In [None]:
print(res_lst)

df = pd.DataFrame(res_lst, columns=['Test patient', 'RMSE', 'MAE', 'MAPE'])
df.to_csv('/content/drive/Shareddrives/Yanjun/ReproGenBG/baseline_performance/diatrend_20test.csv', index=False)

[['Subject1', 21.671059917848112, 16.03298611111111, 9.935905878186189], ['Subject10', 31.997304557502847, 23.587937579080556, 11.252483826318436], ['Subject11', 30.678017741245746, 21.763090128755366, 13.065541170101525], ['Subject12', 35.00801287253616, 25.95977984758679, 14.450174659625697], ['Subject13', 18.874885851635568, 12.784881007932805, 10.525128160180735], ['Subject14', 26.609598108195907, 16.971628109995635, 5.5016374320356745], ['Subject15', 31.451915206079526, 22.784602440050485, 13.40479668257849], ['Subject16', 32.667164652121855, 22.564810879728007, 10.926822805100135], ['Subject17', 29.793201922636836, 22.11836051861146, 10.12098679434024], ['Subject18', 21.421931578230122, 15.75593220338983, 8.962439985103675], ['Subject19', 31.79152162853529, 22.406779661016948, 9.01694745275826], ['Subject2', 25.340244988669525, 18.349195710455763, 10.033740674636066], ['Subject20', 33.13533943663976, 23.321849105974707, 14.04040640689112], ['Subject21', 28.77540610503349, 19.4568

### loocv

In [None]:
# population

root = '/content/drive/Shareddrives/Yanjun/ReproGenBG/ReproGenBG_Dataset/diatrend_processed/'

MSE_lst = []
RMSE_lst = []
MAE_lst = []
MAPE_lst = []

for file in os.listdir(root):
  if file.endswith('.csv'):
    print(file)
    df = pd.read_csv(root + file)
    df['date'] = df['date'].apply(convert_to_datetime)
    test_segments = data_process_population(df)
    print(len(test_segments))
    mse, rmse, mae, mape = get_baseline_performance(test_segments)
    MSE_lst.append(mse)
    RMSE_lst.append(rmse)
    MAE_lst.append(mae)
    MAPE_lst.append(mape)
    # break

processed_cgm_data_Subject1.csv
8611
30 mins diff avg: 15.71 std: 14.63
MSE: 460.77 RMSE: 21.47 MAE: 15.71 MAPE 9.9
processed_cgm_data_Subject10.csv
11829
30 mins diff avg: 26.19 std: 24.84
MSE: 1303.25 RMSE: 36.1 MAE: 26.19 MAPE 14.21
processed_cgm_data_Subject11.csv
11623
30 mins diff avg: 25.77 std: 25.8
MSE: 1330.01 RMSE: 36.47 MAE: 25.77 MAPE 15.74
processed_cgm_data_Subject12.csv
11783
30 mins diff avg: 26.08 std: 25.0
MSE: 1305.07 RMSE: 36.13 MAE: 26.08 MAPE 14.63
processed_cgm_data_Subject13.csv
10688
30 mins diff avg: 14.75 std: 16.36
MSE: 485.07 RMSE: 22.02 MAE: 14.75 MAPE 11.43
processed_cgm_data_Subject14.csv
11428
30 mins diff avg: 18.03 std: 21.72
MSE: 796.97 RMSE: 28.23 MAE: 18.03 MAPE 7.09
processed_cgm_data_Subject15.csv
11857
30 mins diff avg: 23.17 std: 20.63
MSE: 962.35 RMSE: 31.02 MAE: 23.17 MAPE 15.41
processed_cgm_data_Subject16.csv
11736
30 mins diff avg: 23.98 std: 25.53
MSE: 1227.17 RMSE: 35.03 MAE: 23.98 MAPE 10.85
processed_cgm_data_Subject17.csv
11929
30 mi

In [None]:
print('MSE:', round(np.mean(MSE_lst), 2), round(np.std(MSE_lst), 2))
print('RMSE:', round(np.mean(RMSE_lst), 2), round(np.std(RMSE_lst), 2))
print('MAE:', round(np.mean(MAE_lst), 2), round(np.std(MAE_lst), 2))
print('MAPE:', round(np.mean(MAPE_lst), 2), round(np.std(MAPE_lst), 2))

MSE: 852.27 311.77
RMSE: 28.72 5.23
MAE: 20.71 3.79
MAPE: 12.57 2.56


In [None]:
# population

root = '/content/drive/Shareddrives/Yanjun/ReproGenBG/ReproGenBG_Dataset/diatrend_processed/'

res_lst = []

for file in os.listdir(root):
  if file.endswith('.csv'):
    print(file.split('.')[0].split('_')[-1])
    df = pd.read_csv(root + file)
    df['date'] = df['date'].apply(convert_to_datetime)
    test_segments = data_process_population(df)
    print(len(test_segments))
    mse, rmse, mae, mape = get_baseline_performance(test_segments)
    res_lst.append([file.split('.')[0].split('_')[-1], rmse, mae, mape])

Subject1
8611
30 mins diff avg: 15.71 std: 14.63
MSE: 460.77 RMSE: 21.47 MAE: 15.71 MAPE 9.9
Subject10
11829
30 mins diff avg: 26.19 std: 24.84
MSE: 1303.25 RMSE: 36.1 MAE: 26.19 MAPE 14.21
Subject11
11623
30 mins diff avg: 25.77 std: 25.8
MSE: 1330.01 RMSE: 36.47 MAE: 25.77 MAPE 15.74
Subject12
11783
30 mins diff avg: 26.08 std: 25.0
MSE: 1305.07 RMSE: 36.13 MAE: 26.08 MAPE 14.63
Subject13
10688
30 mins diff avg: 14.75 std: 16.36
MSE: 485.07 RMSE: 22.02 MAE: 14.75 MAPE 11.43
Subject14
11428
30 mins diff avg: 18.03 std: 21.72
MSE: 796.97 RMSE: 28.23 MAE: 18.03 MAPE 7.09
Subject15
11857
30 mins diff avg: 23.17 std: 20.63
MSE: 962.35 RMSE: 31.02 MAE: 23.17 MAPE 15.41
Subject16
11736
30 mins diff avg: 23.98 std: 25.53
MSE: 1227.17 RMSE: 35.03 MAE: 23.98 MAPE 10.85
Subject17
11929
30 mins diff avg: 22.23 std: 20.8
MSE: 926.71 RMSE: 30.44 MAE: 22.23 MAPE 10.33
Subject18
11771
30 mins diff avg: 17.74 std: 16.01
MSE: 570.98 RMSE: 23.9 MAE: 17.74 MAPE 11.31
Subject19
11771
30 mins diff avg: 22

In [None]:
print(res_lst)

df = pd.DataFrame(res_lst, columns=['Test patient', 'RMSE', 'MAE', 'MAPE'])
df.to_csv('/content/drive/Shareddrives/Yanjun/ReproGenBG/baseline_performance/diatrend_loocv.csv', index=False)

[['Subject1', 21.4656255660761, 15.711647892230868, 9.897633648018934], ['Subject10', 36.10057362441069, 26.19401470961197, 14.208083257291557], ['Subject11', 36.46932547961089, 25.77045513206573, 15.739481702456223], ['Subject12', 36.12570347643983, 26.081643045064926, 14.625285006497887], ['Subject13', 22.024268307053422, 14.749906437125748, 11.43444777049782], ['Subject14', 28.23072658817312, 18.029926496324816, 7.0855073592853115], ['Subject15', 31.02178429129553, 23.16580922661719, 15.414293539892507], ['Subject16', 35.03095086842119, 23.984832992501705, 10.854045352408038], ['Subject17', 30.44199232098296, 22.226506832089864, 10.325088980518764], ['Subject18', 23.89526143019642, 17.740463851839266, 11.310332396197799], ['Subject19', 31.434982112549026, 22.162857871038995, 10.123828025351044], ['Subject2', 30.302852116285862, 21.210653753026634, 12.882096417989713], ['Subject20', 33.15626808224288, 24.354813325172685, 15.089124524518184], ['Subject21', 28.543799903763972, 19.83651