In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import numpy as np
import pandas as pd
import copy
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
import os
pd.set_option('display.max_columns', None)
import pickle

In [3]:
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score # sensitivity
from sklearn.metrics import confusion_matrix

# T1DEXI

## data preprocess

In [None]:
def convert_to_datetime(date_str):
  try:
    return pd.to_datetime(date_str)
  except ValueError:
    return pd.to_datetime(date_str + ' 00:00:00')

In [None]:
df = pd.read_csv('../../datasets/t1dexi_subsetcgm_valid.csv')
print(df.shape)
df.head(5)

In [None]:
print(len(pd.unique(df['USUBJID'])))
df = df[df['LBTESTCD'] == 'GLUC']
df = df[['USUBJID', 'LBORRES', 'LBDTC']]
print(df.shape)
df.head(5)

In [None]:
def daily_filter(df):
  # print('func1', df.shape)
  samples_per_day = df.groupby(df['LBDTC'].dt.date).size()
  dates_with_350_plus_records = samples_per_day[samples_per_day > 350]
  df = df[~df['LBDTC'].dt.date.isin(dates_with_350_plus_records.index)]
  # print('func1', df.shape)
  return df

In [None]:
def BG_value_filter(df):
  # print(df.shape)
  time_diffs = df['LBDTC'].diff().dt.total_seconds() / 60  # Difference in minutes
  df.insert(len(df.columns), 'time_diffs', time_diffs)

  # rows_to_remove = df[(time_diffs >= 0) & (time_diffs <= .5)]
  # print(rows_to_remove)
  df = df[~df['time_diffs'].between(0, .5)] # remove BG values that time difference with last value < 0.5 min
  df = df.drop('time_diffs', axis=1)
  # print(df.shape)

  return df

In [None]:
# uid 1127 has 3 days with more than 350 records, and these 3 days data are deleted
# uid 1509 is removed, all days have > 350 records

cur_df = df[df['USUBJID'] == 1127]
cur_df['LBDTC'] = cur_df['LBDTC'].apply(convert_to_datetime)
samples_per_day = cur_df.groupby(cur_df['LBDTC'].dt.date).size()
dates_with_350_plus_records = samples_per_day[samples_per_day > 350]
print(dates_with_350_plus_records)

LBDTC
2019-07-23    419
2019-07-24    576
2019-07-25    502
dtype: int64


In [None]:
pd.options.mode.chained_assignment = None  # Disable the warning

uid_lst = pd.unique(df['USUBJID'])

for uid in uid_lst:
  print(uid)
  cur_df = df[df['USUBJID'] == uid]
  print(cur_df.shape)
  # cur_df['LBDTC'] = pd.to_datetime(cur_df['LBDTC'])
  cur_df['LBDTC'] = cur_df['LBDTC'].apply(convert_to_datetime)
  cur_df = daily_filter(cur_df)
  cur_df = BG_value_filter(cur_df)
  print(cur_df.shape)

  cur_df.to_csv('../datasets/t1dexi_subset/' + str(uid) + '.csv', index=False)


## baseline functions

In [None]:
def find_continuous_rows(df):
  continuous_rows = []
  for i in range(len(df) - (6 + 6 -1)): # fit with seq_len + pred_len
    current_rows = df.iloc[i:i+6+6]
    time_diffs = np.diff(pd.to_datetime(current_rows['LBDTC'])).astype('timedelta64[m]')
    # print(time_diffs)
    if all(diff <= timedelta(minutes=7) for diff in time_diffs):
      continuous_rows.append(current_rows)
  return continuous_rows

In [None]:
def data_process(cgm_df):
  continuous_segments = find_continuous_rows(cgm_df)
  print(len(continuous_segments))
  # print(continuous_segments[0])
  if continuous_segments:
    num_train = int(len(continuous_segments) * 0.7)
    num_test = int(len(continuous_segments) * 0.2)
    num_vali = len(continuous_segments) - num_train - num_test
    border1s = [0, num_train - 6, len(continuous_segments) - num_test - 6]
    border2s = [num_train, num_train + num_vali, len(continuous_segments)]

    # # print('last time in train set:', continuous_segments[num_train-1].LBDTC.values[-1], cgm_df.shape)
    # train_df = cgm_df[cgm_df['mg/dl'] <= continuous_segments[num_train-1].LBDTC.values[-1]]
    # train_segments = continuous_segments[border1s[0]:border2s[0]]
    # print(f"Number of train segments: {len(train_segments)}")

    # vali_segments = continuous_segments[border1s[1]:border2s[1]]
    # print(f"Number of vali segments: {len(vali_segments)}")

    border1 = border1s[2]
    border2 = border2s[2]
    # print(border1, border2)
    test_segments = continuous_segments[border1:border2]
    print(f"Number of test segments: {len(test_segments)}")

    # return train_df, train_segments, vali_segments, test_segments
    return test_segments

In [None]:
def data_process_population(cgm_df):
  continuous_segments = find_continuous_rows(cgm_df)
  # print(len(continuous_segments))
  # print(continuous_segments[0])
  if continuous_segments:
    return continuous_segments

In [None]:
def get_seq_pred(test_segments):
  features_list = []
  trues_list = []

  for segment in test_segments:
    features = segment.iloc[:6]['LBORRES'].values
    trues = segment['LBORRES'].values[-1]
    features_list.append(features)
    trues_list.append(trues)

  return features_list, trues_list

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np

def get_mse_rmse_mae(true, pred):
  # Calculate MSE
  mse = mean_squared_error(true, pred)
  # Calculate RMSE
  rmse = np.sqrt(mse)
  # Calculate MAE
  mae = mean_absolute_error(true, pred)

  true, pred = np.array(true), np.array(pred)
  mape = np.mean(np.abs((true - pred) / true)) * 100

  return mse, rmse, mae, mape

In [None]:
def get_baseline_performance(test_segments):
    features_list, trues_list = get_seq_pred(test_segments)

    # 30 mins diff
    trues = trues_list
    preds = [i[-1] for i in features_list]

    # print('----- 30 mins differnece (between features[-1], and pred[5]) ----')
    diff_30_mins = abs(np.array(trues) - np.array(preds))
    print('30 mins diff avg:', round(np.mean(diff_30_mins), 2), 'std:', round(np.std(diff_30_mins), 2))
    mse, rmse, mae, mape = get_mse_rmse_mae(trues, preds)
    print('MSE:', round(mse, 2), 'RMSE:', round(rmse, 2), 'MAE:', round(mae, 2), 'MAPE', round(mape, 2))
    return mse, rmse, mae, mape

## baseline calculation

### population (last 20% data (test))


In [None]:
# individual

root = '/content/drive/Shareddrives/Yanjun/ReproGenBG/ReproGenBG_Dataset/T1DEXI_processed/'

res_lst = []

for file in os.listdir(root):
  if file.endswith('.csv'):
    print(file)
    df = pd.read_csv(root + file)
    df['LBDTC'] = df['LBDTC'].apply(convert_to_datetime)
    test_segments = data_process(df)
    print(len(test_segments))
    mse, rmse, mae, mape = get_baseline_performance(test_segments)

    res_lst.append([file.split('.')[0], rmse, mae, mape])

In [None]:
print(res_lst)

df = pd.DataFrame(res_lst, columns=['Test patient', 'RMSE', 'MAE', 'MAPE'])
df.to_csv('/content/drive/Shareddrives/Yanjun/ReproGenBG/baseline_performance/T1DEXI_20test.csv', index=False)

### loocv (whole data)

In [None]:
# population

root = '/content/drive/Shareddrives/Yanjun/ReproGenBG/ReproGenBG_Dataset/T1DEXI_processed/'

res_lst = []

for file in os.listdir(root):
  if file.endswith('.csv'):
    print(file.split('.')[0].split('_')[-1])
    df = pd.read_csv(root + file)
    df['LBDTC'] = df['LBDTC'].apply(convert_to_datetime)
    test_segments = data_process_population(df)
    print(len(test_segments))
    mse, rmse, mae, mape = get_baseline_performance(test_segments)
    res_lst.append([file.split('.')[0], rmse, mae, mape])

In [None]:
print(res_lst)

df = pd.DataFrame(res_lst, columns=['Test patient', 'RMSE', 'MAE', 'MAPE'])
df.to_csv('/content/drive/Shareddrives/Yanjun/ReproGenBG/baseline_performance/T1DEXI_loocv.csv', index=False)

[['1127', 23.583880251135547, 17.61162036384149, 14.280914861912159], ['1', 32.460614579981645, 22.408001025772535, 15.422695831246823], ['1010', 27.62154380557665, 19.018273305084747, 16.083398222570803], ['1015', 13.46805954455484, 9.349100923375993, 7.181502798502512], ['103', 13.51357591114699, 9.63795825293808, 8.058932058368686], ['1043', 23.869956377662273, 16.811323171512743, 12.615797498450402], ['1082', 20.07432640169085, 14.158301647655259, 12.92493202339699], ['11', 20.723957974479198, 15.206398996235885, 15.642455109627896], ['1115', 25.35568365775996, 17.22677909799412, 10.154811379927672], ['1121', 28.54588878450734, 20.46294416243655, 17.6790618802732], ['1139', 22.896329206861587, 16.890599230346343, 12.526402604403945], ['114', 24.426069211865777, 17.905064935064935, 12.803172937243538], ['1143', 21.324485075934422, 15.626581452980306, 11.21157189103998], ['115', 27.766280532981682, 20.11373092926491, 14.58248719939995], ['1171', 24.929650250599096, 18.017656090071647

# DiaTrend

## data preprocess

In [14]:
def daily_filter(df):
  # print(df.shape)
  samples_per_day = df.groupby(df['date'].dt.date).size()
  dates_with_350_plus_records = samples_per_day[samples_per_day > 350]
  print('-----', dates_with_350_plus_records)
  df = df[~df['date'].dt.date.isin(dates_with_350_plus_records.index)]
  # print(df.shape)
  return df

In [15]:
def BG_value_filter(df):
  # print(df.shape)
  time_diffs = df['date'].diff().dt.total_seconds() / 60  # Difference in minutes
  df.insert(len(df.columns), 'time_diffs', time_diffs)

  # rows_to_remove = df[(time_diffs >= 0) & (time_diffs <= .5)]
  # print(rows_to_remove)
  df = df[~df['time_diffs'].between(0, .5)] # remove BG values that time difference with last value < 0.5 min
  df = df.drop('time_diffs', axis=1)
  # print(df.shape)

  return df

In [None]:
root = '/content/drive/Shareddrives/Yanjun/ReproGenBG/ReproGenBG_Dataset/diatrend_subset/'

for file in os.listdir(root):
  if file.endswith('.csv'):
    print(file)
    df = pd.read_csv(root + file)
    print("original size:", df.shape)
    df['date'] = pd.to_datetime(df['date'])
    df = daily_filter(df)
    print('remove duplicated values: ', df.shape)
    df = BG_value_filter(df)
    print('remove days with > 350 values: ', df.shape)


    # df.to_csv('/content/drive/Shareddrives/Yanjun/ReproGenBG/ReproGenBG_Dataset/diatrend_processed/' + file, index=False)

## baseline functions

In [None]:
def find_continuous_rows(df):
  continuous_rows = []
  for i in range(len(df) - (6 + 6 -1)): # fit with seq_len + pred_len
    current_rows = df.iloc[i:i+6+6]
    time_diffs = np.diff(pd.to_datetime(current_rows['date'])).astype('timedelta64[m]')
    # print(time_diffs)
    if all(diff <= timedelta(minutes=7) for diff in time_diffs):
      continuous_rows.append(current_rows)
  return continuous_rows

In [None]:
def data_process(cgm_df):
  continuous_segments = find_continuous_rows(cgm_df)
  print(len(continuous_segments))
  # print(continuous_segments[0])
  if continuous_segments:
    num_train = int(len(continuous_segments) * 0.7)
    num_test = int(len(continuous_segments) * 0.2)
    num_vali = len(continuous_segments) - num_train - num_test
    border1s = [0, num_train - 6, len(continuous_segments) - num_test - 6]
    border2s = [num_train, num_train + num_vali, len(continuous_segments)]

    # # print('last time in train set:', continuous_segments[num_train-1].LBDTC.values[-1], cgm_df.shape)
    # train_df = cgm_df[cgm_df['mg/dl'] <= continuous_segments[num_train-1].LBDTC.values[-1]]
    # train_segments = continuous_segments[border1s[0]:border2s[0]]
    # print(f"Number of train segments: {len(train_segments)}")

    # vali_segments = continuous_segments[border1s[1]:border2s[1]]
    # print(f"Number of vali segments: {len(vali_segments)}")

    border1 = border1s[2]
    border2 = border2s[2]
    # print(border1, border2)
    test_segments = continuous_segments[border1:border2]
    print(f"Number of test segments: {len(test_segments)}")

    # return train_df, train_segments, vali_segments, test_segments
    return test_segments

In [None]:
def data_process_population(cgm_df):
  continuous_segments = find_continuous_rows(cgm_df)
  # print(len(continuous_segments))
  # print(continuous_segments[0])
  if continuous_segments:
    return continuous_segments

In [None]:
def get_seq_pred(test_segments):
  features_list = []
  trues_list = []

  for segment in test_segments:
    features = segment.iloc[:6]['mg/dl'].values
    trues = segment['mg/dl'].values[-1]
    features_list.append(features)
    trues_list.append(trues)

  return features_list, trues_list

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np

def get_mse_rmse_mae(true, pred):
  # Calculate MSE
  mse = mean_squared_error(true, pred)
  # Calculate RMSE
  rmse = np.sqrt(mse)
  # Calculate MAE
  mae = mean_absolute_error(true, pred)

  true, pred = np.array(true), np.array(pred)
  mape = np.mean(np.abs((true - pred) / true)) * 100

  return mse, rmse, mae, mape

In [None]:
def get_baseline_performance(test_segments):
    features_list, trues_list = get_seq_pred(test_segments)

    # 30 mins diff
    trues = trues_list
    preds = [i[-1] for i in features_list]

    # print('----- 30 mins differnece (between features[-1], and pred[5]) ----')
    diff_30_mins = abs(np.array(trues) - np.array(preds))
    print('30 mins diff avg:', round(np.mean(diff_30_mins), 2), 'std:', round(np.std(diff_30_mins), 2))
    mse, rmse, mae, mape = get_mse_rmse_mae(trues, preds)
    print('MSE:', round(mse, 2), 'RMSE:', round(rmse, 2), 'MAE:', round(mae, 2), 'MAPE', round(mape, 2))
    return mse, rmse, mae, mape

## baseline calculation

### population

In [None]:
# individual

root = '/content/drive/Shareddrives/Yanjun/ReproGenBG/ReproGenBG_Dataset/diatrend_processed/'

MSE_lst = []
RMSE_lst = []
MAE_lst = []
MAPE_lst = []

for file in os.listdir(root):
  if file.endswith('.csv'):
    print(file)
    df = pd.read_csv(root + file)
    df['date'] = df['date'].apply(convert_to_datetime)
    test_segments = data_process(df)
    print(len(test_segments))
    mse, rmse, mae, mape = get_baseline_performance(test_segments)
    MSE_lst.append(mse)
    RMSE_lst.append(rmse)
    MAE_lst.append(mae)
    MAPE_lst.append(mape)
    # break

In [None]:
print('MSE:', round(np.mean(MSE_lst), 2), round(np.std(MSE_lst), 2))
print('RMSE:', round(np.mean(RMSE_lst), 2), round(np.std(RMSE_lst), 2))
print('MAE:', round(np.mean(MAE_lst), 2), round(np.std(MAE_lst), 2))
print('MAPE:', round(np.mean(MAPE_lst), 2), round(np.std(MAPE_lst), 2))

MSE: 859.11 356.51
RMSE: 28.73 5.81
MAE: 20.82 4.2
MAPE: 12.42 2.81


In [None]:
# individual

root = '/content/drive/Shareddrives/Yanjun/ReproGenBG/ReproGenBG_Dataset/diatrend_processed/'

res_lst = []

for file in os.listdir(root):
  if file.endswith('.csv'):
    print(file)
    df = pd.read_csv(root + file)
    df['date'] = df['date'].apply(convert_to_datetime)
    test_segments = data_process(df)
    print(len(test_segments))
    mse, rmse, mae, mape = get_baseline_performance(test_segments)

    res_lst.append([file.split('.')[0].split('_')[-1], rmse, mae, mape])

In [None]:
print(res_lst)

df = pd.DataFrame(res_lst, columns=['Test patient', 'RMSE', 'MAE', 'MAPE'])
df.to_csv('/content/drive/Shareddrives/Yanjun/ReproGenBG/baseline_performance/diatrend_20test.csv', index=False)

[['Subject1', 21.671059917848112, 16.03298611111111, 9.935905878186189], ['Subject10', 31.997304557502847, 23.587937579080556, 11.252483826318436], ['Subject11', 30.678017741245746, 21.763090128755366, 13.065541170101525], ['Subject12', 35.00801287253616, 25.95977984758679, 14.450174659625697], ['Subject13', 18.874885851635568, 12.784881007932805, 10.525128160180735], ['Subject14', 26.609598108195907, 16.971628109995635, 5.5016374320356745], ['Subject15', 31.451915206079526, 22.784602440050485, 13.40479668257849], ['Subject16', 32.667164652121855, 22.564810879728007, 10.926822805100135], ['Subject17', 29.793201922636836, 22.11836051861146, 10.12098679434024], ['Subject18', 21.421931578230122, 15.75593220338983, 8.962439985103675], ['Subject19', 31.79152162853529, 22.406779661016948, 9.01694745275826], ['Subject2', 25.340244988669525, 18.349195710455763, 10.033740674636066], ['Subject20', 33.13533943663976, 23.321849105974707, 14.04040640689112], ['Subject21', 28.77540610503349, 19.4568

### loocv

In [None]:
# population

root = '/content/drive/Shareddrives/Yanjun/ReproGenBG/ReproGenBG_Dataset/diatrend_processed/'

MSE_lst = []
RMSE_lst = []
MAE_lst = []
MAPE_lst = []

for file in os.listdir(root):
  if file.endswith('.csv'):
    print(file)
    df = pd.read_csv(root + file)
    df['date'] = df['date'].apply(convert_to_datetime)
    test_segments = data_process_population(df)
    print(len(test_segments))
    mse, rmse, mae, mape = get_baseline_performance(test_segments)
    MSE_lst.append(mse)
    RMSE_lst.append(rmse)
    MAE_lst.append(mae)
    MAPE_lst.append(mape)
    # break

In [None]:
print('MSE:', round(np.mean(MSE_lst), 2), round(np.std(MSE_lst), 2))
print('RMSE:', round(np.mean(RMSE_lst), 2), round(np.std(RMSE_lst), 2))
print('MAE:', round(np.mean(MAE_lst), 2), round(np.std(MAE_lst), 2))
print('MAPE:', round(np.mean(MAPE_lst), 2), round(np.std(MAPE_lst), 2))

In [None]:
# population

root = '/content/drive/Shareddrives/Yanjun/ReproGenBG/ReproGenBG_Dataset/diatrend_processed/'

res_lst = []

for file in os.listdir(root):
  if file.endswith('.csv'):
    print(file.split('.')[0].split('_')[-1])
    df = pd.read_csv(root + file)
    df['date'] = df['date'].apply(convert_to_datetime)
    test_segments = data_process_population(df)
    print(len(test_segments))
    mse, rmse, mae, mape = get_baseline_performance(test_segments)
    res_lst.append([file.split('.')[0].split('_')[-1], rmse, mae, mape])

In [None]:
print(res_lst)

df = pd.DataFrame(res_lst, columns=['Test patient', 'RMSE', 'MAE', 'MAPE'])
df.to_csv('/content/drive/Shareddrives/Yanjun/ReproGenBG/baseline_performance/diatrend_loocv.csv', index=False)

[['Subject1', 21.4656255660761, 15.711647892230868, 9.897633648018934], ['Subject10', 36.10057362441069, 26.19401470961197, 14.208083257291557], ['Subject11', 36.46932547961089, 25.77045513206573, 15.739481702456223], ['Subject12', 36.12570347643983, 26.081643045064926, 14.625285006497887], ['Subject13', 22.024268307053422, 14.749906437125748, 11.43444777049782], ['Subject14', 28.23072658817312, 18.029926496324816, 7.0855073592853115], ['Subject15', 31.02178429129553, 23.16580922661719, 15.414293539892507], ['Subject16', 35.03095086842119, 23.984832992501705, 10.854045352408038], ['Subject17', 30.44199232098296, 22.226506832089864, 10.325088980518764], ['Subject18', 23.89526143019642, 17.740463851839266, 11.310332396197799], ['Subject19', 31.434982112549026, 22.162857871038995, 10.123828025351044], ['Subject2', 30.302852116285862, 21.210653753026634, 12.882096417989713], ['Subject20', 33.15626808224288, 24.354813325172685, 15.089124524518184], ['Subject21', 28.543799903763972, 19.83651