In [1]:
# Imports 
import glob
import pandas as pd
import datetime
import time
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor

In [2]:
all_files = sorted(glob.glob('_data/Diabetes-Data/data*'))
df_list = []
for idx,filename in enumerate(all_files):
    df = pd.read_csv(filename, sep='\t', header=None)
    df['patient_id'] = idx+1
    df_list.append(df)
df = pd.concat(df_list, axis=0, ignore_index=True)
df.shape

(29330, 5)

In [3]:
# Drop nulls
df = df.dropna()
df

Unnamed: 0,0,1,2,3,patient_id
0,04-21-1991,9:09,58,100,1
1,04-21-1991,9:09,33,9,1
2,04-21-1991,9:09,34,13,1
3,04-21-1991,17:08,62,119,1
4,04-21-1991,17:08,33,7,1
...,...,...,...,...,...
29325,05-09-1989,08:00,33,1,70
29326,05-09-1989,08:00,34,7,70
29327,05-10-1989,08:00,34,7,70
29328,05-11-1989,08:00,34,7,70


In [4]:
codes = """
33 = Regular insulin dose
34 = NPH insulin dose
35 = UltraLente insulin dose
48 = Unspecified blood glucose measurement
57 = Unspecified blood glucose measurement
58 = Pre-breakfast blood glucose measurement
59 = Post-breakfast blood glucose measurement
60 = Pre-lunch blood glucose measurement
61 = Post-lunch blood glucose measurement
62 = Pre-supper blood glucose measurement
63 = Post-supper blood glucose measurement
64 = Pre-snack blood glucose measurement
65 = Hypoglycemic symptoms
66 = Typical meal ingestion
67 = More-than-usual meal ingestion
68 = Less-than-usual meal ingestion
69 = Typical exercise activity
70 = More-than-usual exercise activity
71 = Less-than-usual exercise activity
72 = Unspecified special event
"""
code_dict = {}
for code in codes.split('\n')[1:-1]:
    key,val = code.split(' = ')
    code_dict[int(key)] = val
inv_code_dict = {v: k for k, v in code_dict.items()}
code_dict

{33: 'Regular insulin dose',
 34: 'NPH insulin dose',
 35: 'UltraLente insulin dose',
 48: 'Unspecified blood glucose measurement',
 57: 'Unspecified blood glucose measurement',
 58: 'Pre-breakfast blood glucose measurement',
 59: 'Post-breakfast blood glucose measurement',
 60: 'Pre-lunch blood glucose measurement',
 61: 'Post-lunch blood glucose measurement',
 62: 'Pre-supper blood glucose measurement',
 63: 'Post-supper blood glucose measurement',
 64: 'Pre-snack blood glucose measurement',
 65: 'Hypoglycemic symptoms',
 66: 'Typical meal ingestion',
 67: 'More-than-usual meal ingestion',
 68: 'Less-than-usual meal ingestion',
 69: 'Typical exercise activity',
 70: 'More-than-usual exercise activity',
 71: 'Less-than-usual exercise activity',
 72: 'Unspecified special event'}

In [5]:
# figure out missing codes
non_standard_codes = set(code_dict.keys()).symmetric_difference(set(df[2].unique()))
non_standard_codes

{4, 36, 56}

In [6]:
df[df[2].isin(non_standard_codes)]

Unnamed: 0,0,1,2,3,patient_id
1707,07-21-1990,08:56,56,115,3
1712,07-22-1990,05:56,56,108,3
1722,07-23-1990,05:04,56,110,3
1731,07-24-1990,05:27,56,116,3
1740,07-25-1990,06:24,56,121,3
...,...,...,...,...,...
23926,07-11-1990,21:15,56,52,57
23934,07-13-1990,21:53,56,162,57
23935,07-13-1990,21:56,56,91,57
28944,08-24-1990,13:02,56,258,69


In [7]:
# remove non standard codes
df = df.drop(df[df[2].isin(non_standard_codes)].index)

In [8]:
# standardize duplicate codes
df[2] = df[2].replace(48, 57)

In [9]:
# Some Dates are wrong
for d in df[0]:
    try:
        datetime.datetime.strptime(d, '%m-%d-%Y')
    except:
        print(d)

06-31-1991
06-31-1991
06-31-1991
06-31-1991
06-31-1991
06-31-1991
06-31-1991


In [10]:
# safe to drop
df = df.drop(df[df[0] =='06-31-1991'].index)
df = df.drop(df[df[1].isin(['56:35', '188:00'])].index)
#Drop more invalid data
non_ints = set([])
for val in df[3]:
    try:
        int(val)
    except:
        non_ints.add(val)
        print(val)
df = df.drop(df[df[3].isin(non_ints)].index)


0Hi
0Hi
0Hi
0Hi
0Hi
0Lo
0Hi
0''


In [11]:
# Add in timestamp
df['timestamp'] = df.apply(lambda x: datetime.datetime.strptime(f'{x[0]} {x[1]}', '%m-%d-%Y %H:%M'), axis=1)
df

Unnamed: 0,0,1,2,3,patient_id,timestamp
0,04-21-1991,9:09,58,100,1,1991-04-21 09:09:00
1,04-21-1991,9:09,33,9,1,1991-04-21 09:09:00
2,04-21-1991,9:09,34,13,1,1991-04-21 09:09:00
3,04-21-1991,17:08,62,119,1,1991-04-21 17:08:00
4,04-21-1991,17:08,33,7,1,1991-04-21 17:08:00
...,...,...,...,...,...,...
29325,05-09-1989,08:00,33,1,70,1989-05-09 08:00:00
29326,05-09-1989,08:00,34,7,70,1989-05-09 08:00:00
29327,05-10-1989,08:00,34,7,70,1989-05-10 08:00:00
29328,05-11-1989,08:00,34,7,70,1989-05-11 08:00:00


In [12]:
df[3] = df[3].astype(int)

In [13]:
glucose_indices = list(range(57,65))


In [14]:
tidy1 = df.pivot_table(index=['timestamp', 'patient_id'], columns=2, values=3).sort_values(['patient_id', 'timestamp']).reset_index().copy()

In [15]:
def display_tablenames(df):
    return df.rename(columns=code_dict)

In [16]:
display_tablenames(tidy1)

2,timestamp,patient_id,Regular insulin dose,NPH insulin dose,UltraLente insulin dose,Unspecified blood glucose measurement,Pre-breakfast blood glucose measurement,Post-breakfast blood glucose measurement,Pre-lunch blood glucose measurement,Post-lunch blood glucose measurement,...,Post-supper blood glucose measurement,Pre-snack blood glucose measurement,Hypoglycemic symptoms,Typical meal ingestion,More-than-usual meal ingestion,Less-than-usual meal ingestion,Typical exercise activity,More-than-usual exercise activity,Less-than-usual exercise activity,Unspecified special event
0,1991-04-21 09:09:00,1,9.0,13.0,,,100.0,,,,...,,,,,,,,,,
1,1991-04-21 17:08:00,1,7.0,,,,,,,,...,,,,,,,,,,
2,1991-04-21 22:51:00,1,,,,123.0,,,,,...,,,,,,,,,,
3,1991-04-22 07:35:00,1,10.0,13.0,,,216.0,,,,...,,,,,,,,,,
4,1991-04-22 13:40:00,1,2.0,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18198,1989-05-08 22:00:00,70,,,,145.0,,,,,...,,,,,,,,,,
18199,1989-05-09 08:00:00,70,1.0,7.0,,,259.0,,,,...,,,,,,,,,,
18200,1989-05-10 08:00:00,70,,7.0,,,,,,,...,,,,,,,,,,
18201,1989-05-11 08:00:00,70,,7.0,,,,,,,...,,,,,,,,,,


In [17]:
tidy1['measurement_type'] = tidy1[glucose_indices].apply(lambda x: code_dict.get(x.idxmax(), np.nan), axis=1)
tidy1['glucose_measurement'] = tidy1[glucose_indices].apply(lambda x: x.max(), axis=1)
tidy1

2,timestamp,patient_id,33,34,35,57,58,59,60,61,...,65,66,67,68,69,70,71,72,measurement_type,glucose_measurement
0,1991-04-21 09:09:00,1,9.0,13.0,,,100.0,,,,...,,,,,,,,,Pre-breakfast blood glucose measurement,100.0
1,1991-04-21 17:08:00,1,7.0,,,,,,,,...,,,,,,,,,Pre-supper blood glucose measurement,119.0
2,1991-04-21 22:51:00,1,,,,123.0,,,,,...,,,,,,,,,Unspecified blood glucose measurement,123.0
3,1991-04-22 07:35:00,1,10.0,13.0,,,216.0,,,,...,,,,,,,,,Pre-breakfast blood glucose measurement,216.0
4,1991-04-22 13:40:00,1,2.0,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18198,1989-05-08 22:00:00,70,,,,145.0,,,,,...,,,,,,,,,Unspecified blood glucose measurement,145.0
18199,1989-05-09 08:00:00,70,1.0,7.0,,,259.0,,,,...,,,,,,,,,Pre-breakfast blood glucose measurement,259.0
18200,1989-05-10 08:00:00,70,,7.0,,,,,,,...,,,,,,,,,,
18201,1989-05-11 08:00:00,70,,7.0,,,,,,,...,,,,,,,,,,


In [18]:
# Usable data
usable_df = tidy1[tidy1['glucose_measurement'].notna()]
usable_df

2,timestamp,patient_id,33,34,35,57,58,59,60,61,...,65,66,67,68,69,70,71,72,measurement_type,glucose_measurement
0,1991-04-21 09:09:00,1,9.0,13.0,,,100.0,,,,...,,,,,,,,,Pre-breakfast blood glucose measurement,100.0
1,1991-04-21 17:08:00,1,7.0,,,,,,,,...,,,,,,,,,Pre-supper blood glucose measurement,119.0
2,1991-04-21 22:51:00,1,,,,123.0,,,,,...,,,,,,,,,Unspecified blood glucose measurement,123.0
3,1991-04-22 07:35:00,1,10.0,13.0,,,216.0,,,,...,,,,,,,,,Pre-breakfast blood glucose measurement,216.0
5,1991-04-22 16:56:00,1,7.0,,,,,,,,...,,,,,,,,,Pre-supper blood glucose measurement,211.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18193,1989-05-07 12:00:00,70,,,,,,,151.0,,...,,,,,,,,,Pre-lunch blood glucose measurement,151.0
18195,1989-05-07 22:00:00,70,,,,265.0,,,,,...,,,,,,,,,Unspecified blood glucose measurement,265.0
18196,1989-05-08 08:00:00,70,1.0,7.0,,,248.0,,,,...,,,,,,,,,Pre-breakfast blood glucose measurement,248.0
18198,1989-05-08 22:00:00,70,,,,145.0,,,,,...,,,,,,,,,Unspecified blood glucose measurement,145.0


In [19]:
# split dataset to train and test PATIENTS
# our model should generalize to patients outside of the train data
test_patients = np.random.choice(usable_df['patient_id'].unique(),10)

In [20]:
test_df = usable_df[usable_df['patient_id'].isin(test_patients)]
train_df = usable_df[~usable_df['patient_id'].isin(test_patients)]

In [21]:
# Baseline prediction to beat
baseline_pred = train_df['glucose_measurement'].mean()

In [22]:
# Split test patients to known data and test data
# This form assumes that we would have a certain amount of known data to help our model
known_data = []
test_data = []
for _id in test_df['patient_id'].unique():
    known, test = np.array_split(test_df[test_df['patient_id']==_id], 2)
    known_data.append(known)
    test_data.append(test)
    
real_test_df = pd.concat(test_data)
known_df = pd.concat(known_data)

In [23]:
y_true = real_test_df['glucose_measurement']
y_pred = np.array(y_true.size * [baseline_pred])

In [24]:
# Gather preliminary metrics
rmse = np.sqrt(mean_squared_error(y_true, y_pred))
mae = mean_absolute_error(y_true, y_pred)
print(f'RMSE: {rmse}\nMAE {mae}')

RMSE: 70.20748163133605
MAE 56.957333090617496


In [25]:
# Because our dataset consists of individual patient data and some patients have more observation than the others
# It's ideal to for our metrics to average out our metrics to counter the imbalance and to
# prevent our metrics from overfitting to certain patient types
rmse_errors = []
mae_errors = []
for pid in real_test_df['patient_id'].unique():
    y_true = real_test_df[real_test_df['patient_id']==pid]['glucose_measurement']
    y_pred = np.array(y_true.size * [baseline_pred])
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    rmse_errors.append(rmse)
    mae_errors.append(mae)
print(f'MRMSE: {np.mean(rmse_errors)}\nMMAE {np.mean(mae_errors)}')

MRMSE: 80.91185433915479
MMAE 69.31462306225406


In [26]:
engineered_df = train_df.copy()
engineered_df.shape

(11185, 23)

In [27]:
# get previous reading for each patient
engineered_df['previous_measurement'] = engineered_df['glucose_measurement'].shift()

In [28]:
engineered_df['previous_measurement_time'] = engineered_df['timestamp'].shift()

In [29]:
for pid in engineered_df['patient_id'].unique():
    engineered_df = engineered_df.drop(engineered_df[engineered_df['patient_id'] == pid][0:1].index)
engineered_df.shape

(11125, 25)

In [30]:
engineered_df['glucose_delta'] = engineered_df['glucose_measurement'] - engineered_df['previous_measurement']

In [31]:
engineered_df['time_delta'] = engineered_df['timestamp'] - engineered_df['previous_measurement_time']

In [32]:
engineered_df['time_delta_minutes'] = engineered_df['time_delta']/ np.timedelta64(1, 'm')

In [33]:
engineered_df

2,timestamp,patient_id,33,34,35,57,58,59,60,61,...,70,71,72,measurement_type,glucose_measurement,previous_measurement,previous_measurement_time,glucose_delta,time_delta,time_delta_minutes
1,1991-04-21 17:08:00,1,7.0,,,,,,,,...,,,,Pre-supper blood glucose measurement,119.0,100.0,1991-04-21 09:09:00,19.0,07:59:00,479.0
2,1991-04-21 22:51:00,1,,,,123.0,,,,,...,,,,Unspecified blood glucose measurement,123.0,119.0,1991-04-21 17:08:00,4.0,05:43:00,343.0
3,1991-04-22 07:35:00,1,10.0,13.0,,,216.0,,,,...,,,,Pre-breakfast blood glucose measurement,216.0,123.0,1991-04-21 22:51:00,93.0,08:44:00,524.0
5,1991-04-22 16:56:00,1,7.0,,,,,,,,...,,,,Pre-supper blood glucose measurement,211.0,216.0,1991-04-22 07:35:00,-5.0,09:21:00,561.0
6,1991-04-23 07:25:00,1,11.0,13.0,,,257.0,,,,...,,,,Pre-breakfast blood glucose measurement,257.0,211.0,1991-04-22 16:56:00,46.0,14:29:00,869.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18193,1989-05-07 12:00:00,70,,,,,,,151.0,,...,,,,Pre-lunch blood glucose measurement,151.0,378.0,1989-05-06 18:00:00,-227.0,18:00:00,1080.0
18195,1989-05-07 22:00:00,70,,,,265.0,,,,,...,,,,Unspecified blood glucose measurement,265.0,151.0,1989-05-07 12:00:00,114.0,10:00:00,600.0
18196,1989-05-08 08:00:00,70,1.0,7.0,,,248.0,,,,...,,,,Pre-breakfast blood glucose measurement,248.0,265.0,1989-05-07 22:00:00,-17.0,10:00:00,600.0
18198,1989-05-08 22:00:00,70,,,,145.0,,,,,...,,,,Unspecified blood glucose measurement,145.0,248.0,1989-05-08 08:00:00,-103.0,14:00:00,840.0


In [34]:
engineered_df[[33,34,35]] = engineered_df[[33,34,35]].fillna(0)

In [35]:
engineered_df

2,timestamp,patient_id,33,34,35,57,58,59,60,61,...,70,71,72,measurement_type,glucose_measurement,previous_measurement,previous_measurement_time,glucose_delta,time_delta,time_delta_minutes
1,1991-04-21 17:08:00,1,7.0,0.0,0.0,,,,,,...,,,,Pre-supper blood glucose measurement,119.0,100.0,1991-04-21 09:09:00,19.0,07:59:00,479.0
2,1991-04-21 22:51:00,1,0.0,0.0,0.0,123.0,,,,,...,,,,Unspecified blood glucose measurement,123.0,119.0,1991-04-21 17:08:00,4.0,05:43:00,343.0
3,1991-04-22 07:35:00,1,10.0,13.0,0.0,,216.0,,,,...,,,,Pre-breakfast blood glucose measurement,216.0,123.0,1991-04-21 22:51:00,93.0,08:44:00,524.0
5,1991-04-22 16:56:00,1,7.0,0.0,0.0,,,,,,...,,,,Pre-supper blood glucose measurement,211.0,216.0,1991-04-22 07:35:00,-5.0,09:21:00,561.0
6,1991-04-23 07:25:00,1,11.0,13.0,0.0,,257.0,,,,...,,,,Pre-breakfast blood glucose measurement,257.0,211.0,1991-04-22 16:56:00,46.0,14:29:00,869.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18193,1989-05-07 12:00:00,70,0.0,0.0,0.0,,,,151.0,,...,,,,Pre-lunch blood glucose measurement,151.0,378.0,1989-05-06 18:00:00,-227.0,18:00:00,1080.0
18195,1989-05-07 22:00:00,70,0.0,0.0,0.0,265.0,,,,,...,,,,Unspecified blood glucose measurement,265.0,151.0,1989-05-07 12:00:00,114.0,10:00:00,600.0
18196,1989-05-08 08:00:00,70,1.0,7.0,0.0,,248.0,,,,...,,,,Pre-breakfast blood glucose measurement,248.0,265.0,1989-05-07 22:00:00,-17.0,10:00:00,600.0
18198,1989-05-08 22:00:00,70,0.0,0.0,0.0,145.0,,,,,...,,,,Unspecified blood glucose measurement,145.0,248.0,1989-05-08 08:00:00,-103.0,14:00:00,840.0


In [36]:
model_df = engineered_df[['patient_id', 33, 34, 35, 'measurement_type', 'previous_measurement', 'glucose_delta', 'time_delta', 'time_delta_minutes', 'glucose_measurement']]
model_df = model_df.replace(inv_code_dict)

In [37]:
train1_df, test1_df = train_test_split(model_df)
train1_df = train1_df.copy()
test1_df = test1_df.copy()

In [38]:
known_avg_train = {}
for pid in train1_df['patient_id'].unique():
    known_avg_train[pid] = train1_df[train1_df['patient_id']==pid]['glucose_measurement'].mean()


In [39]:
train1_df['known_avg_glucose'] = train1_df.apply(lambda x: known_avg_train[x['patient_id']], axis=1)
test1_df['known_avg_glucose'] = test1_df.apply(lambda x: known_avg_train[x['patient_id']], axis=1)

In [40]:
RMSE = np.sqrt(mean_squared_error(test1_df['glucose_measurement'], test1_df['known_avg_glucose']))
MAE = mean_absolute_error(test1_df['glucose_measurement'], test1_df['known_avg_glucose'])
print('RMSE: ', RMSE)
print('MAE: ', MAE)

RMSE:  78.23713125567512
MAE:  61.798335361992144


In [41]:
features = ['measurement_type', 'previous_measurement', 'known_avg_glucose', 'time_delta_minutes']
target = 'glucose_measurement'
X = train1_df[features]
y = train1_df[target]

In [42]:
model = RandomForestRegressor(n_estimators=100)

In [43]:
model.fit(X,y)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=100,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [44]:
# our model is barely able to beat the per-patient mean baseline. This is a terrible model!
y_pred = model.predict(test1_df[features])
y_true = test1_df[target]
RMSE = np.sqrt(mean_squared_error(y_true, y_pred))
MAE = mean_absolute_error(y_true, y_pred)
print('RMSE: ', RMSE)
print('MAE: ', MAE)

RMSE:  79.35241976933668
MAE:  60.78894145172066


In [46]:
# model improves noticeably if we can figure out a way to predict insulin dosage.
# Unfortunately I don't have model to predict insulin dosage. 
# This demonstrate that some domain knowledge and or a good model on how much dosage of each type of 
# insulin will improve our predictions. 

model_with_insulin = RandomForestRegressor(n_estimators=100)
# Add insulin dosage to features
features_with_insulin = [33, 34, 35, 'measurement_type', 'previous_measurement', 'known_avg_glucose', 'time_delta_minutes']
X = train1_df[features_with_insulin]
y = train1_df[target]
model_with_insulin.fit(X,y)
y_pred = model_with_insulin.predict(test1_df[features_with_insulin])
y_true = test1_df[target]
RMSE = np.sqrt(mean_squared_error(y_true, y_pred))
MAE = mean_absolute_error(y_true, y_pred)
print('RMSE: ', RMSE)
print('MAE: ', MAE)

RMSE:  70.67741044614634
MAE:  53.01017143176138


In [47]:
# Final layer predictions
patient_mean_glucose = {pid: known_df[known_df['patient_id']==pid]['glucose_measurement'].mean() for pid in known_df.patient_id.unique()}
patient_mean_glucose

{3: 132.34285714285716,
 11: 149.26470588235293,
 29: 153.5487012987013,
 34: 145.41666666666666,
 40: 149.5,
 41: 177.8360655737705,
 50: 91.28571428571429,
 55: 165.73597359735973,
 60: 124.07142857142857,
 62: 204.5}

In [79]:
RMSE_list = []
MAE_list = []
for pid in test_df['patient_id'].unique():
    patient_df = real_test_df[real_test_df['patient_id'] == pid].copy()
    last_observed_time = known_df[known_df['patient_id'] == pid].iloc[-1:]['timestamp'].values[0]
    last_measured_glucose = known_df[known_df['patient_id'] == pid].iloc[-1:]['glucose_measurement'].values[0]
    # now refering to previous "prediction" time since
    patient_df['previous_measurement_time'] = patient_df['timestamp'].shift()
    # use last predicted time to form output
    patient_df['previous_measurement_time'] = patient_df['previous_measurement_time'].fillna(last_observed_time)
    
    # apply same transformation we did from the test data
    patient_df = patient_df.replace(inv_code_dict)
    patient_df['time_delta'] = patient_df['timestamp'] - patient_df['previous_measurement_time']
    patient_df['time_delta_minutes'] = patient_df['time_delta']/ np.timedelta64(1, 'm')
    patient_df[[33,34,35]] = patient_df[[33,34,35]].fillna(0)
    # we can't populate 'previous_measurement' the same way since that relied
    # on the 'glucose_measurement' from the test data
    # we have to populate our predictions 1 by 1 since it relies on the result of the previous prediction
    y_pred = []
    for idx,row in patient_df.iterrows():
        time_delta_minutes = (row['timestamp'] - last_observed_time)/ np.timedelta64(1, 'm')
        x = [row['measurement_type'], last_measured_glucose, patient_mean_glucose[pid], time_delta_minutes]
        result = model.predict([x])[0]
        # set the time and value for the next prediction
        last_measured_glucose = result
        last_observed_time = row['timestamp']
        y_pred.append(result)
    y_true = patient_df['glucose_measurement']
    RMSE = np.sqrt(mean_squared_error(y_true, y_pred))
    MAE = mean_absolute_error(y_true, y_pred)
    RMSE_list.append(RMSE)
    MAE_list.append(MAE)
    print(f'Patient #{pid}: MAE - {MAE}, RMSE - {RMSE}')
print('MMRSE: ', np.mean(RMSE_list))
print('MMAE: ', np.mean(MAE_list))

Patient #3: MAE - 65.21055555555554, RMSE - 77.27901727755642
Patient #11: MAE - 79.12672366522365, RMSE - 93.19150470196362
Patient #29: MAE - 45.48282003710575, RMSE - 58.19001988618263
Patient #34: MAE - 60.63252222222222, RMSE - 72.72761761046118
Patient #40: MAE - 78.8537777777778, RMSE - 96.00285461421332
Patient #41: MAE - 64.00467493112947, RMSE - 76.67157499458018
Patient #50: MAE - 32.74231829573935, RMSE - 38.81467350274758
Patient #55: MAE - 52.64936324584839, RMSE - 64.05787215341216
Patient #60: MAE - 64.40068407287158, RMSE - 85.77231379906716
Patient #62: MAE - 91.06923232323233, RMSE - 111.04539135910126
MMRSE:  77.37528398992855
MMAE:  63.41726721267061


In [None]:
# We got a somewhat better(than baseline) model but the predictions are still terrible.

In [51]:
import pickle

In [55]:
with open('model.pkl', 'wb') as f:
    pickle.dump(model, f)

In [84]:
meal_times = usable_df[['timestamp', 'measurement_type']].copy()
meal_times['minutes'] = meal_times['timestamp'].apply(lambda x: x.hour * 60 + x.minute)
meal_times

2,timestamp,measurement_type,minutes
0,1991-04-21 09:09:00,Pre-breakfast blood glucose measurement,549
1,1991-04-21 17:08:00,Pre-supper blood glucose measurement,1028
2,1991-04-21 22:51:00,Unspecified blood glucose measurement,1371
3,1991-04-22 07:35:00,Pre-breakfast blood glucose measurement,455
5,1991-04-22 16:56:00,Pre-supper blood glucose measurement,1016
...,...,...,...
18193,1989-05-07 12:00:00,Pre-lunch blood glucose measurement,720
18195,1989-05-07 22:00:00,Unspecified blood glucose measurement,1320
18196,1989-05-08 08:00:00,Pre-breakfast blood glucose measurement,480
18198,1989-05-08 22:00:00,Unspecified blood glucose measurement,1320


In [105]:
meal_times = meal_times.groupby(['measurement_type']).mean()
meal_times

2,minutes
measurement_type,Unnamed: 1_level_1
Post-breakfast blood glucose measurement,595.85
Post-lunch blood glucose measurement,860.5
Post-supper blood glucose measurement,1221.050228
Pre-breakfast blood glucose measurement,443.046075
Pre-lunch blood glucose measurement,728.675099
Pre-snack blood glucose measurement,1249.265487
Pre-supper blood glucose measurement,1072.207924
Unspecified blood glucose measurement,1151.472822


In [117]:
# setting time for timedelta calculations
# Ideal times for matching timedelta and mesurement_type features for model prediction
meal_times['avg_time'] = meal_times['minutes'].apply(lambda x: ':'.join([str(x) for x in divmod(round(x), 60)]))
meal_times

2,minutes,avg_time
measurement_type,Unnamed: 1_level_1,Unnamed: 2_level_1
Post-breakfast blood glucose measurement,595.85,9:56
Post-lunch blood glucose measurement,860.5,14:20
Post-supper blood glucose measurement,1221.050228,20:21
Pre-breakfast blood glucose measurement,443.046075,7:23
Pre-lunch blood glucose measurement,728.675099,12:9
Pre-snack blood glucose measurement,1249.265487,20:49
Pre-supper blood glucose measurement,1072.207924,17:52
Unspecified blood glucose measurement,1151.472822,19:11
