In [110]:
import pandas as pd
import numpy as np
from datetime import timedelta
from scipy.fftpack import fft, ifft,rfft
from sklearn.utils import shuffle
from sklearn.metrics import classification_report, accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import KFold, RepeatedKFold
from joblib import dump, load

In [134]:
insulin_data_df=pd.read_csv('InsulinData.csv',usecols=['Date','Time','BWZ Carb Input (grams)'])
insulin_data_df

Unnamed: 0,Date,Time,BWZ Carb Input (grams)
0,2/12/2018,13:20:53,
1,2/12/2018,13:20:48,
2,2/12/2018,13:18:48,
3,2/12/2018,13:18:48,
4,2/12/2018,13:12:33,
...,...,...,...
41430,7/24/2017,19:00:01,
41431,7/24/2017,18:59:44,
41432,7/24/2017,18:59:44,
41433,7/24/2017,18:59:44,


In [133]:
cgm_data_df=pd.read_csv('CGMData.csv',usecols=['Date','Time','Sensor Glucose (mg/dL)'])
cgm_data_df

Unnamed: 0,Date,Time,Sensor Glucose (mg/dL)
0,2/12/2018,13:22:27,118.0
1,2/12/2018,13:17:27,122.0
2,2/12/2018,13:12:27,
3,2/12/2018,13:07:27,
4,2/12/2018,13:02:27,
...,...,...,...
55338,7/25/2017,12:28:54,311.0
55339,7/25/2017,12:23:54,311.0
55340,7/25/2017,12:18:54,309.0
55341,7/25/2017,12:13:54,310.0


In [113]:
insulin_data_df['date_time_stamp']=pd.to_datetime(insulin_data_df['Date'] + ' ' + insulin_data_df['Time'])
cgm_data_df['date_time_stamp']=pd.to_datetime(cgm_data_df['Date'] + ' ' + cgm_data_df['Time'])
insulin_data_df.shape[0]

41435

In [114]:
insulin_data_df_1=pd.read_csv('Insulin_patient2.csv',low_memory=False,usecols=['Date','Time','BWZ Carb Input (grams)'])
cgm_data_df_1=pd.read_csv('CGM_patient2.csv',low_memory=False,usecols=['Date','Time','Sensor Glucose (mg/dL)'])

insulin_data_df_1['date_time_stamp']=pd.to_datetime(insulin_data_df_1['Date'] + ' ' + insulin_data_df_1['Time'])
cgm_data_df_1['date_time_stamp']=pd.to_datetime(cgm_data_df_1['Date'] + ' ' + cgm_data_df_1['Time'])

insulin_data_df_1

Unnamed: 0,Date,Time,BWZ Carb Input (grams),date_time_stamp
0,2018-03-07,11:36:15,,2018-03-07 11:36:15
1,2018-03-07,11:35:56,,2018-03-07 11:35:56
2,2018-03-07,11:33:19,,2018-03-07 11:33:19
3,2018-03-07,11:33:19,,2018-03-07 11:33:19
4,2018-03-07,11:31:04,,2018-03-07 11:31:04
...,...,...,...,...
23180,2017-09-05 00:00:00,08:26:00,,2017-09-05 08:26:00
23181,2017-09-05 00:00:00,08:25:35,,2017-09-05 08:25:35
23182,2017-09-05 00:00:00,08:25:35,,2017-09-05 08:25:35
23183,2017-09-05 00:00:00,08:25:35,,2017-09-05 08:25:35


In [115]:
def createmealdata(insulin_data_df,cgm_data_df,dateidentifier):
    insulin_df=insulin_data_df.copy()
    insulin_df=insulin_df.set_index('date_time_stamp')
    find_timestamp_with_2_5_hours_df=insulin_df.sort_values(by='date_time_stamp',ascending=True).dropna().reset_index()
    find_timestamp_with_2_5_hours_df['BWZ Carb Input (grams)'].replace(0.0,np.nan)#,inplace=True)
    #print(find_timestamp_with_2_5_hours_df)
    
    find_timestamp_with_2_5_hours_df=find_timestamp_with_2_5_hours_df.dropna()
    #print(find_timestamp_with_2_5_hours_df)

    find_timestamp_with_2_5_hours_df=find_timestamp_with_2_5_hours_df.reset_index().drop(columns='index')
    #print(find_timestamp_with_2_5_hours_df)
    
    valid_timestamp_list=[]
    value=0
    for idx,i in enumerate(find_timestamp_with_2_5_hours_df['date_time_stamp']):
        try:
            value=(find_timestamp_with_2_5_hours_df['date_time_stamp'][idx+1]-i).seconds / 60.0
            #print(value,i)
            if value >= 120:
                valid_timestamp_list.append(i)
        except KeyError:
            break
    
    list1=[]
    #print(dateidentifier)
    if dateidentifier==1:
        for idx,i in enumerate(valid_timestamp_list):
            start=pd.to_datetime(i - timedelta(minutes=30))
            end=pd.to_datetime(i + timedelta(minutes=120))
            print("start",start)
            print("end",end)
            get_date=i.date().strftime("%m/%d/%Y")
            #print(get_date)
            list1.append(cgm_data_df.loc[cgm_data_df['Date']==get_date].set_index('date_time_stamp').between_time(start_time=start.strftime('%H:%M:%S'),end_time=end.strftime('%H:%M:%S'))['Sensor Glucose (mg/dL)'].values.tolist())
        return pd.DataFrame(list1)
    else:
        for idx,i in enumerate(valid_timestamp_list):
            start=pd.to_datetime(i - timedelta(minutes=30))
            end=pd.to_datetime(i + timedelta(minutes=120))
            get_date=i.date().strftime('%Y-%m-%d')
            list1.append(cgm_data_df.loc[cgm_data_df['Date']==get_date].set_index('date_time_stamp').between_time(start_time=start.strftime('%H:%M:%S'),end_time=end.strftime('%H:%M:%S'))['Sensor Glucose (mg/dL)'].values.tolist())
        return pd.DataFrame(list1)
    return 0


In [116]:
meal_data=createmealdata(insulin_data_df,cgm_data_df,1)
meal_data1=createmealdata(insulin_data_df_1,cgm_data_df_1,2)

meal_data=meal_data.iloc[:,0:24]
meal_data1=meal_data1.iloc[:,0:24]

# ### No-Meal data extraction
meal_data

start 2017-07-25 10:09:46
end 2017-07-25 12:39:46
start 2017-07-25 13:15:08
end 2017-07-25 15:45:08
start 2017-07-25 18:01:40
end 2017-07-25 20:31:40
start 2017-07-26 12:18:41
end 2017-07-26 14:48:41
start 2017-07-26 18:45:06
end 2017-07-26 21:15:06
start 2017-07-27 05:15:51
end 2017-07-27 07:45:51
start 2017-07-27 08:49:41
end 2017-07-27 11:19:41
start 2017-07-27 13:26:49
end 2017-07-27 15:56:49
start 2017-07-27 18:28:04
end 2017-07-27 20:58:04
start 2017-07-27 22:34:49
end 2017-07-28 01:04:49
start 2017-07-28 08:19:55
end 2017-07-28 10:49:55
start 2017-07-28 14:46:56
end 2017-07-28 17:16:56
start 2017-07-28 21:06:49
end 2017-07-28 23:36:49
start 2017-07-29 17:37:03
end 2017-07-29 20:07:03
start 2017-07-29 21:00:44
end 2017-07-29 23:30:44
start 2017-07-30 17:59:02
end 2017-07-30 20:29:02
start 2017-07-31 01:06:00
end 2017-07-31 03:36:00
start 2017-07-31 05:17:22
end 2017-07-31 07:47:22
start 2017-07-31 09:24:25
end 2017-07-31 11:54:25
start 2017-07-31 12:05:32
end 2017-07-31 14:35:32


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
719,,,,,,,,,,,...,,,,,,,,,,
720,,,,,,,,,,,...,,,,,,,,,,
721,,,,,,,,,,,...,,,,,,,,,,
722,,,,,,,,,,,...,,,,,,,,,,


In [117]:
meal_data1

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
0,218.0,205.0,211.0,224.0,239.0,242.0,245.0,247.0,246.0,244.0,...,258.0,250.0,243.0,243.0,243.0,226.0,214.0,201.0,183.0,182.0
1,180.0,180.0,172.0,166.0,163.0,160.0,169.0,180.0,186.0,188.0,...,237.0,236.0,226.0,221.0,219.0,219.0,221.0,224.0,225.0,226.0
2,73.0,69.0,67.0,68.0,76.0,83.0,88.0,93.0,97.0,99.0,...,100.0,108.0,119.0,127.0,137.0,142.0,144.0,147.0,154.0,148.0
3,204.0,217.0,223.0,226.0,229.0,227.0,223.0,219.0,212.0,201.0,...,156.0,145.0,134.0,127.0,121.0,118.0,120.0,122.0,126.0,129.0
4,129.0,133.0,137.0,140.0,142.0,147.0,153.0,164.0,173.0,178.0,...,167.0,180.0,189.0,193.0,190.0,183.0,178.0,175.0,173.0,172.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
544,179.0,172.0,164.0,153.0,147.0,147.0,149.0,150.0,149.0,144.0,...,153.0,151.0,149.0,147.0,142.0,135.0,132.0,134.0,143.0,147.0
545,115.0,120.0,130.0,132.0,130.0,129.0,126.0,128.0,131.0,139.0,...,198.0,208.0,223.0,245.0,259.0,263.0,254.0,248.0,233.0,222.0
546,159.0,161.0,164.0,169.0,174.0,179.0,182.0,184.0,186.0,194.0,...,200.0,205.0,207.0,210.0,211.0,214.0,216.0,209.0,205.0,206.0
547,195.0,192.0,181.0,165.0,139.0,133.0,125.0,126.0,136.0,143.0,...,155.0,154.0,155.0,158.0,158.0,160.0,165.0,169.0,169.0,160.0


In [118]:
def createnomealdata(insulin_data_df,cgm_data_df):
    insulin_no_meal_df=insulin_data_df.copy()
    test1_df=insulin_no_meal_df.sort_values(by='date_time_stamp',ascending=True).replace(0.0,np.nan).dropna().copy()
    test1_df=test1_df.reset_index().drop(columns='index')
    valid_timestamp=[]
    for idx,i in enumerate(test1_df['date_time_stamp']):
        try:
            value=(test1_df['date_time_stamp'][idx+1]-i).seconds//3600
            if value >=4:
                valid_timestamp.append(i)
        except KeyError:
            break
    dataset=[]
    for idx, i in enumerate(valid_timestamp):
        iteration_dataset=1
        try:
            length_of_24_dataset=len(cgm_data_df.loc[(cgm_data_df['date_time_stamp']>=valid_timestamp[idx]+pd.Timedelta(hours=2))&(cgm_data_df['date_time_stamp']<valid_timestamp[idx+1])])//24
            while (iteration_dataset<=length_of_24_dataset):
                if iteration_dataset==1:
                    dataset.append(cgm_data_df.loc[(cgm_data_df['date_time_stamp']>=valid_timestamp[idx]+pd.Timedelta(hours=2))&(cgm_data_df['date_time_stamp']<valid_timestamp[idx+1])]['Sensor Glucose (mg/dL)'][:iteration_dataset*24].values.tolist())
                    iteration_dataset+=1
                else:
                    dataset.append(cgm_data_df.loc[(cgm_data_df['date_time_stamp']>=valid_timestamp[idx]+pd.Timedelta(hours=2))&(cgm_data_df['date_time_stamp']<valid_timestamp[idx+1])]['Sensor Glucose (mg/dL)'][(iteration_dataset-1)*24:(iteration_dataset)*24].values.tolist())
                    iteration_dataset+=1
        except IndexError:
            break
    return pd.DataFrame(dataset)

In [119]:
no_meal_data=createnomealdata(insulin_data_df,cgm_data_df)
no_meal_data1=createnomealdata(insulin_data_df_1,cgm_data_df_1)
no_meal_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
0,102.0,81.0,71.0,63.0,59.0,58.0,54.0,51.0,48.0,47.0,...,45.0,50.0,57.0,64.0,71.0,75.0,81.0,86.0,81.0,71.0
1,63.0,58.0,59.0,62.0,65.0,70.0,73.0,77.0,82.0,85.0,...,106.0,109.0,113.0,120.0,129.0,134.0,145.0,160.0,174.0,186.0
2,255.0,259.0,268.0,281.0,292.0,304.0,315.0,323.0,334.0,342.0,...,368.0,362.0,350.0,339.0,327.0,324.0,326.0,328.0,326.0,327.0
3,321.0,318.0,316.0,316.0,300.0,289.0,288.0,276.0,260.0,249.0,...,200.0,199.0,200.0,199.0,197.0,197.0,197.0,198.0,198.0,195.0
4,192.0,192.0,191.0,193.0,196.0,196.0,194.0,193.0,193.0,192.0,...,184.0,188.0,192.0,192.0,190.0,185.0,185.0,187.0,185.0,184.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1704,114.0,111.0,107.0,116.0,126.0,141.0,150.0,144.0,140.0,131.0,...,75.0,71.0,67.0,62.0,58.0,70.0,75.0,76.0,81.0,84.0
1705,86.0,87.0,87.0,89.0,92.0,94.0,95.0,96.0,96.0,94.0,...,105.0,100.0,97.0,95.0,93.0,103.0,108.0,110.0,110.0,108.0
1706,106.0,105.0,105.0,105.0,106.0,108.0,109.0,109.0,109.0,109.0,...,98.0,99.0,98.0,97.0,95.0,93.0,91.0,90.0,89.0,87.0
1707,84.0,79.0,77.0,77.0,77.0,76.0,77.0,79.0,79.0,73.0,...,67.0,66.0,65.0,66.0,66.0,63.0,62.0,61.0,59.0,60.0


In [120]:
def createmealfeaturematrix(meal_data):
    #pd.DataFrame(meal_data).to_csv('meal.csv')
    print("meal_data",meal_data.shape)
    index=meal_data.isna().sum(axis=1).replace(0,np.nan).dropna().where(lambda x:x>6).dropna().index
    meal_data_cleaned=meal_data.drop(meal_data.index[index]).reset_index().drop(columns='index')
    meal_data_cleaned=meal_data_cleaned.interpolate(method='linear',axis=1)
    index_to_drop_again=meal_data_cleaned.isna().sum(axis=1).replace(0,np.nan).dropna().index
    meal_data_cleaned=meal_data_cleaned.drop(meal_data.index[index_to_drop_again]).reset_index().drop(columns='index')
    #meal_data_cleaned['tau_time']=(meal_data_cleaned.iloc[:,22:25].idxmin(axis=1)-meal_data_cleaned.iloc[:,5:19].idxmax(axis=1))*5
    #meal_data_cleaned['difference_in_glucose_normalized']=(meal_data_cleaned.iloc[:,5:19].max(axis=1)-meal_data_cleaned.iloc[:,22:25].min(axis=1))/(meal_data_cleaned.iloc[:,22:25].min(axis=1))
    #pd.DataFrame(meal_data_cleaned).to_csv('cleaned_meal.csv') 
    
    print("meal_data_cleaned",meal_data_cleaned.shape)
    meal_data_cleaned=meal_data_cleaned.dropna().reset_index().drop(columns='index')

    power_first_max=[]
    index_first_max=[]
    power_second_max=[]
    index_second_max=[]
    power_third_max=[]
    print("length=",len(meal_data_cleaned))
    print(meal_data_cleaned)
    
    for i in range(len(meal_data_cleaned)):
        print(i)
        array=abs(rfft(meal_data_cleaned.iloc[:,0:25].iloc[i].values.tolist())).tolist()
        sorted_array=abs(rfft(meal_data_cleaned.iloc[:,0:25].iloc[i].values.tolist())).tolist()
        sorted_array.sort()
        #print("Arr=",sorted_array)
        #print("Arr[-2]",sorted_array[-2])
        #print("Arr[-3]",sorted_array[-3])
        power_first_max.append(sorted_array[-2])
        power_second_max.append(sorted_array[-3])
        power_third_max.append(sorted_array[-4])
        index_first_max.append(array.index(sorted_array[-2]))
        index_second_max.append(array.index(sorted_array[-3]))
    meal_feature_matrix=pd.DataFrame()
    meal_feature_matrix['power_second_max']=power_second_max
    meal_feature_matrix['power_third_max']=power_third_max
    #print(meal_data_cleaned.iloc[:,5:19])
    tm=meal_data_cleaned.iloc[:,22:25].idxmin(axis=1)
    maximum=meal_data_cleaned.iloc[:,5:19].idxmax(axis=1)
    #print(maximum)
    list1=[]
    second_differential_data=[]
    standard_deviation=[]
    for i in range(len(meal_data_cleaned)):
        list1.append(np.diff(meal_data_cleaned.iloc[:,maximum[i]:tm[i]].iloc[i].tolist()).max())
        second_differential_data.append(np.diff(np.diff(meal_data_cleaned.iloc[:,maximum[i]:tm[i]].iloc[i].tolist())).max())
        standard_deviation.append(np.std(meal_data_cleaned.iloc[i]))
    meal_feature_matrix['2ndDifferential']=second_differential_data
    meal_feature_matrix['standard_deviation']=standard_deviation
    return meal_feature_matrix

In [121]:
meal_feature_matrix=createmealfeaturematrix(meal_data)

meal_feature_matrix1=createmealfeaturematrix(meal_data1)
meal_feature_matrix=pd.concat([meal_feature_matrix,meal_feature_matrix1]).reset_index().drop(columns='index')


meal_data (724, 24)
meal_data_cleaned (208, 24)
length= 208
        0      1      2      3      4      5      6      7      8      9   \
0    206.0  203.0  203.0  205.0  209.0  213.0  214.0  214.0  214.0  210.0   
1    140.0  139.0  140.0  139.0  139.0  145.0  150.0  154.0  157.0  159.0   
2    134.0  135.0  139.0  144.0  149.0  156.0  163.0  170.0  178.0  187.0   
3     76.0   72.0   70.0   67.0   64.0   63.0   65.0   69.0   76.0   86.0   
4    140.0  142.0  145.0  148.0  151.0  152.0  153.0  156.0  159.0  163.0   
..     ...    ...    ...    ...    ...    ...    ...    ...    ...    ...   
203  214.0  214.0  208.0  217.0  224.0  227.0  237.0  240.0  246.0  246.0   
204  239.0  239.0  234.0  228.0  225.0  223.0  223.0  223.0  222.0  224.0   
205  171.0  174.0  177.0  179.0  182.0  179.0  181.0  165.0  158.0  155.0   
206  106.0  117.0  124.0  123.0  120.0  125.0  133.0  144.0  159.0  172.0   
207  118.0  117.0  117.0  111.0  117.0  121.0  130.0  134.0  142.0  146.0   

     ...     14

In [122]:
def createnomealfeaturematrix(non_meal_data):
    index_to_remove_non_meal=non_meal_data.isna().sum(axis=1).replace(0,np.nan).dropna().where(lambda x:x>5).dropna().index
    non_meal_data_cleaned=non_meal_data.drop(non_meal_data.index[index_to_remove_non_meal]).reset_index().drop(columns='index')
    non_meal_data_cleaned=non_meal_data_cleaned.interpolate(method='linear',axis=1)
    index_to_drop_again=non_meal_data_cleaned.isna().sum(axis=1).replace(0,np.nan).dropna().index
    non_meal_data_cleaned=non_meal_data_cleaned.drop(non_meal_data_cleaned.index[index_to_drop_again]).reset_index().drop(columns='index')
    non_meal_feature_matrix=pd.DataFrame()
    #non_meal_data_cleaned['tau_time']=(24-non_meal_data_cleaned.iloc[:,0:19].idxmax(axis=1))*5
    #non_meal_data_cleaned['difference_in_glucose_normalized']=(non_meal_data_cleaned.iloc[:,0:19].max(axis=1)-non_meal_data_cleaned.iloc[:,24])/(non_meal_data_cleaned.iloc[:,24])
    power_first_max=[]
    index_first_max=[]
    power_second_max=[]
    index_second_max=[]
    power_third_max=[]
    for i in range(len(non_meal_data_cleaned)):
        array=abs(rfft(non_meal_data_cleaned.iloc[:,0:24].iloc[i].values.tolist())).tolist()
        sorted_array=abs(rfft(non_meal_data_cleaned.iloc[:,0:24].iloc[i].values.tolist())).tolist()
        sorted_array.sort()
        power_first_max.append(sorted_array[-2])
        power_second_max.append(sorted_array[-3])
        power_third_max.append(sorted_array[-4])
        index_first_max.append(array.index(sorted_array[-2]))
        index_second_max.append(array.index(sorted_array[-3]))
    non_meal_feature_matrix['power_second_max']=power_second_max
    non_meal_feature_matrix['power_third_max']=power_third_max
    first_differential_data=[]
    second_differential_data=[]
    standard_deviation=[]
    for i in range(len(non_meal_data_cleaned)):
        first_differential_data.append(np.diff(non_meal_data_cleaned.iloc[:,0:24].iloc[i].tolist()).max())
        second_differential_data.append(np.diff(np.diff(non_meal_data_cleaned.iloc[:,0:24].iloc[i].tolist())).max())
        standard_deviation.append(np.std(non_meal_data_cleaned.iloc[i]))
    non_meal_feature_matrix['2ndDifferential']=second_differential_data
    non_meal_feature_matrix['standard_deviation']=standard_deviation
    return non_meal_feature_matrix

In [123]:
non_meal_feature_matrix=createnomealfeaturematrix(no_meal_data)
non_meal_feature_matrix1=createnomealfeaturematrix(no_meal_data1)
non_meal_feature_matrix=pd.concat([non_meal_feature_matrix,non_meal_feature_matrix1]).reset_index().drop(columns='index')

In [130]:
meal_feature_matrix['label']=1
non_meal_feature_matrix['label']=0
#pd.DataFrame(meal_feature_matrix).to_csv('mftr.csv')
#pd.DataFrame(non_meal_feature_matrix).to_csv('nmftr.csv')
total_data=pd.concat([meal_feature_matrix,non_meal_feature_matrix]).reset_index().drop(columns='index')
dataset=shuffle(total_data,random_state=1).reset_index().drop(columns='index')
#pd.DataFrame(dataset).to_csv('data_set.csv')
kfold = KFold(n_splits=10,shuffle=False)
principaldata=dataset.drop(columns='label')
#print(principaldata)
scores_rf = []
model=DecisionTreeClassifier(criterion="entropy")
for train_index, test_index in kfold.split(principaldata):
    #print(train_index,test_index)
    X_train,X_test,y_train,y_test = principaldata.loc[train_index],principaldata.loc[test_index],dataset.label.loc[train_index],dataset.label.loc[test_index]
    model.fit(X_train,y_train)
    scores_rf.append(model.score(X_test,y_test)) 
    y_pred = model.predict(X_test)

print(classification_report(y_test, y_pred, target_names=['0','1']))
print(scores_rf)



              precision    recall  f1-score   support

           0       0.83      0.83      0.83       236
           1       0.39      0.40      0.39        65

    accuracy                           0.73       301
   macro avg       0.61      0.61      0.61       301
weighted avg       0.74      0.73      0.74       301

[0.7649006622516556, 0.7582781456953642, 0.7185430463576159, 0.7251655629139073, 0.7086092715231788, 0.7516556291390728, 0.7582781456953642, 0.7615894039735099, 0.7483443708609272, 0.7342192691029901]


In [125]:
classifier=DecisionTreeClassifier(criterion='entropy')
X, y= principaldata, dataset['label']
classifier.fit(X,y)

In [135]:
dump(classifier, 'model.pickle')

['model.pickle']