In [1]:
import pandas as pd
import numpy as np
from datetime import timedelta
from scipy.fftpack import fft, ifft,rfft
from sklearn.utils import shuffle
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import KFold, RepeatedKFold
from joblib import dump, load
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

In [2]:
insulin_data_df=pd.read_csv('InsulinData.csv',low_memory=False,usecols=['Date','Time','BWZ Carb Input (grams)'])
print(insulin_data_df)
# insulin_data_df=pd.read_csv('~\InsulinData.csv',low_memory=False,usecols=['Date','Time','BWZ Carb Input (grams)'])

            Date      Time  BWZ Carb Input (grams)
0      2/12/2018  13:20:53                     NaN
1      2/12/2018  13:20:48                     NaN
2      2/12/2018  13:18:48                     NaN
3      2/12/2018  13:18:48                     NaN
4      2/12/2018  13:12:33                     NaN
...          ...       ...                     ...
41430  7/24/2017  19:00:01                     NaN
41431  7/24/2017  18:59:44                     NaN
41432  7/24/2017  18:59:44                     NaN
41433  7/24/2017  18:59:44                     NaN
41434  7/24/2017  18:59:42                     NaN

[41435 rows x 3 columns]


In [3]:
cgm_data_df=pd.read_csv('CGMData.csv',low_memory=False,usecols=['Date','Time','Sensor Glucose (mg/dL)'])
print(cgm_data_df)
# cgm_data_df=pd.read_csv('~\CGMData.csv',low_memory=False,usecols=['Date','Time','Sensor Glucose (mg/dL)'])

            Date      Time  Sensor Glucose (mg/dL)
0      2/12/2018  13:22:27                   118.0
1      2/12/2018  13:17:27                   122.0
2      2/12/2018  13:12:27                     NaN
3      2/12/2018  13:07:27                     NaN
4      2/12/2018  13:02:27                     NaN
...          ...       ...                     ...
55338  7/25/2017  12:28:54                   311.0
55339  7/25/2017  12:23:54                   311.0
55340  7/25/2017  12:18:54                   309.0
55341  7/25/2017  12:13:54                   310.0
55342  7/25/2017  12:08:54                   314.0

[55343 rows x 3 columns]


In [4]:
insulin_data_df['date_time_stamp']=pd.to_datetime(insulin_data_df['Date'] + ' ' + insulin_data_df['Time'])
cgm_data_df['date_time_stamp']=pd.to_datetime(cgm_data_df['Date'] + ' ' + cgm_data_df['Time'])
print(insulin_data_df)
print(cgm_data_df)

            Date      Time  BWZ Carb Input (grams)     date_time_stamp
0      2/12/2018  13:20:53                     NaN 2018-02-12 13:20:53
1      2/12/2018  13:20:48                     NaN 2018-02-12 13:20:48
2      2/12/2018  13:18:48                     NaN 2018-02-12 13:18:48
3      2/12/2018  13:18:48                     NaN 2018-02-12 13:18:48
4      2/12/2018  13:12:33                     NaN 2018-02-12 13:12:33
...          ...       ...                     ...                 ...
41430  7/24/2017  19:00:01                     NaN 2017-07-24 19:00:01
41431  7/24/2017  18:59:44                     NaN 2017-07-24 18:59:44
41432  7/24/2017  18:59:44                     NaN 2017-07-24 18:59:44
41433  7/24/2017  18:59:44                     NaN 2017-07-24 18:59:44
41434  7/24/2017  18:59:42                     NaN 2017-07-24 18:59:42

[41435 rows x 4 columns]
            Date      Time  Sensor Glucose (mg/dL)     date_time_stamp
0      2/12/2018  13:22:27                   118.0 

In [5]:
insulin_data_df_1=pd.read_csv('Insulin_patient2.csv',low_memory=False,usecols=['Date','Time','BWZ Carb Input (grams)'])
# insulin_data_df_1=pd.read_csv('~\Insulin_patient2.csv',low_memory=False,usecols=['Date','Time','BWZ Carb Input (grams)'])

In [6]:
cgm_data_df_1=pd.read_csv('CGM_patient2.csv',low_memory=False,usecols=['Date','Time','Sensor Glucose (mg/dL)'])
# cgm_data_df_1=pd.read_csv('~\CGM_patient2.csv',low_memory=False,usecols=['Date','Time','Sensor Glucose (mg/dL)'])
print(cgm_data_df_1)

             Date      Time  Sensor Glucose (mg/dL)
0      2018-03-07  11:30:48                   172.0
1      2018-03-07  11:25:48                   172.0
2      2018-03-07  11:20:48                   172.0
3      2018-03-07  11:15:48                   170.0
4      2018-03-07  11:10:48                   176.0
...           ...       ...                     ...
33243  2017-09-05  11:06:21                   226.0
33244  2017-09-05  11:01:21                   214.0
33245  2017-09-05  10:56:21                   201.0
33246  2017-09-05  10:51:21                   183.0
33247  2017-09-05  10:46:21                   182.0

[33248 rows x 3 columns]


In [7]:
insulin_data_df_1['date_time_stamp']=pd.to_datetime(insulin_data_df_1['Date'] + ' ' + insulin_data_df_1['Time'])
cgm_data_df_1['date_time_stamp']=pd.to_datetime(cgm_data_df_1['Date'] + ' ' + cgm_data_df_1['Time'])

In [8]:
def createmealdata(insulin_data_df,cgm_data_df,dateidentifier):
    insulin_df=insulin_data_df.copy()
    #print("1",insulin_df)
    insulin_df=insulin_df.set_index('date_time_stamp')
    #print("2",insulin_df)
    find_timestamp_with_2_5_hours_df=insulin_df.sort_values(by='date_time_stamp',ascending=True).dropna().reset_index()
    #print("3",find_timestamp_with_2_5_hours_df)
    find_timestamp_with_2_5_hours_df['BWZ Carb Input (grams)'].replace(0.0,np.nan,inplace=True)
    #print("4",find_timestamp_with_2_5_hours_df)
    find_timestamp_with_2_5_hours_df=find_timestamp_with_2_5_hours_df.dropna()
    #print("5",find_timestamp_with_2_5_hours_df)
    find_timestamp_with_2_5_hours_df=find_timestamp_with_2_5_hours_df.reset_index().drop(columns='index')
    #print("6",find_timestamp_with_2_5_hours_df)
    valid_timestamp_list=[]
    value=0
    for idx,i in enumerate(find_timestamp_with_2_5_hours_df['date_time_stamp']):
        try:
            value=(find_timestamp_with_2_5_hours_df['date_time_stamp'][idx+1]-i).seconds / 60.0
            if value >= 120:
                valid_timestamp_list.append(i)
        except KeyError:
            break
    #print(valid_timestamp_list)
    
    list1=[]
    if dateidentifier==1:
        for idx,i in enumerate(valid_timestamp_list):
            start=pd.to_datetime(i - timedelta(minutes=30))
            end=pd.to_datetime(i + timedelta(minutes=120))
            get_date=i.date().strftime('%#m/%#d/%Y')
            list1.append(cgm_data_df.loc[cgm_data_df['Date']==get_date].set_index('date_time_stamp').between_time(start_time=start.strftime('%#H:%#M:%#S'),end_time=end.strftime('%#H:%#M:%#S'))['Sensor Glucose (mg/dL)'].values.tolist())
        return pd.DataFrame(list1)
    else:
        for idx,i in enumerate(valid_timestamp_list):
            start=pd.to_datetime(i - timedelta(minutes=30))
            end=pd.to_datetime(i + timedelta(minutes=120))
            get_date=i.date().strftime('%Y-%m-%d')
            print(get_date)
            print(cgm_data_df['Date'])
            list1.append(cgm_data_df.loc[cgm_data_df['Date']==get_date].set_index('date_time_stamp').between_time(start_time=start.strftime('%H:%M:%S'),end_time=end.strftime('%H:%M:%S'))['Sensor Glucose (mg/dL)'].values.tolist())
        return pd.DataFrame(list1)
    
        

In [9]:
meal_data=createmealdata(insulin_data_df,cgm_data_df,1)
meal_data=meal_data.iloc[:,0:30]
print(meal_data)

        0      1      2      3      4      5      6      7      8      9   \
0    312.0  311.0  311.0  311.0  309.0  310.0  314.0    NaN    NaN    NaN   
1    196.0  203.0  198.0  195.0  190.0  184.0  178.0  169.0  164.0  168.0   
2    278.0  283.0  284.0  274.0  267.0  267.0  269.0  274.0  277.0  270.0   
3     81.0   77.0   74.0   67.0   70.0   72.0   74.0   75.0   71.0   67.0   
4    209.0  210.0  209.0  210.0  210.0  213.0  216.0  212.0  213.0  210.0   
..     ...    ...    ...    ...    ...    ...    ...    ...    ...    ...   
598  202.0  197.0  191.0  185.0  197.0  206.0  209.0  207.0  174.0  168.0   
599  208.0  215.0  212.0  206.0  203.0  196.0  190.0  180.0  184.0  180.0   
600  107.0  106.0  123.0  128.0  137.0  147.0  145.0  142.0    NaN    NaN   
601  177.0  194.0  200.0  201.0  189.0  162.0  166.0  173.0  176.0  169.0   
602    NaN   93.0   92.0   86.0   84.0   80.0   87.0   98.0  105.0  113.0   

     ...     20     21     22     23     24     25     26     27     28    

In [10]:
meal_data1=createmealdata(insulin_data_df_1,cgm_data_df_1,2)
meal_data1=meal_data1.iloc[:,0:30]
print(meal_data1)

2017-09-05
0        2018-03-07
1        2018-03-07
2        2018-03-07
3        2018-03-07
4        2018-03-07
            ...    
33243    2017-09-05
33244    2017-09-05
33245    2017-09-05
33246    2017-09-05
33247    2017-09-05
Name: Date, Length: 33248, dtype: object
2017-09-05
0        2018-03-07
1        2018-03-07
2        2018-03-07
3        2018-03-07
4        2018-03-07
            ...    
33243    2017-09-05
33244    2017-09-05
33245    2017-09-05
33246    2017-09-05
33247    2017-09-05
Name: Date, Length: 33248, dtype: object
2017-09-06
0        2018-03-07
1        2018-03-07
2        2018-03-07
3        2018-03-07
4        2018-03-07
            ...    
33243    2017-09-05
33244    2017-09-05
33245    2017-09-05
33246    2017-09-05
33247    2017-09-05
Name: Date, Length: 33248, dtype: object
2017-09-06
0        2018-03-07
1        2018-03-07
2        2018-03-07
3        2018-03-07
4        2018-03-07
            ...    
33243    2017-09-05
33244    2017-09-05
33245    2017

0        2018-03-07
1        2018-03-07
2        2018-03-07
3        2018-03-07
4        2018-03-07
            ...    
33243    2017-09-05
33244    2017-09-05
33245    2017-09-05
33246    2017-09-05
33247    2017-09-05
Name: Date, Length: 33248, dtype: object
2017-09-26
0        2018-03-07
1        2018-03-07
2        2018-03-07
3        2018-03-07
4        2018-03-07
            ...    
33243    2017-09-05
33244    2017-09-05
33245    2017-09-05
33246    2017-09-05
33247    2017-09-05
Name: Date, Length: 33248, dtype: object
2017-09-27
0        2018-03-07
1        2018-03-07
2        2018-03-07
3        2018-03-07
4        2018-03-07
            ...    
33243    2017-09-05
33244    2017-09-05
33245    2017-09-05
33246    2017-09-05
33247    2017-09-05
Name: Date, Length: 33248, dtype: object
2017-09-27
0        2018-03-07
1        2018-03-07
2        2018-03-07
3        2018-03-07
4        2018-03-07
            ...    
33243    2017-09-05
33244    2017-09-05
33245    2017-09-05
3324

2017-10-22
0        2018-03-07
1        2018-03-07
2        2018-03-07
3        2018-03-07
4        2018-03-07
            ...    
33243    2017-09-05
33244    2017-09-05
33245    2017-09-05
33246    2017-09-05
33247    2017-09-05
Name: Date, Length: 33248, dtype: object
2017-10-22
0        2018-03-07
1        2018-03-07
2        2018-03-07
3        2018-03-07
4        2018-03-07
            ...    
33243    2017-09-05
33244    2017-09-05
33245    2017-09-05
33246    2017-09-05
33247    2017-09-05
Name: Date, Length: 33248, dtype: object
2017-10-23
0        2018-03-07
1        2018-03-07
2        2018-03-07
3        2018-03-07
4        2018-03-07
            ...    
33243    2017-09-05
33244    2017-09-05
33245    2017-09-05
33246    2017-09-05
33247    2017-09-05
Name: Date, Length: 33248, dtype: object
2017-10-23
0        2018-03-07
1        2018-03-07
2        2018-03-07
3        2018-03-07
4        2018-03-07
            ...    
33243    2017-09-05
33244    2017-09-05
33245    2017

0        2018-03-07
1        2018-03-07
2        2018-03-07
3        2018-03-07
4        2018-03-07
            ...    
33243    2017-09-05
33244    2017-09-05
33245    2017-09-05
33246    2017-09-05
33247    2017-09-05
Name: Date, Length: 33248, dtype: object
2017-11-25
0        2018-03-07
1        2018-03-07
2        2018-03-07
3        2018-03-07
4        2018-03-07
            ...    
33243    2017-09-05
33244    2017-09-05
33245    2017-09-05
33246    2017-09-05
33247    2017-09-05
Name: Date, Length: 33248, dtype: object
2017-11-25
0        2018-03-07
1        2018-03-07
2        2018-03-07
3        2018-03-07
4        2018-03-07
            ...    
33243    2017-09-05
33244    2017-09-05
33245    2017-09-05
33246    2017-09-05
33247    2017-09-05
Name: Date, Length: 33248, dtype: object
2017-11-25
0        2018-03-07
1        2018-03-07
2        2018-03-07
3        2018-03-07
4        2018-03-07
            ...    
33243    2017-09-05
33244    2017-09-05
33245    2017-09-05
3324

2018-02-09
0        2018-03-07
1        2018-03-07
2        2018-03-07
3        2018-03-07
4        2018-03-07
            ...    
33243    2017-09-05
33244    2017-09-05
33245    2017-09-05
33246    2017-09-05
33247    2017-09-05
Name: Date, Length: 33248, dtype: object
2018-02-09
0        2018-03-07
1        2018-03-07
2        2018-03-07
3        2018-03-07
4        2018-03-07
            ...    
33243    2017-09-05
33244    2017-09-05
33245    2017-09-05
33246    2017-09-05
33247    2017-09-05
Name: Date, Length: 33248, dtype: object
2018-02-10
0        2018-03-07
1        2018-03-07
2        2018-03-07
3        2018-03-07
4        2018-03-07
            ...    
33243    2017-09-05
33244    2017-09-05
33245    2017-09-05
33246    2017-09-05
33247    2017-09-05
Name: Date, Length: 33248, dtype: object
2018-02-10
0        2018-03-07
1        2018-03-07
2        2018-03-07
3        2018-03-07
4        2018-03-07
            ...    
33243    2017-09-05
33244    2017-09-05
33245    2017

### No-Meal data extraction

In [11]:
def createnomealdata(insulin_data_df,cgm_data_df):
    insulin_no_meal_df=insulin_data_df.copy()
    test1_df=insulin_no_meal_df.sort_values(by='date_time_stamp',ascending=True).replace(0.0,np.nan).dropna().copy()
    test1_df=test1_df.reset_index().drop(columns='index')
    valid_timestamp=[]
    for idx,i in enumerate(test1_df['date_time_stamp']):
        try:
            value=(test1_df['date_time_stamp'][idx+1]-i).seconds//3600
            if value >=4:
                valid_timestamp.append(i)
        except KeyError:
            break
    dataset=[]
    for idx, i in enumerate(valid_timestamp):
        iteration_dataset=1
        try:
            length_of_24_dataset=len(cgm_data_df.loc[(cgm_data_df['date_time_stamp']>=valid_timestamp[idx]+pd.Timedelta(hours=2))&(cgm_data_df['date_time_stamp']<valid_timestamp[idx+1])])//24
            while (iteration_dataset<=length_of_24_dataset):
                if iteration_dataset==1:
                    dataset.append(cgm_data_df.loc[(cgm_data_df['date_time_stamp']>=valid_timestamp[idx]+pd.Timedelta(hours=2))&(cgm_data_df['date_time_stamp']<valid_timestamp[idx+1])]['Sensor Glucose (mg/dL)'][:iteration_dataset*24].values.tolist())
                    iteration_dataset+=1
                else:
                    dataset.append(cgm_data_df.loc[(cgm_data_df['date_time_stamp']>=valid_timestamp[idx]+pd.Timedelta(hours=2))&(cgm_data_df['date_time_stamp']<valid_timestamp[idx+1])]['Sensor Glucose (mg/dL)'][(iteration_dataset-1)*24:(iteration_dataset)*24].values.tolist())
                    iteration_dataset+=1
        except IndexError:
            break
    return pd.DataFrame(dataset)

In [12]:
no_meal_data=createnomealdata(insulin_data_df,cgm_data_df)
no_meal_data1=createnomealdata(insulin_data_df_1,cgm_data_df_1)

### Create Feature matrix from extracted meal data. Following are the features created,
1. Fast Fourier Transform (FFT) of 2nd highest and 3rd highest power as well as their indexes.
2. tau_time - time interval between max glucose level and min glucose level.
3. 1st order differential - 1st order differential of glucose level w.r.t time
4. 2nd order differential - 2nd order differential of glucose level w.r.t time
5. difference in glucose normalized - change in glucose between min and max levels over min glucose level.

In [13]:
def createmealfeaturematrix(meal_data):
    index=meal_data.isna().sum(axis=1).replace(0,np.nan).dropna().where(lambda x:x>6).dropna().index
    meal_data_cleaned=meal_data.drop(meal_data.index[index]).reset_index().drop(columns='index')
    meal_data_cleaned=meal_data_cleaned.interpolate(method='linear',axis=1)
    index_to_drop_again=meal_data_cleaned.isna().sum(axis=1).replace(0,np.nan).dropna().index
    meal_data_cleaned=meal_data_cleaned.drop(meal_data.index[index_to_drop_again]).reset_index().drop(columns='index')
    meal_data_cleaned['tau_time']=(meal_data_cleaned.iloc[:,22:25].idxmin(axis=1)-meal_data_cleaned.iloc[:,5:19].idxmax(axis=1))*5
    meal_data_cleaned['difference_in_glucose_normalized']=(meal_data_cleaned.iloc[:,5:19].max(axis=1)-meal_data_cleaned.iloc[:,22:25].min(axis=1))/(meal_data_cleaned.iloc[:,22:25].min(axis=1))
    meal_data_cleaned=meal_data_cleaned.dropna().reset_index().drop(columns='index')
    power_first_max=[]
    index_first_max=[]
    power_second_max=[]
    index_second_max=[]
    for i in range(len(meal_data_cleaned)):
        array=abs(rfft(meal_data_cleaned.iloc[:,0:30].iloc[i].values.tolist())).tolist()
        sorted_array=abs(rfft(meal_data_cleaned.iloc[:,0:30].iloc[i].values.tolist())).tolist()
        sorted_array.sort()
        power_first_max.append(sorted_array[-2])
        power_second_max.append(sorted_array[-3])
        index_first_max.append(array.index(sorted_array[-2]))
        index_second_max.append(array.index(sorted_array[-3]))
    meal_feature_matrix=pd.DataFrame()
    meal_feature_matrix['tau_time']=meal_data_cleaned['tau_time']
    meal_feature_matrix['difference_in_glucose_normalized']=meal_data_cleaned['difference_in_glucose_normalized']
    meal_feature_matrix['power_first_max']=power_first_max
    meal_feature_matrix['power_second_max']=power_second_max
    meal_feature_matrix['index_first_max']=index_first_max
    meal_feature_matrix['index_second_max']=index_second_max
    tm=meal_data_cleaned.iloc[:,22:25].idxmin(axis=1)
    maximum=meal_data_cleaned.iloc[:,5:19].idxmax(axis=1)
    list1=[]
    second_differential_data=[]
    standard_deviation=[]
    for i in range(len(meal_data_cleaned)):
        list1.append(np.diff(meal_data_cleaned.iloc[:,maximum[i]:tm[i]].iloc[i].tolist()).max())
        second_differential_data.append(np.diff(np.diff(meal_data_cleaned.iloc[:,maximum[i]:tm[i]].iloc[i].tolist())).max())
        standard_deviation.append(np.std(meal_data_cleaned.iloc[i]))
    meal_feature_matrix['1stDifferential']=list1
    meal_feature_matrix['2ndDifferential']=second_differential_data
    return meal_feature_matrix

In [14]:
meal_feature_matrix=createmealfeaturematrix(meal_data)
meal_feature_matrix1=createmealfeaturematrix(meal_data1)
meal_feature_matrix=pd.concat([meal_feature_matrix,meal_feature_matrix1]).reset_index().drop(columns='index')

### Just like Meal feature matrix, Create Feature matrix from extracted non meal data. (logic is same.) Following are the features created,
1. Fast Fourier Transform (FFT) of 2nd highest and 3rd highest power as well as their indexes.
2. tau_time - time interval between max glucose level and min glucose level.
3. 1st order differential - 1st order differential of glucose level w.r.t time
4. 2nd order differential - 2nd order differential of glucose level w.r.t time
5. difference in glucose normalized - change in glucose between min and max levels over min glucose level.

In [15]:
def createnomealfeaturematrix(non_meal_data):
    index_to_remove_non_meal=non_meal_data.isna().sum(axis=1).replace(0,np.nan).dropna().where(lambda x:x>5).dropna().index
    non_meal_data_cleaned=non_meal_data.drop(non_meal_data.index[index_to_remove_non_meal]).reset_index().drop(columns='index')
    non_meal_data_cleaned=non_meal_data_cleaned.interpolate(method='linear',axis=1)
    index_to_drop_again=non_meal_data_cleaned.isna().sum(axis=1).replace(0,np.nan).dropna().index
    non_meal_data_cleaned=non_meal_data_cleaned.drop(non_meal_data_cleaned.index[index_to_drop_again]).reset_index().drop(columns='index')
    non_meal_feature_matrix=pd.DataFrame()
    non_meal_data_cleaned['tau_time']=(24-non_meal_data_cleaned.iloc[:,0:19].idxmax(axis=1))*5
    non_meal_data_cleaned['difference_in_glucose_normalized']=(non_meal_data_cleaned.iloc[:,0:19].max(axis=1)-non_meal_data_cleaned.iloc[:,24])/(non_meal_data_cleaned.iloc[:,24])
    power_first_max,index_first_max,power_second_max,index_second_max=[],[],[],[]
    for i in range(len(non_meal_data_cleaned)):
        array=abs(rfft(non_meal_data_cleaned.iloc[:,0:24].iloc[i].values.tolist())).tolist()
        sorted_array=abs(rfft(non_meal_data_cleaned.iloc[:,0:24].iloc[i].values.tolist())).tolist()
        sorted_array.sort()
        power_first_max.append(sorted_array[-2])
        power_second_max.append(sorted_array[-3])
        index_first_max.append(array.index(sorted_array[-2]))
        index_second_max.append(array.index(sorted_array[-3]))
    non_meal_feature_matrix['tau_time']=non_meal_data_cleaned['tau_time']
    non_meal_feature_matrix['difference_in_glucose_normalized']=non_meal_data_cleaned['difference_in_glucose_normalized']
    non_meal_feature_matrix['power_first_max']=power_first_max
    non_meal_feature_matrix['power_second_max']=power_second_max
    non_meal_feature_matrix['index_first_max']=index_first_max
    non_meal_feature_matrix['index_second_max']=index_second_max
    first_differential_data=[]
    second_differential_data=[]
    for i in range(len(non_meal_data_cleaned)):
        first_differential_data.append(np.diff(non_meal_data_cleaned.iloc[:,0:24].iloc[i].tolist()).max())
        second_differential_data.append(np.diff(np.diff(non_meal_data_cleaned.iloc[:,0:24].iloc[i].tolist())).max())
    non_meal_feature_matrix['1stDifferential']=first_differential_data
    non_meal_feature_matrix['2ndDifferential']=second_differential_data
    return non_meal_feature_matrix

In [16]:
non_meal_feature_matrix=createnomealfeaturematrix(no_meal_data)
non_meal_feature_matrix1=createnomealfeaturematrix(no_meal_data1)
non_meal_feature_matrix=pd.concat([non_meal_feature_matrix,non_meal_feature_matrix1]).reset_index().drop(columns='index')

### Subject the data onto Decision Tree Classifier to predict 0 / 1 label using KFold Cross Validation.

In [1]:

meal_feature_matrix['label']=1
non_meal_feature_matrix['label']=0
total_data=pd.concat([meal_feature_matrix,non_meal_feature_matrix]).reset_index().drop(columns='index')
dataset=shuffle(total_data,random_state=1).reset_index().drop(columns='index')
kfold = KFold(n_splits=10,shuffle=True,random_state=1)
principaldata=dataset.drop(columns='label')

accuracy, f1, precision, recall = [], [], [], []
model=DecisionTreeClassifier(criterion="entropy")
for train_index, test_index in kfold.split(principaldata):
    X_train,X_test,y_train,y_test = principaldata.loc[train_index],principaldata.loc[test_index],\
    dataset.label.loc[train_index],dataset.label.loc[test_index]
    model.fit(X_train,y_train)
    y_test_pred=model.predict(X_test)
    accuracy.append(accuracy_score(y_test,y_test_pred))
    f1.append(f1_score(y_test,y_test_pred))
    precision.append(precision_score(y_test,y_test_pred))
    recall.append(recall_score(y_test,y_test_pred))

NameError: name 'meal_feature_matrix' is not defined

In [18]:
print('Accuracy score is',np.mean(accuracy)*100)
print('F1 Score score is',np.mean(f1)*100)
print('Precision score is',np.mean(precision)*100)
print('Recall score is',np.mean(recall)*100)


Accuracy score is 97.55002858776443
F1 Score score is 95.35143742886112
Precision score is 95.34137603084923
Recall score is 95.4079684933136


In [19]:
classifier=DecisionTreeClassifier(criterion='entropy')
X, y= principaldata, dataset['label']
classifier.fit(X,y)
dump(classifier, 'DecisionTreeClassifier.pickle')

['DecisionTreeClassifier.pickle']