In [76]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import cleaning_fns as cf
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc
from fancyimpute import MICE
from sklearn.decomposition import PCA
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
sns.set_style("whitegrid")

In [3]:
df = pd.read_csv('type1-and-type2-data-cleaned.csv')

In [4]:
df['diagnosisType'].value_counts(dropna=False)

type1    328
type2     58
Name: diagnosisType, dtype: int64

In [5]:
cf.get_bad_columns(df)

['basal.count',
 'bolus.count',
 'deviceEvent.count',
 'pumpSettings.count',
 'wizard.count',
 'bolus.nUniqueDeviceIds',
 'basal.nDaysWithData',
 'bolus.nDaysWithData',
 'deviceEvent.nDaysWithData',
 'pumpSettings.nDaysWithData',
 'wizard.nDaysWithData',
 'cgmSettings.count',
 'cgmSettings.nDaysWithData']

In [6]:
bad_cols = set(cf.get_bad_columns(df))

In [7]:
columns = set(df.columns)

In [8]:
good_columns = list(columns.difference(bad_cols))

In [9]:
df.isnull().mean()

hashID                        0.000000
diagnosisType                 0.000000
currentAge                    0.000000
diagnosisAge                  0.000000
yearsLivingWithDiabetes       0.000000
basal.count                   0.559585
bolus.count                   0.559585
cbg.count                     0.253886
deviceEvent.count             0.406736
pumpSettings.count            0.559585
smbg.count                    0.295337
upload.count                  0.000000
wizard.count                  0.562176
cbg.nUniqueDeviceIds          0.253886
bolus.nUniqueDeviceIds        0.559585
smbg.nUniqueDeviceIds         0.295337
basal.nDaysWithData           0.559585
bolus.nDaysWithData           0.559585
cbg.nDaysWithData             0.253886
deviceEvent.nDaysWithData     0.406736
pumpSettings.nDaysWithData    0.559585
smbg.nDaysWithData            0.295337
upload.nDaysWithData          0.000000
wizard.nDaysWithData          0.562176
cgmSettings.count             0.831606
cgmSettings.nDaysWithData

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 386 entries, 0 to 385
Data columns (total 26 columns):
hashID                        386 non-null object
diagnosisType                 386 non-null object
currentAge                    386 non-null int64
diagnosisAge                  386 non-null int64
yearsLivingWithDiabetes       386 non-null int64
basal.count                   170 non-null float64
bolus.count                   170 non-null float64
cbg.count                     288 non-null float64
deviceEvent.count             229 non-null float64
pumpSettings.count            170 non-null float64
smbg.count                    272 non-null float64
upload.count                  386 non-null int64
wizard.count                  169 non-null float64
cbg.nUniqueDeviceIds          288 non-null float64
bolus.nUniqueDeviceIds        170 non-null float64
smbg.nUniqueDeviceIds         272 non-null float64
basal.nDaysWithData           170 non-null float64
bolus.nDaysWithData           170 non-

In [11]:
df.head()

Unnamed: 0,hashID,diagnosisType,currentAge,diagnosisAge,yearsLivingWithDiabetes,basal.count,bolus.count,cbg.count,deviceEvent.count,pumpSettings.count,smbg.count,upload.count,wizard.count,cbg.nUniqueDeviceIds,bolus.nUniqueDeviceIds,smbg.nUniqueDeviceIds,basal.nDaysWithData,bolus.nDaysWithData,cbg.nDaysWithData,deviceEvent.nDaysWithData,pumpSettings.nDaysWithData,smbg.nDaysWithData,upload.nDaysWithData,wizard.nDaysWithData,cgmSettings.count,cgmSettings.nDaysWithData
0,92c2847d30b6514ac8a6385f04b3bf3c92ee87a8b89cb8...,type2,59,20,38,,,,,,71.0,1,,,,1.0,,,,,,48.0,1,,,
1,6b720ecfc94a90f7c75a39f80a97eac27969ca95a2e6fb...,type2,65,45,19,,,,,,303.0,22,,,,1.0,,,,,,41.0,14,,,
2,e0ef840bd07fc79f10b8e21a7f6f3d53ef3b7594d94ec0...,type1,32,15,17,2712.0,1268.0,29811.0,637.0,24.0,1797.0,6,1257.0,12.0,12.0,12.0,213.0,212.0,110.0,150.0,23.0,213.0,5,212.0,,
3,8974377383af45404d560543b58749b240c78885ca0710...,type1,47,14,32,,,10279.0,2.0,,19.0,1,,1.0,,1.0,,,112.0,2.0,,16.0,1,,,
4,f92417af60cc16f254600f310df4d15d4a50973de26223...,type2,62,53,8,,,,,,750.0,1,,,,1.0,,,,,,400.0,1,,,


In [12]:
good_columns

['upload.count',
 'upload.nDaysWithData',
 'yearsLivingWithDiabetes',
 'smbg.nDaysWithData',
 'currentAge',
 'hashID',
 'smbg.count',
 'diagnosisAge',
 'cbg.count',
 'diagnosisType',
 'cbg.nDaysWithData',
 'smbg.nUniqueDeviceIds',
 'cbg.nUniqueDeviceIds']

In [13]:
df['smbg.count'].value_counts(dropna=False)

NaN        114
 750.0       7
 1000.0      7
 2.0         3
 709.0       3
 798.0       2
 232.0       2
 11.0        2
 111.0       2
 1.0         2
 49.0        2
 66.0        2
 2000.0      2
 45.0        2
 102.0       2
 3000.0      2
 614.0       2
 278.0       1
 596.0       1
 169.0       1
 213.0       1
 804.0       1
 368.0       1
 464.0       1
 2893.0      1
 28.0        1
 33.0        1
 338.0       1
 35.0        1
 230.0       1
          ... 
 2567.0      1
 393.0       1
 770.0       1
 1844.0      1
 970.0       1
 479.0       1
 275.0       1
 677.0       1
 509.0       1
 1138.0      1
 718.0       1
 439.0       1
 549.0       1
 457.0       1
 1859.0      1
 997.0       1
 303.0       1
 391.0       1
 642.0       1
 845.0       1
 1107.0      1
 263.0       1
 981.0       1
 477.0       1
 727.0       1
 267.0       1
 2550.0      1
 646.0       1
 725.0       1
 71.0        1
Name: smbg.count, Length: 245, dtype: int64

In [14]:
df.head(10)

Unnamed: 0,hashID,diagnosisType,currentAge,diagnosisAge,yearsLivingWithDiabetes,basal.count,bolus.count,cbg.count,deviceEvent.count,pumpSettings.count,smbg.count,upload.count,wizard.count,cbg.nUniqueDeviceIds,bolus.nUniqueDeviceIds,smbg.nUniqueDeviceIds,basal.nDaysWithData,bolus.nDaysWithData,cbg.nDaysWithData,deviceEvent.nDaysWithData,pumpSettings.nDaysWithData,smbg.nDaysWithData,upload.nDaysWithData,wizard.nDaysWithData,cgmSettings.count,cgmSettings.nDaysWithData
0,92c2847d30b6514ac8a6385f04b3bf3c92ee87a8b89cb8...,type2,59,20,38,,,,,,71.0,1,,,,1.0,,,,,,48.0,1,,,
1,6b720ecfc94a90f7c75a39f80a97eac27969ca95a2e6fb...,type2,65,45,19,,,,,,303.0,22,,,,1.0,,,,,,41.0,14,,,
2,e0ef840bd07fc79f10b8e21a7f6f3d53ef3b7594d94ec0...,type1,32,15,17,2712.0,1268.0,29811.0,637.0,24.0,1797.0,6,1257.0,12.0,12.0,12.0,213.0,212.0,110.0,150.0,23.0,213.0,5,212.0,,
3,8974377383af45404d560543b58749b240c78885ca0710...,type1,47,14,32,,,10279.0,2.0,,19.0,1,,1.0,,1.0,,,112.0,2.0,,16.0,1,,,
4,f92417af60cc16f254600f310df4d15d4a50973de26223...,type2,62,53,8,,,,,,750.0,1,,,,1.0,,,,,,400.0,1,,,
5,b96eee9a57d57b6fcf3d9326ae8372bd196706da93caa4...,type1,4,4,0,,,,,,646.0,29,,,,2.0,,,,,,66.0,14,,,
6,8a2da88b5859a2cf5635a3ca35da1d3456fadd4272f9e3...,type1,26,7,19,,,4409.0,1.0,,,1,,1.0,,,,,51.0,1.0,,,1,,,
7,73d26fe0f67c6731617902c43fba936e5ff55c36bd12ff...,type1,18,8,10,,,14149.0,,,,2294,,1.0,,,,,55.0,,,,25,,,
8,db17945b2ce9557cb456172f94d43ea170d045d92282b8...,type1,45,5,40,,,5443.0,,,,1215,,1.0,,,,,21.0,,,,20,,,
9,1476f11455ec5e9c2d2d6392545178d8c6184257134b45...,type2,39,23,16,,,344.0,,,,1,,1.0,,,,,5.0,,,,1,,,


In [15]:
df['diagnosis_bin'] = df['diagnosisType'].map(lambda x: 0 if x=='type2' else 1)

In [16]:
t2 = df['diagnosisType']=='type2' # people with type 2
t1 = df['diagnosisType']=='type1' # people with type 1

In [17]:
df[t1].isnull().mean()

hashID                        0.000000
diagnosisType                 0.000000
currentAge                    0.000000
diagnosisAge                  0.000000
yearsLivingWithDiabetes       0.000000
basal.count                   0.503049
bolus.count                   0.503049
cbg.count                     0.225610
deviceEvent.count             0.371951
pumpSettings.count            0.503049
smbg.count                    0.280488
upload.count                  0.000000
wizard.count                  0.506098
cbg.nUniqueDeviceIds          0.225610
bolus.nUniqueDeviceIds        0.503049
smbg.nUniqueDeviceIds         0.280488
basal.nDaysWithData           0.503049
bolus.nDaysWithData           0.503049
cbg.nDaysWithData             0.225610
deviceEvent.nDaysWithData     0.371951
pumpSettings.nDaysWithData    0.503049
smbg.nDaysWithData            0.280488
upload.nDaysWithData          0.000000
wizard.nDaysWithData          0.506098
cgmSettings.count             0.804878
cgmSettings.nDaysWithData

In [18]:
df[t2].isnull().mean()

hashID                        0.000000
diagnosisType                 0.000000
currentAge                    0.000000
diagnosisAge                  0.000000
yearsLivingWithDiabetes       0.000000
basal.count                   0.879310
bolus.count                   0.879310
cbg.count                     0.413793
deviceEvent.count             0.603448
pumpSettings.count            0.879310
smbg.count                    0.379310
upload.count                  0.000000
wizard.count                  0.879310
cbg.nUniqueDeviceIds          0.413793
bolus.nUniqueDeviceIds        0.879310
smbg.nUniqueDeviceIds         0.379310
basal.nDaysWithData           0.879310
bolus.nDaysWithData           0.879310
cbg.nDaysWithData             0.413793
deviceEvent.nDaysWithData     0.603448
pumpSettings.nDaysWithData    0.879310
smbg.nDaysWithData            0.379310
upload.nDaysWithData          0.000000
wizard.nDaysWithData          0.879310
cgmSettings.count             0.982759
cgmSettings.nDaysWithData

In [19]:
df[t1].T.isnull().sum()

2       2
3      11
5      16
6      14
7      16
8      16
10      2
11      2
12      2
13      2
14      2
15     14
16      2
20      2
21     12
23      2
24     12
25      2
26     16
28      2
29      0
30     16
32     16
33     11
34     13
35     14
37     16
38     16
39      2
40      2
       ..
351     0
352    16
353    16
354     2
355     2
356    16
358    16
360     5
361     2
362    12
363     2
364     0
365    14
366    16
367     0
368    16
369    16
370     5
371     2
372    16
374    16
375     2
376     5
378    12
379    16
380     5
381    16
382    16
383    16
384     0
Length: 328, dtype: int64

In [20]:
df[t2].T.isnull().sum()

0      16
1      16
4      16
9      16
17     16
18     16
19     16
22     14
27     11
31     14
36     14
41     14
72     16
77     16
79     16
90     14
110    16
112    16
115    11
123    11
130    16
133    14
139    14
142     5
143    16
147    16
165    16
166     2
168    13
171    16
177    16
184    16
212    16
220    14
222    11
251    16
254     2
262    16
287    14
294    16
299    16
301    13
302    16
308    16
309    14
310    16
312    16
314     2
315    16
316    16
323     0
336    14
339    14
357    16
359    16
373    16
377     2
385     2
dtype: int64

In [21]:
df['mean_null'] = df.isnull().mean(axis=1)

In [24]:
df[t2].isnull().mean() / df[t1].isnull().mean()

hashID                             NaN
diagnosisType                      NaN
currentAge                         NaN
diagnosisAge                       NaN
yearsLivingWithDiabetes            NaN
basal.count                   1.747962
bolus.count                   1.747962
cbg.count                     1.834110
deviceEvent.count             1.622386
pumpSettings.count            1.747962
smbg.count                    1.352324
upload.count                       NaN
wizard.count                  1.737432
cbg.nUniqueDeviceIds          1.834110
bolus.nUniqueDeviceIds        1.747962
smbg.nUniqueDeviceIds         1.352324
basal.nDaysWithData           1.747962
bolus.nDaysWithData           1.747962
cbg.nDaysWithData             1.834110
deviceEvent.nDaysWithData     1.622386
pumpSettings.nDaysWithData    1.747962
smbg.nDaysWithData            1.352324
upload.nDaysWithData               NaN
wizard.nDaysWithData          1.737432
cgmSettings.count             1.221003
cgmSettings.nDaysWithData

In [39]:
df['upload.nDaysWithData']

0        1
1       14
2        5
3        1
4        1
5       14
6        1
7       25
8       20
9        1
10      56
11       4
12      19
13       5
14       2
15       2
16      32
17       1
18       1
19       7
20       3
21       3
22       1
23       5
24       1
25      19
26       9
27       2
28      61
29       9
      ... 
356      2
357      1
358      2
359      1
360      1
361     24
362      2
363      3
364      1
365      1
366      8
367      3
368      1
369     22
370      1
371      1
372      2
373      1
374      1
375      6
376      4
377      7
378     30
379      6
380      3
381      3
382      1
383     20
384    259
385     23
Name: upload.nDaysWithData, Length: 386, dtype: int64

In [25]:
df.shape[1]

28

In [40]:
df['sum_null'] = df.isnull().sum(axis=1)
df['mean_null'] = df.isnull().mean(axis=1)


In [28]:
cbg = [column for column in df.columns if 'cbg' in str(column)]

In [29]:
df[cbg].isnull().sum(axis=1).value_counts()

0    288
3     98
dtype: int64

In [30]:
cbg_null_index = df[df[cbg]['cbg.count'].isnull()].index.tolist()

In [31]:
df['cbg_null'] = df['cbg.count'].isnull()

In [32]:
# pd.crosstab(a, [b, c], rownames=['a'], colnames=['b', 'c'])

In [33]:
pump = [column for column in df.columns if 'pump' in str(column)]
pump

['pumpSettings.count', 'pumpSettings.nDaysWithData']

In [34]:
df[pump].isnull().sum(axis=1).value_counts()

2    216
0    170
dtype: int64

In [36]:
df['pump_null'] = df['pumpSettings.count'].isnull() # missing -> 1, 

In [42]:
smbg = [column for column in df.columns if 'smbg' in str(column)]

df[smbg].isnull().sum(axis=1).value_counts()

In [47]:
df['smbg_null'] = df['smbg.count'].isnull()

In [38]:
pd.crosstab(df['diagnosisType'], df['cbg_null'], rownames=['diagnosisType'], colnames=['cbg_null'])

cbg_null,False,True
diagnosisType,Unnamed: 1_level_1,Unnamed: 2_level_1
type1,254,74
type2,34,24


In [48]:
pd.crosstab(df['diagnosisType'], df['smbg_null'], rownames=['diagnosisType'], colnames=['smbg_null'])

smbg_null,False,True
diagnosisType,Unnamed: 1_level_1,Unnamed: 2_level_1
type1,236,92
type2,36,22


In [37]:
pd.crosstab(df['diagnosisType'], df['pump_null'], rownames=['diagnosisType'], colnames=['pump_null'])

pump_null,False,True
diagnosisType,Unnamed: 1_level_1,Unnamed: 2_level_1
type1,163,165
type2,7,51


In [49]:
df.head()

Unnamed: 0,hashID,diagnosisType,currentAge,diagnosisAge,yearsLivingWithDiabetes,basal.count,bolus.count,cbg.count,deviceEvent.count,pumpSettings.count,smbg.count,upload.count,wizard.count,cbg.nUniqueDeviceIds,bolus.nUniqueDeviceIds,smbg.nUniqueDeviceIds,basal.nDaysWithData,bolus.nDaysWithData,cbg.nDaysWithData,deviceEvent.nDaysWithData,pumpSettings.nDaysWithData,smbg.nDaysWithData,upload.nDaysWithData,wizard.nDaysWithData,cgmSettings.count,cgmSettings.nDaysWithData,diagnosis_bin,mean_null,sum_null,cbg_null,pump_null,smbg_null
0,92c2847d30b6514ac8a6385f04b3bf3c92ee87a8b89cb8...,type2,59,20,38,,,,,,71.0,1,,,,1.0,,,,,,48.0,1,,,,0,0.516129,16,True,True,False
1,6b720ecfc94a90f7c75a39f80a97eac27969ca95a2e6fb...,type2,65,45,19,,,,,,303.0,22,,,,1.0,,,,,,41.0,14,,,,0,0.516129,16,True,True,False
2,e0ef840bd07fc79f10b8e21a7f6f3d53ef3b7594d94ec0...,type1,32,15,17,2712.0,1268.0,29811.0,637.0,24.0,1797.0,6,1257.0,12.0,12.0,12.0,213.0,212.0,110.0,150.0,23.0,213.0,5,212.0,,,1,0.064516,2,False,False,False
3,8974377383af45404d560543b58749b240c78885ca0710...,type1,47,14,32,,,10279.0,2.0,,19.0,1,,1.0,,1.0,,,112.0,2.0,,16.0,1,,,,1,0.354839,11,False,True,False
4,f92417af60cc16f254600f310df4d15d4a50973de26223...,type2,62,53,8,,,,,,750.0,1,,,,1.0,,,,,,400.0,1,,,,0,0.516129,16,True,True,False


In [None]:
# df['deviceEvent_null'] = df['deviceEvent.count'].isnull()
# df['deviceEvent.nDaysWithData_null'] = df['deviceEvent.nDaysWithData'].isnull()
# df['wizard_null'] = df['wizard.count'].isnull()
# df['wizard.nDaysWithData_null'] = df['wizard.nDaysWithData'].isnull()

# df.drop(['deviceEvent.count', 'deviceEvent.nDaysWithData', 'wizard.count', 'wizard.nDaysWithData'], axis=1, inplace=True)

In [51]:
(df['deviceEvent.count'].isnull() == df['deviceEvent.nDaysWithData'].isnull()).all()

True

In [53]:
(df['wizard.count'].isnull() == df['wizard.nDaysWithData'].isnull()).all()

True

In [54]:
df['device_null'] = df['deviceEvent.count'].isnull()

In [55]:
df['wizard_null'] = df['wizard.count'].isnull()

In [56]:
df.head()

Unnamed: 0,hashID,diagnosisType,currentAge,diagnosisAge,yearsLivingWithDiabetes,basal.count,bolus.count,cbg.count,deviceEvent.count,pumpSettings.count,smbg.count,upload.count,wizard.count,cbg.nUniqueDeviceIds,bolus.nUniqueDeviceIds,smbg.nUniqueDeviceIds,basal.nDaysWithData,bolus.nDaysWithData,cbg.nDaysWithData,deviceEvent.nDaysWithData,pumpSettings.nDaysWithData,smbg.nDaysWithData,upload.nDaysWithData,wizard.nDaysWithData,cgmSettings.count,cgmSettings.nDaysWithData,diagnosis_bin,mean_null,sum_null,cbg_null,pump_null,smbg_null,device_null,wizard_null
0,92c2847d30b6514ac8a6385f04b3bf3c92ee87a8b89cb8...,type2,59,20,38,,,,,,71.0,1,,,,1.0,,,,,,48.0,1,,,,0,0.516129,16,True,True,False,True,True
1,6b720ecfc94a90f7c75a39f80a97eac27969ca95a2e6fb...,type2,65,45,19,,,,,,303.0,22,,,,1.0,,,,,,41.0,14,,,,0,0.516129,16,True,True,False,True,True
2,e0ef840bd07fc79f10b8e21a7f6f3d53ef3b7594d94ec0...,type1,32,15,17,2712.0,1268.0,29811.0,637.0,24.0,1797.0,6,1257.0,12.0,12.0,12.0,213.0,212.0,110.0,150.0,23.0,213.0,5,212.0,,,1,0.064516,2,False,False,False,False,False
3,8974377383af45404d560543b58749b240c78885ca0710...,type1,47,14,32,,,10279.0,2.0,,19.0,1,,1.0,,1.0,,,112.0,2.0,,16.0,1,,,,1,0.354839,11,False,True,False,False,True
4,f92417af60cc16f254600f310df4d15d4a50973de26223...,type2,62,53,8,,,,,,750.0,1,,,,1.0,,,,,,400.0,1,,,,0,0.516129,16,True,True,False,True,True


In [57]:
df.columns

Index(['hashID', 'diagnosisType', 'currentAge', 'diagnosisAge',
       'yearsLivingWithDiabetes', 'basal.count', 'bolus.count', 'cbg.count',
       'deviceEvent.count', 'pumpSettings.count', 'smbg.count', 'upload.count',
       'wizard.count', 'cbg.nUniqueDeviceIds', 'bolus.nUniqueDeviceIds',
       'smbg.nUniqueDeviceIds', 'basal.nDaysWithData', 'bolus.nDaysWithData',
       'cbg.nDaysWithData', 'deviceEvent.nDaysWithData',
       'pumpSettings.nDaysWithData', 'smbg.nDaysWithData',
       'upload.nDaysWithData', 'wizard.nDaysWithData', 'cgmSettings.count',
       'cgmSettings.nDaysWithData', 'diagnosis_bin', 'mean_null', 'sum_null',
       'cbg_null', 'pump_null', 'smbg_null', 'device_null', 'wizard_null'],
      dtype='object')

In [71]:
droppers = ['basal.count', 'bolus.count', 'cbg.count', 'deviceEvent.count', 
            'pumpSettings.count', 'smbg.count', 'wizard.count', 'cbg.nUniqueDeviceIds', 
            'bolus.nUniqueDeviceIds', 'smbg.nUniqueDeviceIds', 'basal.nDaysWithData', 
            'bolus.nDaysWithData', 'cbg.nDaysWithData', 'deviceEvent.nDaysWithData',
            'pumpSettings.nDaysWithData', 'smbg.nDaysWithData','wizard.nDaysWithData', 
            'cgmSettings.count', 'cgmSettings.nDaysWithData']

In [62]:
bolus = [column for column in df.columns if 'bolus' in str(column)]
df['bolus_null'] = df[bolus].isnull().sum(axis=1)
df['bolus_null'] = df['bolus_null'].replace(3, 1)

basal = [column for column in df.columns if 'basal' in str(column)]
df['basal_null'] = df[basal].isnull().sum(axis=1)
df['basal_null'] = df['basal_null'].replace(2, 1)

cgm = [column for column in df.columns if 'cgm' in str(column)]
df['cgm_null'] = df[cgm].isnull().sum(axis=1)
df['cgm_null'] = df['cgm_null'].replace(2, 1)

In [63]:
df.head()

Unnamed: 0,hashID,diagnosisType,currentAge,diagnosisAge,yearsLivingWithDiabetes,basal.count,bolus.count,cbg.count,deviceEvent.count,pumpSettings.count,smbg.count,upload.count,wizard.count,cbg.nUniqueDeviceIds,bolus.nUniqueDeviceIds,smbg.nUniqueDeviceIds,basal.nDaysWithData,bolus.nDaysWithData,cbg.nDaysWithData,deviceEvent.nDaysWithData,pumpSettings.nDaysWithData,smbg.nDaysWithData,upload.nDaysWithData,wizard.nDaysWithData,cgmSettings.count,cgmSettings.nDaysWithData,diagnosis_bin,mean_null,sum_null,cbg_null,pump_null,smbg_null,device_null,wizard_null,bolus_null,basal_null,cgm_null
0,92c2847d30b6514ac8a6385f04b3bf3c92ee87a8b89cb8...,type2,59,20,38,,,,,,71.0,1,,,,1.0,,,,,,48.0,1,,,,0,0.516129,16,True,True,False,True,True,1,1,1
1,6b720ecfc94a90f7c75a39f80a97eac27969ca95a2e6fb...,type2,65,45,19,,,,,,303.0,22,,,,1.0,,,,,,41.0,14,,,,0,0.516129,16,True,True,False,True,True,1,1,1
2,e0ef840bd07fc79f10b8e21a7f6f3d53ef3b7594d94ec0...,type1,32,15,17,2712.0,1268.0,29811.0,637.0,24.0,1797.0,6,1257.0,12.0,12.0,12.0,213.0,212.0,110.0,150.0,23.0,213.0,5,212.0,,,1,0.064516,2,False,False,False,False,False,0,0,1
3,8974377383af45404d560543b58749b240c78885ca0710...,type1,47,14,32,,,10279.0,2.0,,19.0,1,,1.0,,1.0,,,112.0,2.0,,16.0,1,,,,1,0.354839,11,False,True,False,False,True,1,1,1
4,f92417af60cc16f254600f310df4d15d4a50973de26223...,type2,62,53,8,,,,,,750.0,1,,,,1.0,,,,,,400.0,1,,,,0,0.516129,16,True,True,False,True,True,1,1,1


In [64]:
refits = ['cbg_null', 'pump_null', 'smbg_null', 'device_null', 'wizard_null']

In [68]:
df['cbg_null'] = df['cbg_null'].map(lambda x: 1 if x else 0)
df['pump_null'] = df['pump_null'].map(lambda x: 1 if x else 0)
df['smbg_null'] = df['smbg_null'].map(lambda x: 1 if x else 0)
df['device_null'] = df['device_null'].map(lambda x: 1 if x else 0)
df['wizard_null'] = df['wizard_null'].map(lambda x: 1 if x else 0)

In [69]:
df.head()

Unnamed: 0,hashID,diagnosisType,currentAge,diagnosisAge,yearsLivingWithDiabetes,basal.count,bolus.count,cbg.count,deviceEvent.count,pumpSettings.count,smbg.count,upload.count,wizard.count,cbg.nUniqueDeviceIds,bolus.nUniqueDeviceIds,smbg.nUniqueDeviceIds,basal.nDaysWithData,bolus.nDaysWithData,cbg.nDaysWithData,deviceEvent.nDaysWithData,pumpSettings.nDaysWithData,smbg.nDaysWithData,upload.nDaysWithData,wizard.nDaysWithData,cgmSettings.count,cgmSettings.nDaysWithData,diagnosis_bin,mean_null,sum_null,cbg_null,pump_null,smbg_null,device_null,wizard_null,bolus_null,basal_null,cgm_null
0,92c2847d30b6514ac8a6385f04b3bf3c92ee87a8b89cb8...,type2,59,20,38,,,,,,71.0,1,,,,1.0,,,,,,48.0,1,,,,0,0.516129,16,1,1,0,1,1,1,1,1
1,6b720ecfc94a90f7c75a39f80a97eac27969ca95a2e6fb...,type2,65,45,19,,,,,,303.0,22,,,,1.0,,,,,,41.0,14,,,,0,0.516129,16,1,1,0,1,1,1,1,1
2,e0ef840bd07fc79f10b8e21a7f6f3d53ef3b7594d94ec0...,type1,32,15,17,2712.0,1268.0,29811.0,637.0,24.0,1797.0,6,1257.0,12.0,12.0,12.0,213.0,212.0,110.0,150.0,23.0,213.0,5,212.0,,,1,0.064516,2,0,0,0,0,0,0,0,1
3,8974377383af45404d560543b58749b240c78885ca0710...,type1,47,14,32,,,10279.0,2.0,,19.0,1,,1.0,,1.0,,,112.0,2.0,,16.0,1,,,,1,0.354839,11,0,1,0,0,1,1,1,1
4,f92417af60cc16f254600f310df4d15d4a50973de26223...,type2,62,53,8,,,,,,750.0,1,,,,1.0,,,,,,400.0,1,,,,0,0.516129,16,1,1,0,1,1,1,1,1


In [73]:
df.drop(droppers, axis=1, inplace=True)

In [75]:
df.head(10)

Unnamed: 0,hashID,diagnosisType,currentAge,diagnosisAge,yearsLivingWithDiabetes,upload.count,upload.nDaysWithData,diagnosis_bin,mean_null,sum_null,cbg_null,pump_null,smbg_null,device_null,wizard_null,bolus_null,basal_null,cgm_null
0,92c2847d30b6514ac8a6385f04b3bf3c92ee87a8b89cb8...,type2,59,20,38,1,1,0,0.516129,16,1,1,0,1,1,1,1,1
1,6b720ecfc94a90f7c75a39f80a97eac27969ca95a2e6fb...,type2,65,45,19,22,14,0,0.516129,16,1,1,0,1,1,1,1,1
2,e0ef840bd07fc79f10b8e21a7f6f3d53ef3b7594d94ec0...,type1,32,15,17,6,5,1,0.064516,2,0,0,0,0,0,0,0,1
3,8974377383af45404d560543b58749b240c78885ca0710...,type1,47,14,32,1,1,1,0.354839,11,0,1,0,0,1,1,1,1
4,f92417af60cc16f254600f310df4d15d4a50973de26223...,type2,62,53,8,1,1,0,0.516129,16,1,1,0,1,1,1,1,1
5,b96eee9a57d57b6fcf3d9326ae8372bd196706da93caa4...,type1,4,4,0,29,14,1,0.516129,16,1,1,0,1,1,1,1,1
6,8a2da88b5859a2cf5635a3ca35da1d3456fadd4272f9e3...,type1,26,7,19,1,1,1,0.451613,14,0,1,1,0,1,1,1,1
7,73d26fe0f67c6731617902c43fba936e5ff55c36bd12ff...,type1,18,8,10,2294,25,1,0.516129,16,0,1,1,1,1,1,1,1
8,db17945b2ce9557cb456172f94d43ea170d045d92282b8...,type1,45,5,40,1215,20,1,0.516129,16,0,1,1,1,1,1,1,1
9,1476f11455ec5e9c2d2d6392545178d8c6184257134b45...,type2,39,23,16,1,1,0,0.516129,16,0,1,1,1,1,1,1,1


In [74]:
df.to_csv('diabetes_rev0.csv', index=False)