In [1]:
! mkdir _data

In [3]:
! wget -P _data https://archive.ics.uci.edu/ml/machine-learning-databases/diabetes/diabetes-data.tar.Z

--2019-08-26 11:46:34--  https://archive.ics.uci.edu/ml/machine-learning-databases/diabetes/diabetes-data.tar.Z
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 187551 (183K) [application/x-httpd-php]
Saving to: ‘_data/diabetes-data.tar.Z’


2019-08-26 11:46:34 (913 KB/s) - ‘_data/diabetes-data.tar.Z’ saved [187551/187551]



In [13]:
! tar xzf _data/diabetes-data.tar.Z -C _data

In [317]:
# Imports 
import glob
import pandas as pd
import datetime
import time
import numpy as np

In [359]:
all_files = sorted(glob.glob('_data/Diabetes-Data/data*'))
df_list = []
for idx,filename in enumerate(all_files):
    df = pd.read_csv(filename, sep='\t', header=None)
    df['patient_id'] = idx+1
    df_list.append(df)
df = pd.concat(df_list, axis=0, ignore_index=True)
df.shape

(29330, 5)

In [360]:
# Drop nulls
df = df.dropna()
df

Unnamed: 0,0,1,2,3,patient_id
0,04-21-1991,9:09,58,100,1
1,04-21-1991,9:09,33,9,1
2,04-21-1991,9:09,34,13,1
3,04-21-1991,17:08,62,119,1
4,04-21-1991,17:08,33,7,1
...,...,...,...,...,...
29325,05-09-1989,08:00,33,1,70
29326,05-09-1989,08:00,34,7,70
29327,05-10-1989,08:00,34,7,70
29328,05-11-1989,08:00,34,7,70


In [361]:
df[2].value_counts()

33    9485
34    3830
58    3518
62    3160
60    2771
48    1883
35    1053
57     990
64     904
65     331
67     326
63     219
66     154
70     139
56     119
71      98
72      94
69      68
61      66
68      34
59      20
36       1
4        1
Name: 2, dtype: int64

In [362]:
codes = """
33 = Regular insulin dose
34 = NPH insulin dose
35 = UltraLente insulin dose
48 = Unspecified blood glucose measurement
57 = Unspecified blood glucose measurement
58 = Pre-breakfast blood glucose measurement
59 = Post-breakfast blood glucose measurement
60 = Pre-lunch blood glucose measurement
61 = Post-lunch blood glucose measurement
62 = Pre-supper blood glucose measurement
63 = Post-supper blood glucose measurement
64 = Pre-snack blood glucose measurement
65 = Hypoglycemic symptoms
66 = Typical meal ingestion
67 = More-than-usual meal ingestion
68 = Less-than-usual meal ingestion
69 = Typical exercise activity
70 = More-than-usual exercise activity
71 = Less-than-usual exercise activity
72 = Unspecified special event
"""

In [363]:
code_dict = {}
for code in codes.split('\n')[1:-1]:
    key,val = code.split(' = ')
    code_dict[int(key)] = val
code_dict

{33: 'Regular insulin dose',
 34: 'NPH insulin dose',
 35: 'UltraLente insulin dose',
 48: 'Unspecified blood glucose measurement',
 57: 'Unspecified blood glucose measurement',
 58: 'Pre-breakfast blood glucose measurement',
 59: 'Post-breakfast blood glucose measurement',
 60: 'Pre-lunch blood glucose measurement',
 61: 'Post-lunch blood glucose measurement',
 62: 'Pre-supper blood glucose measurement',
 63: 'Post-supper blood glucose measurement',
 64: 'Pre-snack blood glucose measurement',
 65: 'Hypoglycemic symptoms',
 66: 'Typical meal ingestion',
 67: 'More-than-usual meal ingestion',
 68: 'Less-than-usual meal ingestion',
 69: 'Typical exercise activity',
 70: 'More-than-usual exercise activity',
 71: 'Less-than-usual exercise activity',
 72: 'Unspecified special event'}

In [364]:
# figure out missing codes
non_standard_codes = set(code_dict.keys()).symmetric_difference(set(df[2].unique()))
non_standard_codes

{4, 36, 56}

In [365]:
df[df[2].isin(non_standard_codes)]

Unnamed: 0,0,1,2,3,patient_id
1707,07-21-1990,08:56,56,115,3
1712,07-22-1990,05:56,56,108,3
1722,07-23-1990,05:04,56,110,3
1731,07-24-1990,05:27,56,116,3
1740,07-25-1990,06:24,56,121,3
...,...,...,...,...,...
23926,07-11-1990,21:15,56,52,57
23934,07-13-1990,21:53,56,162,57
23935,07-13-1990,21:56,56,91,57
28944,08-24-1990,13:02,56,258,69


In [366]:
# remove non standard codes
df = df.drop(df[df[2].isin(non_standard_codes)].index)

In [367]:
# standardize duplicate codes
df[2] = df[2].replace(48, 57)

In [368]:
# Some Dates are wrong
for d in df[0]:
    try:
        datetime.datetime.strptime(d, '%m-%d-%Y')
    except:
        print(d)

06-31-1991
06-31-1991
06-31-1991
06-31-1991
06-31-1991
06-31-1991
06-31-1991


In [369]:
df[df[1] =='05-12-1989']

Unnamed: 0,0,1,2,3,patient_id


In [370]:
# Some time are wrong
for t in df[1]:
    try:
        datetime.datetime.strptime(t, '%H:%M')
        
    except:
        print(t)

56:35
56:35
56:35
188:00
188:00


In [371]:
# check records
df[df[0] =='06-31-1991']

Unnamed: 0,0,1,2,3,patient_id
6505,06-31-1991,07:50,58,149,20
6506,06-31-1991,07:50,33,4,20
6507,06-31-1991,07:50,34,24,20
6508,06-31-1991,13:30,60,162,20
6509,06-31-1991,13:30,33,5,20
6510,06-31-1991,19:45,62,213,20
6511,06-31-1991,19:45,33,11,20


In [372]:
df[df[1].isin(['56:35', '188:00'])]

Unnamed: 0,0,1,2,3,patient_id
28070,04-09-1991,56:35,58,237,67
28071,04-09-1991,56:35,33,16,67
28072,04-09-1991,56:35,34,40,67
28171,04-21-1991,188:00,62,128,67
28172,04-21-1991,188:00,33,14,67


In [373]:
# other patient data looks normal
df[df['patient_id'] == 20]

Unnamed: 0,0,1,2,3,patient_id
6142,05-12-1991,06:55,58,223,20
6143,05-12-1991,06:55,33,6,20
6144,05-12-1991,06:55,34,24,20
6145,05-12-1991,11:20,60,260,20
6146,05-12-1991,11:20,33,8,20
...,...,...,...,...,...
7140,09-23-1991,12:15,33,5,20
7141,09-23-1991,17:15,62,264,20
7142,09-23-1991,17:15,33,13,20
7143,09-23-1991,21:10,57,256,20


In [374]:
df[df['patient_id'] == 67]

Unnamed: 0,0,1,2,3,patient_id
27278,01-01-1991,9:10,58,235,67
27279,01-01-1991,9:10,33,18,67
27280,01-01-1991,9:10,34,40,67
27281,01-01-1991,13:40,60,195,67
27282,01-01-1991,13:40,33,10,67
...,...,...,...,...,...
28240,04-30-1991,12:05,60,64,67
28241,04-30-1991,12:05,33,10,67
28242,04-30-1991,18:10,62,100,67
28243,04-30-1991,18:10,33,14,67


In [375]:
# safe to drop
df = df.drop(df[df[0] =='06-31-1991'].index)
df = df.drop(df[df[1].isin(['56:35', '188:00'])].index)


In [376]:
df['timestamp'] = df.apply(lambda x: datetime.datetime.strptime(f'{x[0]} {x[1]}', '%m-%d-%Y %H:%M'), axis=1)
df

Unnamed: 0,0,1,2,3,patient_id,timestamp
0,04-21-1991,9:09,58,100,1,1991-04-21 09:09:00
1,04-21-1991,9:09,33,9,1,1991-04-21 09:09:00
2,04-21-1991,9:09,34,13,1,1991-04-21 09:09:00
3,04-21-1991,17:08,62,119,1,1991-04-21 17:08:00
4,04-21-1991,17:08,33,7,1,1991-04-21 17:08:00
...,...,...,...,...,...,...
29325,05-09-1989,08:00,33,1,70,1989-05-09 08:00:00
29326,05-09-1989,08:00,34,7,70,1989-05-09 08:00:00
29327,05-10-1989,08:00,34,7,70,1989-05-10 08:00:00
29328,05-11-1989,08:00,34,7,70,1989-05-11 08:00:00


In [420]:
patient_1 =  df[df['patient_id']==1]
patient_1 = patient_1.pivot(index='timestamp', columns=2, values=3)
patient_1

2,33,34,57,58,60,62,65
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1991-04-21 09:09:00,9,13,,100,,,
1991-04-21 17:08:00,7,,,,,119,
1991-04-21 22:51:00,,,123,,,,
1991-04-22 07:35:00,10,13,,216,,,
1991-04-22 13:40:00,2,,,,,,
...,...,...,...,...,...,...,...
1991-09-02 08:51:00,9,16,,168,,,
1991-09-02 13:00:00,4,,,,,,
1991-09-02 17:30:00,7,,,,,61,
1991-09-02 23:00:00,,,155,,,,


In [415]:
patient_1.rename(columns=code_dict)

2,Regular insulin dose,NPH insulin dose,Unspecified blood glucose measurement,Pre-breakfast blood glucose measurement,Pre-lunch blood glucose measurement,Pre-supper blood glucose measurement,Hypoglycemic symptoms
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1991-04-21 09:09:00,9,13,,100,,,
1991-04-21 17:08:00,7,,,,,119,
1991-04-21 22:51:00,,,123,,,,
1991-04-22 07:35:00,10,13,,216,,,
1991-04-22 13:40:00,2,,,,,,
...,...,...,...,...,...,...,...
1991-09-02 08:51:00,9,16,,168,,,
1991-09-02 13:00:00,4,,,,,,
1991-09-02 17:30:00,7,,,,,61,
1991-09-02 23:00:00,,,155,,,,


In [416]:
code_dict

{33: 'Regular insulin dose',
 34: 'NPH insulin dose',
 35: 'UltraLente insulin dose',
 48: 'Unspecified blood glucose measurement',
 57: 'Unspecified blood glucose measurement',
 58: 'Pre-breakfast blood glucose measurement',
 59: 'Post-breakfast blood glucose measurement',
 60: 'Pre-lunch blood glucose measurement',
 61: 'Post-lunch blood glucose measurement',
 62: 'Pre-supper blood glucose measurement',
 63: 'Post-supper blood glucose measurement',
 64: 'Pre-snack blood glucose measurement',
 65: 'Hypoglycemic symptoms',
 66: 'Typical meal ingestion',
 67: 'More-than-usual meal ingestion',
 68: 'Less-than-usual meal ingestion',
 69: 'Typical exercise activity',
 70: 'More-than-usual exercise activity',
 71: 'Less-than-usual exercise activity',
 72: 'Unspecified special event'}

In [418]:
glucose_indices = list(range(57,65))
#patient_1['Glucose measurement'] = patient_1.apply(lambda x: x[glucose_indices].mean(), axis=1)

In [381]:
patient_1.rename(columns=code_dict)

2,Regular insulin dose,NPH insulin dose,Unspecified blood glucose measurement,Pre-breakfast blood glucose measurement,Pre-lunch blood glucose measurement,Pre-supper blood glucose measurement,Hypoglycemic symptoms,Glucose measurement
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1991-04-21 09:09:00,9,13,,100,,,,100.0
1991-04-21 17:08:00,7,,,,,119,,119.0
1991-04-21 22:51:00,,,123,,,,,123.0
1991-04-22 07:35:00,10,13,,216,,,,216.0
1991-04-22 13:40:00,2,,,,,,,
...,...,...,...,...,...,...,...,...
1991-09-02 08:51:00,9,16,,168,,,,168.0
1991-09-02 13:00:00,4,,,,,,,
1991-09-02 17:30:00,7,,,,,61,,61.0
1991-09-02 23:00:00,,,155,,,,,155.0


In [422]:
patient_1.apply(lambda x: x[glucose_indices], axis=1)

2,57,58,59,60,61,62,63,64
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1991-04-21 09:09:00,,100.0,,,,,,
1991-04-21 17:08:00,,,,,,119.0,,
1991-04-21 22:51:00,123.0,,,,,,,
1991-04-22 07:35:00,,216.0,,,,,,
1991-04-22 13:40:00,,,,,,,,
...,...,...,...,...,...,...,...,...
1991-09-02 08:51:00,,168.0,,,,,,
1991-09-02 13:00:00,,,,,,,,
1991-09-02 17:30:00,,,,,,61.0,,
1991-09-02 23:00:00,155.0,,,,,,,


In [425]:
patient_1

2,33,34,57,58,60,62,65
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1991-04-21 09:09:00,9,13,,100,,,
1991-04-21 17:08:00,7,,,,,119,
1991-04-21 22:51:00,,,123,,,,
1991-04-22 07:35:00,10,13,,216,,,
1991-04-22 13:40:00,2,,,,,,
...,...,...,...,...,...,...,...
1991-09-02 08:51:00,9,16,,168,,,
1991-09-02 13:00:00,4,,,,,,
1991-09-02 17:30:00,7,,,,,61,
1991-09-02 23:00:00,,,155,,,,
