In [1]:
# !wget https://archive.ics.uci.edu/ml/machine-learning-databases/diabetes/diabetes-data.tar.Z

In [2]:
# !tar xzf diabetes-data.tar.Z

In [3]:
import pandas as pd
import numpy as np
from glob import glob
import datetime
import time

In [4]:
# merge csv files
def merge_files(SOURCE):
  files_list = sorted(glob(SOURCE + '*'))
  df_list = []
  for num, path in enumerate(files_list):
    df = pd.read_csv(path, sep='\t', header=None)
    df['patient'] = num+1
    df_list.append(df)
  df = pd.concat(df_list, join='outer', axis=0, sort=True, ignore_index=True)
  return df

SOURCE = 'Diabetes-Data/data'
df = merge_files(SOURCE)
print(df.shape)
df.head()

(29330, 5)


Unnamed: 0,0,1,2,3,patient
0,04-21-1991,9:09,58,100,1
1,04-21-1991,9:09,33,9,1
2,04-21-1991,9:09,34,13,1
3,04-21-1991,17:08,62,119,1
4,04-21-1991,17:08,33,7,1


In [5]:
# Drop nulls
df = df.dropna()
print(df.shape)

(29264, 5)


In [6]:
df[2].value_counts()

33    9485
34    3830
58    3518
62    3160
60    2771
48    1883
35    1053
57     990
64     904
65     331
67     326
63     219
66     154
70     139
56     119
71      98
72      94
69      68
61      66
68      34
59      20
36       1
4        1
Name: 2, dtype: int64

In [7]:
codes = """
33 = Regular insulin dose
34 = NPH insulin dose
35 = UltraLente insulin dose
48 = Unspecified blood glucose measurement
57 = Unspecified blood glucose measurement
58 = Pre-breakfast blood glucose measurement
59 = Post-breakfast blood glucose measurement
60 = Pre-lunch blood glucose measurement
61 = Post-lunch blood glucose measurement
62 = Pre-supper blood glucose measurement
63 = Post-supper blood glucose measurement
64 = Pre-snack blood glucose measurement
65 = Hypoglycemic symptoms
66 = Typical meal ingestion
67 = More-than-usual meal ingestion
68 = Less-than-usual meal ingestion
69 = Typical exercise activity
70 = More-than-usual exercise activity
71 = Less-than-usual exercise activity
72 = Unspecified special event
"""

In [8]:
code_dict = {}
for code in codes.split('\n')[1:-1]:
    key,val = code.split(' = ')
    code_dict[int(key)] = val
code_dict

{33: 'Regular insulin dose',
 34: 'NPH insulin dose',
 35: 'UltraLente insulin dose',
 48: 'Unspecified blood glucose measurement',
 57: 'Unspecified blood glucose measurement',
 58: 'Pre-breakfast blood glucose measurement',
 59: 'Post-breakfast blood glucose measurement',
 60: 'Pre-lunch blood glucose measurement',
 61: 'Post-lunch blood glucose measurement',
 62: 'Pre-supper blood glucose measurement',
 63: 'Post-supper blood glucose measurement',
 64: 'Pre-snack blood glucose measurement',
 65: 'Hypoglycemic symptoms',
 66: 'Typical meal ingestion',
 67: 'More-than-usual meal ingestion',
 68: 'Less-than-usual meal ingestion',
 69: 'Typical exercise activity',
 70: 'More-than-usual exercise activity',
 71: 'Less-than-usual exercise activity',
 72: 'Unspecified special event'}

In [9]:
#missing codes
undefined_code = set(code_dict.keys()).symmetric_difference(set(df[2].unique()))
undefined_code

{4, 36, 56}

In [10]:
df[df[2].isin(undefined_code)]

Unnamed: 0,0,1,2,3,patient
1707,07-21-1990,08:56,56,115,3
1712,07-22-1990,05:56,56,108,3
1722,07-23-1990,05:04,56,110,3
1731,07-24-1990,05:27,56,116,3
1740,07-25-1990,06:24,56,121,3
1749,07-26-1990,05:15,56,115,3
1764,07-27-1990,20:10,56,263,3
1848,08-09-1990,06:12,56,111,3
1943,08-21-1990,17:00,56,303,3
1947,08-21-1990,17:04,56,132,3


In [11]:
df.shape

(29264, 5)

In [12]:
# removing rows with undefined codes
df = df.drop(df[df[2].isin(undefined_code)].index)
df.shape    

(29143, 5)

In [13]:
#combine codes that have the same description
df[2] = df[2].replace(48, 57)

In [14]:
# Some Dates are wrong (richmonds code)
for d in df[0]:
    try:
        datetime.datetime.strptime(d, '%m-%d-%Y')
    except:
        print(d)

06-31-1991
06-31-1991
06-31-1991
06-31-1991
06-31-1991
06-31-1991
06-31-1991


In [15]:
df[df[1] =='05-12-1989']

Unnamed: 0,0,1,2,3,patient


In [16]:
# Some time are wrong
for t in df[1]:
    try:
        datetime.datetime.strptime(t, '%H:%M')
        
    except:
        print(t)


56:35
56:35
56:35
188:00
188:00


In [17]:
df[df[0] =='06-31-1991']

Unnamed: 0,0,1,2,3,patient
6505,06-31-1991,07:50,58,149,20
6506,06-31-1991,07:50,33,4,20
6507,06-31-1991,07:50,34,24,20
6508,06-31-1991,13:30,60,162,20
6509,06-31-1991,13:30,33,5,20
6510,06-31-1991,19:45,62,213,20
6511,06-31-1991,19:45,33,11,20


In [18]:
df[df[1].isin(['56:35', '188:00'])]

Unnamed: 0,0,1,2,3,patient
28070,04-09-1991,56:35,58,237,67
28071,04-09-1991,56:35,33,16,67
28072,04-09-1991,56:35,34,40,67
28171,04-21-1991,188:00,62,128,67
28172,04-21-1991,188:00,33,14,67


In [19]:
#dropping the abnormal values

df = df.drop(df[df[0] =='06-31-1991'].index)
df = df.drop(df[df[1].isin(['56:35', '188:00'])].index)
df.shape

(29131, 5)

In [20]:
# create timestamp feature
df['timestamp'] =df.apply(lambda x: datetime.datetime.strptime(f'{x[0]} {x[1]}', '%m-%d-%Y %H:%M'), axis=1)
df.shape

(29131, 6)

In [21]:
df.dtypes

0                    object
1                    object
2                     int64
3                    object
patient               int64
timestamp    datetime64[ns]
dtype: object

In [22]:
# Determine why df[3] is an object
df[3].unique()

array([100, 9, 13, 119, 7, 123, 216, 10, 2, 211, 257, 11, 129, 239, 14, 4,
       340, 5, 67, 206, 288, 77, 228, 259, 256, 8, 109, 96, 200, 128, 0,
       192, 263, 81, 179, 88, 185, 104, 86, 60, 163, 147, 207, 305, 151,
       85, 133, 63, 183, 282, 91, 229, 121, 251, 135, 87, 255, 343, 97,
       136, 82, 110, 236, 175, 108, 300, 76, 65, 43, 130, 6, 182, 117, 75,
       69, 201, 244, 125, 92, 122, 162, 270, 297, 148, 79, 246, 165, 149,
       232, 312, 115, 180, 295, 15, 220, 64, 94, 273, 240, 217, 187, 3,
       46, 127, 313, 102, 134, 103, 120, 227, 16, 105, 287, 113, 142, 173,
       70, 153, 141, 146, 202, 111, 212, 171, 107, 35, 156, 106, 55, 204,
       74, 233, 194, 159, 164, 72, 139, 68, 262, 126, 152, 66, 269, 184,
       219, 84, 132, 172, 112, 168, 114, 58, 265, 205, 80, 51, 193, 306,
       303, 197, 181, 101, 198, 267, 154, 166, 222, 59, 225, 169, 177,
       145, 61, 271, 258, 89, 188, 191, 226, 158, 178, 189, 203, 195, 261,
       118, 160, 208, 54, 237, 62, 247, 144, 

In [23]:
# feature has int, float, str, and some non usable values
# Changing to numerical values
df[3] = pd.to_numeric(df[3], errors='coerce')

In [24]:
# number of resulting NaNs
df[3].isnull().sum()

8

In [25]:
df.shape

(29131, 6)

In [26]:
df = df.dropna()
df.shape

(29123, 6)

In [27]:
df.dtypes

0                    object
1                    object
2                     int64
3                   float64
patient               int64
timestamp    datetime64[ns]
dtype: object

In [28]:
df[0] = pd.to_datetime(df[0], format='%m-%d-%Y')

In [29]:
df.dtypes

0            datetime64[ns]
1                    object
2                     int64
3                   float64
patient               int64
timestamp    datetime64[ns]
dtype: object

In [30]:
df.describe()

Unnamed: 0,2,3,patient
count,29123.0,29123.0,29123.0
mean,47.03832,79.130258,36.4903
std,13.629464,93.453944,20.117226
min,33.0,0.0,1.0
25%,33.0,6.0,21.0
50%,57.0,22.0,34.0
75%,60.0,141.0,55.0
max,72.0,501.0,70.0


In [31]:
df.describe(exclude='number')

Unnamed: 0,0,1,timestamp
count,29123,29123,29123
unique,1140,1291,14649
top,1990-09-19 00:00:00,08:00,1989-04-07 08:00:00
freq,126,3100,15
first,1988-03-27 00:00:00,,1988-03-27 08:00:00
last,1991-09-23 00:00:00,,1991-09-23 21:10:00


In [32]:
my_list = sorted(set(df[3]))

In [33]:
my_list

[0.0,
 1.0,
 1.5,
 2.0,
 2.5,
 3.0,
 3.5,
 4.0,
 4.5,
 5.0,
 6.0,
 6.5,
 7.0,
 7.5,
 8.0,
 9.0,
 10.0,
 11.0,
 12.0,
 13.0,
 14.0,
 15.0,
 16.0,
 17.0,
 18.0,
 19.0,
 20.0,
 21.0,
 22.0,
 23.0,
 24.0,
 25.0,
 26.0,
 27.0,
 28.0,
 29.0,
 30.0,
 31.0,
 32.0,
 33.0,
 34.0,
 35.0,
 36.0,
 37.0,
 38.0,
 39.0,
 40.0,
 41.0,
 42.0,
 43.0,
 44.0,
 45.0,
 46.0,
 47.0,
 48.0,
 49.0,
 50.0,
 51.0,
 52.0,
 53.0,
 54.0,
 55.0,
 56.0,
 57.0,
 58.0,
 59.0,
 60.0,
 61.0,
 62.0,
 63.0,
 64.0,
 65.0,
 66.0,
 67.0,
 68.0,
 69.0,
 70.0,
 71.0,
 72.0,
 73.0,
 74.0,
 75.0,
 76.0,
 77.0,
 78.0,
 79.0,
 80.0,
 81.0,
 82.0,
 83.0,
 84.0,
 85.0,
 86.0,
 87.0,
 88.0,
 89.0,
 90.0,
 91.0,
 92.0,
 93.0,
 94.0,
 95.0,
 96.0,
 97.0,
 98.0,
 99.0,
 100.0,
 101.0,
 102.0,
 103.0,
 104.0,
 105.0,
 106.0,
 107.0,
 108.0,
 109.0,
 110.0,
 111.0,
 112.0,
 113.0,
 114.0,
 115.0,
 116.0,
 117.0,
 118.0,
 119.0,
 120.0,
 121.0,
 122.0,
 123.0,
 124.0,
 125.0,
 126.0,
 127.0,
 128.0,
 129.0,
 130.0,
 131.0,
 132.0,
 133.0,
 1

In [34]:
df.head()

Unnamed: 0,0,1,2,3,patient,timestamp
0,1991-04-21,9:09,58,100.0,1,1991-04-21 09:09:00
1,1991-04-21,9:09,33,9.0,1,1991-04-21 09:09:00
2,1991-04-21,9:09,34,13.0,1,1991-04-21 09:09:00
3,1991-04-21,17:08,62,119.0,1,1991-04-21 17:08:00
4,1991-04-21,17:08,33,7.0,1,1991-04-21 17:08:00


In [35]:
dfpiv = df.pivot_table(index=['patient','timestamp'], columns=2, values=3)
dfpiv

Unnamed: 0_level_0,2,33,34,35,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72
patient,timestamp,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,1991-04-21 09:09:00,9.0,13.0,,,100.0,,,,,,,,,,,,,,
1,1991-04-21 17:08:00,7.0,,,,,,,,119.0,,,,,,,,,,
1,1991-04-21 22:51:00,,,,123.0,,,,,,,,,,,,,,,
1,1991-04-22 07:35:00,10.0,13.0,,,216.0,,,,,,,,,,,,,,
1,1991-04-22 13:40:00,2.0,,,,,,,,,,,,,,,,,,
1,1991-04-22 16:56:00,7.0,,,,,,,,211.0,,,,,,,,,,
1,1991-04-23 07:25:00,11.0,13.0,,,257.0,,,,,,,,,,,,,,
1,1991-04-23 17:25:00,7.0,,,,,,,,129.0,,,,,,,,,,
1,1991-04-24 07:52:00,10.0,14.0,,,239.0,,,,,,,,,,,,,,
1,1991-04-24 12:00:00,4.0,,,,,,,,,,,,,,,,,,


In [36]:
dfpiv = dfpiv.rename(columns=code_dict)
dfpiv

Unnamed: 0_level_0,2,Regular insulin dose,NPH insulin dose,UltraLente insulin dose,Unspecified blood glucose measurement,Pre-breakfast blood glucose measurement,Post-breakfast blood glucose measurement,Pre-lunch blood glucose measurement,Post-lunch blood glucose measurement,Pre-supper blood glucose measurement,Post-supper blood glucose measurement,Pre-snack blood glucose measurement,Hypoglycemic symptoms,Typical meal ingestion,More-than-usual meal ingestion,Less-than-usual meal ingestion,Typical exercise activity,More-than-usual exercise activity,Less-than-usual exercise activity,Unspecified special event
patient,timestamp,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,1991-04-21 09:09:00,9.0,13.0,,,100.0,,,,,,,,,,,,,,
1,1991-04-21 17:08:00,7.0,,,,,,,,119.0,,,,,,,,,,
1,1991-04-21 22:51:00,,,,123.0,,,,,,,,,,,,,,,
1,1991-04-22 07:35:00,10.0,13.0,,,216.0,,,,,,,,,,,,,,
1,1991-04-22 13:40:00,2.0,,,,,,,,,,,,,,,,,,
1,1991-04-22 16:56:00,7.0,,,,,,,,211.0,,,,,,,,,,
1,1991-04-23 07:25:00,11.0,13.0,,,257.0,,,,,,,,,,,,,,
1,1991-04-23 17:25:00,7.0,,,,,,,,129.0,,,,,,,,,,
1,1991-04-24 07:52:00,10.0,14.0,,,239.0,,,,,,,,,,,,,,
1,1991-04-24 12:00:00,4.0,,,,,,,,,,,,,,,,,,


In [39]:
print(dfpiv['Typical exercise activity'].notna().sum())
print(dfpiv['More-than-usual exercise activity'].notna().sum())
print(dfpiv['Less-than-usual exercise activity'].notna().sum())

68
139
98


In [41]:
print(dfpiv['Regular insulin dose'].notna().sum())
print(dfpiv['NPH insulin dose'].notna().sum())
print(dfpiv['UltraLente insulin dose'].notna().sum())

9469
3821
1053
