In [73]:
!wget https://archive.ics.uci.edu/ml/machine-learning-databases/diabetes/diabetes-data.tar.Z

--2019-08-27 19:22:57--  https://archive.ics.uci.edu/ml/machine-learning-databases/diabetes/diabetes-data.tar.Z
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 187551 (183K) [application/x-httpd-php]
Saving to: ‘diabetes-data.tar.Z.1’


2019-08-27 19:22:59 (371 KB/s) - ‘diabetes-data.tar.Z.1’ saved [187551/187551]



In [0]:
!tar xzf diabetes-data.tar.Z

## File Names and format:

(1) Date in MM-DD-YYYY format

(2) Time in XX:YY format

(3) Code

(4) Value

In [0]:
import numpy as np
import pandas as pd
from glob import glob
from datetime import datetime

In [104]:
# merge csv files
def merge_files(SOURCE):
  files_list = sorted(glob(SOURCE + '*'))
  df_list = []
  for num, path in enumerate(files_list):
    df = pd.read_csv(path, sep='\t', header=None)
    df['patient_id'] = num+1
    df_list.append(df)
  df = pd.concat(df_list, join='outer', ignore_index=True, axis=0, sort=True)
  
  return df

SOURCE = 'Diabetes-Data/data'
df = merge_files(SOURCE)
print(df.shape)
df.head()

(29330, 5)


Unnamed: 0,0,1,2,3,patient_id
0,04-21-1991,9:09,58,100,1
1,04-21-1991,9:09,33,9,1
2,04-21-1991,9:09,34,13,1
3,04-21-1991,17:08,62,119,1
4,04-21-1991,17:08,33,7,1


In [90]:
df.isnull().sum()

0             33
1              0
2              0
3             33
patient_id     0
dtype: int64

In [91]:
df = df.dropna()
df.shape

(29264, 5)

In [92]:
# check if date and time can be transformed into datetime
print('Incorrect dates:')
for d in df[0]:
  try:
    datetime.strptime(d, '%m-%d-%Y')
  except:
    print(d)
print('\nIncorrect times:')
for t in df[1]:
  try:
    datetime.strptime(t, '%H:%M')
  except:
    print(t)
print('\nIncorrect values:')
# check if all values are integers
for i in df[3]:
  try:
    float(i)
  except:
    print(i)

Incorrect dates:
06-31-1991
06-31-1991
06-31-1991
06-31-1991
06-31-1991
06-31-1991
06-31-1991

Incorrect times:
56:35
56:35
56:35
188:00
188:00

Incorrect values:
0Hi
0Hi
0Hi
0Hi
0Hi
0Lo
0Hi
0''


In [93]:
codes = """
33 = Regular insulin dose
34 = NPH insulin dose
35 = UltraLente insulin dose
48 = Unspecified blood glucose measurement
57 = Unspecified blood glucose measurement
58 = Pre-breakfast blood glucose measurement
59 = Post-breakfast blood glucose measurement
60 = Pre-lunch blood glucose measurement
61 = Post-lunch blood glucose measurement
62 = Pre-supper blood glucose measurement
63 = Post-supper blood glucose measurement
64 = Pre-snack blood glucose measurement
65 = Hypoglycemic symptoms
66 = Typical meal ingestion
67 = More-than-usual meal ingestion
68 = Less-than-usual meal ingestion
69 = Typical exercise activity
70 = More-than-usual exercise activity
71 = Less-than-usual exercise activity
72 = Unspecified special event
"""
code_dict = {}
for code in codes.split('\n')[1:-1]:
    key,val = code.split(' = ')
    code_dict[int(key)] = val
code_dict

{33: 'Regular insulin dose',
 34: 'NPH insulin dose',
 35: 'UltraLente insulin dose',
 48: 'Unspecified blood glucose measurement',
 57: 'Unspecified blood glucose measurement',
 58: 'Pre-breakfast blood glucose measurement',
 59: 'Post-breakfast blood glucose measurement',
 60: 'Pre-lunch blood glucose measurement',
 61: 'Post-lunch blood glucose measurement',
 62: 'Pre-supper blood glucose measurement',
 63: 'Post-supper blood glucose measurement',
 64: 'Pre-snack blood glucose measurement',
 65: 'Hypoglycemic symptoms',
 66: 'Typical meal ingestion',
 67: 'More-than-usual meal ingestion',
 68: 'Less-than-usual meal ingestion',
 69: 'Typical exercise activity',
 70: 'More-than-usual exercise activity',
 71: 'Less-than-usual exercise activity',
 72: 'Unspecified special event'}

In [94]:
# figure out missing codes
non_standard_codes = set(code_dict.keys()).symmetric_difference(set(df[2].unique()))
non_standard_codes

{4, 36, 56}

In [95]:
df[df[2].isin(non_standard_codes)].index

Int64Index([  3,   8,  18,  27,  36,  45,  60, 144, 239, 243,
            ...
            137, 113, 124, 274, 100,  32,  40,  41,   6,  10],
           dtype='int64', length=121)

In [103]:
df[df[3].isin(['0Hi', '0Lo', "0''"])].shape

(8, 5)

In [105]:
# remove non standard codes (121 rows)
df = df.drop(df[df[2].isin(non_standard_codes)].index)
# remove the values that are not correct
df = df.drop(df[df[0] =='06-31-1991'].index) # 7
df = df.drop(df[df[1].isin(['56:35', '188:00'])].index) # 5
df = df.drop(df[df[3].isin(['0Hi', '0Lo', "0''"])].index) # 8
# standardize duplicate codes
df[2] = df[2].replace(48, 57)
df.shape

(29189, 5)

In [51]:
# create new feature with timestamp
df['timestamp'] = df.apply(lambda x: datetime.strptime(f'{x[0]} {x[1]}', '%m-%d-%Y %H:%M'), axis=1)
# convert date to datetime
df = df.rename(columns={ 0:'date', 1:'time', 2:'code', 3:'value'})
df['value'] = df['value'].astype(float)
df.dtypes

date                  object
time                  object
code                   int64
value                float64
patient_id             int64
timestamp     datetime64[ns]
dtype: object

In [79]:
print(df.shape)
df.head()

(22491, 5)


Unnamed: 0,0,1,2,3,patient_id
12,04-23-1991,7:25,58,257,1
20,04-24-1991,12:00,33,4,1
22,04-24-1991,22:09,57,340,1
23,04-24-1991,22:09,33,5,1
28,04-25-1991,17:24,62,206,1


In [106]:
# Unspecified special event (code 72) 54 rows
# 94
df[(df[2]==72)].shape

(94, 5)

In [107]:
# Hypoglycemic symptoms 229 rows
# 331
df[(df[2]==65)].shape

(331, 5)

In [108]:
# meal ingestion 343 rows
# 514
df[(df[2]==66)|(df[2]==67)|(df[2]==68)].shape

(514, 5)

In [109]:
# exercise activity 194 rows
# 331
df[(df[2]==69)|(df[2]==70)|(df[2]==71)].shape

(305, 5)

In [0]:
df = pd.read_csv('Diabetes-Data/data-21', sep='\t', header=None)
df = df.rename(columns={0:'date', 1:'time',2:'code',3:'value'})
df['date'] = pd.to_datetime(df['date'], format='%m-%d-%Y', infer_datetime_format=True)
print(df.shape)
df.head()

(517, 4)


Unnamed: 0,date,time,code,value
0,1989-09-03,08:00,58,125
1,1989-09-03,08:00,33,10
2,1989-09-03,08:00,34,18
3,1989-09-03,12:00,60,193
4,1989-09-03,12:00,33,10


In [0]:
df.describe(include='all')

Unnamed: 0,date,time,code,value,patient
count,29264,29264,29264.0,29264.0,29264.0
unique,1141,1295,,741.0,
top,09-19-1990,08:00,,6.0,
freq,126,3101,,1522.0,
mean,,,46.496104,,33.491457
std,,,13.370059,,19.254279
min,,,4.0,,1.0
25%,,,33.0,,18.0
50%,,,48.0,,33.0
75%,,,60.0,,50.0


In [0]:
df = pd.read_csv('Diabetes-Data/data-43', sep='\t', header=None)
df = df.rename(columns={0:'date', 1:'time',2:'code',3:'value'})
print(df.shape)
df.head()

(300, 4)


Unnamed: 0,date,time,code,value
0,07-13-1990,11:36,57,84
1,07-13-1990,11:39,33,3
2,07-13-1990,16:43,65,0
3,07-13-1990,16:44,66,0
4,07-13-1990,16:44,62,180


In [0]:
df.describe(include='all')

Unnamed: 0,0
count,942
unique,942
top,08-19-1991\t19:30\t62\t285
freq,1


In [0]:
df = pd.read_csv('Diabetes-Data/data-03')
df.head(10)

Unnamed: 0,07-21-1990	06:43	58	202
0,07-21-1990\t07:03\t33\t4
1,07-21-1990\t07:03\t35\t9
2,07-21-1990\t08:56\t56\t115
3,07-21-1990\t10:57\t60\t134
4,07-21-1990\t16:14\t62\t100
5,07-21-1990\t16:16\t35\t8
6,07-21-1990\t20:12\t64\t156
7,07-22-1990\t05:56\t56\t108
8,07-22-1990\t05:58\t58\t92
9,07-22-1990\t06:00\t33\t3


In [0]:
df_index = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/diabetes/Index')
df_index

Unnamed: 0,Index of diabetes
0,02 Dec 1996 115 Index
1,29 May 1995 682 README
2,30 Aug 1993 187551 diabetes-data.tar.Z


In [0]:
# sdjfdskjfh
"""sdfsdf"""