# Diabetes prediction with synthea data

###### Mostly from https://github.com/IBM/example-health-machine-learning/blob/master/diabetes-prediction.ipynb

### Import data in pandas dataframes 

In [2]:
import pandas as pd 
import numpy as np

#load data into pandas dataframes
data_dir = "../../data/synthea/"
conditions_file = data_dir+"conditions.csv"
medications_file = data_dir+"medications.csv"
observatios_file = data_dir+"observations.csv"
patients_file = data_dir+"patients.csv"

df_cond = pd.read_csv(conditions_file)
df_med = pd.read_csv(medications_file)
df_obs = pd.read_csv(observatios_file)
df_pat = pd.read_csv(patients_file)

In [9]:
df_obs.head()

Unnamed: 0,DATE,PATIENT,ENCOUNTER,CODE,DESCRIPTION,VALUE,UNITS,TYPE
0,2011-03-31T02:00:17Z,7d3e489a-7789-9cd6-2a1b-711074af481b,814174f3-2e0e-1625-de48-9c40732c9149,8302-2,Body Height,167.0,cm,numeric
1,2011-03-31T02:00:17Z,7d3e489a-7789-9cd6-2a1b-711074af481b,814174f3-2e0e-1625-de48-9c40732c9149,72514-3,Pain severity - 0-10 verbal numeric rating [Sc...,3.0,{score},numeric
2,2011-03-31T02:00:17Z,7d3e489a-7789-9cd6-2a1b-711074af481b,814174f3-2e0e-1625-de48-9c40732c9149,29463-7,Body Weight,71.1,kg,numeric
3,2011-03-31T02:00:17Z,7d3e489a-7789-9cd6-2a1b-711074af481b,814174f3-2e0e-1625-de48-9c40732c9149,39156-5,Body Mass Index,25.5,kg/m2,numeric
4,2011-03-31T02:00:17Z,7d3e489a-7789-9cd6-2a1b-711074af481b,814174f3-2e0e-1625-de48-9c40732c9149,59576-9,Body mass index (BMI) [Percentile] Per age and...,83.6,%,numeric


In [29]:
df_pat.head()

Unnamed: 0,Id,BIRTHDATE,DEATHDATE,SSN,DRIVERS,PASSPORT,PREFIX,FIRST,LAST,SUFFIX,...,BIRTHPLACE,ADDRESS,CITY,STATE,COUNTY,ZIP,LAT,LON,HEALTHCARE_EXPENSES,HEALTHCARE_COVERAGE
0,7d3e489a-7789-9cd6-2a1b-711074af481b,1993-01-28,,999-95-8631,S99916705,X24646789X,Mr.,Jon665,Pacocha935,,...,Lawrence Massachusetts US,942 Fahey Overpass Apt 21,Natick,Massachusetts,Middlesex County,,42.309347,-71.349633,569019.69,2293.12
1,a3795ec8-54f3-e99e-a4b1-4c067f3141d7,1971-12-01,,999-62-4431,S99941017,X38787090X,Mr.,Dick869,Streich926,,...,Swansea Massachusetts US,1064 Hickle View Apt 7,Chicopee,Massachusetts,Hampden County,1020.0,42.198239,-72.554752,18755.46,0.0
2,3829c803-1f4c-74ed-0d8f-36e502cadd0f,2005-01-07,,999-21-2332,,,,Cordell41,Eichmann909,,...,Chelmsford Massachusetts US,560 Ritchie Way Suite 68,Swansea,Massachusetts,Bristol County,,41.748125,-71.182914,361770.0,2768.96
3,d7acfddb-f4c2-69f4-2081-ad1fb8490448,1990-07-04,,999-53-1990,S99932677,X67053099X,Mrs.,Cheri871,Oberbrunner298,,...,Cambridge Massachusetts US,268 Hansen Loaf Apt 62,Lowell,Massachusetts,Middlesex County,1850.0,42.66252,-71.368933,703332.77,5551.19
4,474766f3-ee93-f5d6-84c3-db38ba803394,2012-04-03,,999-57-2653,,,,Desmond566,O'Conner199,,...,Cohasset Massachusetts US,831 Schumm Lock Apt 62,Westborough,Massachusetts,Worcester County,,42.253951,-71.563825,206450.27,2284.86


### Feature selection

Select the features of interests: 

- Systolic blood pressure readings from the observations (code 8480-6).
- Select diastolic blood pressure readings (code 8462-4).
- Select HDL cholesterol readings (code 2085-9).
- Select LDL cholesterol readings (code 18262-6).
- Select BMI (body mass index) readings (code 39156-5).


In [89]:
def feature_selection_obs(df, code):
    return df[df["CODE"]==code][["PATIENT", "DATE", "VALUE"]].drop_duplicates().reset_index(drop=True)

#select feautures from observations
df_systolic = feature_selection_obs(df_obs, "8480-6").rename(columns={"VALUE": "SYSTOLIC_BP"})
df_diastolic = feature_selection_obs(df_obs, "8462-4").rename(columns={"VALUE": "DIASTOLIC_BP"})
df_hdl = feature_selection_obs(df_obs, "2085-9").rename(columns={"VALUE": "HDL"})
df_ldl = feature_selection_obs(df_obs, "18262-6").rename(columns={"VALUE": "LDL"})
df_bmi = feature_selection_obs(df_obs, "39156-5").rename(columns={"VALUE": "BMI"})

In [90]:
len(df_systolic), len(df_diastolic), len(df_hdl), len(df_ldl), len(df_bmi)

(83540, 83541, 26900, 26900, 57880)

Merge the dataframes (inner join for now, to avoid dealing with missing values)

In [91]:
df1 = pd.merge(df_systolic, df_diastolic, on=["PATIENT", "DATE"], how='inner')
df2 = pd.merge(df1, df_hdl, on=["PATIENT", "DATE"], how='inner')
df3 = pd.merge(df2, df_ldl, on=["PATIENT", "DATE"], how='inner')
df4 = pd.merge(df3, df_bmi, on=["PATIENT", "DATE"], how='inner')

In [92]:
len(df4)

21224

In [93]:
df4.head()

Unnamed: 0,PATIENT,DATE,SYSTOLIC_BP,DIASTOLIC_BP,HDL,LDL,BMI
0,a3795ec8-54f3-e99e-a4b1-4c067f3141d7,2013-01-16T22:06:58Z,128.0,88.0,64.5,89.2,22.4
1,a3795ec8-54f3-e99e-a4b1-4c067f3141d7,2017-12-20T22:06:58Z,116.0,71.0,64.9,78.0,22.4
2,9bafdf36-6e60-e93e-7925-c8d15a49ea62,2012-11-25T09:32:01Z,125.0,82.0,72.9,97.3,27.6
3,9bafdf36-6e60-e93e-7925-c8d15a49ea62,2015-12-13T09:32:01Z,104.0,89.0,64.3,71.3,27.6
4,9bafdf36-6e60-e93e-7925-c8d15a49ea62,2018-12-30T09:32:01Z,121.0,77.0,61.2,77.8,27.6
