# Project: Predicting Diabetes Onset FHIR EHR Data
### Notebook: 03_Feature_Engineering.ipynb
### Purpose: Develop features for modeling

In [2]:
# import libraries
import pandas as pd
import numpy as np
from typing import List, Tuple


In [3]:
# load cohort data
df_cohort = pd.read_csv('/Users/sanasiddiqui/Desktop/Current Desktop/Data Portfolio Projects/Diabetes_Onset_Prediction_FHIR/Data_Diabetes_Prediction/Cleaned/df_cohort.csv',
                       parse_dates=['index_date', 'diabetes_onset_date', 'last_encounter_date'])

In [4]:
# load observations features df
df_obs_features = pd.read_csv('/Users/sanasiddiqui/Desktop/Current Desktop/Data Portfolio Projects/Diabetes_Onset_Prediction_FHIR/Data_Diabetes_Prediction/Cleaned/df_obs_features.csv',
                             parse_dates=['time', 'index_date'])

In [5]:
# inspect cohort df
df_cohort.head(), df_cohort.shape, df_cohort.dtypes

(                             patient_id       diabetes_onset_date  \
 0  a8a4c7d6-722a-a914-04b6-a23a7c3496e2 2018-07-20 09:10:07+00:00   
 1  779b82f4-c8cd-9764-2f80-28f0b176a418                       NaT   
 2  e696153b-4bda-a741-04fe-a1fd43f60fd9                       NaT   
 3  7e0366e0-5cf1-1f51-f609-3ec9c2197140 2020-05-29 04:33:52+00:00   
 4  f286e228-2d19-4405-da1c-23dcae75169b 2021-05-29 14:35:55+00:00   
 
         last_encounter_date  label                index_date  
 0 2025-01-21 09:10:07+00:00      1 2017-07-20 09:10:07+00:00  
 1 2025-05-24 03:32:57+00:00      0 2024-05-24 03:32:57+00:00  
 2 2022-06-10 14:14:34+00:00      0 2021-06-10 14:14:34+00:00  
 3 2025-10-31 04:33:52+00:00      1 2019-05-30 04:33:52+00:00  
 4 2025-10-11 14:35:55+00:00      1 2020-05-29 14:35:55+00:00  ,
 (956, 5),
 patient_id                          object
 diabetes_onset_date    datetime64[ns, UTC]
 last_encounter_date    datetime64[ns, UTC]
 label                                int64
 index

In [6]:
# inspect ibs features df 
df_obs_features.head(), df_obs_features.shape, df_obs_features.dtypes

(                             patient_id     code  \
 0  a8a4c7d6-722a-a914-04b6-a23a7c3496e2   8302-2   
 1  a8a4c7d6-722a-a914-04b6-a23a7c3496e2  29463-7   
 2  a8a4c7d6-722a-a914-04b6-a23a7c3496e2  39156-5   
 3  a8a4c7d6-722a-a914-04b6-a23a7c3496e2   8867-4   
 4  a8a4c7d6-722a-a914-04b6-a23a7c3496e2   9279-1   
 
                             name  value   unit                      time  \
 0                    Body Height  155.8     cm 2017-06-23 09:10:07+00:00   
 1                    Body Weight   56.3     kg 2017-06-23 09:10:07+00:00   
 2  Body mass index (BMI) [Ratio]   23.2  kg/m2 2017-06-23 09:10:07+00:00   
 3                     Heart rate   73.0   /min 2017-06-23 09:10:07+00:00   
 4               Respiratory rate   12.0   /min 2017-06-23 09:10:07+00:00   
 
                  index_date  
 0 2017-07-20 09:10:07+00:00  
 1 2017-07-20 09:10:07+00:00  
 2 2017-07-20 09:10:07+00:00  
 3 2017-07-20 09:10:07+00:00  
 4 2017-07-20 09:10:07+00:00  ,
 (2962, 7),
 patient_id      

In [7]:
# before merging data, check that all index 
# dates associated wiht patient_id are indentical in each dataframe
check = (
    df_obs_features[['patient_id','index_date']]
    .drop_duplicates()
    .merge(
        df_cohort[['patient_id','index_date']],
        on='patient_id',
        suffixes=('_obs','_cohort')
    )
)

# verify that both index date values for each row are identical match
(check['index_date_obs']!=check['index_date_cohort']).sum()

0

In [8]:
# drop index_date column from obs features df
df_obs_features = df_obs_features.drop(columns=['index_date'])
df_obs_features.head()

Unnamed: 0,patient_id,code,name,value,unit,time
0,a8a4c7d6-722a-a914-04b6-a23a7c3496e2,8302-2,Body Height,155.8,cm,2017-06-23 09:10:07+00:00
1,a8a4c7d6-722a-a914-04b6-a23a7c3496e2,29463-7,Body Weight,56.3,kg,2017-06-23 09:10:07+00:00
2,a8a4c7d6-722a-a914-04b6-a23a7c3496e2,39156-5,Body mass index (BMI) [Ratio],23.2,kg/m2,2017-06-23 09:10:07+00:00
3,a8a4c7d6-722a-a914-04b6-a23a7c3496e2,8867-4,Heart rate,73.0,/min,2017-06-23 09:10:07+00:00
4,a8a4c7d6-722a-a914-04b6-a23a7c3496e2,9279-1,Respiratory rate,12.0,/min,2017-06-23 09:10:07+00:00


In [9]:
# merge obs features df and cohort df
df_features = df_obs_features.merge(
    df_cohort[['patient_id', 'index_date']],
    on='patient_id',
    how='inner'
)

In [10]:
df_features = df_features[
    (df_features['time'] < df_features['index_date']) &
    (df_features['time'] >= df_features['index_date'] - pd.Timedelta(days=365))
]

In [11]:
# Final verificaiton chekpoint to ensure index_dare column dropped and time column is
# a time before index_date ffrom cohort df 
assert 'index_date' not in df_obs_features.columns
assert (df_features['time'] < df_features['index_date']).all()

In [12]:
df_features.head()

Unnamed: 0,patient_id,code,name,value,unit,time,index_date
0,a8a4c7d6-722a-a914-04b6-a23a7c3496e2,8302-2,Body Height,155.8,cm,2017-06-23 09:10:07+00:00,2017-07-20 09:10:07+00:00
1,a8a4c7d6-722a-a914-04b6-a23a7c3496e2,29463-7,Body Weight,56.3,kg,2017-06-23 09:10:07+00:00,2017-07-20 09:10:07+00:00
2,a8a4c7d6-722a-a914-04b6-a23a7c3496e2,39156-5,Body mass index (BMI) [Ratio],23.2,kg/m2,2017-06-23 09:10:07+00:00,2017-07-20 09:10:07+00:00
3,a8a4c7d6-722a-a914-04b6-a23a7c3496e2,8867-4,Heart rate,73.0,/min,2017-06-23 09:10:07+00:00,2017-07-20 09:10:07+00:00
4,a8a4c7d6-722a-a914-04b6-a23a7c3496e2,9279-1,Respiratory rate,12.0,/min,2017-06-23 09:10:07+00:00,2017-07-20 09:10:07+00:00


In [13]:
# rename time columns to obs_time for clarity

df_features.rename(columns={'time':'obs_time'}, inplace=True)

In [14]:
# aggregate labs per patient
lab_agg = (
    df_features
    .groupby(['patient_id', 'code'])
    .agg(
        mean_value=('value', 'mean'),
        median_value=('value', 'median'),
        min_value=('value', 'min'),
        max_value=('value', 'max'),
        std_value=('value', 'std'),
        count_value=('value', 'count'),
        last_value=('value', 'last'),
        last_date=('obs_time', 'last'),
        index_date=('index_date', 'first')

    )
    .reset_index()
)

lab_agg.head()


Unnamed: 0,patient_id,code,mean_value,median_value,min_value,max_value,std_value,count_value,last_value,last_date,index_date
0,011dcab7-8543-84f8-5842-fa84972bbf25,2085-9,55.95,55.95,55.95,55.95,,1,55.95,2019-02-01 17:52:29+00:00,2019-02-07 17:52:29+00:00
1,011dcab7-8543-84f8-5842-fa84972bbf25,2093-3,200.35,200.35,200.35,200.35,,1,200.35,2019-02-01 17:52:29+00:00,2019-02-07 17:52:29+00:00
2,011dcab7-8543-84f8-5842-fa84972bbf25,2571-8,140.7,140.7,140.7,140.7,,1,140.7,2019-02-01 17:52:29+00:00,2019-02-07 17:52:29+00:00
3,011dcab7-8543-84f8-5842-fa84972bbf25,29463-7,102.9,102.9,102.9,102.9,,1,102.9,2019-02-01 17:52:29+00:00,2019-02-07 17:52:29+00:00
4,011dcab7-8543-84f8-5842-fa84972bbf25,39156-5,29.75,29.75,29.75,29.75,,1,29.75,2019-02-01 17:52:29+00:00,2019-02-07 17:52:29+00:00


In [15]:
# add a column days_since_last to get days between index date and last observation date
lab_agg['days_since_last'] = (
    lab_agg['index_date'] - lab_agg['last_date']
).dt.days

lab_agg.head()

Unnamed: 0,patient_id,code,mean_value,median_value,min_value,max_value,std_value,count_value,last_value,last_date,index_date,days_since_last
0,011dcab7-8543-84f8-5842-fa84972bbf25,2085-9,55.95,55.95,55.95,55.95,,1,55.95,2019-02-01 17:52:29+00:00,2019-02-07 17:52:29+00:00,6
1,011dcab7-8543-84f8-5842-fa84972bbf25,2093-3,200.35,200.35,200.35,200.35,,1,200.35,2019-02-01 17:52:29+00:00,2019-02-07 17:52:29+00:00,6
2,011dcab7-8543-84f8-5842-fa84972bbf25,2571-8,140.7,140.7,140.7,140.7,,1,140.7,2019-02-01 17:52:29+00:00,2019-02-07 17:52:29+00:00,6
3,011dcab7-8543-84f8-5842-fa84972bbf25,29463-7,102.9,102.9,102.9,102.9,,1,102.9,2019-02-01 17:52:29+00:00,2019-02-07 17:52:29+00:00,6
4,011dcab7-8543-84f8-5842-fa84972bbf25,39156-5,29.75,29.75,29.75,29.75,,1,29.75,2019-02-01 17:52:29+00:00,2019-02-07 17:52:29+00:00,6


In [16]:
# confirm no duplicate index_dates per patient

(
    lab_agg
    .groupby('patient_id')['index_date']
    .nunique()
    .max()
)

1

In [17]:
# map lab codes to lab names

lab_dict = {
    '8867-4': 'heart_rate',
    '9279-1': 'creatinine',
    '29463-7': 'body_weight',
    '8302-2': 'body_height',
    '39156-5': 'bmi',
    '70274-6': 'blood_glucose',
    '2085-9': 'hdl_cholesterol',
    '2093-3': 'total_cholesterol',
    '2571-8': 'ldl_cholesterol',
    '4544-3': 'hba1c'
}

# map to dicitonary
lab_agg['lab_name'] = lab_agg['code'].map(lab_dict)

# check for unmapped items
lab_agg[lab_agg['lab_name'].isna()]['code'].unique()


array([], dtype=object)

In [18]:
lab_agg.head()

Unnamed: 0,patient_id,code,mean_value,median_value,min_value,max_value,std_value,count_value,last_value,last_date,index_date,days_since_last,lab_name
0,011dcab7-8543-84f8-5842-fa84972bbf25,2085-9,55.95,55.95,55.95,55.95,,1,55.95,2019-02-01 17:52:29+00:00,2019-02-07 17:52:29+00:00,6,hdl_cholesterol
1,011dcab7-8543-84f8-5842-fa84972bbf25,2093-3,200.35,200.35,200.35,200.35,,1,200.35,2019-02-01 17:52:29+00:00,2019-02-07 17:52:29+00:00,6,total_cholesterol
2,011dcab7-8543-84f8-5842-fa84972bbf25,2571-8,140.7,140.7,140.7,140.7,,1,140.7,2019-02-01 17:52:29+00:00,2019-02-07 17:52:29+00:00,6,ldl_cholesterol
3,011dcab7-8543-84f8-5842-fa84972bbf25,29463-7,102.9,102.9,102.9,102.9,,1,102.9,2019-02-01 17:52:29+00:00,2019-02-07 17:52:29+00:00,6,body_weight
4,011dcab7-8543-84f8-5842-fa84972bbf25,39156-5,29.75,29.75,29.75,29.75,,1,29.75,2019-02-01 17:52:29+00:00,2019-02-07 17:52:29+00:00,6,bmi


In [19]:
# two labs have been incorrectly mapped, their coes
# correspond to generalized anxiety disorder and hematocrit
# they will be dropped

drop_codes = ['70274-6','4544-3']

lab_agg = lab_agg[~lab_agg['code'].isin(drop_codes)]

In [20]:
# Helper function to pivot columns

def pivot_features(df, index: str, columns: str, values: str) -> pd.DataFrame:
    return df.pivot(index = index,
                    columns = columns,
                    values = values
            )

# Helper function to add suffix to column names
def add_suffix(df, suffix: str) -> List[str]:
    names = []
    for c in df.columns:
        col_name = f"{c}_{suffix}"
        names.append(col_name)
    df.columns = names
    return df.columns

In [21]:
# pivot mean Values by lab name

mean_features = pivot_features(lab_agg, index='patient_id', columns='lab_name', values='mean_value')
mean_features.columns = add_suffix(mean_features, 'mean')
mean_features.head()

Unnamed: 0_level_0,bmi_mean,body_height_mean,body_weight_mean,creatinine_mean,hdl_cholesterol_mean,heart_rate_mean,ldl_cholesterol_mean,total_cholesterol_mean
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
011dcab7-8543-84f8-5842-fa84972bbf25,29.75,186.0,102.9,15.0,55.95,72.0,140.7,200.35
019dd1e7-5307-5d8a-c04f-8693c0ebdf35,15.995,98.2,15.45,15.0,,79.0,,
02aa3df1-6e90-e28f-abf3-8a5647f49707,22.53,185.4,77.4,16.0,,96.0,,
02c16628-45fc-609a-fed2-bc5a582f33bd,29.92,173.9,90.5,12.0,63.86,90.0,138.86,204.17
043cc39f-934c-f0ad-ba8e-c6b241697203,27.09,178.5,86.3,15.0,,93.0,,


In [22]:
# pivot last value by lab names

last_value = pivot_features(lab_agg, index='patient_id', columns='lab_name', values='last_value')
last_value.columns = add_suffix(last_value, 'last_value')
last_value.head()

Unnamed: 0_level_0,bmi_last_value,body_height_last_value,body_weight_last_value,creatinine_last_value,hdl_cholesterol_last_value,heart_rate_last_value,ldl_cholesterol_last_value,total_cholesterol_last_value
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
011dcab7-8543-84f8-5842-fa84972bbf25,29.75,186.0,102.9,15.0,55.95,72.0,140.7,200.35
019dd1e7-5307-5d8a-c04f-8693c0ebdf35,15.57,100.0,15.6,14.0,,74.0,,
02aa3df1-6e90-e28f-abf3-8a5647f49707,22.53,185.4,77.4,16.0,,96.0,,
02c16628-45fc-609a-fed2-bc5a582f33bd,29.92,173.9,90.5,12.0,63.86,90.0,138.86,204.17
043cc39f-934c-f0ad-ba8e-c6b241697203,27.09,178.5,86.3,15.0,,93.0,,


In [23]:
# pivot count values by lab names

count_features = pivot_features(lab_agg, index='patient_id', columns='lab_name', values='count_value')
count_features.columns = add_suffix(count_features,'count_value')
count_features.head()

Unnamed: 0_level_0,bmi_count_value,body_height_count_value,body_weight_count_value,creatinine_count_value,hdl_cholesterol_count_value,heart_rate_count_value,ldl_cholesterol_count_value,total_cholesterol_count_value
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
011dcab7-8543-84f8-5842-fa84972bbf25,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
019dd1e7-5307-5d8a-c04f-8693c0ebdf35,2.0,2.0,2.0,2.0,,2.0,,
02aa3df1-6e90-e28f-abf3-8a5647f49707,1.0,1.0,1.0,1.0,,1.0,,
02c16628-45fc-609a-fed2-bc5a582f33bd,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
043cc39f-934c-f0ad-ba8e-c6b241697203,1.0,1.0,1.0,1.0,,1.0,,


In [24]:
# pivot recency features by lab name

recency_features = pivot_features(lab_agg, index='patient_id', columns='lab_name', values='days_since_last')
recency_features.columns = add_suffix(recency_features,'days_since_last')
recency_features.head()

Unnamed: 0_level_0,bmi_days_since_last,body_height_days_since_last,body_weight_days_since_last,creatinine_days_since_last,hdl_cholesterol_days_since_last,heart_rate_days_since_last,ldl_cholesterol_days_since_last,total_cholesterol_days_since_last
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
011dcab7-8543-84f8-5842-fa84972bbf25,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0
019dd1e7-5307-5d8a-c04f-8693c0ebdf35,139.0,139.0,139.0,139.0,,139.0,,
02aa3df1-6e90-e28f-abf3-8a5647f49707,175.0,175.0,175.0,175.0,,175.0,,
02c16628-45fc-609a-fed2-bc5a582f33bd,300.0,300.0,300.0,300.0,300.0,300.0,300.0,300.0
043cc39f-934c-f0ad-ba8e-c6b241697203,6.0,6.0,6.0,6.0,,6.0,,


In [25]:
# merge all feature dataframes

df_patient_features = (
    mean_features
    .merge(last_value, on='patient_id', how='left')
    .merge(count_features, on='patient_id', how='left')
    .merge(recency_features, on='patient_id', how='left')
)

In [26]:
# inspect patient features dataframe
df_patient_features.head()

Unnamed: 0_level_0,bmi_mean,body_height_mean,body_weight_mean,creatinine_mean,hdl_cholesterol_mean,heart_rate_mean,ldl_cholesterol_mean,total_cholesterol_mean,bmi_last_value,body_height_last_value,...,ldl_cholesterol_count_value,total_cholesterol_count_value,bmi_days_since_last,body_height_days_since_last,body_weight_days_since_last,creatinine_days_since_last,hdl_cholesterol_days_since_last,heart_rate_days_since_last,ldl_cholesterol_days_since_last,total_cholesterol_days_since_last
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
011dcab7-8543-84f8-5842-fa84972bbf25,29.75,186.0,102.9,15.0,55.95,72.0,140.7,200.35,29.75,186.0,...,1.0,1.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0
019dd1e7-5307-5d8a-c04f-8693c0ebdf35,15.995,98.2,15.45,15.0,,79.0,,,15.57,100.0,...,,,139.0,139.0,139.0,139.0,,139.0,,
02aa3df1-6e90-e28f-abf3-8a5647f49707,22.53,185.4,77.4,16.0,,96.0,,,22.53,185.4,...,,,175.0,175.0,175.0,175.0,,175.0,,
02c16628-45fc-609a-fed2-bc5a582f33bd,29.92,173.9,90.5,12.0,63.86,90.0,138.86,204.17,29.92,173.9,...,1.0,1.0,300.0,300.0,300.0,300.0,300.0,300.0,300.0,300.0
043cc39f-934c-f0ad-ba8e-c6b241697203,27.09,178.5,86.3,15.0,,93.0,,,27.09,178.5,...,,,6.0,6.0,6.0,6.0,,6.0,,


In [27]:
# add missing flags 
missing_flags = df_patient_features.isna().astype(int)
missing_flags.columns = [c + '_missing' for c in missing_flags.columns]

df_patient_features = df_patient_features.merge(
    missing_flags,
    left_index=True,
    right_index=True
)

In [28]:
df_patient_features.head()

Unnamed: 0_level_0,bmi_mean,body_height_mean,body_weight_mean,creatinine_mean,hdl_cholesterol_mean,heart_rate_mean,ldl_cholesterol_mean,total_cholesterol_mean,bmi_last_value,body_height_last_value,...,ldl_cholesterol_count_value_missing,total_cholesterol_count_value_missing,bmi_days_since_last_missing,body_height_days_since_last_missing,body_weight_days_since_last_missing,creatinine_days_since_last_missing,hdl_cholesterol_days_since_last_missing,heart_rate_days_since_last_missing,ldl_cholesterol_days_since_last_missing,total_cholesterol_days_since_last_missing
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
011dcab7-8543-84f8-5842-fa84972bbf25,29.75,186.0,102.9,15.0,55.95,72.0,140.7,200.35,29.75,186.0,...,0,0,0,0,0,0,0,0,0,0
019dd1e7-5307-5d8a-c04f-8693c0ebdf35,15.995,98.2,15.45,15.0,,79.0,,,15.57,100.0,...,1,1,0,0,0,0,1,0,1,1
02aa3df1-6e90-e28f-abf3-8a5647f49707,22.53,185.4,77.4,16.0,,96.0,,,22.53,185.4,...,1,1,0,0,0,0,1,0,1,1
02c16628-45fc-609a-fed2-bc5a582f33bd,29.92,173.9,90.5,12.0,63.86,90.0,138.86,204.17,29.92,173.9,...,0,0,0,0,0,0,0,0,0,0
043cc39f-934c-f0ad-ba8e-c6b241697203,27.09,178.5,86.3,15.0,,93.0,,,27.09,178.5,...,1,1,0,0,0,0,1,0,1,1


In [29]:
# utilization features table 

util_features = (
    df_features
    .groupby('patient_id')
    .agg(
        total_labs = ('code', 'count'),
        unique_labs = ('code', 'nunique')
    )
)

In [30]:
# merge utilization features into patient features dataframe

df_patient_features = df_patient_features.merge(
    util_features,
    left_index=True,
    right_index=True
)

In [68]:
# Merge labels for modeling
X = df_patient_features.merge(
    df_cohort[['patient_id','label']],
    on='patient_id',
    how='left'
)

In [72]:
X.columns

Index(['patient_id', 'bmi_mean', 'body_height_mean', 'body_weight_mean',
       'creatinine_mean', 'hdl_cholesterol_mean', 'heart_rate_mean',
       'ldl_cholesterol_mean', 'total_cholesterol_mean', 'bmi_last_value',
       'body_height_last_value', 'body_weight_last_value',
       'creatinine_last_value', 'hdl_cholesterol_last_value',
       'heart_rate_last_value', 'ldl_cholesterol_last_value',
       'total_cholesterol_last_value', 'bmi_count_value',
       'body_height_count_value', 'body_weight_count_value',
       'creatinine_count_value', 'hdl_cholesterol_count_value',
       'heart_rate_count_value', 'ldl_cholesterol_count_value',
       'total_cholesterol_count_value', 'bmi_days_since_last',
       'body_height_days_since_last', 'body_weight_days_since_last',
       'creatinine_days_since_last', 'hdl_cholesterol_days_since_last',
       'heart_rate_days_since_last', 'ldl_cholesterol_days_since_last',
       'total_cholesterol_days_since_last', 'bmi_mean_missing',
       'body_

In [74]:
# remove redundant columns

cols_to_keep = [
    'patient_id', 'label',
    'bmi_mean', 'body_height_mean', 'body_weight_mean', 'creatinine_mean',
    'hdl_cholesterol_mean', 'heart_rate_mean', 'ldl_cholesterol_mean', 'total_cholesterol_mean',
    'bmi_last_value', 'body_height_last_value', 'body_weight_last_value', 'creatinine_last_value',
    'hdl_cholesterol_last_value', 'heart_rate_last_value', 'ldl_cholesterol_last_value', 'total_cholesterol_last_value',
    'bmi_count_value', 'body_height_count_value', 'body_weight_count_value', 'creatinine_count_value',
    'hdl_cholesterol_count_value', 'heart_rate_count_value', 'ldl_cholesterol_count_value', 'total_cholesterol_count_value',
    'bmi_days_since_last', 'body_height_days_since_last', 'body_weight_days_since_last', 'creatinine_days_since_last',
    'hdl_cholesterol_days_since_last', 'heart_rate_days_since_last', 'ldl_cholesterol_days_since_last', 'total_cholesterol_days_since_last',
    'creatinine_mean_missing', 'hdl_cholesterol_mean_missing', 'ldl_cholesterol_mean_missing', 'total_cholesterol_mean_missing',
    'total_labs', 'unique_labs'
]

df_features_master  = X[cols_to_keep].copy()

In [76]:
# inspect master features dataset

df_features_master.head()

Unnamed: 0,patient_id,label,bmi_mean,body_height_mean,body_weight_mean,creatinine_mean,hdl_cholesterol_mean,heart_rate_mean,ldl_cholesterol_mean,total_cholesterol_mean,...,hdl_cholesterol_days_since_last,heart_rate_days_since_last,ldl_cholesterol_days_since_last,total_cholesterol_days_since_last,creatinine_mean_missing,hdl_cholesterol_mean_missing,ldl_cholesterol_mean_missing,total_cholesterol_mean_missing,total_labs,unique_labs
0,011dcab7-8543-84f8-5842-fa84972bbf25,1,29.75,186.0,102.9,15.0,55.95,72.0,140.7,200.35,...,6.0,6.0,6.0,6.0,0,0,0,0,8,8
1,019dd1e7-5307-5d8a-c04f-8693c0ebdf35,0,15.995,98.2,15.45,15.0,,79.0,,,...,,139.0,,,0,1,1,1,10,5
2,02aa3df1-6e90-e28f-abf3-8a5647f49707,0,22.53,185.4,77.4,16.0,,96.0,,,...,,175.0,,,0,1,1,1,6,6
3,02c16628-45fc-609a-fed2-bc5a582f33bd,1,29.92,173.9,90.5,12.0,63.86,90.0,138.86,204.17,...,300.0,300.0,300.0,300.0,0,0,0,0,9,9
4,043cc39f-934c-f0ad-ba8e-c6b241697203,1,27.09,178.5,86.3,15.0,,93.0,,,...,,6.0,,,0,1,1,1,6,6


In [78]:
# save master features dataframe

df_features_master.to_csv('/Users/sanasiddiqui/Desktop/Current Desktop/Data Portfolio Projects/Diabetes_Onset_Prediction_FHIR/Data_Diabetes_Prediction/Cleaned/df_features_master.csv', index=False)