In [77]:
# %load ds_imp_set.py
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

np.set_printoptions(suppress=True, precision=4)
pd.set_option('display.max_columns', None)
sns.set_context(context='notebook')
sns.set_style(style='whitegrid')


In [78]:
df = pd.read_csv('csv/diabetic_model.csv')

In [79]:
dead = df[df['deceased']==1].index.tolist()

In [80]:
df.drop(dead, axis='index', inplace=True)
df.index = list(range(df.shape[0]))

In [81]:
df['con_start'] = pd.to_datetime(df['con_start'])

df['birthdate'] = pd.to_datetime(df['birthdate'])

df['diag_age'] = (df['con_start'] - df['birthdate']).map(lambda x: int(str(x).split(" ")[0]) / 365)

In [82]:
dfmod = df.sort_values(by=['patient', 'con_start', 'condition'], ascending=True) \
        [['patient', 'condition', 'con_start']]

In [83]:
dfmod.index = list(range(dfmod.shape[0]))

In [84]:
for i in dfmod.index[:3]:
    print(dfmod['patient'][i])

0003d26b-8757-449d-a4d6-c6dbaa822426
00156155-152c-4836-8902-cadb73995b17
00156155-152c-4836-8902-cadb73995b17


In [85]:
dfmod[dfmod['patient']=='fff0c5a2-0a50-44e6-8b0b-886233a5e739']

Unnamed: 0,patient,condition,con_start
45002,fff0c5a2-0a50-44e6-8b0b-886233a5e739,Chronic intractable migraine without aura,2000-01-24
45003,fff0c5a2-0a50-44e6-8b0b-886233a5e739,Prediabetes,2002-10-26


Patients for whom Prediabetes is their first diagnosis.

In [86]:
predi_1st_dn = [k for k,v in dict(dfmod.groupby('patient').first() \
                ['condition']=='Prediabetes') \
                .items()
                if v]

In [87]:
# 2140 rows × 3 columns
pats_w_diabe = dfmod[dfmod['condition']=='Diabetes'] \
                    .groupby('patient') \
                    .first() \
                    .index \
                    .tolist()
# 1902 rows × 2 columns
len(pats_w_diabe) == len(set(pats_w_diabe))

True

In [88]:
p_then_d = list(set(predi_1st_dn).intersection(set(pats_w_diabe)))

In [89]:
len(p_then_d) , len(predi_1st_dn)

(249, 5101)

### This is a list of patients who were diagnosed with prediabetes
### then later diagnosed with diabetes.
### This will be used for classification and regression.

In [90]:
import pickle
with open('p_then_d.pkl', 'wb') as file:
    pickle.dump(p_then_d, file)
with open('predi_1st_dn.pkl', 'wb') as file:
    pickle.dump(predi_1st_dn, file)    

In [15]:
predi_first_mask = dfmod['patient'].isin(p_then_d)

In [16]:
p = dfmod['condition']=='Prediabetes'
d = dfmod['condition']=='Diabetes'

In [17]:
irrelevent = dfmod[np.logical_not(np.logical_or(p,d))].index

In [18]:
df_t = dfmod.drop(irrelevent, axis='index')[predi_first_mask]

  if __name__ == '__main__':


In [19]:
df_t.head()

Unnamed: 0,patient,condition,con_start
257,01b8caf1-f10c-46d5-810c-649414cd3008,Prediabetes,1967-03-27
258,01b8caf1-f10c-46d5-810c-649414cd3008,Diabetes,1970-01-08
301,0204fb22-601c-4687-a1be-cfe16b7de4b2,Prediabetes,1996-03-22
302,0204fb22-601c-4687-a1be-cfe16b7de4b2,Diabetes,2002-01-25
359,0287f2d7-8796-4129-84b4-fce5213c7a27,Prediabetes,1973-12-24


In [20]:
df_t.reset_index(inplace=True)

In [21]:
dfnp = df_t.pivot(index='index', columns='condition', values='con_start')

In [22]:
dfnp.fillna(method='ffill', inplace=True)

In [23]:
dfnp.index = list(range(dfnp.shape[0]))

In [24]:
dfnp.index = df_t.index

In [25]:
df_p = pd.concat([df_t, dfnp], axis=1)

In [26]:
df_p = df_p[df_p.index%2==1]

In [27]:
df_p.shape

(267, 6)

Now combine with previously established data.

In [28]:
df_q = pd.read_pickle('./dfnum.pkl')

In [29]:
df_q.reset_index(inplace=True)

In [30]:
df_q.columns

Index(['patient', 'Body Height', 'Body Mass Index', 'Body Weight', 'Calcium',
       'Carbon Dioxide', 'Chloride', 'Diastolic Blood Pressure', 'Glucose',
       'Potassium', 'Sodium', 'Systolic Blood Pressure', 'Urea Nitrogen',
       'deceased', 'persistent_condition', 'pro_reason_given',
       'enc_reason_given', 'mcr_reported', 'fev_fvc_reported',
       'glomerular_reported', 'triglyc_choles_reported', 'age',
       'single_condition', 'diag_age', 'marital_S', 'marital_nan', 'sex_M',
       'sex_nan', 'con_start'],
      dtype='object')

In [31]:
dff = df_p.merge(df_q, left_on=['patient', 'Prediabetes'], right_on=['patient', 'con_start'])

In [32]:
# ripe for drop: index, condition, con_start

In [33]:
dff.head()

Unnamed: 0,index,patient,condition,con_start_x,Diabetes,Prediabetes,Body Height,Body Mass Index,Body Weight,Calcium,Carbon Dioxide,Chloride,Diastolic Blood Pressure,Glucose,Potassium,Sodium,Systolic Blood Pressure,Urea Nitrogen,deceased,persistent_condition,pro_reason_given,enc_reason_given,mcr_reported,fev_fvc_reported,glomerular_reported,triglyc_choles_reported,age,single_condition,diag_age,marital_S,marital_nan,sex_M,sex_nan,con_start_y
0,302,0204fb22-601c-4687-a1be-cfe16b7de4b2,Diabetes,2002-01-25,2002-01-25,1996-03-22,177.06,27.69,86.8,9.66,27.0,103.0,70.0,104.0,5.01,142.0,122.0,7.0,0,1,0,0,1,0,1,1,38,0,23.608219,0,0,1,0,1996-03-22
1,360,0287f2d7-8796-4129-84b4-fce5213c7a27,Diabetes,1976-10-08,1976-10-08,1973-12-24,163.37,30.76,82.11,9.55,23.0,109.0,70.0,82.0,4.22,142.0,108.0,8.0,0,1,0,0,1,0,1,1,63,0,26.676712,0,0,0,0,1973-12-24
2,782,0441d5fe-e95a-4cdd-b1d7-c6cadf8332b6,Diabetes,1986-07-06,1986-07-06,1983-07-02,174.71,37.49,114.43,9.6,24.0,109.0,94.0,159.0,4.04,138.0,173.0,17.0,0,1,0,0,1,0,1,1,59,0,32.054795,1,0,1,0,1983-07-02
3,2092,09b874ca-046c-4f29-8a7d-ea55673609ea,Diabetes,1999-11-29,1999-11-29,1997-12-06,163.03,38.95,103.54,9.59,22.0,106.0,104.0,193.0,4.43,139.0,165.0,17.0,0,1,0,0,1,0,1,1,59,0,46.813699,1,0,0,0,1997-12-06
4,2112,09fbcb5e-cc43-49cf-b0bc-f37bcbe6da86,Diabetes,2005-02-23,2005-02-23,1999-02-27,178.83,32.2,102.96,9.97,24.0,111.0,117.0,94.0,3.84,140.0,161.0,7.0,0,1,0,0,1,0,1,1,48,0,36.69589,0,0,1,0,1999-02-27


In [34]:
dff.drop(['index', 'condition', 'con_start_x', 'con_start_y', 
          'deceased', 'persistent_condition',
          'pro_reason_given', 'enc_reason_given',
          'mcr_reported', 'fev_fvc_reported', 
          'glomerular_reported', 'triglyc_choles_reported',
          'single_condition'], axis=1, inplace=True)

In [35]:
dff.drop(21, axis=0, inplace=True) # diabetes earlier than prediabetes
dff.index = list(range(dff.shape[0]))

In [36]:
dff['delta_time'] = dff['Diabetes'] - dff['Prediabetes']
dff['delta_time'] = dff['delta_time'].apply(lambda x: x.total_seconds() / (60**2 * 24 * 365))

In [37]:
dff.head()

Unnamed: 0,patient,Diabetes,Prediabetes,Body Height,Body Mass Index,Body Weight,Calcium,Carbon Dioxide,Chloride,Diastolic Blood Pressure,Glucose,Potassium,Sodium,Systolic Blood Pressure,Urea Nitrogen,age,diag_age,marital_S,marital_nan,sex_M,sex_nan,delta_time
0,0204fb22-601c-4687-a1be-cfe16b7de4b2,2002-01-25,1996-03-22,177.06,27.69,86.8,9.66,27.0,103.0,70.0,104.0,5.01,142.0,122.0,7.0,38,23.608219,0,0,1,0,5.849315
1,0287f2d7-8796-4129-84b4-fce5213c7a27,1976-10-08,1973-12-24,163.37,30.76,82.11,9.55,23.0,109.0,70.0,82.0,4.22,142.0,108.0,8.0,63,26.676712,0,0,0,0,2.791781
2,0441d5fe-e95a-4cdd-b1d7-c6cadf8332b6,1986-07-06,1983-07-02,174.71,37.49,114.43,9.6,24.0,109.0,94.0,159.0,4.04,138.0,173.0,17.0,59,32.054795,1,0,1,0,3.013699
3,09b874ca-046c-4f29-8a7d-ea55673609ea,1999-11-29,1997-12-06,163.03,38.95,103.54,9.59,22.0,106.0,104.0,193.0,4.43,139.0,165.0,17.0,59,46.813699,1,0,0,0,1.980822
4,09fbcb5e-cc43-49cf-b0bc-f37bcbe6da86,2005-02-23,1999-02-27,178.83,32.2,102.96,9.97,24.0,111.0,117.0,94.0,3.84,140.0,161.0,7.0,48,36.69589,0,0,1,0,5.994521


In [38]:
hb_a1c = pickle.load(open('hb_a1c.pkl', 'rb'))

In [50]:
df_a1c = pd.DataFrame(list(hb_a1c.keys()), columns=['encounter', 'patient', 'date'])

In [51]:
df_a1c['hb_a1c'] = pd.Series(list(hb_a1c.values()))

In [56]:
df_a1c['date'] = pd.to_datetime(df_a1c['date'])
df_a1c['hb_a1c'] = df_a1c['hb_a1c'].map(lambda x: float(x))

In [62]:
df_a1c.drop('encounter', axis=1, inplace=True)

In [72]:
dff.merge(df_a1c, left_on=['patient', 'Prediabetes'], right_on=['patient', 'date'])
# .drop_duplicates(subset=['patient'])

Unnamed: 0,patient,Diabetes,Prediabetes,Body Height,Body Mass Index,Body Weight,Calcium,Carbon Dioxide,Chloride,Diastolic Blood Pressure,Glucose,Potassium,Sodium,Systolic Blood Pressure,Urea Nitrogen,age,diag_age,marital_S,marital_nan,sex_M,sex_nan,delta_time,date,hb_a1c
0,62c12865-d83b-4815-a23e-6c2c9d3f8dd0,2017-04-11,2013-10-20,168.3,46.32,131.22,8.97,25.0,110.0,87.0,79.0,3.94,141.0,111.0,7.0,47,47.087671,0,0,0,0,3.476712,2013-10-20,5.9
1,62c12865-d83b-4815-a23e-6c2c9d3f8dd0,2017-04-11,2013-10-20,168.3,46.32,131.22,8.97,25.0,110.0,87.0,79.0,3.94,141.0,111.0,7.0,47,47.087671,0,0,0,0,3.476712,2013-10-20,5.9


Nevermind the hb_a1c. It does not map to a sufficient number of observations.

Drop columns that chronologically follow the prediabetes diagnosis.

In [75]:
dff.drop(['Diabetes', 'age'], axis=1, inplace=True)

In [76]:
dff.to_csv('./csv/diabetic_regression.csv')