In [6]:
import pandas as pd
import numpy as np
from data_processing import bin_features, extract_df_with_features

df = pd.read_csv("all_features_all_patients_binned.csv")
df_clinical1 = pd.read_csv(r"dataframe_glucose_feats.csv")
df_clinical = df_clinical1[['participant_id','mean_glucose', 'spike_resolutions', 'expected_daily_spikes',
       'hyper_time_pcts', 'relative_spikes', 'nocturnal_hypoglycemias','study_group_id']]
print(df.shape)
print(df_clinical.columns) 
print(df_clinical1.shape)
df = df.merge(df_clinical, on = "participant_id")
print(df.shape)


(1067, 267)
Index(['participant_id', 'mean_glucose', 'spike_resolutions',
       'expected_daily_spikes', 'hyper_time_pcts', 'relative_spikes',
       'nocturnal_hypoglycemias', 'study_group_id'],
      dtype='object')
(1011, 10)
(1011, 274)


In [7]:
num_bins = 16
col_lst = ['mean_glucose', 'spike_resolutions', 'expected_daily_spikes',
          'hyper_time_pcts', 'nocturnal_hypoglycemias','cestl, CESD-10 Score','moca_total_score_time',
          'paate, How many hours since you last ate? (number',
          "cage, Age (in years)"]
df = bin_features(df, col_lst, num_bins)

In [8]:
c = [
    "Unnamed: 0",
    'years_of_education',
    "brthyy, Year (e.g. 1967)",
    "studyid, Participant Study ID",
]

df.drop(columns=c,inplace=True)
drop_columns=[]
thresh=50
sentinels = [55, 555, 88,99, 999]            # 55.0 matches 55 automatically
alt = 15                                

df = df.replace(sentinels, alt)
# Find features with very high class imbalance to drop
for col_name in df.select_dtypes(include=[np.number]).columns:
    x = df[col_name] 
    num_non_zero = x.sum()
    if num_non_zero < thresh:
        print(col_name)
        drop_columns.append(col_name)


mhoccur_ad, Dementia (Examples: Alzheimer's Disea
mhoccur_cogn, Mild cognitive impairment (known as
mhoccur_ms, Multiple sclerosis
mhoccur_pd, Parkinson's disease
mhoccur_rvo, Retinal vascular occlusion ("stroke 


In [4]:
# These are features that contain information about diabetes itself, so removed.
invalid_feats = [
                'paid_scrd, Feeling scared when you think about li',
                'paid_wr, Worrying about the future and the possib',
                'paid_eng, Feeling that diabetes is taking up too ',
                'paid_cml, Coping with complications of diabetes',
                'paid_dpr, Feeling depressed when you think about ',
                'paidscore, PAID score',
                'dmlfeet, How often do you inspect your feet?',
                'dmlmd, Which of the following options best descri',
                'dmledu, How often would you say that you engage i',
                'cm_ibp, Have you taken ibuprofen or ibuprofen-con',
                ]
redundant_feats=[
                'pulse_vsorres_2, Heart Rate (bpm)',   # This is a duplicate
                'Carbon Dioxide, Total (mEq/L)',       # This feature is not relevant to diabetes progression
                'mhoccur_clsh, High blood cholesterol' # Cholesterol measurements are present
                ]

drop_feat = [
                'mhterm_predm, Pre-diabetes',
                'mhterm_dm2, Type II Diabetes',
                'mh_a1c, Elevated A1C levels (elevated blood sugar',
                'mhoccur_pdr, Diabetic retinopathy (in one or both'
    ]
#Drop feature starting with cmtrt_a1c, this refers to taking diabetes measurement.
df = df.drop(df.filter(regex=r"^cmtrt_a1c").columns, axis=1)

print(drop_columns)
df.drop(columns=drop_columns,inplace=True)
df.drop(columns=invalid_feats,inplace=True)
df.drop(columns=redundant_feats,inplace=True)
df.drop(columns=drop_feat,inplace=True)
print(df.shape)

print(df.columns[0:20])

["mhoccur_ad, Dementia (Examples: Alzheimer's Disea", 'mhoccur_cogn, Mild cognitive impairment (known as', 'mhoccur_ms, Multiple sclerosis', "mhoccur_pd, Parkinson's disease", 'mhoccur_rvo, Retinal vascular occlusion ("stroke ']
(1011, 248)
Index(['participant_id', 'age', 'study_group', 'clinical_site',
       'mhoccur_amd, Age-related macular degeneration (AM',
       'mhoccur_ca, Cancer (any type)',
       'mhoccur_circ, Circulation problems (Examples: art',
       'mhoccur_cns, Other neurological conditions',
       'mhoccur_crt, Cataracts (in one or both eyes)',
       'mhoccur_cvdot, Other heart issues (Examples: pace',
       'mhoccur_ded, Dry eye (in one or both eyes)',
       'mhoccur_ear, Hearing impairment',
       'mhoccur_gi, Digestive problems (Examples: stomach',
       'mhoccur_glc, Glaucoma (in one or both eyes)',
       'mhoccur_hbp, High blood pressure', 'mhoccur_lbp, Low blood pressure',
       'mhoccur_mi, Heart attack', 'mhoccur_oa, Osteoporosis',
       'mhoccur_o

In [5]:
num_top_features = 16
study_groups = [0,1]
df_subset = extract_df_with_features(df, study_groups, num_top_features)
df_subset.head()
print(df_subset.shape)
print(df_subset.columns)
fname = 'dataset_progression_analysis_'+str(study_groups[0])+"_" + str(study_groups[1])+'.csv'
df_subset.to_csv(fname)

study_groups = [1,2]
df_subset = extract_df_with_features(df, study_groups, num_top_features)
df_subset.head()
print(df_subset.shape)
print(df_subset.columns)
fname =  'dataset_progression_analysis_'+str(study_groups[0])+"_" + str(study_groups[1])+'.csv'
df_subset.to_csv(fname)

study_groups = [2,3]
df_subset = extract_df_with_features(df, study_groups, num_top_features)
df_subset.head()
print(df_subset.shape)
print(df_subset.columns)
fname = 'dataset_progression_analysis_'+str(study_groups[0])+"_" + str(study_groups[1])+'.csv'
df_subset.to_csv(fname)

                                           Measurement  AvgAbsCorrelation
40                                           HbA1c (%)           0.279153
241                                       mean_glucose           0.218942
244                                    hyper_time_pcts           0.200654
21                        mhoccur_rnl, Kidney problems           0.197664
246                            nocturnal_hypoglycemias           0.197612
242                                  spike_resolutions           0.183711
38                                     Glucose (mg/dL)           0.178575
14                    mhoccur_hbp, High blood pressure           0.175141
18                                mhoccur_obs, Obesity           0.161307
164  dmlsugar, When reflecting on your typical eating            0.133272
52                                    bmi_vsorres, BMI           0.123262
41                                     INSULIN (ng/mL)           0.121865
232         sualckncf, Have you ever c

  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]


                                           Measurement  AvgAbsCorrelation
244                                    hyper_time_pcts           0.384884
40                                           HbA1c (%)           0.353122
241                                       mean_glucose           0.331168
38                                     Glucose (mg/dL)           0.307549
42                 LDL Cholesterol Calculation (mg/dL)           0.305892
47                           Total Cholesterol (mg/dL)           0.299597
242                                  spike_resolutions           0.298989
246                            nocturnal_hypoglycemias           0.250721
129              whr_vsorres, Waist to Hip Ratio (WHR)           0.199008
39                             HDL Cholesterol (mg/dL)           0.195166
14                    mhoccur_hbp, High blood pressure           0.194265
127            waist_vsorres, Waist Circumference (cm)           0.188936
104                    pulse_vsorres, 

  c /= stddev[:, None]
  c /= stddev[None, :]
