In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

# Preamble 
We have noticed that glucose levels has a strong relation with determing if a person is diabetic or not. 

This note book will look at other features when compared to diabetic or not.

link: https://wwwn.cdc.gov/nchs/nhanes/continuousnhanes/default.aspx?BeginYear=2007


# Data Preparation

In [63]:
def is_diabetic(row):
    '''
    This is classification based on survey. 
    DIQ010 --> This is a survey of diabetic or not.
    '''
    if row['DIQ010']==1:
        return 1
    else:
        return 0

def merge_data_diabetes(df_one,df_two):
    '''
    This will make a df for classification.
    '''
    df_one['HasDiabetes'] = df_one.apply(lambda row: is_diabetic(row), axis=1)
    df_one = df_one[['SEQN','HasDiabetes']]
    df_merged = df_one.merge(right=df_two,on='SEQN',how='inner')
    return df_merged

def build_diabetes_classification(df):
    '''
    Build the classification.
    '''
    df['HasDiabetes'] = df.apply(lambda row: is_diabetic(row), axis=1)
    df = df[['SEQN','HasDiabetes']]
    return df

def diabetes_corr(df_merged):
    correlation = df_merged.corr()
    return correlation['HasDiabetes'].abs().sort_values(ascending=False)

def import_dataset(name,col_dic=None):
    df = pd.read_sas(base_dir+name+'.XPT')
    if col_dic==None:
        return df
    else: 
        return df[[*col_dic]].rename(columns=col_dic)
    
def expand_col(df,col_name,dic_col_val):
    for new_col in dic_col_val:
        df[new_col] = 1
        df.loc[df[col_name]==dic_col_val[new_col],new_col] = 2
    return df

Diabetic classifier options: 
* diabetic survey --> may have survey records of people eating and living clean after being diagnosed. 
* glycohemoglobin --> above 6.5% indicates diabetic person. 
* fasting glucose level --> indicator to diabetic person. 

Datasets interested in: 
* demographic information
    * Has age
* overweight --> MCQ_E --> MCQ080
* Fasting Glucose (mg/dL) --> GLU_E --> LBXGLU
* Dietry --> nutrient information --> DR1TOT_E & DR2TOT_E
* Occupation --> OCQ_E
* Income --> INQ_E
* Food Security --> FSQ_E

References: Influece Factors: 
* https://www.niddk.nih.gov/health-information/diabetes/overview/risk-factors-type-2-diabetes

Blood pressure: 
* Hard to obtain if you don't have a BP device. 
* But, certain groups of society will have a BP device. Therefore, it is practical to use BP as a feature. 
* Also, if a person has their recent recording from there doctor, they could possible us that reading. 

![image.png](attachment:image.png)

Modeling notes:
* Classifications to be put on sperate columns. 
* zero and one values need to be changed to one and two values. 

![image.png](attachment:image.png)

In [64]:
# OLD IMPORTS
# # blood pressure questionaire
# df_bp = pd.read_sas('./EDA/CDC/shotgun_approach/BPQ_E.XPT')
# # blood pressure results
# df_bpr = pd.read_sas('./EDA/CDC/Blood_pressure/BPX_E.XPT')
# # medical conditions
# df_mc = pd.read_sas('./EDA/CDC/shotgun_approach/MCQ_E.XPT')

In [65]:
# I want to selectively import features rather than chucking in variables.
# df that will be used to classify diabetes
df_diabetes = pd.read_sas('./EDA/CDC/Questionare/Diabetes/DIQ_E.XPT') #https://wwwn.cdc.gov/Nchs/Nhanes/2007-2008/DIQ_E.htm#DIQ010
df_diabetes = build_diabetes_classification(df_diabetes)
df_diabetes.head()

Unnamed: 0,SEQN,HasDiabetes
0,41475.0,0
1,41476.0,0
2,41477.0,1
3,41478.0,0
4,41479.0,0


In [66]:
df_glyc_raw = pd.read_sas('./EDA/CDC/Glycohemoglobin/GHB_E.XPT')
df_glyc_raw.LBXGH.loc[df_glyc_raw.LBXGH<6.5] = 0
df_glyc_raw.LBXGH.loc[df_glyc_raw.LBXGH>=6.5] = 1
df_glyc_raw.head()

Unnamed: 0,SEQN,LBXGH
0,41475.0,0.0
1,41477.0,1.0
2,41479.0,0.0
3,41481.0,
4,41482.0,1.0


In [67]:
df = df_diabetes.merge(right=df_glyc_raw,on='SEQN',how='outer',indicator='indicator')
# df_diabetes_combined.groupby(['indicator']).count()
# https://www.uofmhealth.org/health-library/hw8432
# We will want to keep the whole merge. 
df.loc[df.indicator=='right_only']

Unnamed: 0,SEQN,HasDiabetes,LBXGH,indicator


In [68]:
df.loc[df.indicator=='left_only']

Unnamed: 0,SEQN,HasDiabetes,LBXGH,indicator
1,41476.0,0,,left_only
3,41478.0,0,,left_only
5,41480.0,0,,left_only
12,41488.0,0,,left_only
15,41491.0,0,,left_only
...,...,...,...,...
9643,51600.0,0,,left_only
9644,51601.0,0,,left_only
9645,51602.0,0,,left_only
9649,51607.0,0,,left_only


In [69]:
df.loc[(df.indicator=='left_only')&(df.HasDiabetes==1)]

Unnamed: 0,SEQN,HasDiabetes,LBXGH,indicator
799,42316.0,1,,left_only
1128,42663.0,1,,left_only
1377,42928.0,1,,left_only
1636,43200.0,1,,left_only
1745,43312.0,1,,left_only
1829,43400.0,1,,left_only
1857,43429.0,1,,left_only
2062,43643.0,1,,left_only
2133,43722.0,1,,left_only
2174,43766.0,1,,left_only


In [70]:
# (not(df.HasDiabetes.isna()))
df_both = df.loc[(df.indicator=='both')]

# Nulls in HasDiabetes are being filled
df_both.loc[(df_both.HasDiabetes.isna())&(df_both.LBXGH.notna()),'HasDiabetes'] = \
df_both.loc[(df_both.HasDiabetes.isna())&(df_both.LBXGH.notna()),'LBXGH']

# 0 in HasDiabetes are being filled with 1s from LBXGH
df_both.loc[(df_both.HasDiabetes==0)&(df_both.LBXGH==1),'HasDiabetes'] = \
df_both.loc[(df_both.HasDiabetes==0)&(df_both.LBXGH==1),'LBXGH']

df.loc[(df.indicator=='both')] = df_both

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


In [71]:
df.loc[df.indicator=='both']

Unnamed: 0,SEQN,HasDiabetes,LBXGH,indicator
0,41475.0,0.0,0.0,both
2,41477.0,1.0,1.0,both
4,41479.0,0.0,0.0,both
6,41481.0,0.0,,both
7,41482.0,1.0,1.0,both
...,...,...,...,...
9661,51619.0,0.0,0.0,both
9662,51620.0,0.0,0.0,both
9663,51621.0,0.0,0.0,both
9664,51622.0,0.0,0.0,both


In [72]:
df.loc[df.indicator=='right_only']

Unnamed: 0,SEQN,HasDiabetes,LBXGH,indicator


In [73]:
df.loc[(df.indicator=='right_only'),'HasDiabetes'] = df.loc[(df.indicator=='right_only'),'LBXGH']

# Assign df_classifer

In [76]:
df_classifier = df
df_classifier = df_classifier.drop(columns=['LBXGH','indicator'])

# Need to change 1 to 2 and 0 to 1
df_classifier.loc[df_classifier.HasDiabetes==1,'HasDiabetes'] = 2
df_classifier.loc[df_classifier.HasDiabetes==0,'HasDiabetes'] = 1
df_classifier

Unnamed: 0,SEQN,HasDiabetes
0,41475.0,1.0
1,41476.0,1.0
2,41477.0,2.0
3,41478.0,1.0
4,41479.0,1.0
...,...,...
9661,51619.0,1.0
9662,51620.0,1.0
9663,51621.0,1.0
9664,51622.0,1.0


# Features

In [77]:
base_dir = './EDA/CDC/shotgun_approach/'
# demographic
df_demo = import_dataset('DEMO_E',{
    'SEQN':'SEQN',
    'RIDRETH1':'Ethnicity',
    'RIDAGEYR':'Age_in_yrs_at_exam', # Ages above 80yrs are lumped with 80yrs.
#     'INDFMPIR':'Ratio_of_family_income_to_poverty'
})

df_demo = expand_col(df_demo,'Ethnicity',{
    'Mexican_American':1,
    'Other_Hispanic':2,
    'Non-Hispanic_White':3,
    'Non-Hispanic_Black':4,
    'Other_and_Multi-Racial':5,
})

df_demo.head()

Unnamed: 0,SEQN,Ethnicity,Age_in_yrs_at_exam,Mexican_American,Other_Hispanic,Non-Hispanic_White,Non-Hispanic_Black,Other_and_Multi-Racial
0,41475.0,5.0,62.0,1,1,1,1,2
1,41476.0,5.0,6.0,1,1,1,1,2
2,41477.0,3.0,71.0,1,1,2,1,1
3,41478.0,3.0,1.0,1,1,2,1,1
4,41479.0,1.0,52.0,2,1,1,1,1


In [78]:
# Medical Question: overweight, stroke
df_medical_qs = import_dataset('MCQ_E',{
    'SEQN':'SEQN',
    'MCQ080':'Overweight'
})
df_medical_qs = expand_col(df_medical_qs,'Overweight',{
    'Yes_overweight':1,
    'No_overweight':2
#     'Refused':7,
#     'Do_not_know':9
})

df_medical_qs.head()

Unnamed: 0,SEQN,Overweight,Yes_overweight,No_overweight
0,41475.0,1.0,2,1
1,41476.0,,1,1
2,41477.0,2.0,1,2
3,41478.0,,1,1
4,41479.0,2.0,1,2


In [79]:
# BMI
df_bmi = import_dataset('BMX_E',{
    'SEQN':'SEQN',
    'BMXWAIST':'Waist_cm',
    'BMXBMI':'Body_mass_index_kgpm2'
})

df_bmi.head()

Unnamed: 0,SEQN,Waist_cm,Body_mass_index_kgpm2
0,41475.0,156.3,58.04
1,41476.0,52.7,15.18
2,41477.0,109.5,30.05
3,41478.0,,
4,41479.0,95.4,27.56


In [80]:
# Fasting Glucose levels GLU_E:LBXGLU
df_fasting_glucose = pd.read_sas(base_dir+'GLU_E.XPT')[['SEQN','LBXGLU']].rename(columns={'LBXGLU':'Fasting_glucose'})
# df_fasting_glucose.Fasting_glucose.hist()
df_fasting_glucose.head()

Unnamed: 0,SEQN,Fasting_glucose
0,41479.0,113.0
1,41481.0,
2,41485.0,97.0
3,41486.0,123.0
4,41487.0,108.0


In [81]:
df_food_security = import_dataset('FSQ_E',{
    'SEQN':'SEQN',
#     'FSD032A':'Worried_run_out_of_food',
    'FSD032C':'Aford_balance_meals'
#     'FSDAD':'Adult_food_security'
#     'FSD431':'Hungry'
})
df_food_security = expand_col(df_food_security,'Aford_balance_meals',{
    'Aford_balance_meals_Often_true':1,
    'Aford_balance_meals_Sometimes_true':2,
    'Aford_balance_meals_Never_true':3
})
df_food_security.head()

Unnamed: 0,SEQN,Aford_balance_meals,Aford_balance_meals_Often_true,Aford_balance_meals_Sometimes_true,Aford_balance_meals_Never_true
0,41475.0,3.0,1,1,2
1,41476.0,3.0,1,1,2
2,41477.0,3.0,1,1,2
3,41478.0,2.0,1,2,1
4,41479.0,3.0,1,1,2


In [82]:
df_day_one_diet = import_dataset('DR1TOT_E',{
    'SEQN':'SEQN',
    'DR1TKCAL':'Energy', 
})

In [83]:
# df_depression = import_dataset('DPQ_E',{
#     'DPQ020':''
# })

In [120]:
key = 'SEQN'

df_merged = df_classifier.merge(right=df_demo,on=key,how='inner')
df_merged = df_merged.merge(right=df_medical_qs,on=key,how='inner')
df_merged = df_merged.merge(right=df_bmi,on=key,how='inner')
# df_merged = df_merged.merge(right=df_fasting_glucose,on=key,how='inner')
df_merged = df_merged.merge(right=df_food_security,on=key,how='inner')
# df_merged = df_merged.merge(right=df_day_one_diet,on=key,how='inner')

df_merged.dtypes

SEQN                                  float64
HasDiabetes                           float64
Ethnicity                             float64
Age_in_yrs_at_exam                    float64
Mexican_American                        int64
Other_Hispanic                          int64
Non-Hispanic_White                      int64
Non-Hispanic_Black                      int64
Other_and_Multi-Racial                  int64
Overweight                            float64
Yes_overweight                          int64
No_overweight                           int64
Waist_cm                              float64
Body_mass_index_kgpm2                 float64
Aford_balance_meals                   float64
Aford_balance_meals_Often_true          int64
Aford_balance_meals_Sometimes_true      int64
Aford_balance_meals_Never_true          int64
dtype: object

In [121]:
df_merged.columns

Index(['SEQN', 'HasDiabetes', 'Ethnicity', 'Age_in_yrs_at_exam',
       'Mexican_American', 'Other_Hispanic', 'Non-Hispanic_White',
       'Non-Hispanic_Black', 'Other_and_Multi-Racial', 'Overweight',
       'Yes_overweight', 'No_overweight', 'Waist_cm', 'Body_mass_index_kgpm2',
       'Aford_balance_meals', 'Aford_balance_meals_Often_true',
       'Aford_balance_meals_Sometimes_true', 'Aford_balance_meals_Never_true'],
      dtype='object')

In [181]:
# OPTION 1: Drop Table Method
# col_drop = [
#     'SEQN',
#     'Ethnicity',
#     'Overweight',
#     'Waist_cm',
#     'Body_mass_index_kgpm2',
#     'Fasting_glucose',
#     'Aford_balance_meals'
# #     'Energy'
# ]
# df_merged = df_merged.drop(columns=col_drop)

# OPTION 2: Select Features Method
df_merged = df_merged[['HasDiabetes', 'Yes_overweight', 'Waist_cm', 'Body_mass_index_kgpm2',
   'Aford_balance_meals_Often_true', 'Aford_balance_meals_Sometimes_true', 'Aford_balance_meals_Never_true'
]]

# Summary
df_merged.dtypes

HasDiabetes                           float64
Yes_overweight                          int64
Waist_cm                              float64
Body_mass_index_kgpm2                 float64
Aford_balance_meals_Often_true          int64
Aford_balance_meals_Sometimes_true      int64
Aford_balance_meals_Never_true          int64
dtype: object

In [182]:
# Dropping NaNs
df_merged = df_merged.dropna()
df_merged

Unnamed: 0,HasDiabetes,Yes_overweight,Waist_cm,Body_mass_index_kgpm2,Aford_balance_meals_Often_true,Aford_balance_meals_Sometimes_true,Aford_balance_meals_Never_true
0,1.0,2,156.3,58.04,1,1,2
1,1.0,1,52.7,15.18,1,1,2
2,2.0,1,109.5,30.05,1,1,2
4,1.0,1,95.4,27.56,1,1,2
5,1.0,1,60.3,17.93,1,2,1
...,...,...,...,...,...,...,...
9302,1.0,2,108.5,30.08,1,1,2
9303,1.0,1,101.1,28.31,1,1,2
9304,1.0,1,69.4,17.72,1,2,1
9305,1.0,2,111.9,31.53,1,1,2


In [183]:
df_merged.corr()['HasDiabetes'].abs().sort_values(ascending=False)

HasDiabetes                           1.000000
Waist_cm                              0.323403
Body_mass_index_kgpm2                 0.300683
Yes_overweight                        0.282924
Aford_balance_meals_Often_true        0.004213
Aford_balance_meals_Never_true        0.002616
Aford_balance_meals_Sometimes_true    0.001848
Name: HasDiabetes, dtype: float64

In [184]:
def print_change_in_rows(df_one,df_two): 
    original_rows = df_one.shape[0]
    final_rows = df_two.shape[0]
    change_in_rows = original_rows-final_rows
    percentage_change = round((1-final_rows/original_rows)*100)
    print('The change in rows is {}'.format(change_in_rows))
    print('The percentage change is {}%\n'.format(percentage_change))
    return None

def summary_on_diabetes(df): 
    print('TABLE OF DIABETIC COUNT')
    print(df.HasDiabetes.value_counts().rename({1:'non-diabetic',
                                                2:'diabetic'}))
    return None

print_change_in_rows(df_classifier,df_merged)
summary_on_diabetes(df_merged)

The change in rows is 1193
The percentage change is 12%

TABLE OF DIABETIC COUNT
non-diabetic    7623
diabetic         850
Name: HasDiabetes, dtype: int64


# Modeling

In [185]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
#from xgboost import XGBClassifier
#from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, \
    classification_report
from sklearn.model_selection import train_test_split, KFold
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [186]:
try: 
    df_final = df_merged.drop(columns='SEQN')
except:
    df_final = df_merged

def percentage_isna(df,threshold):
    total_rows = df.shape[0]
    df_out = df
    for col in df.columns: 
        try: 
            non_na_count = df[col].isna().value_counts().loc[False]
        except:
            non_na_count = total_rows-df[col].isna().value_counts().loc[True]
        percentage = round((total_rows-non_na_count)/total_rows*100)
        print('Percentage {}%'.format(percentage))
        if percentage>=threshold:
            print(col)
            df_out = df_out.drop(columns=col)
            print(df_out.shape[1])
    return df_out

# df_final_na_removed = percentage_isna(df_final,1).dropna()
# print(df_final_na_removed.shape)
df_final_na_removed = df_final.dropna()
df_final_na_removed

Unnamed: 0,HasDiabetes,Yes_overweight,Waist_cm,Body_mass_index_kgpm2,Aford_balance_meals_Often_true,Aford_balance_meals_Sometimes_true,Aford_balance_meals_Never_true
0,1.0,2,156.3,58.04,1,1,2
1,1.0,1,52.7,15.18,1,1,2
2,2.0,1,109.5,30.05,1,1,2
4,1.0,1,95.4,27.56,1,1,2
5,1.0,1,60.3,17.93,1,2,1
...,...,...,...,...,...,...,...
9302,1.0,2,108.5,30.08,1,1,2
9303,1.0,1,101.1,28.31,1,1,2
9304,1.0,1,69.4,17.72,1,2,1
9305,1.0,2,111.9,31.53,1,1,2


In [187]:
x = df_final_na_removed.drop(columns='HasDiabetes')
y = df_final_na_removed['HasDiabetes']

In [188]:
x

Unnamed: 0,Yes_overweight,Waist_cm,Body_mass_index_kgpm2,Aford_balance_meals_Often_true,Aford_balance_meals_Sometimes_true,Aford_balance_meals_Never_true
0,2,156.3,58.04,1,1,2
1,1,52.7,15.18,1,1,2
2,1,109.5,30.05,1,1,2
4,1,95.4,27.56,1,1,2
5,1,60.3,17.93,1,2,1
...,...,...,...,...,...,...
9302,2,108.5,30.08,1,1,2
9303,1,101.1,28.31,1,1,2
9304,1,69.4,17.72,1,2,1
9305,2,111.9,31.53,1,1,2


In [189]:
y

0       1.0
1       1.0
2       2.0
4       1.0
5       1.0
       ... 
9302    1.0
9303    1.0
9304    1.0
9305    1.0
9306    1.0
Name: HasDiabetes, Length: 8473, dtype: float64

In [190]:
models = [('LR', LogisticRegression()),
          ('KNN', KNeighborsClassifier()),
          ('CART', DecisionTreeClassifier()),
          ('RF', RandomForestClassifier()),
          ('SVR', SVC(gamma='auto')),
          ('GB',GradientBoostingClassifier())]

# evaluate each model in turn
results = []
names = []

for name, model in models:
    kfold = KFold(n_splits=10, random_state=None)
    cv_results = cross_val_score(model, x, y, cv=kfold, scoring="accuracy")
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

LR: 0.898264 (0.012342)




KNN: 0.883630 (0.011798)
CART: 0.837836 (0.012528)
RF: 0.878791 (0.011726)
SVR: 0.897910 (0.011835)
GB: 0.897202 (0.013868)


In [191]:
# LR
train, test = train_test_split(df_final_na_removed, test_size=0.3, random_state=0, stratify=y)
train_x = train[train.columns[:-1]]
test_x = test[test.columns[:-1]]
train_y = train['HasDiabetes']
test_y = test['HasDiabetes']

lr_model = LogisticRegression()

lr_model.fit(train_x, train_y)

prediction = lr_model.predict(test_x)

metrics.accuracy_score(prediction, test_y)

metrics.confusion_matrix(prediction, test_y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


array([[2287,    0],
       [   0,  255]], dtype=int64)

In [192]:
svr_model = SVC(gamma='auto')

In [193]:
svr_model.fit(train_x, train_y)

SVC(gamma='auto')

In [194]:
prediction_svr = svr_model.predict(test_x)

metrics.accuracy_score(prediction_svr, test_y)

metrics.confusion_matrix(prediction_svr, test_y)

array([[2284,   28],
       [   3,  227]], dtype=int64)

In [195]:
tn, fp, fn, tp = metrics.confusion_matrix(prediction_svr, test_y).ravel()

In [196]:
# Decision Tree

In [197]:
cart_model = DecisionTreeClassifier()

In [198]:
cart_model.fit(train_x, train_y)

DecisionTreeClassifier()

In [199]:
prediction_cart = cart_model.predict(test_x)

metrics.accuracy_score(prediction_cart, test_y)

metrics.confusion_matrix(prediction_cart, test_y)

array([[2287,    0],
       [   0,  255]], dtype=int64)

In [200]:
print('DONE')

DONE
