In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

# Data Preparation

In [2]:
def is_diabetic(row):
    '''
    This is classification based on survey. 
    '''
    if row['DIQ010']==1:
        return 1
    else:
        return 0

def merge_data_diabetes(df_one,df_two):
    '''
    This will make a df for classification.
    '''
    df_one['HasDiabetes'] = df_one.apply(lambda row: is_diabetic(row), axis=1)
    df_one = df_one[['SEQN','HasDiabetes']]
    df_merged = df_one.merge(right=df_two,on='SEQN',how='inner')
    return df_merged

def build_diabetes_classification(df):
    '''
    Build the classification.
    '''
    df['HasDiabetes'] = df.apply(lambda row: is_diabetic(row), axis=1)
    df = df[['SEQN','HasDiabetes']]
    return df

def diabetes_corr(df_merged):
    correlation = df_merged.corr()
    return correlation['HasDiabetes'].abs().sort_values(ascending=False)


In [3]:
# df that will be used to classify diabetes
df_diabetes = pd.read_sas('./EDA/CDC/Questionare/Diabetes/DIQ_E.XPT')
df_diabetes = build_diabetes_classification(df_diabetes)

# demographic
df_demo = pd.read_sas('./EDA/CDC/shotgun_approach/DEMO_E.XPT')
# blood pressure questionaire
df_bp = pd.read_sas('./EDA/CDC/shotgun_approach/BPQ_E.XPT')
# blood pressure results
df_bpr = pd.read_sas('./EDA/CDC/Blood_pressure/BPX_E.XPT')
# medical conditions
df_mc = pd.read_sas('./EDA/CDC/shotgun_approach/MCQ_E.XPT')

key = 'SEQN'
df_merged = df_diabetes.merge(right=df_demo,on=key,how='inner')
df_merged = df_merged.merge(right=df_bp,on=key,how='inner')
df_merged = df_merged.merge(right=df_bpr,on=key,how='inner')
df_merged = df_merged.merge(right=df_mc,on=key,how='inner')

In [4]:
def print_change_in_rows(df_one,df_two): 
    original_rows = df_one.shape[0]
    final_rows = df_two.shape[0]
    change_in_rows = original_rows-final_rows
    percentage_change = round((1-final_rows/original_rows)*100)
    print('The change in rows is {}'.format(change_in_rows))
    print('The percentage change is {}%\n'.format(percentage_change))
    return None

def summary_on_diabetes(df): 
    print('TABLE OF DIABETIC COUNT')
    print(df.HasDiabetes.value_counts().rename({0:'non-diabetic',
                                                1:'diabetic'}))
    return None

print_change_in_rows(df_diabetes,df_merged)
summary_on_diabetes(df_merged)

The change in rows is 3359
The percentage change is 35%

TABLE OF DIABETIC COUNT
non-diabetic    5576
diabetic         731
Name: HasDiabetes, dtype: int64


# Modeling

In [5]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
#from xgboost import XGBClassifier
#from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, \
    classification_report
from sklearn.model_selection import train_test_split, KFold
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [6]:
df_final = df_merged.drop(columns='SEQN')
# print(df_final['MCQ265'].isna().value_counts().loc[False])

def percentage_isna(df,threshold):
    total_rows = df.shape[0]
    df_out = df
    for col in df.columns: 
        try: 
            non_na_count = df[col].isna().value_counts().loc[False]
        except:
            non_na_count = total_rows-df[col].isna().value_counts().loc[True]
        percentage = round((total_rows-non_na_count)/total_rows*100)
        print('Percentage {}%'.format(percentage))
        if percentage>=threshold:
            print(col)
            df_out = df_out.drop(columns=col)
            print(df_out.shape[1])
    return df_out

df_final_na_removed = percentage_isna(df_final,1).dropna()
print(df_final_na_removed.shape)

Percentage 0%
Percentage 0%
Percentage 0%
Percentage 0%
Percentage 0%
Percentage 0%
Percentage 6%
RIDAGEMN
177
Percentage 6%
RIDAGEEX
176
Percentage 0%
Percentage 3%
DMQMILIT
175
Percentage 0%
Percentage 0%
Percentage 77%
DMDYRSUS
174
Percentage 90%
DMDEDUC3
173
Percentage 10%
DMDEDUC2
172
Percentage 91%
DMDSCHOL
171
Percentage 10%
DMDMARTL
170
Percentage 0%
Percentage 0%
Percentage 1%
INDHHIN2
169
Percentage 1%
INDFMIN2
168
Percentage 9%
INDFMPIR
167
Percentage 81%
RIDEXPRG
166
Percentage 0%
Percentage 0%
Percentage 2%
DMDHRBR2
165
Percentage 2%
DMDHREDU
164
Percentage 3%
DMDHRMAR
163
Percentage 49%
DMDHSEDU
162
Percentage 0%
Percentage 0%
Percentage 0%
Percentage 1%
FIALANG
161
Percentage 1%
FIAPROXY
160
Percentage 1%
FIAINTRP
159
Percentage 8%
MIALANG
158
Percentage 8%
MIAPROXY
157
Percentage 8%
MIAINTRP
156
Percentage 0%
Percentage 0%
Percentage 0%
Percentage 0%
Percentage 0%
Percentage 0%
Percentage 67%
BPQ030
155
Percentage 68%
BPD035
154
Percentage 67%
BPQ040A
153
Percentage 71%

In [7]:
x = df_final_na_removed.drop(columns='HasDiabetes')
y = df_final_na_removed['HasDiabetes']

In [8]:
x

Unnamed: 0,SDDSRVYR,RIDSTATR,RIDEXMON,RIAGENDR,RIDAGEYR,RIDRETH1,DMDBORN2,DMDCITZN,DMDHHSIZ,DMDFMSIZ,...,BPQ020,BPQ052,PEASCST1,MCQ010,MCQ053,MCQ080,MCQ092,MCQ140,MCQ245A,MCQ300B
0,5.0,2.0,2.0,2.0,62.0,5.0,1.0,1.0,2.0,2.0,...,1.0,2.0,1.0,1.0,2.0,1.0,2.0,1.0,2.0,2.0
1,5.0,2.0,2.0,1.0,71.0,3.0,1.0,1.0,2.0,2.0,...,1.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
2,5.0,2.0,1.0,1.0,52.0,1.0,2.0,1.0,5.0,5.0,...,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,1.0,2.0
3,5.0,2.0,1.0,1.0,21.0,4.0,1.0,1.0,4.0,4.0,...,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,1.0,2.0
4,5.0,2.0,2.0,1.0,64.0,1.0,2.0,2.0,6.0,6.0,...,1.0,2.0,1.0,2.0,2.0,1.0,2.0,1.0,1.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6302,5.0,2.0,1.0,1.0,61.0,1.0,2.0,1.0,7.0,7.0,...,1.0,2.0,1.0,2.0,2.0,1.0,2.0,2.0,1.0,2.0
6303,5.0,2.0,2.0,2.0,50.0,3.0,1.0,1.0,2.0,2.0,...,2.0,2.0,1.0,2.0,1.0,2.0,2.0,1.0,1.0,2.0
6304,5.0,2.0,1.0,1.0,17.0,2.0,4.0,2.0,7.0,5.0,...,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
6305,5.0,2.0,1.0,2.0,60.0,4.0,5.0,2.0,6.0,6.0,...,2.0,2.0,1.0,2.0,2.0,1.0,1.0,1.0,2.0,2.0


In [9]:
y

0       0
1       1
2       0
3       0
4       0
       ..
6302    0
6303    0
6304    0
6305    0
6306    0
Name: HasDiabetes, Length: 6305, dtype: int64

In [10]:
models = [('LR', LogisticRegression()),
          ('KNN', KNeighborsClassifier()),
          ('CART', DecisionTreeClassifier()),
          ('RF', RandomForestClassifier()),
          ('SVR', SVC(gamma='auto')),
          ('GB',GradientBoostingClassifier())]

# evaluate each model in turn
results = []
names = []

for name, model in models:
    kfold = KFold(n_splits=10, random_state=None)
    cv_results = cross_val_score(model, x, y, cv=kfold, scoring="accuracy")
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

LR: 0.884216 (0.018730)




KNN: 0.874701 (0.016113)
CART: 0.816972 (0.016557)
RF: 0.882315 (0.016847)
SVR: 0.881837 (0.017496)
GB: 0.885011 (0.017124)


In [12]:
# LR
train, test = train_test_split(df_final_na_removed, test_size=0.3, random_state=0, stratify=y)
train_x = train[train.columns[:-1]]
test_x = test[test.columns[:-1]]
train_y = train['HasDiabetes']
test_y = test['HasDiabetes']

lr_model = LogisticRegression()

lr_model.fit(train_x, train_y)

prediction = lr_model.predict(test_x)

metrics.accuracy_score(prediction, test_y)

metrics.confusion_matrix(prediction, test_y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


array([[1665,   20],
       [   8,  199]], dtype=int64)

In [13]:
svr_model = SVC(gamma='auto')

In [14]:
svr_model.fit(train_x, train_y)

SVC(gamma='auto')

In [15]:
prediction_svr = svr_model.predict(test_x)

metrics.accuracy_score(prediction_svr, test_y)

metrics.confusion_matrix(prediction_svr, test_y)

array([[1671,  217],
       [   2,    2]], dtype=int64)

In [16]:
tn, fp, fn, tp = metrics.confusion_matrix(prediction_svr, test_y).ravel()

In [17]:
# Decision Tree

In [18]:
cart_model = DecisionTreeClassifier()

In [19]:
cart_model.fit(train_x, train_y)

DecisionTreeClassifier()

In [20]:
prediction_cart = cart_model.predict(test_x)

metrics.accuracy_score(prediction_cart, test_y)

metrics.confusion_matrix(prediction_cart, test_y)

array([[1673,    0],
       [   0,  219]], dtype=int64)