In [3]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

# Data Preparation

In [4]:
def is_diabetic(row):
    '''
    This is classification based on survey. 
    '''
    if row['DIQ010']==1:
        return 1
    else:
        return 0

def merge_data_diabetes(df_one,df_two):
    '''
    This will make a df for classification.
    '''
    df_one['HasDiabetes'] = df_one.apply(lambda row: is_diabetic(row), axis=1)
    df_one = df_one[['SEQN','HasDiabetes']]
    df_merged = df_one.merge(right=df_two,on='SEQN',how='inner')
    return df_merged

def build_diabetes_classification(df):
    '''
    Build the classification.
    '''
    df['HasDiabetes'] = df.apply(lambda row: is_diabetic(row), axis=1)
    df = df[['SEQN','HasDiabetes']]
    return df

def diabetes_corr(df_merged):
    correlation = df_merged.corr()
    return correlation['HasDiabetes'].abs().sort_values(ascending=False)


In [22]:
# df that will be used to classify diabetes
df_diabetes = pd.read_sas('./EDA/CDC/Questionare/Diabetes/DIQ_E.XPT')
df_diabetes = build_diabetes_classification(df_diabetes)

# demographic
df_demo = pd.read_sas('./EDA/CDC/shotgun_approach/DEMO_E.XPT')
# blood pressure questionaire
df_bp = pd.read_sas('./EDA/CDC/shotgun_approach/BPQ_E.XPT')
# blood pressure results
df_bpr = pd.read_sas('./EDA/CDC/Blood_pressure/BPX_E.XPT')
# medical conditions
df_mc = pd.read_sas('./EDA/CDC/shotgun_approach/MCQ_E.XPT')

key = 'SEQN'
df_merged = df_diabetes.merge(right=df_demo,on=key,how='inner')
df_merged = df_merged.merge(right=df_bp,on=key,how='inner')
df_merged = df_merged.merge(right=df_bpr,on=key,how='inner')
df_merged = df_merged.merge(right=df_mc,on=key,how='inner')

In [23]:
def print_change_in_rows(df_one,df_two): 
    original_rows = df_one.shape[0]
    final_rows = df_two.shape[0]
    change_in_rows = original_rows-final_rows
    percentage_change = round((1-final_rows/original_rows)*100)
    print('The change in rows is {}'.format(change_in_rows))
    print('The percentage change is {}%\n'.format(percentage_change))
    return None

def summary_on_diabetes(df): 
    print('TABLE OF DIABETIC COUNT')
    print(df.HasDiabetes.value_counts().rename({0:'non-diabetic',
                                                1:'diabetic'}))
    return None

print_change_in_rows(df_diabetes,df_merged)
summary_on_diabetes(df_merged)

The change in rows is 3359
The percentage change is 35%

TABLE OF DIABETIC COUNT
non-diabetic    5576
diabetic         731
Name: HasDiabetes, dtype: int64


# Modeling

In [24]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
#from xgboost import XGBClassifier
#from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, \
    classification_report
from sklearn.model_selection import train_test_split, KFold
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [47]:
df_final = df_merged.drop(columns='SEQN')
df_final = df_final.dropna()

x = df_final.drop(columns='HasDiabetes')
y = df_final['HasDiabetes']

(6307, 178)


In [39]:
x

Unnamed: 0,SDDSRVYR,RIDSTATR,RIDEXMON,RIAGENDR,RIDAGEYR,RIDAGEMN,RIDAGEEX,RIDRETH1,DMQMILIT,DMDBORN2,...,MCQ265,MCQ268A,MCQ268B,MCQ268C,MCQ268D,MCQ310,MCQ320,MCD330,MCQ340,MCQ350
0,5.0,2.0,2.0,2.0,62.0,751.0,752.0,5.0,2.0,1.0,...,,,,,,,,,,
1,5.0,2.0,2.0,1.0,71.0,859.0,860.0,3.0,1.0,1.0,...,2.0,,,,,1.0,65.0,122.0,2.0,2.0
2,5.0,2.0,1.0,1.0,52.0,629.0,630.0,1.0,2.0,2.0,...,2.0,,,,,2.0,,,,
3,5.0,2.0,1.0,1.0,21.0,254.0,254.0,4.0,2.0,1.0,...,,,,,,,,,,
4,5.0,2.0,2.0,1.0,64.0,778.0,779.0,1.0,2.0,2.0,...,2.0,,,,,1.0,50.0,365.0,1.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6302,5.0,2.0,1.0,1.0,61.0,739.0,740.0,1.0,2.0,2.0,...,2.0,,,,,1.0,45.0,91.0,2.0,2.0
6303,5.0,2.0,2.0,2.0,50.0,611.0,612.0,3.0,2.0,1.0,...,,,,,,,,,,
6304,5.0,2.0,1.0,1.0,17.0,212.0,212.0,2.0,2.0,4.0,...,,,,,,,,,,
6305,5.0,2.0,1.0,2.0,60.0,727.0,727.0,4.0,2.0,5.0,...,,,,,,,,,,


In [40]:
y

0       0
1       1
2       0
3       0
4       0
       ..
6302    0
6303    0
6304    0
6305    0
6306    0
Name: HasDiabetes, Length: 6307, dtype: int64

In [41]:
models = [('LR', LogisticRegression()),
          ('KNN', KNeighborsClassifier()),
          ('CART', DecisionTreeClassifier()),
          ('RF', RandomForestClassifier()),
          ('SVR', SVC(gamma='auto')),
          ('GB',GradientBoostingClassifier())]

# evaluate each model in turn
results = []
names = []

for name, model in models:
    kfold = KFold(n_splits=10, random_state=None)
    cv_results = cross_val_score(model, x, y, cv=kfold, scoring="accuracy")
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

10 fits failed out of a total of 10.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "c:\python39\lib\site-packages\sklearn\model_selection\_validation.py", line 681, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\python39\lib\site-packages\sklearn\linear_model\_logistic.py", line 1508, in fit
    X, y = self._validate_data(
  File "c:\python39\lib\site-packages\sklearn\base.py", line 572, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "c:\python39\lib\site-packages\sklearn\utils\validation.py", line 956, in check_X_y
    X = check_array(
  File "c:\python39\lib\site-packages\sklearn\utils\validation.py"

LR: nan (nan)
KNN: nan (nan)
CART: nan (nan)


10 fits failed out of a total of 10.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "c:\python39\lib\site-packages\sklearn\model_selection\_validation.py", line 681, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\python39\lib\site-packages\sklearn\ensemble\_forest.py", line 326, in fit
    X, y = self._validate_data(
  File "c:\python39\lib\site-packages\sklearn\base.py", line 572, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "c:\python39\lib\site-packages\sklearn\utils\validation.py", line 956, in check_X_y
    X = check_array(
  File "c:\python39\lib\site-packages\sklearn\utils\validation.py", line 

RF: nan (nan)
SVR: nan (nan)
GB: nan (nan)


10 fits failed out of a total of 10.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "c:\python39\lib\site-packages\sklearn\model_selection\_validation.py", line 681, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\python39\lib\site-packages\sklearn\ensemble\_gb.py", line 486, in fit
    X, y = self._validate_data(
  File "c:\python39\lib\site-packages\sklearn\base.py", line 572, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "c:\python39\lib\site-packages\sklearn\utils\validation.py", line 956, in check_X_y
    X = check_array(
  File "c:\python39\lib\site-packages\sklearn\utils\validation.py", line 792,

In [42]:
# LR
train, test = train_test_split(df_final, test_size=0.3, random_state=0, stratify=y)
train_x = train[train.columns[:-1]]
test_x = test[test.columns[:-1]]
train_y = train['HasDiabetes']
test_y = test['HasDiabetes']

lr_model = LogisticRegression()

lr_model.fit(train_x, train_y)

prediction = lr_model.predict(test_x)

metrics.accuracy_score(prediction, test_y)

metrics.confusion_matrix(prediction, test_y)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [251]:
svr_model = SVC(gamma='auto')

In [252]:
svr_model.fit(train_x, train_y)

SVC(gamma='auto')

In [253]:
prediction_svr = svr_model.predict(test_x)

metrics.accuracy_score(prediction_svr, test_y)

metrics.confusion_matrix(prediction_svr, test_y)

array([[1515,  164],
       [   1,    0]])

In [254]:
tn, fp, fn, tp = metrics.confusion_matrix(prediction_svr, test_y).ravel()

In [255]:
# Decision Tree

In [256]:
cart_model = DecisionTreeClassifier()

In [257]:
cart_model.fit(train_x, train_y)

DecisionTreeClassifier()

In [258]:
prediction_cart = cart_model.predict(test_x)

metrics.accuracy_score(prediction_cart, test_y)

metrics.confusion_matrix(prediction_cart, test_y)

array([[1360,  130],
       [ 156,   34]])