In [None]:
## import necessary libraries
import numpy as np
import pandas as pd

In [None]:
## read the raw data file
df = pd.read_csv('raw_data/diabetes_012_health_indicators_BRFSS2015.csv')
print('The dataframe has {} rows and {} columns'.format(df.shape[0],df.shape[1]))
print(df.columns.values)
df.head()

In [None]:
## check out the variable type of each column
df.info()

In [None]:
#convert data types
df[['Diabetes_012','GenHlth', 'Age', 'Education','Income']] = df[['Diabetes_012','GenHlth', 'Age', 'Education','Income']].astype(int)
df.info()

In [None]:
#slice the dataframe for feature and label
X, y = df.iloc[:,1:], df.iloc[:,0]
print(X.shape)
print(y.shape)

In [None]:
# Identify discrete columns with more than 2 categories
discrete_columns = [col for col in X.columns if X[col].nunique() > 2 and X[col].dtype != 'float64']

# Create dummy variables for these columns
df_with_dummies = pd.get_dummies(X, columns=discrete_columns, drop_first=True)

df_with_dummies.head()

X = df_with_dummies

X.head()

In [None]:
## This is the baseline dummy classifier which classifies all as the majority class

from sklearn.dummy import DummyClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc

# Splitting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

dummy_majority = DummyClassifier(strategy='most_frequent').fit(X_train, y_train)
y_dummy_prediction = dummy_majority.predict(X_test)

## calculate metrics
accuracy = accuracy_score(y_test, y_dummy_prediction)
precision = precision_score(y_test, y_dummy_prediction, average='weighted')
recall = recall_score(y_test, y_dummy_prediction, average='weighted')
f1 = f1_score(y_test, y_dummy_prediction, average='weighted')

print('accuracy : %s  precision : %s \n recall : %s  f1 : %s'
      % (accuracy, precision, recall, f1))

In [None]:
# from sklearn.linear_model import LogisticRegression
# from sklearn.model_selection import cross_val_score
# from sklearn.model_selection import GridSearchCV

# paramSpace = {'penalty' : ["l1", "l2", "elasticnet", None]}
# logit = LogisticRegression()
# clf = GridSearchCV(logit, param_grid = paramSpace, scoring = 'recall_micro', cv=5)
# clf.fit(X, y)
# print(clf.best_params_) ## the best parameter
# print(clf.best_score_) ## the best recall

In [None]:
# Moving forward with l2 logistic regression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc
import matplotlib.pyplot as plt


# # Splitting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Training the logistic regression model
clf = LogisticRegression(penalty='l2')


# cross_score = cross_val_score(clf, X, y, cv=5, scoring = 'recall_weighted')
# print(cross_score)
# print(cross_score.mean())

clf.fit(X_train, y_train)

# Making predictions
y_pred = clf.predict(X_test)
y_pred_proba = clf.predict_proba(X_test)[:, 1]

# Calculating metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# # ROC Curve and AUC
# fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
# roc_auc = auc(fpr, tpr)

# # Plotting the ROC Curve
# plt.figure()
# plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
# plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
# plt.xlim([0.0, 1.0])
# plt.ylim([0.0, 1.05])
# plt.xlabel('False Positive Rate')
# plt.ylabel('True Positive Rate')
# plt.title('Receiver Operating Characteristic')
# plt.legend(loc="lower right")
# plt.show()

print('accuracy : %s  precision : %s \n recall : %s  f1 : %s'
      % (accuracy, precision, recall, f1))

In [None]:
## we can see that the baseline model logistic regression perform poorly
## try KNN classifier

from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier

# Setting up the KNN classifier
knn = KNeighborsClassifier(n_neighbors = 5)

knn.fit(X_train, y_train)

# Making predictions
y_pred = knn.predict(X_test)
y_pred_proba = knn.predict_proba(X_test)[:, 1]

# Calculating metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Performing 5-fold cross-validation
# cv_scores = cross_val_score(knn, X, y, cv=5)
# cv_scores

print('accuracy : %s  precision : %s \n recall : %s  f1 : %s'
      % (accuracy, precision, recall, f1))

In [None]:
## create a classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred, digits=4))

In [None]:
## we can see that both logistic regression and KNN perform very poorly, lets try more models

## Decision trees
from sklearn import tree
tree_clf = tree.DecisionTreeClassifier()
tree_clf.fit(X_train, y_train)

# Making predictions
y_pred = tree_clf.predict(X_test)
y_pred_proba = tree_clf.predict_proba(X_test)[:, 1]

# Calculating metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Performing 5-fold cross-validation
# cv_scores = cross_val_score(knn, X, y, cv=5)
# cv_scores

print('accuracy : %s  precision : %s \n recall : %s  f1 : %s'
      % (accuracy, precision, recall, f1))

In [15]:
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler

scalar = StandardScaler()
svc_clf = SVC()
svc_clf.fit(scalar.fit_transform(X_train), y_train)

# Making predictions
y_pred = svc_clf.predict(scalar.fit_transform(X_test))
y_pred_proba = svc_clf.predict_proba(StandardScaler.fit_transform(X_test))[:, 1]

# Calculating metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Performing 5-fold cross-validation
# cv_scores = cross_val_score(knn, X, y, cv=5)
# cv_scores

print('accuracy : %s  precision : %s \n recall : %s  f1 : %s'
      % (accuracy, precision, recall, f1))