In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import numpy.random as nr
import math
from sklearn import preprocessing,svm
import sklearn.model_selection as ms
from sklearn import linear_model
import sklearn.metrics as sklm
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

In [4]:
data = pd.read_csv('diabetes.csv', header=None)
print(data.shape)
data.columns = ['Pregnancies','Glucose','Blood Pressure','Skin Thickness',
                  'Insulin', 'BMI','Diabetes Pedigree Index', 'Age', 'Outcome']
print(data.shape)
data.head()

(768, 9)
(768, 9)


Unnamed: 0,Pregnancies,Glucose,Blood Pressure,Skin Thickness,Insulin,BMI,Diabetes Pedigree Index,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [5]:
positive_counts = data['Outcome'].value_counts()
print(positive_counts)

0    500
1    268
Name: Outcome, dtype: int64


In [6]:
Features = np.array(data[['Pregnancies','Glucose','Blood Pressure','Skin Thickness',
                  'Insulin', 'BMI','Diabetes Pedigree Index', 'Age']])

In [7]:
y = np.asarray(data['Outcome'])
print(y.shape)

(768,)


# Preprocessing the data

In [6]:
from sklearn import preprocessing
X = preprocessing.StandardScaler().fit(Features).transform(Features)

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=4)
print ('Train set:', X_train.shape,  y_train.shape)
print ('Test set:', X_test.shape,  y_test.shape)

Train set: (614, 8) (614,)
Test set: (154, 8) (154,)


# LOGISTIC REGRESSION

#### This function implements logistic regression and can use different numerical optimizers to find parameters, including ‘newton-cg’, ‘lbfgs’, ‘liblinear’, ‘sag’, ‘saga’ solvers.

In [8]:
Labels = y.reshape(y.shape[0],)
Labels = np.array(Labels)
Features = np.array(Features)
print(Labels.shape)
nr.seed(123)
inside = ms.KFold(n_splits=5, shuffle = True)
nr.seed(321)
outside = ms.KFold(n_splits=5, shuffle = True)

(768,)


In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

In [18]:
nr.seed(3456)
## Define the dictionary for the grid search and the model object to search on
param_grid = {"C": [0.1, 1, 10, 100, 1000]}
## Define the logistic regression model
logistic_clf =LogisticRegression() 

## Perform the grid search over the parameters
lr_clf = ms.GridSearchCV(estimator = logistic_clf, param_grid = param_grid, 
                      cv = inside, # Use the inside folds
                      scoring = 'roc_auc',
                      return_train_score = True)
lr_clf.fit(Features, Labels)
lr_c=lr_clf.best_estimator_.C
print(lr_clf.best_estimator_.C)

100


In [19]:
nr.seed(3456)
## Define the dictionary for the grid search and the model object to search on
param_grid = {"solver": ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga' ],"max_iter":[10000]}
## Define the logistic regression model
logistic_clf =LogisticRegression() 

## Perform the grid search over the parameters
lr_clf = ms.GridSearchCV(estimator = logistic_clf, param_grid = param_grid,
                      cv = inside, # Use the inside folds
                      scoring = 'roc_auc',
                      return_train_score = True)
lr_clf.fit(Features, Labels)
lr_solver=lr_clf.best_estimator_.solver
print(lr_clf.best_estimator_.solver)

newton-cg


In [20]:
nr.seed(1115)
logistic_mod = LogisticRegression(C = lr_c,solver = lr_solver,max_iter=10000) 
logistic_mod.fit(X_train, y_train)

LogisticRegression(C=100, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=10000, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='newton-cg', tol=0.0001,
          verbose=0, warm_start=False)

In [21]:
def score_model(probs, threshold):
    return np.array([1 if x > threshold else 0 for x in probs[:,1]])

# SUPPORT VECTOR MACHINES (SVM)

The mathematical function used for the transformation is known as the kernel function, and can be of different types, such as:

    1.Linear
    2.Polynomial
    3.Radial basis function (RBF)
    4.Sigmoid

In [22]:
nr.seed(3456)
## Define the dictionary for the grid search and the model object to search on
param_grid = {"kernel":['linear','rbf','sigmoid']}
## Define the SVM model
svc_clf = svm.SVC() 
## Perform the grid search over the parameters
clf = ms.GridSearchCV(estimator = svc_clf, param_grid = param_grid, 
                      cv = inside, # Use the inside folds
                      scoring = 'roc_auc',
                      return_train_score = True)
clf.fit(Features, Labels)
svm_kernel=clf.best_estimator_.kernel
print(clf.best_estimator_.kernel)

linear


In [24]:
nr.seed(3456)
## Define the dictionary for the grid search and the model object to search on
param_grid = {"C": [1, 10, 100, 1000], "gamma":[1.0/50.0, 1.0/200.0, 1.0/500.0, 1.0/1000.0]}
## Define the SVM model
svc_clf = svm.SVC() 

## Perform the grid search over the parameters
clf = ms.GridSearchCV(estimator = svc_clf, param_grid = param_grid, 
                      cv = inside, # Use the inside folds
                      scoring = 'roc_auc',
                      return_train_score = True)
clf.fit(Features, Labels)
svm_c=clf.best_estimator_.C
svm_gamma=clf.best_estimator_.gamma
print(clf.best_estimator_.C)
print(clf.best_estimator_.gamma)

1
0.001


In [25]:
nr.seed(1115)
svm_mod = svm.SVC(C = svm_c,gamma = svm_gamma,
                  kernel = svm_kernel,
                  probability=True) 
svm_mod.fit(X_train, y_train)

SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.001, kernel='linear',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

# RANDOM FOREST

In [26]:
param_grid = {"n_estimators":[1,10,100,500,1000]}
## Define the random forest model
nr.seed(3456)
rf_clf = RandomForestClassifier(class_weight = "balanced") # class_weight = {0:0.33, 1:0.67}) 

## Perform the grid search over the parameters
nr.seed(4455)
rf_clf = ms.GridSearchCV(estimator = rf_clf, param_grid = param_grid, 
                      cv = inside, # Use the inside folds
                      scoring = 'roc_auc',
                      return_train_score = True)
rf_clf.fit(Features, Labels)
rf_n=rf_clf.best_estimator_.n_estimators
print(rf_clf.best_estimator_.n_estimators)

500


In [27]:
param_grid = {"min_samples_leaf":[3, 5, 10, 20], "criterion":['gini','entropy']}
## Define the random forest model
nr.seed(3456)
rf_clf = RandomForestClassifier() # class_weight = {0:0.33, 1:0.67}) 

## Perform the grid search over the parameters
nr.seed(4455)
rf_clf = ms.GridSearchCV(estimator = rf_clf, param_grid = param_grid, 
                      cv = inside, # Use the inside folds
                      scoring = 'roc_auc',
                      return_train_score = True)
rf_clf.fit(Features, Labels)
rf_criterion=rf_clf.best_estimator_.criterion
rf_leaf=rf_clf.best_estimator_.min_samples_leaf
print(rf_clf.best_estimator_.criterion)
print(rf_clf.best_estimator_.min_samples_leaf)

entropy
20


In [29]:
rf_clf = RandomForestClassifier(n_estimators=rf_n,max_depth=4,criterion=rf_criterion,min_samples_leaf=rf_leaf)
rf_clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=4, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=20, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [48]:
probabilities =logistic_mod.predict_proba(X_test)
scores = score_model(probabilities, 0.5)
print('Accuracy of logistic regression  %f' % sklm.accuracy_score(y_test, scores))
a=sklm.accuracy_score(y_test, scores)
probabilities1 = svm_mod.predict_proba(X_test)
scores1 = score_model(probabilities1, 0.5)
print('Accuracy of SVM  %f' % sklm.accuracy_score(y_test, scores1))
b=sklm.accuracy_score(y_test, scores1)
probabilities2 = rf_clf.predict_proba(X_test)
scores2 = score_model(probabilities2, 0.5)
print('Accuracy of Random Forest  %f' % sklm.accuracy_score(y_test, scores2))
c=sklm.accuracy_score(y_test, scores2)

Accuracy of logistic regression  0.805195
Accuracy of SVM  0.811688
Accuracy of Random Forest  0.779221


In [62]:
def print_metrics(labels, scores):
    metrics = sklm.precision_recall_fscore_support(labels, scores)
    conf = sklm.confusion_matrix(labels, scores)
    print('                 Confusion matrix')
    print('                 Score positive    Score negative')
    print('Actual positive    %6d' % conf[0,0] + '             %5d' % conf[0,1])
    print('Actual negative    %6d' % conf[1,0] + '             %5d' % conf[1,1])
    print('')
    print(' ')
    print('           Positive      Negative')
    print('Num case   %6d' % metrics[3][0] + '        %6d' % metrics[3][1])
    print('Precision  %6.2f' % metrics[0][0] + '        %6.2f' % metrics[0][1])
    print('Recall     %6.2f' % metrics[1][0] + '        %6.2f' % metrics[1][1])
    print('F1         %6.2f' % metrics[2][0] + '        %6.2f' % metrics[2][1])


    
print_metrics(y_test, scores)    

                 Confusion matrix
                 Score positive    Score negative
Actual positive        89                13
Actual negative        17                35

 
           Positive      Negative
Num case      102            52
Precision    0.84          0.73
Recall       0.87          0.67
F1           0.86          0.70


In [63]:
def print_metrics(labels, scores):
    metrics = sklm.precision_recall_fscore_support(labels, scores)
    conf = sklm.confusion_matrix(labels, scores)
    print('                 Confusion matrix')
    print('                 Score positive    Score negative')
    print('Actual positive    %6d' % conf[0,0] + '             %5d' % conf[0,1])
    print('Actual negative    %6d' % conf[1,0] + '             %5d' % conf[1,1])
    print('')
    print(' ')
    print('           Positive      Negative')
    print('Num case   %6d' % metrics[3][0] + '        %6d' % metrics[3][1])
    print('Precision  %6.2f' % metrics[0][0] + '        %6.2f' % metrics[0][1])
    print('Recall     %6.2f' % metrics[1][0] + '        %6.2f' % metrics[1][1])
    print('F1         %6.2f' % metrics[2][0] + '        %6.2f' % metrics[2][1])


    
print_metrics(y_test, scores1)    

                 Confusion matrix
                 Score positive    Score negative
Actual positive        91                11
Actual negative        18                34

 
           Positive      Negative
Num case      102            52
Precision    0.83          0.76
Recall       0.89          0.65
F1           0.86          0.70


In [61]:
def print_metrics(labels, scores):
    metrics = sklm.precision_recall_fscore_support(labels, scores)
    conf = sklm.confusion_matrix(labels, scores)
    print('                 Confusion matrix')
    print('                 Score positive    Score negative')
    print('Actual positive    %6d' % conf[0,0] + '             %5d' % conf[0,1])
    print('Actual negative    %6d' % conf[1,0] + '             %5d' % conf[1,1])
    print('')
    print(' ')
    print('           Positive      Negative')
    print('Num case   %6d' % metrics[3][0] + '        %6d' % metrics[3][1])
    print('Precision  %6.2f' % metrics[0][0] + '        %6.2f' % metrics[0][1])
    print('Recall     %6.2f' % metrics[1][0] + '        %6.2f' % metrics[1][1])
    print('F1         %6.2f' % metrics[2][0] + '        %6.2f' % metrics[2][1])


    
print_metrics(y_test, scores2)    

                 Confusion matrix
                 Score positive    Score negative
Actual positive        91                11
Actual negative        23                29

 
           Positive      Negative
Num case      102            52
Precision    0.80          0.72
Recall       0.89          0.56
F1           0.84          0.63


In [57]:
final=a*scores + b*scores1 + c*scores2
finalans=final/3
def score_model1(probs, threshold):
    return np.array([1 if x > threshold else 0 for x in probs[:]])
finalscores = score_model1(finalans, 0.5)
print('Accuracy of Our Algorithm  %f' % sklm.accuracy_score(y_test, finalscores))

Accuracy of Our Algorithm  0.798701
