## 0 - General Imports

In [43]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import linear_model
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

## 1 - Load Dataset into a Pandas DataFrame

In [5]:
df = pd.read_csv('diabetes.csv',header=0,sep=',')
df.columns=['Pregnancies','Glucose','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigreeFunction','Age','Outcome']
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


## 2 - Get Features into a NumPy Array

In [7]:
X=df.loc[:, df.columns != 'Outcome'].values

## 3 - Get Class Labels into a NumPy Array

In [8]:
y=df['Outcome'].values

## 4 - Splitting data

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = df.Outcome, random_state=0)

## 5 - Model Selection

### 5.1 - Perceptron 

### Combining transformers and estimators  in a pipeline

In [79]:
pipe_lc=Pipeline([('scl1',StandardScaler()),('clf1',
linear_model.Perceptron(random_state=42, max_iter=10, tol=0.001))])
lc_train=pipe_lc.fit(X_train, y_train)
lc_test=pipe_lc.fit(X_test, y_test)


### Fine Tuning Perceptron via Grid Search

In [22]:
param1_range=[0.00001, 0.0001, 0.001, 0.01, 1.0, 10.0]
param2_range=[5, 10, 15, 20, 25, 30, 40, 50, 60, 100]
param_grid = [{'clf1__max_iter': param2_range, 'clf1__tol':param1_range}]
gd=GridSearchCV(estimator=pipe_lc, param_grid=param_grid, cv=10,n_jobs=-1)
gd.fit(X_train, y_train)
print(gd.best_score_)
print(gd.best_params_)

0.6979166666666666
{'clf1__max_iter': 15, 'clf1__tol': 1e-05}




### Estimating model performance


In [83]:
y_pred_test=pipe_lc.predict(X_test)
print('Accuracy: %.2f' % accuracy_score(y_test, y_pred_test))
cm1 = confusion_matrix(y_test, y_pred_test)
print(cm1)

Accuracy: 0.78
[[105  20]
 [ 22  45]]


### 5.2 - Adaline 

###  Combining transformers and estimators  in a pipeline


In [84]:
pipe_lcA=Pipeline([('scl2',StandardScaler()),('clf2',
linear_model.SGDClassifier(max_iter=10, tol=1e-2, loss='perceptron', random_state=42))])
lc_train=pipe_lcA.fit(X_train, y_train)
lc_test=pipe_lcA.fit(X_test, y_test)



###  Fine Tuning Adaline via Grid Search

In [33]:
param_gridA = [{'clf2__max_iter': param2_range, 'clf2__tol':param1_range}]
gda=GridSearchCV(estimator=pipe_lcA, param_grid=param_gridA, cv=10,n_jobs=-1)
gda.fit(X_train, y_train)
print(gda.best_score_)
print(gda.best_params_)

0.7100694444444444
{'clf2__max_iter': 60, 'clf2__tol': 0.01}





### Estimating model performance


In [89]:
print('Test Accuracy: %.3f' % pipe_lcA.score(X_test, y_test))
y_pred_test=pipe_lcA.predict(X_test)
print('SKlearn Accuracy: %.2f' % accuracy_score(y_test, y_pred_test))
cm2 = confusion_matrix(y_test, y_pred_test)
print(cm2)

Test Accuracy: 0.760
SKlearn Accuracy: 0.76
[[94 31]
 [15 52]]


### 5.3 kNN 

###  Combining transformers and estimators  in a pipeline


In [91]:
pipe_knn=Pipeline([('scl3',StandardScaler()),('clf3',KNeighborsClassifier(n_neighbors=3))])
knn_train=pipe_knn.fit(X_train, y_train)
knn_test=pipe_knn.fit(X_test, y_test)


### Fine Tuning kNN via Grid Search

In [52]:
param_range=np.arange(1,15)
param_gridKnn = [{'clf3__n_neighbors': param_range}]
gd_knn=GridSearchCV(estimator=pipe_knn, param_grid=param_gridKnn, cv=10, n_jobs=-1)
gd_knn.fit(X_train, y_train)
print(gd_knn.best_score_)
print(gd_knn.best_params_)

0.7378472222222222
{'clf3__n_neighbors': 13}




### Estimating model performance

In [92]:
y_pred_knn_test=pipe_knn.predict(X_test)
print('Accuracy: %.2f' % accuracy_score(y_test, y_pred_knn_test))
cm3 = confusion_matrix(y_test, y_pred_knn_test)
print(cm3)
print(classification_report(y_test,y_pred_knn_test))

Accuracy: 0.86
[[115  10]
 [ 16  51]]
              precision    recall  f1-score   support

           0       0.88      0.92      0.90       125
           1       0.84      0.76      0.80        67

   micro avg       0.86      0.86      0.86       192
   macro avg       0.86      0.84      0.85       192
weighted avg       0.86      0.86      0.86       192



### 5.4 - Logiostic Regerssion

### Combining transformers and estimators in a pipeline

In [93]:
pipe_lr=Pipeline([('scl4',StandardScaler()),('clf4',LogisticRegression())])
lr_train=pipe_lr.fit(X_train, y_train)
lr_test=pipe_lr.fit(X_test, y_test)




### Fine Tuning Logistic Regression via Grid Search

In [64]:
'''param_range = list(np.arange(1, 10))
param_gridlr = [
    {'clf4': param_range, 'clf4__penalty': ['l1'], 'clf4__solver' : ['liblinear'], 'clf4__multi_class' : ['ovr']},
    {'clf4': param_range, 'clf4__penalty': ['l2'], 'clf4__solver' : ['liblinear', 'newton-cg', 'lbfgs'], 'clf4__multi_class' : ['ovr']}
]'''
grid_values = {'clf4__penalty': ['l1', 'l2'],'clf4__C':[0.001,.009,0.01,.09,1,5,10,25]}
gd_lr=GridSearchCV(estimator=pipe_lr, param_grid=grid_values, cv=10, n_jobs=-1)
gd_lr.fit(X_train, y_train)
print(gd_lr.best_score_)
print(gd_lr.best_params_)

0.765625
{'clf4__C': 0.09, 'clf4__penalty': 'l1'}




### Estimating model performance

In [94]:
y_pred_lr_test=pipe_lr.predict(X_test)
print('Accuracy: %.2f' % accuracy_score(y_test, y_pred_lr_test))
cm4 = confusion_matrix(y_test, y_pred_lr_test)
print(cm4)
print(classification_report(y_test,y_pred_lr_test))

Accuracy: 0.81
[[113  12]
 [ 24  43]]
              precision    recall  f1-score   support

           0       0.82      0.90      0.86       125
           1       0.78      0.64      0.70        67

   micro avg       0.81      0.81      0.81       192
   macro avg       0.80      0.77      0.78       192
weighted avg       0.81      0.81      0.81       192



### 5.5 -  SVM

### Combining transformers and estimators in a pipeline

In [95]:
pipe_SVM=Pipeline([('scl5',StandardScaler()),('clf5',SVC())])
svm_train=pipe_SVM.fit(X_train, y_train)
svm_test=pipe_SVM.fit(X_test, y_test)


### Fine Tuning SVM via Grid Search

In [74]:
param_gridsvm = {'clf5__C': [0.1, 1, 10, 100, 1000],  
              'clf5__gamma': [1, 0.1, 0.01, 0.001, 0.0001], 
              'clf5__kernel': ['rbf']}  
  
gd_svm=GridSearchCV(estimator=pipe_SVM, param_grid=param_gridsvm, cv=10, n_jobs=-1)
gd_svm.fit(X_train, y_train)
print(gd_svm.best_score_)
print(gd_svm.best_params_)

0.7743055555555556
{'clf5__C': 1, 'clf5__gamma': 0.01, 'clf5__kernel': 'rbf'}




### Estimating model performance

In [98]:
y_pred_svm_test=pipe_SVM.predict(X_test)
print('Accuracy: %.2f' % accuracy_score(y_test, y_pred_svm_test))
cm4 = confusion_matrix(y_test, y_pred_svm_test)
print(cm4)
print(classification_report(y_test,y_pred_svm_test))

Accuracy: 0.89
[[121   4]
 [ 17  50]]
              precision    recall  f1-score   support

           0       0.88      0.97      0.92       125
           1       0.93      0.75      0.83        67

   micro avg       0.89      0.89      0.89       192
   macro avg       0.90      0.86      0.87       192
weighted avg       0.89      0.89      0.89       192



## 6 - Algorithm selection with nested cross-validation


In [77]:
scores_P=cross_val_score(gd, X_train, y_train, scoring='accuracy', cv=2)
print('Perceptron accuracy: %.3f +/- %.3f' % (np.mean(scores_P), np.std(scores_P)))

scores_GD=cross_val_score(gda, X_train, y_train, scoring='accuracy', cv=2)
print('Adaline accuracy: %.3f +/- %.3f' % (np.mean(scores_GD),np.std(scores_GD)))

scores_KNN=cross_val_score(gd_knn, X_train, y_train, scoring='accuracy', cv=2)
print('KNN accuracy: %.3f +/- %.3f' % (np.mean(scores_KNN), np.std(scores_KNN)))

scores_LR=cross_val_score(gd_lr, X_train, y_train, scoring='accuracy', cv=2)
print('LR accuracy: %.3f +/- %.3f' % (np.mean(scores_LR), np.std(scores_LR)))

scores_SVM=cross_val_score(gd_svm, X_train, y_train, scoring='accuracy', cv=2)
print('SVM accuracy: %.3f +/- %.3f' % (np.mean(scores_SVM), np.std(scores_SVM)))



Perceptron accuracy: 0.677 +/- 0.037




Adaline accuracy: 0.694 +/- 0.015




KNN accuracy: 0.715 +/- 0.008




LR accuracy: 0.752 +/- 0.008




SVM accuracy: 0.743 +/- 0.003




## 7 -  Fitting the selected algorithm



In [100]:
lr1 = LogisticRegression()
lr1.fit(X_train,y_train)
y_pred_lr1 = lr1.predict(X_test)

Test Accuracy: 0.776
[[110  15]
 [ 28  39]]
              precision    recall  f1-score   support

           0       0.80      0.88      0.84       125
           1       0.72      0.58      0.64        67

   micro avg       0.78      0.78      0.78       192
   macro avg       0.76      0.73      0.74       192
weighted avg       0.77      0.78      0.77       192





## 8 - Estimating final model performance

In [101]:
print('Test Accuracy: %.3f' % lr1.score(X_test, y_test))
print(confusion_matrix(y_test,y_pred_lr1))
print(classification_report(y_test,y_pred_lr1))

Test Accuracy: 0.776
[[110  15]
 [ 28  39]]
              precision    recall  f1-score   support

           0       0.80      0.88      0.84       125
           1       0.72      0.58      0.64        67

   micro avg       0.78      0.78      0.78       192
   macro avg       0.76      0.73      0.74       192
weighted avg       0.77      0.78      0.77       192

