In [25]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import shapiro
from scipy.stats import boxcox

from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import PowerTransformer 

from sklearn.model_selection import train_test_split
from sklearn import model_selection
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import plot_confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.neural_network import MLPClassifier

In [2]:
# Dataset downloaded from: https://www.kaggle.com/uciml/pima-indians-diabetes-database?select=diabetes.csv
data = pd.read_csv('diabetes.csv')

### Data cleaning

In [3]:
# No need to full columns
data.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [4]:
print((data['Glucose'] == 0).sum())
print((data['BloodPressure'] == 0).sum())
print((data['SkinThickness'] == 0).sum())
print((data['Insulin'] == 0).sum())
print((data['SkinThickness'] == 0).sum())
print((data['BMI'] == 0).sum())

5
35
227
374
227
11


In [5]:
data['Glucose'] = data['Glucose'].replace(0, data['Glucose'].median())
data['BloodPressure'] = data['BloodPressure'].replace(0, data['BloodPressure'].median())
data['SkinThickness'] = data['SkinThickness'].replace(0, data['SkinThickness'].median())
data['Insulin'] = data['Insulin'].replace(0, data['Insulin'].median())
data['SkinThickness'] = data['SkinThickness'].replace(0, data['SkinThickness'].median())
data['BMI'] = data['BMI'].replace(0, data['BMI'].median())

In [6]:
print((data == 0).sum())

Pregnancies                 111
Glucose                       0
BloodPressure                 0
SkinThickness                 0
Insulin                       0
BMI                           0
DiabetesPedigreeFunction      0
Age                           0
Outcome                     500
dtype: int64


### Data describing

In [None]:
data.info()

In [None]:
data.describe()

### Data visualization

In [None]:
sns.heatmap(data.corr(), annot=True)

In [None]:
fig = data.hist(figsize=(10,10))

### 

### Data transform

In [7]:
# Get skewness 
data_cp = data
#data_cp.agg(['skew', 'kurtosis']).transpose()

In [None]:
column_trans = ColumnTransformer(
    [('Pregnancies_yj', PowerTransformer(method='yeo-johnson', standardize=True), ['Pregnancies']),
     ('Glucose_yj', PowerTransformer(method='yeo-johnson', standardize=True), ['Glucose']),
     ('BloodPressure_yj', PowerTransformer(method='yeo-johnson', standardize=True), ['BloodPressure']),
     ('SkinThickness_yj', PowerTransformer(method='yeo-johnson', standardize=True), ['SkinThickness']),
     ('Insulin_yj', PowerTransformer(method='yeo-johnson', standardize=True), ['Insulin']),
     ('BMI_yj', PowerTransformer(method='yeo-johnson', standardize=True), ['BMI']),
     ('DiabetesPedigreeFunction_yj', PowerTransformer(method='yeo-johnson', standardize=True), ['DiabetesPedigreeFunction']),
     ('Age_yj', PowerTransformer(method='yeo-johnson', standardize=True), ['Age']),
     ('Outcome_yj', PowerTransformer(method='yeo-johnson', standardize=True), ['Outcome']),
    ])

transformed_yeojohnson = column_trans.fit_transform(data_cp)
new_cols = ['Pregnancies_yj', 'Glucose_yj', 'BloodPressure_yj', 'SkinThickness_yj', 'Insulin_yj', 'BMI_yj',
            'DiabetesPedigreeFunction_yj','Age_yj', 'Outcome_yj']

pima_yj = pd.DataFrame(transformed_yeojohnson, columns=new_cols)
pd.concat([ pima_yj], axis = 1)
pima_yj.head()


In [None]:
#pima_yj.describe()

In [None]:
data_cp.agg(['skew', 'kurtosis']).transpose()

In [None]:
#fig = pima_yj.hist(figsize=(10,10))

### Data standarizarion

In [8]:
data_tr = data_cp
scaler = StandardScaler()
scaler.fit(data_tr.drop('Outcome', axis=1))
scaler_features = scaler.transform(data_tr.drop('Outcome', axis=1))
data_feat = pd.DataFrame(scaler_features, columns = data_tr.columns[:-1])

In [9]:
data_feat['Outcome'] = data_tr['Outcome'].astype(int)
data_tr = data_feat.copy()
data_tr.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,0.639947,0.866045,-0.03199,0.831114,-0.608201,0.16724,0.468492,1.425995,1
1,-0.844885,-1.205066,-0.528319,0.180566,-0.608201,-0.851551,-0.365061,-0.190672,0
2,1.23388,2.016662,-0.693761,-0.469981,-0.608201,-1.331838,0.604397,-0.105584,1
3,-0.844885,-1.073567,-0.528319,-0.469981,-0.006185,-0.633239,-0.920763,-1.041549,0
4,-1.141852,0.504422,-2.679076,0.831114,0.695378,1.549885,5.484909,-0.020496,1


### Split data

In [10]:
X=data_tr.drop('Outcome',axis=1)
Y=data_tr['Outcome']
X_train, X_val_test, Y_train, Y_val_test = train_test_split(X, Y, train_size=.60, random_state=11)        
X_val, X_test, Y_val, Y_test = train_test_split(X_val_test, Y_val_test, train_size=.50, random_state=17)  

print(X_train.shape)
print(Y_train.shape)
print(X_val.shape)
print(Y_val.shape)
print(X_test.shape)
print(Y_test.shape)

(460, 8)
(460,)
(154, 8)
(154,)
(154, 8)
(154,)


### Compare classification models

In [None]:
models = []

models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))
models.append(('LR', LogisticRegression()))
models.append(('RF', RandomForestClassifier()))

In [None]:
seed = 8
results = []
names = []
scoring = 'accuracy'

for name, model in models:
    kfold = model_selection.KFold(n_splits=10)
    cv_results = model_selection.cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

In [None]:
fig = plt.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()

### Classification models 

In [None]:
#### Multi Layer Perceptron without Hyper-parameter Tuning

In [30]:
model = MLPClassifier(hidden_layer_sizes=(4,),activation="relu",random_state=1, solver='adam')

model.fit(X_train, Y_train)

predictions = model.predict(X_test)
print(classification_report(Y_test, predictions))

              precision    recall  f1-score   support

           0       0.88      0.71      0.78        99
           1       0.61      0.82      0.70        55

    accuracy                           0.75       154
   macro avg       0.74      0.76      0.74       154
weighted avg       0.78      0.75      0.75       154





In [None]:
#### Multi Layer Perceptron using Grid Search

In [41]:
model = MLPClassifier(max_iter=100)

parameter_space = {
    'hidden_layer_sizes': [(10,30,10),(20,)],
    'activation': ['tanh', 'relu'],
    'solver': ['sgd', 'adam'],
    'alpha': [0.0001, 0.05],
    'learning_rate': ['constant','adaptive'],
}

clf = GridSearchCV(model, parameter_space, n_jobs=-1, scoring='accuracy', cv=5)
clf.fit(X_train, Y_train)







GridSearchCV(cv=5, estimator=MLPClassifier(max_iter=100), n_jobs=-1,
             param_grid={'activation': ['tanh', 'relu'],
                         'alpha': [0.0001, 0.05],
                         'hidden_layer_sizes': [(10, 30, 10), (20,)],
                         'learning_rate': ['constant', 'adaptive'],
                         'solver': ['sgd', 'adam']},
             scoring='accuracy')

In [42]:
# Best paramete set
print('Best parameters found:\n', clf.best_params_)

# All results
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))

Best parameters found:
 {'activation': 'relu', 'alpha': 0.0001, 'hidden_layer_sizes': (10, 30, 10), 'learning_rate': 'constant', 'solver': 'adam'}
0.733 (+/-0.056) for {'activation': 'tanh', 'alpha': 0.0001, 'hidden_layer_sizes': (10, 30, 10), 'learning_rate': 'constant', 'solver': 'sgd'}
0.750 (+/-0.046) for {'activation': 'tanh', 'alpha': 0.0001, 'hidden_layer_sizes': (10, 30, 10), 'learning_rate': 'constant', 'solver': 'adam'}
0.720 (+/-0.069) for {'activation': 'tanh', 'alpha': 0.0001, 'hidden_layer_sizes': (10, 30, 10), 'learning_rate': 'adaptive', 'solver': 'sgd'}
0.752 (+/-0.052) for {'activation': 'tanh', 'alpha': 0.0001, 'hidden_layer_sizes': (10, 30, 10), 'learning_rate': 'adaptive', 'solver': 'adam'}
0.739 (+/-0.074) for {'activation': 'tanh', 'alpha': 0.0001, 'hidden_layer_sizes': (20,), 'learning_rate': 'constant', 'solver': 'sgd'}
0.748 (+/-0.059) for {'activation': 'tanh', 'alpha': 0.0001, 'hidden_layer_sizes': (20,), 'learning_rate': 'constant', 'solver': 'adam'}
0.737 

In [43]:
y_true, y_pred = Y_test , clf.predict(X_test)

from sklearn.metrics import classification_report
print('Results on the test set:')
print(classification_report(y_true, y_pred))

Results on the test set:
              precision    recall  f1-score   support

           0       0.82      0.90      0.86        99
           1       0.78      0.64      0.70        55

    accuracy                           0.81       154
   macro avg       0.80      0.77      0.78       154
weighted avg       0.80      0.81      0.80       154





#### Support Vector Classifier without Hyper-parameter Tuning

In [11]:
model = SVC()
model.fit(X_train, Y_train)

predictions = model.predict(X_test)
print(classification_report(Y_test, predictions))

              precision    recall  f1-score   support

           0       0.74      0.91      0.82        99
           1       0.73      0.44      0.55        55

    accuracy                           0.74       154
   macro avg       0.74      0.67      0.68       154
weighted avg       0.74      0.74      0.72       154



#### Support Vector Classifier using Grid Search 

In [12]:
param_grid = {'C': [0.001, 0.0001, 0.00001, 0.000001, 0.0000001, 0.1, 1],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001, 0.00001, 0.000001, 0.0000001],
              'kernel': ['linear','rbf']}
 
grid = GridSearchCV(model, param_grid, refit = True, verbose = 3)
 
# fitting the model for grid search
grid.fit(X_train, Y_train)

# print best parameter after tuning
print(grid.best_params_)
 
# print how our model looks after hyper-parameter tuning
print(grid.best_estimator_)


grid_predictions = grid.predict(X_test)
 
# print classification report
print(classification_report(Y_test, grid_predictions))

Fitting 5 folds for each of 112 candidates, totalling 560 fits
[CV 1/5] END ...C=0.001, gamma=1, kernel=linear;, score=0.663 total time=   0.0s
[CV 2/5] END ...C=0.001, gamma=1, kernel=linear;, score=0.663 total time=   0.0s
[CV 3/5] END ...C=0.001, gamma=1, kernel=linear;, score=0.652 total time=   0.0s
[CV 4/5] END ...C=0.001, gamma=1, kernel=linear;, score=0.652 total time=   0.0s
[CV 5/5] END ...C=0.001, gamma=1, kernel=linear;, score=0.652 total time=   0.0s
[CV 1/5] END ......C=0.001, gamma=1, kernel=rbf;, score=0.663 total time=   0.0s
[CV 2/5] END ......C=0.001, gamma=1, kernel=rbf;, score=0.663 total time=   0.0s
[CV 3/5] END ......C=0.001, gamma=1, kernel=rbf;, score=0.652 total time=   0.0s
[CV 4/5] END ......C=0.001, gamma=1, kernel=rbf;, score=0.652 total time=   0.0s
[CV 5/5] END ......C=0.001, gamma=1, kernel=rbf;, score=0.652 total time=   0.0s
[CV 1/5] END .C=0.001, gamma=0.1, kernel=linear;, score=0.663 total time=   0.0s
[CV 2/5] END .C=0.001, gamma=0.1, kernel=linea

[CV 2/5] END ..C=0.0001, gamma=0.01, kernel=rbf;, score=0.663 total time=   0.0s
[CV 3/5] END ..C=0.0001, gamma=0.01, kernel=rbf;, score=0.652 total time=   0.0s
[CV 4/5] END ..C=0.0001, gamma=0.01, kernel=rbf;, score=0.652 total time=   0.0s
[CV 5/5] END ..C=0.0001, gamma=0.01, kernel=rbf;, score=0.652 total time=   0.0s
[CV 1/5] END C=0.0001, gamma=0.001, kernel=linear;, score=0.663 total time=   0.0s
[CV 2/5] END C=0.0001, gamma=0.001, kernel=linear;, score=0.663 total time=   0.0s
[CV 3/5] END C=0.0001, gamma=0.001, kernel=linear;, score=0.652 total time=   0.0s
[CV 4/5] END C=0.0001, gamma=0.001, kernel=linear;, score=0.652 total time=   0.0s
[CV 5/5] END C=0.0001, gamma=0.001, kernel=linear;, score=0.652 total time=   0.0s
[CV 1/5] END .C=0.0001, gamma=0.001, kernel=rbf;, score=0.663 total time=   0.0s
[CV 2/5] END .C=0.0001, gamma=0.001, kernel=rbf;, score=0.663 total time=   0.0s
[CV 3/5] END .C=0.0001, gamma=0.001, kernel=rbf;, score=0.652 total time=   0.0s
[CV 4/5] END .C=0.

[CV 1/5] END ..C=1e-05, gamma=1e-05, kernel=rbf;, score=0.663 total time=   0.0s
[CV 2/5] END ..C=1e-05, gamma=1e-05, kernel=rbf;, score=0.663 total time=   0.0s
[CV 3/5] END ..C=1e-05, gamma=1e-05, kernel=rbf;, score=0.652 total time=   0.0s
[CV 4/5] END ..C=1e-05, gamma=1e-05, kernel=rbf;, score=0.652 total time=   0.0s
[CV 5/5] END ..C=1e-05, gamma=1e-05, kernel=rbf;, score=0.652 total time=   0.0s
[CV 1/5] END C=1e-05, gamma=1e-06, kernel=linear;, score=0.663 total time=   0.0s
[CV 2/5] END C=1e-05, gamma=1e-06, kernel=linear;, score=0.663 total time=   0.0s
[CV 3/5] END C=1e-05, gamma=1e-06, kernel=linear;, score=0.652 total time=   0.0s
[CV 4/5] END C=1e-05, gamma=1e-06, kernel=linear;, score=0.652 total time=   0.0s
[CV 5/5] END C=1e-05, gamma=1e-06, kernel=linear;, score=0.652 total time=   0.0s
[CV 1/5] END ..C=1e-05, gamma=1e-06, kernel=rbf;, score=0.663 total time=   0.0s
[CV 2/5] END ..C=1e-05, gamma=1e-06, kernel=rbf;, score=0.663 total time=   0.0s
[CV 3/5] END ..C=1e-05,

[CV 2/5] END ......C=1e-07, gamma=1, kernel=rbf;, score=0.663 total time=   0.0s
[CV 3/5] END ......C=1e-07, gamma=1, kernel=rbf;, score=0.652 total time=   0.0s
[CV 4/5] END ......C=1e-07, gamma=1, kernel=rbf;, score=0.652 total time=   0.0s
[CV 5/5] END ......C=1e-07, gamma=1, kernel=rbf;, score=0.652 total time=   0.0s
[CV 1/5] END .C=1e-07, gamma=0.1, kernel=linear;, score=0.663 total time=   0.0s
[CV 2/5] END .C=1e-07, gamma=0.1, kernel=linear;, score=0.663 total time=   0.0s
[CV 3/5] END .C=1e-07, gamma=0.1, kernel=linear;, score=0.652 total time=   0.0s
[CV 4/5] END .C=1e-07, gamma=0.1, kernel=linear;, score=0.652 total time=   0.0s
[CV 5/5] END .C=1e-07, gamma=0.1, kernel=linear;, score=0.652 total time=   0.0s
[CV 1/5] END ....C=1e-07, gamma=0.1, kernel=rbf;, score=0.663 total time=   0.0s
[CV 2/5] END ....C=1e-07, gamma=0.1, kernel=rbf;, score=0.663 total time=   0.0s
[CV 3/5] END ....C=1e-07, gamma=0.1, kernel=rbf;, score=0.652 total time=   0.0s
[CV 4/5] END ....C=1e-07, ga

[CV 2/5] END C=0.1, gamma=0.0001, kernel=linear;, score=0.750 total time=   0.0s
[CV 3/5] END C=0.1, gamma=0.0001, kernel=linear;, score=0.739 total time=   0.0s
[CV 4/5] END C=0.1, gamma=0.0001, kernel=linear;, score=0.772 total time=   0.0s
[CV 5/5] END C=0.1, gamma=0.0001, kernel=linear;, score=0.783 total time=   0.0s
[CV 1/5] END ...C=0.1, gamma=0.0001, kernel=rbf;, score=0.663 total time=   0.0s
[CV 2/5] END ...C=0.1, gamma=0.0001, kernel=rbf;, score=0.663 total time=   0.0s
[CV 3/5] END ...C=0.1, gamma=0.0001, kernel=rbf;, score=0.652 total time=   0.0s
[CV 4/5] END ...C=0.1, gamma=0.0001, kernel=rbf;, score=0.652 total time=   0.0s
[CV 5/5] END ...C=0.1, gamma=0.0001, kernel=rbf;, score=0.652 total time=   0.0s
[CV 1/5] END .C=0.1, gamma=1e-05, kernel=linear;, score=0.761 total time=   0.0s
[CV 2/5] END .C=0.1, gamma=1e-05, kernel=linear;, score=0.750 total time=   0.0s
[CV 3/5] END .C=0.1, gamma=1e-05, kernel=linear;, score=0.739 total time=   0.0s
[CV 4/5] END .C=0.1, gamma=1

[CV 5/5] END ...C=1, gamma=1e-07, kernel=linear;, score=0.783 total time=   0.0s
[CV 1/5] END ......C=1, gamma=1e-07, kernel=rbf;, score=0.663 total time=   0.0s
[CV 2/5] END ......C=1, gamma=1e-07, kernel=rbf;, score=0.663 total time=   0.0s
[CV 3/5] END ......C=1, gamma=1e-07, kernel=rbf;, score=0.652 total time=   0.0s
[CV 4/5] END ......C=1, gamma=1e-07, kernel=rbf;, score=0.652 total time=   0.0s
[CV 5/5] END ......C=1, gamma=1e-07, kernel=rbf;, score=0.652 total time=   0.0s
{'C': 1, 'gamma': 0.01, 'kernel': 'rbf'}
SVC(C=1, gamma=0.01)
              precision    recall  f1-score   support

           0       0.74      0.93      0.83        99
           1       0.77      0.42      0.54        55

    accuracy                           0.75       154
   macro avg       0.75      0.67      0.68       154
weighted avg       0.75      0.75      0.72       154



#### Logistic Regression without Hyper-parameter Tuning

In [15]:
model = LogisticRegression()
model.fit(X_train, Y_train)

predictions = model.predict(X_test)
print(classification_report(Y_test, predictions))

              precision    recall  f1-score   support

           0       0.76      0.93      0.84        99
           1       0.79      0.47      0.59        55

    accuracy                           0.77       154
   macro avg       0.77      0.70      0.71       154
weighted avg       0.77      0.77      0.75       154



In [None]:
#### Logistic regression using Grid Search 

In [20]:
model = LogisticRegression()
param_grid = {'C': [0.001, 0.0001, 0.00001, 0.000001, 0.0000001, 0.1, 1],
              'penalty': ['l1', 'l2']
             }

#grid = GridSearchCV(model, param_grid, refit = True, verbose = 3)
grid = GridSearchCV(model, param_grid) 
# fitting the model for grid search
print(grid.fit(X_train, Y_train))

# print best parameter after tuning
print(grid.best_params_)
 
# print how our model looks after hyper-parameter tuning
print(grid.best_estimator_)


grid_predictions = grid.predict(X_test)
 
# print classification report
print(classification_report(Y_test, grid_predictions))

GridSearchCV(estimator=LogisticRegression(),
             param_grid={'C': [0.001, 0.0001, 1e-05, 1e-06, 1e-07, 0.1, 1],
                         'penalty': ['l1', 'l2']})
{'C': 0.1, 'penalty': 'l2'}
LogisticRegression(C=0.1)
              precision    recall  f1-score   support

           0       0.75      0.93      0.83        99
           1       0.78      0.45      0.57        55

    accuracy                           0.76       154
   macro avg       0.77      0.69      0.70       154
weighted avg       0.76      0.76      0.74       154



35 fits failed out of a total of 70.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
35 fits failed with the following error:
Traceback (most recent call last):
  File "/home/bryan/.local/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 681, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/bryan/.local/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/home/bryan/.local/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 447, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

        nan 0.65