In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import joblib
import warnings
warnings.filterwarnings('ignore')
np.random.seed(707)

In [2]:
data = pd.read_csv('./diabetes.csv');

In [3]:
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [4]:
data.drop(['SkinThickness','Insulin','DiabetesPedigreeFunction'],axis=1,inplace=True);

In [5]:
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,BMI,Age,Outcome
0,6,148,72,33.6,50,1
1,1,85,66,26.6,31,0
2,8,183,64,23.3,32,1
3,1,89,66,28.1,21,0
4,0,137,40,43.1,33,1


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Pregnancies    768 non-null    int64  
 1   Glucose        768 non-null    int64  
 2   BloodPressure  768 non-null    int64  
 3   BMI            768 non-null    float64
 4   Age            768 non-null    int64  
 5   Outcome        768 non-null    int64  
dtypes: float64(1), int64(5)
memory usage: 36.1 KB


Missing Values are represented as zeroes in the datasets

In [7]:
print('The number of missing values in Glucose are : ',data[data['Glucose'] == 0].count()[1])
print('The number of missing values in BloodPressure are : ',data[data['BloodPressure'] == 0].count()[1])
print('The number of missing values in BMI are : ',data[data['BMI'] == 0].count()[1])
# print('The number of missing values in DiabetesPedigreeFunction are : ',data[data['DiabetesPedigreeFunction'] == 0].count()[1])
print('The number of missing values in Age are : ',data[data['Age'] == 0].count()[1])

The number of missing values in Glucose are :  5
The number of missing values in BloodPressure are :  35
The number of missing values in BMI are :  11
The number of missing values in Age are :  0


In [8]:
data[['Glucose','BloodPressure','BMI']] = data[['Glucose','BloodPressure','BMI']].replace(to_replace=0,value=np.nan)

In [9]:
data['Glucose'] = data['Glucose'].replace(to_replace=np.nan,value=data['Glucose'].mean())
data['BloodPressure'] = data['BloodPressure'].replace(to_replace=np.nan,value=data['BloodPressure'].mean())
data['BMI'] = data['BMI'].replace(to_replace=np.nan,value=data['BMI'].mean())

# Feature Engineering

In [10]:
data.isna().sum()

Pregnancies      0
Glucose          0
BloodPressure    0
BMI              0
Age              0
Outcome          0
dtype: int64

In [11]:
X = data.drop('Outcome',axis=1)
y = data['Outcome']

In [12]:
data.to_csv('diabete_new.csv') 

In [58]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

array([ 0.63994726,  0.86510807, -0.03351824,  0.16629174,  1.4259954 ])

In [14]:
from sklearn.preprocessing import StandardScaler
std = StandardScaler()
x_train =std.fit_transform(X)
x_test = std.fit_transform(x_test)

# Machine Learning

## KNN

In [41]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold

params={'n_neighbors':range(1,21,2),
        'weights':('uniform','distance'),
        'metric':('euclidean', 'manhattan', 'minkowski')}

knn = KNeighborsClassifier()
cv = KFold(n_splits=10)
grid_search = GridSearchCV(knn,params,n_jobs=-1,cv=cv, scoring='accuracy',error_score=0)
grid_result=grid_search.fit(X,y)

# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.779956 using {'metric': 'euclidean', 'n_neighbors': 11, 'weights': 'distance'}
0.695335 (0.036299) with: {'metric': 'euclidean', 'n_neighbors': 1, 'weights': 'uniform'}
0.695335 (0.036299) with: {'metric': 'euclidean', 'n_neighbors': 1, 'weights': 'distance'}
0.738243 (0.056384) with: {'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'uniform'}
0.734313 (0.055231) with: {'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'distance'}
0.747403 (0.059312) with: {'metric': 'euclidean', 'n_neighbors': 5, 'weights': 'uniform'}
0.747386 (0.060254) with: {'metric': 'euclidean', 'n_neighbors': 5, 'weights': 'distance'}
0.755195 (0.058712) with: {'metric': 'euclidean', 'n_neighbors': 7, 'weights': 'uniform'}
0.757792 (0.068644) with: {'metric': 'euclidean', 'n_neighbors': 7, 'weights': 'distance'}
0.765619 (0.050683) with: {'metric': 'euclidean', 'n_neighbors': 9, 'weights': 'uniform'}
0.766866 (0.059084) with: {'metric': 'euclidean', 'n_neighbors': 9, 'weights': 'distance'}
0.779938

# Support Vector Machine

In [16]:
# example of grid searching key hyperparametres for SVC

from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

# define model and parameters
model = SVC()
kernel = ['poly', 'rbf', 'sigmoid']
C = [50, 10, 1.0, 0.1, 0.01]
gamma = ['scale']
# define grid search
grid = dict(kernel=kernel,C=C,gamma=gamma)
cv = KFold(n_splits=10)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X, y)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.759159 using {'C': 0.1, 'gamma': 'scale', 'kernel': 'sigmoid'}
0.735629 (0.048174) with: {'C': 50, 'gamma': 'scale', 'kernel': 'poly'}
0.743438 (0.065693) with: {'C': 50, 'gamma': 'scale', 'kernel': 'rbf'}
0.660116 (0.065244) with: {'C': 50, 'gamma': 'scale', 'kernel': 'sigmoid'}
0.734330 (0.044571) with: {'C': 10, 'gamma': 'scale', 'kernel': 'poly'}
0.759074 (0.068315) with: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}
0.654956 (0.056018) with: {'C': 10, 'gamma': 'scale', 'kernel': 'sigmoid'}
0.736928 (0.049829) with: {'C': 1.0, 'gamma': 'scale', 'kernel': 'poly'}
0.753930 (0.049044) with: {'C': 1.0, 'gamma': 'scale', 'kernel': 'rbf'}
0.686193 (0.077153) with: {'C': 1.0, 'gamma': 'scale', 'kernel': 'sigmoid'}
0.730400 (0.039899) with: {'C': 0.1, 'gamma': 'scale', 'kernel': 'poly'}
0.752632 (0.060025) with: {'C': 0.1, 'gamma': 'scale', 'kernel': 'rbf'}
0.759159 (0.044855) with: {'C': 0.1, 'gamma': 'scale', 'kernel': 'sigmoid'}
0.671822 (0.056600) with: {'C': 0.01, 'gamma': 'sca

## Bagging Classifier

In [17]:
# example of grid searching key hyperparameters for BaggingClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import BaggingClassifier

# define models and parameters
model = BaggingClassifier()
n_estimators = [10, 100, 1000]
# define grid search
grid = dict(n_estimators=n_estimators)
cv = KFold(n_splits=10)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X, y)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.762953 using {'n_estimators': 1000}
0.727768 (0.055464) with: {'n_estimators': 10}
0.760390 (0.062977) with: {'n_estimators': 100}
0.762953 (0.065402) with: {'n_estimators': 1000}


## RandomForestClassifier

In [18]:
# example of grid searching key hyperparameters for RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

# define models and parameters
model = RandomForestClassifier()
n_estimators = [10, 100, 1000]
max_features = ['sqrt', 'log2']
# define grid search
grid = dict(n_estimators=n_estimators,max_features=max_features)
cv = KFold(n_splits=10)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X, y)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.769515 using {'max_features': 'log2', 'n_estimators': 1000}
0.751213 (0.055078) with: {'max_features': 'sqrt', 'n_estimators': 10}
0.764200 (0.057934) with: {'max_features': 'sqrt', 'n_estimators': 100}
0.764303 (0.055007) with: {'max_features': 'sqrt', 'n_estimators': 1000}
0.738209 (0.052926) with: {'max_features': 'log2', 'n_estimators': 10}
0.762970 (0.068358) with: {'max_features': 'log2', 'n_estimators': 100}
0.769515 (0.056711) with: {'max_features': 'log2', 'n_estimators': 1000}


## Logistic Regression

In [19]:
# example of grid searching key hyperparametres for logistic regression
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

# define models and parameters
model = LogisticRegression()
solvers = ['newton-cg', 'lbfgs', 'liblinear']
penalty = ['l2']
c_values = [100, 10, 1.0, 0.1, 0.01]
# define grid search
grid = dict(solver=solvers,penalty=penalty,C=c_values)
cv = KFold(n_splits=10)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X, y)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.773445 using {'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'}
0.773428 (0.048366) with: {'C': 100, 'penalty': 'l2', 'solver': 'newton-cg'}
0.773428 (0.048366) with: {'C': 100, 'penalty': 'l2', 'solver': 'lbfgs'}
0.773428 (0.048366) with: {'C': 100, 'penalty': 'l2', 'solver': 'liblinear'}
0.773428 (0.048366) with: {'C': 10, 'penalty': 'l2', 'solver': 'newton-cg'}
0.773428 (0.048366) with: {'C': 10, 'penalty': 'l2', 'solver': 'lbfgs'}
0.773428 (0.048366) with: {'C': 10, 'penalty': 'l2', 'solver': 'liblinear'}
0.773428 (0.048366) with: {'C': 1.0, 'penalty': 'l2', 'solver': 'newton-cg'}
0.773428 (0.048366) with: {'C': 1.0, 'penalty': 'l2', 'solver': 'lbfgs'}
0.773428 (0.048366) with: {'C': 1.0, 'penalty': 'l2', 'solver': 'liblinear'}
0.768233 (0.049543) with: {'C': 0.1, 'penalty': 'l2', 'solver': 'newton-cg'}
0.768233 (0.049543) with: {'C': 0.1, 'penalty': 'l2', 'solver': 'lbfgs'}
0.773445 (0.048303) with: {'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'}
0.760407 (0.050786) wit

## Gradient Boosting

In [20]:
# # example of grid searching key hyperparameters for GradientBoostingClassifier
# from sklearn.model_selection import KFold
# from sklearn.model_selection import GridSearchCV
# from sklearn.ensemble import GradientBoostingClassifier

# # define models and parameters
# model = GradientBoostingClassifier()
# n_estimators = [10, 100, 1000]
# learning_rate = [0.001, 0.01, 0.1]
# subsample = [0.5, 0.7, 1.0]
# max_depth = [3, 7, 9]

# # define grid search
# grid = dict(learning_rate=learning_rate, n_estimators=n_estimators, subsample=subsample, max_depth=max_depth)
# cv = KFold(n_splits=10)
# grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
# grid_result = grid_search.fit(X, y)
# # summarize results
# print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# means = grid_result.cv_results_['mean_test_score']
# stds = grid_result.cv_results_['std_test_score']
# params = grid_result.cv_results_['params']
# for mean, stdev, param in zip(means, stds, params):
#     print("%f (%f) with: %r" % (mean, stdev, param))

## Best Model Results KNN (0.7799%)

KNN Best: 0.779956 using {'metric': 'euclidean', 'n_neighbors': 11, 'weights': 'distance'}

- With KFold
- Standard Scalar Enabled

In [23]:
# exporting the model as pkl
joblib.dump(grid_search,'model.pkl')
# exporting the columns
model_columns = list(X.columns)
joblib.dump(model_columns, 'model_columns.pkl')

AttributeError: 'numpy.ndarray' object has no attribute 'columns'