In [28]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import joblib
import warnings
from csv import writer
warnings.filterwarnings('ignore')
np.random.seed(707)

In [2]:
data = pd.read_csv('./diabetes.csv');

In [3]:
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [4]:
data.drop(['SkinThickness','Insulin','DiabetesPedigreeFunction'],axis=1,inplace=True);

In [5]:
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,BMI,Age,Outcome
0,6,148,72,33.6,50,1
1,1,85,66,26.6,31,0
2,8,183,64,23.3,32,1
3,1,89,66,28.1,21,0
4,0,137,40,43.1,33,1


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Pregnancies    768 non-null    int64  
 1   Glucose        768 non-null    int64  
 2   BloodPressure  768 non-null    int64  
 3   BMI            768 non-null    float64
 4   Age            768 non-null    int64  
 5   Outcome        768 non-null    int64  
dtypes: float64(1), int64(5)
memory usage: 36.1 KB


Missing Values are represented as zeroes in the datasets

In [7]:
print('The number of missing values in Glucose are : ',data[data['Glucose'] == 0].count()[1])
print('The number of missing values in BloodPressure are : ',data[data['BloodPressure'] == 0].count()[1])
print('The number of missing values in BMI are : ',data[data['BMI'] == 0].count()[1])
# print('The number of missing values in DiabetesPedigreeFunction are : ',data[data['DiabetesPedigreeFunction'] == 0].count()[1])
print('The number of missing values in Age are : ',data[data['Age'] == 0].count()[1])

The number of missing values in Glucose are :  5
The number of missing values in BloodPressure are :  35
The number of missing values in BMI are :  11
The number of missing values in Age are :  0


In [8]:
data[['Glucose','BloodPressure','BMI']] = data[['Glucose','BloodPressure','BMI']].replace(to_replace=0,value=np.nan)

In [9]:
data['Glucose'] = data['Glucose'].replace(to_replace=np.nan,value=data['Glucose'].mean())
data['BloodPressure'] = data['BloodPressure'].replace(to_replace=np.nan,value=data['BloodPressure'].mean())
data['BMI'] = data['BMI'].replace(to_replace=np.nan,value=data['BMI'].mean())

# Feature Engineering

In [10]:
data.isna().sum()

Pregnancies      0
Glucose          0
BloodPressure    0
BMI              0
Age              0
Outcome          0
dtype: int64

In [11]:
X = data.drop('Outcome',axis=1)
y = data['Outcome']

In [12]:
data.to_csv('diabetes_new.csv') 

In [13]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [14]:
from sklearn.preprocessing import StandardScaler
std = StandardScaler()
x_train = std.fit_transform(x_train)
x_test = std.fit_transform(x_test)

# Machine Learning

## KNN

In [15]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold

params={'n_neighbors':range(1,21,2),
        'weights':('uniform','distance'),
        'metric':('euclidean', 'manhattan', 'minkowski')}

knn = KNeighborsClassifier()
cv = KFold(n_splits=10)
grid_search = GridSearchCV(knn,params,n_jobs=-1,cv=cv, scoring='accuracy',error_score=0)
grid_result=grid_search.fit(x_train,y_train)


# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.773585 using {'metric': 'manhattan', 'n_neighbors': 15, 'weights': 'distance'}
0.690666 (0.062658) with: {'metric': 'euclidean', 'n_neighbors': 1, 'weights': 'uniform'}
0.690666 (0.062658) with: {'metric': 'euclidean', 'n_neighbors': 1, 'weights': 'distance'}
0.715151 (0.037190) with: {'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'uniform'}
0.708620 (0.035985) with: {'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'distance'}
0.745981 (0.045566) with: {'metric': 'euclidean', 'n_neighbors': 5, 'weights': 'uniform'}
0.752512 (0.045877) with: {'metric': 'euclidean', 'n_neighbors': 5, 'weights': 'distance'}
0.760656 (0.056954) with: {'metric': 'euclidean', 'n_neighbors': 7, 'weights': 'uniform'}
0.765600 (0.050707) with: {'metric': 'euclidean', 'n_neighbors': 7, 'weights': 'distance'}
0.757298 (0.048565) with: {'metric': 'euclidean', 'n_neighbors': 9, 'weights': 'uniform'}
0.765521 (0.052135) with: {'metric': 'euclidean', 'n_neighbors': 9, 'weights': 'distance'}
0.762322

# Support Vector Machine

In [16]:
# example of grid searching key hyperparametres for SVC

from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

# define model and parameters
model = SVC()
kernel = ['poly', 'rbf', 'sigmoid']
C = [50, 10, 1.0, 0.1, 0.01]
gamma = ['scale']
# define grid search
grid = dict(kernel=kernel,C=C,gamma=gamma)
cv = KFold(n_splits=10)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(x_train, y_train)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.763855 using {'C': 0.1, 'gamma': 'scale', 'kernel': 'sigmoid'}
0.736198 (0.029436) with: {'C': 50, 'gamma': 'scale', 'kernel': 'poly'}
0.734770 (0.064319) with: {'C': 50, 'gamma': 'scale', 'kernel': 'rbf'}
0.671047 (0.036299) with: {'C': 50, 'gamma': 'scale', 'kernel': 'sigmoid'}
0.732998 (0.034266) with: {'C': 10, 'gamma': 'scale', 'kernel': 'poly'}
0.734691 (0.044412) with: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}
0.682364 (0.040751) with: {'C': 10, 'gamma': 'scale', 'kernel': 'sigmoid'}
0.734558 (0.034462) with: {'C': 1.0, 'gamma': 'scale', 'kernel': 'poly'}
0.749260 (0.033593) with: {'C': 1.0, 'gamma': 'scale', 'kernel': 'rbf'}
0.680777 (0.040644) with: {'C': 1.0, 'gamma': 'scale', 'kernel': 'sigmoid'}
0.731280 (0.038926) with: {'C': 0.1, 'gamma': 'scale', 'kernel': 'poly'}
0.750767 (0.023546) with: {'C': 0.1, 'gamma': 'scale', 'kernel': 'rbf'}
0.763855 (0.037039) with: {'C': 0.1, 'gamma': 'scale', 'kernel': 'sigmoid'}
0.665997 (0.045973) with: {'C': 0.01, 'gamma': 'sca

## Bagging Classifier

In [17]:
# example of grid searching key hyperparameters for BaggingClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import BaggingClassifier

# define models and parameters
model = BaggingClassifier()
n_estimators = [10, 100, 1000]
# define grid search
grid = dict(n_estimators=n_estimators)
cv = KFold(n_splits=10)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(x_train, y_train)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.752353 using {'n_estimators': 1000}
0.739424 (0.038258) with: {'n_estimators': 10}
0.749233 (0.055875) with: {'n_estimators': 100}
0.752353 (0.047963) with: {'n_estimators': 1000}


## RandomForestClassifier

In [18]:
# example of grid searching key hyperparameters for RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

# define models and parameters
model = RandomForestClassifier()
n_estimators = [10, 100, 1000]
max_features = ['sqrt', 'log2']
# define grid search
grid = dict(n_estimators=n_estimators,max_features=max_features)
cv = KFold(n_splits=10)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(x_train, y_train)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.765389 using {'max_features': 'log2', 'n_estimators': 100}
0.744315 (0.045946) with: {'max_features': 'sqrt', 'n_estimators': 10}
0.765362 (0.057728) with: {'max_features': 'sqrt', 'n_estimators': 100}
0.762136 (0.058358) with: {'max_features': 'sqrt', 'n_estimators': 1000}
0.752433 (0.054043) with: {'max_features': 'log2', 'n_estimators': 10}
0.765389 (0.060626) with: {'max_features': 'log2', 'n_estimators': 100}
0.762189 (0.049000) with: {'max_features': 'log2', 'n_estimators': 1000}


## Logistic Regression

In [19]:
# example of grid searching key hyperparametres for logistic regression
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

# define models and parameters
model = LogisticRegression()
solvers = ['newton-cg', 'lbfgs', 'liblinear']
penalty = ['l2']
c_values = [100, 10, 1.0, 0.1, 0.01]
# define grid search
grid = dict(solver=solvers,penalty=penalty,C=c_values)
cv = KFold(n_splits=10)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(x_train, y_train)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.768773 using {'C': 0.1, 'penalty': 'l2', 'solver': 'newton-cg'}
0.762269 (0.041481) with: {'C': 100, 'penalty': 'l2', 'solver': 'newton-cg'}
0.762269 (0.041481) with: {'C': 100, 'penalty': 'l2', 'solver': 'lbfgs'}
0.762269 (0.041481) with: {'C': 100, 'penalty': 'l2', 'solver': 'liblinear'}
0.762269 (0.041481) with: {'C': 10, 'penalty': 'l2', 'solver': 'newton-cg'}
0.762269 (0.041481) with: {'C': 10, 'penalty': 'l2', 'solver': 'lbfgs'}
0.762269 (0.041481) with: {'C': 10, 'penalty': 'l2', 'solver': 'liblinear'}
0.765521 (0.041546) with: {'C': 1.0, 'penalty': 'l2', 'solver': 'newton-cg'}
0.765521 (0.041546) with: {'C': 1.0, 'penalty': 'l2', 'solver': 'lbfgs'}
0.762269 (0.041481) with: {'C': 1.0, 'penalty': 'l2', 'solver': 'liblinear'}
0.768773 (0.041294) with: {'C': 0.1, 'penalty': 'l2', 'solver': 'newton-cg'}
0.768773 (0.041294) with: {'C': 0.1, 'penalty': 'l2', 'solver': 'lbfgs'}
0.763882 (0.043514) with: {'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'}
0.757298 (0.045752) wit

## Gradient Boosting

In [20]:
# example of grid searching key hyperparameters for GradientBoostingClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier

# define models and parameters
model = GradientBoostingClassifier()
n_estimators = [10, 100, 1000]
learning_rate = [0.001, 0.01, 0.1]
subsample = [0.5, 0.7, 1.0]
max_depth = [3, 7, 9]

# define grid search
grid = dict(learning_rate=learning_rate, n_estimators=n_estimators, subsample=subsample, max_depth=max_depth)
cv = KFold(n_splits=10)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X, y)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.766917 using {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.5}
0.651025 (0.072141) with: {'learning_rate': 0.001, 'max_depth': 3, 'n_estimators': 10, 'subsample': 0.5}
0.651025 (0.072141) with: {'learning_rate': 0.001, 'max_depth': 3, 'n_estimators': 10, 'subsample': 0.7}
0.651025 (0.072141) with: {'learning_rate': 0.001, 'max_depth': 3, 'n_estimators': 10, 'subsample': 1.0}
0.651025 (0.072141) with: {'learning_rate': 0.001, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.5}
0.651025 (0.072141) with: {'learning_rate': 0.001, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.7}
0.651025 (0.072141) with: {'learning_rate': 0.001, 'max_depth': 3, 'n_estimators': 100, 'subsample': 1.0}
0.759057 (0.064466) with: {'learning_rate': 0.001, 'max_depth': 3, 'n_estimators': 1000, 'subsample': 0.5}
0.755109 (0.065812) with: {'learning_rate': 0.001, 'max_depth': 3, 'n_estimators': 1000, 'subsample': 0.7}
0.755144 (0.060017) with: {'learning_rate': 0.001, 'ma

In [21]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

rf = RandomForestClassifier(max_features='log2',n_estimators=1000)
rf.fit(x_train,y_train)
y_pred = rf.predict(x_test)

print(classification_report(y_test,y_pred))
print(accuracy_score(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.80      0.82      0.81        99
           1       0.66      0.64      0.65        55

    accuracy                           0.75       154
   macro avg       0.73      0.73      0.73       154
weighted avg       0.75      0.75      0.75       154

0.7532467532467533


In [22]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

knn = KNeighborsClassifier(metric='manhattan',n_neighbors=15,weights='distance')
knn.fit(x_train,y_train)

y_pred = knn.predict(x_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.81      0.84      0.82        99
           1       0.69      0.64      0.66        55

    accuracy                           0.77       154
   macro avg       0.75      0.74      0.74       154
weighted avg       0.76      0.77      0.76       154



In [23]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

lr = LogisticRegression(C=100,penalty='l2',solver='newton-cg')
lr.fit(x_train,y_train)
y_pred = lr.predict(x_test)

print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.79      0.85      0.82        99
           1       0.69      0.60      0.64        55

    accuracy                           0.76       154
   macro avg       0.74      0.72      0.73       154
weighted avg       0.75      0.76      0.76       154



In [24]:
# exporting the model as pkl
joblib.dump(knn,'model.pkl')
print('Model was dumped Successfully ... ')
# exporting the columns
model_columns = list(X.columns)
joblib.dump(model_columns, 'model_columns.pkl')
print('Columns was dumped successfully ... ')

Model was dumped Successfully ... 
Columns was dumped successfully ... 


In [30]:
model_columns

['Pregnancies', 'Glucose', 'BloodPressure', 'BMI', 'Age']

In [26]:
x_test[0]

array([ 0.49124165, -0.72667791, -1.24685963,  0.17052869,  0.66661154])

In [29]:
def write_to_csv(json_input):
    with open('diabetes_new.csv', 'a') as f_object:
    
        writer_object = writer(f_object)
    
        writer_object.writerow(json_input)
    
        f_object.close()
        
write_to_csv(x_test[0])