In [4]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

%matplotlib inline

In [6]:
data = pd.read_csv('diabetes.csv')

In [7]:
data.shape


(768, 9)

In [8]:
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [9]:
# check if any null value is present 
data.isnull().values.any()

False

In [10]:
#check the correlation
data.corr()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
Pregnancies,1.0,0.129459,0.141282,-0.081672,-0.073535,0.017683,-0.033523,0.544341,0.221898
Glucose,0.129459,1.0,0.15259,0.057328,0.331357,0.221071,0.137337,0.263514,0.466581
BloodPressure,0.141282,0.15259,1.0,0.207371,0.088933,0.281805,0.041265,0.239528,0.065068
SkinThickness,-0.081672,0.057328,0.207371,1.0,0.436783,0.392573,0.183928,-0.11397,0.074752
Insulin,-0.073535,0.331357,0.088933,0.436783,1.0,0.197859,0.185071,-0.042163,0.130548
BMI,0.017683,0.221071,0.281805,0.392573,0.197859,1.0,0.140647,0.036242,0.292695
DiabetesPedigreeFunction,-0.033523,0.137337,0.041265,0.183928,0.185071,0.140647,1.0,0.033561,0.173844
Age,0.544341,0.263514,0.239528,-0.11397,-0.042163,0.036242,0.033561,1.0,0.238356
Outcome,0.221898,0.466581,0.065068,0.074752,0.130548,0.292695,0.173844,0.238356,1.0


In [12]:
data['Outcome']

0      1
1      0
2      1
3      0
4      1
      ..
763    0
764    0
765    0
766    1
767    0
Name: Outcome, Length: 768, dtype: int64

In [17]:
DTrue_count = len(data.loc[data['Outcome'] == True])
DFalse_count = len(data.loc[data['Outcome'] == False])

In [22]:
# diabetic count
(DTrue_count,DFalse_count)


(268, 500)

## Splitting the dataset as Train and Test dataset

In [24]:
from sklearn.model_selection import train_test_split
feature_columns = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness','Insulin' ,'BMI' ,
                   'DiabetesPedigreeFunction' ,'Age' ,'Outcome']
predicted_class = ['Outcome']

In [25]:
X = data[feature_columns].values
Y = data[predicted_class].values
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.30, random_state = 10)

In [26]:
data.dtypes

Pregnancies                   int64
Glucose                       int64
BloodPressure                 int64
SkinThickness                 int64
Insulin                       int64
BMI                         float64
DiabetesPedigreeFunction    float64
Age                           int64
Outcome                       int64
dtype: object

In [27]:
data.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [29]:
len(data.loc[data['Pregnancies'] == 0])

111

In [31]:
len(data.loc[data['Age'] == 0])

0

In [32]:
len(data.loc[data['Insulin'] == 0])

374

In [36]:
# filling the missing values
from sklearn.impute import SimpleImputer

fill_values = SimpleImputer(missing_values = 0, strategy = "mean")

X_train = fill_values.fit_transform(X_train)
X_test = fill_values.fit_transform(X_test)

In [37]:
## using randomforestclassifier
from sklearn.ensemble import RandomForestClassifier
random_forest_model = RandomForestClassifier(random_state = 10)

random_forest_model.fit(X_train, Y_train.ravel())

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=10, verbose=0,
                       warm_start=False)

In [40]:
predict_train_value = random_forest_model.predict(X_test)

from sklearn import metrics

print("Accuracy = {0: .3f}".format(metrics.accuracy_score(Y_test, predict_train_value)))

Accuracy =  0.758


In [41]:
## Hyper parameter
params= {
    "learning_rate" : [0.05, 0.10, 0.15, 0.20, 0.25, 0.30],
    "max_depth" : [3,4,5,6,8,10,12,15],
    "min_child_weight" : [1,3,5,7],
    "gamma" : [0.0 , 0.1, 0.2, 0.3, 0.4],
    "colsample_bytree" : [0.3 ,0.4, 0.5, 0.7]
}

In [43]:
## using RandomizedSearchCV
from sklearn.model_selection import RandomizedSearchCV
import xgboost

In [44]:
classifier = xgboost.XGBClassifier()

In [45]:
random_search = RandomizedSearchCV(classifier, param_distributions = params, n_iter = 5, scoring='roc_auc', n_jobs = -1, cv=5, verbose = 3)

In [46]:
def timer(start_time = None):
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3000)
        tmin, tsec = divmod(temp_sec, 60)
        print('time taken: %i hours %i minutes and %i seconds' %(thour, tmin, round(tsec, 2)))
        

In [50]:
from datetime import datetime

start_time = timer(None)
random_search.fit(X_train, Y_train.ravel())
timer(start_time)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


time taken: 0 hours 0 minutes and 25 seconds


[Parallel(n_jobs=-1)]: Done  19 out of  25 | elapsed:   25.2s remaining:    7.9s
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:   25.2s finished


In [52]:
random_search.best_estimator_

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.5, gamma=0.0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.05, max_delta_step=0, max_depth=5,
              min_child_weight=5, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [55]:
classifier = xgboost.XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.5, gamma=0.0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.05, max_delta_step=0, max_depth=5,
              min_child_weight=5, missing=None, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [59]:
classifier.fit(X_train, Y_train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.5, gamma=0.0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.05, max_delta_step=0, max_depth=5,
              min_child_weight=5, missing=None, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [60]:
y_predicted = classifier.predict(X_test)

In [61]:
from sklearn.metrics import confusion_matrix, accuracy_score

cm = confusion_matrix(Y_test, y_predicted)
score = accuracy_score(Y_test, y_predicted)
print(cm)
print(score)

[[123  21]
 [ 40  47]]
0.7359307359307359


In [62]:
from sklearn.model_selection import cross_val_score
score = cross_val_score(classifier, X_train, Y_train.ravel(), cv=10)
score

array([0.75925926, 0.7962963 , 0.77777778, 0.75925926, 0.68518519,
       0.72222222, 0.88888889, 0.71698113, 0.81132075, 0.83018868])

In [63]:
score.mean()

0.7747379454926624