# DATA UNDERSTANDING 

In [3]:
# importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
df = pd.read_csv('diabetes.csv')

In [5]:
df.shape

(768, 9)

In [6]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [7]:
df.tail()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.34,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1
767,1,93,70,31,0,30.4,0.315,23,0


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [9]:
df.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [10]:
df.nunique()

Pregnancies                  17
Glucose                     136
BloodPressure                47
SkinThickness                51
Insulin                     186
BMI                         248
DiabetesPedigreeFunction    517
Age                          52
Outcome                       2
dtype: int64

In [13]:
df.duplicated().sum()

0

# NOTE:-
1. THERE IS NO NULL VALUES
2. ALL FEATURES HAVE NUMERICAL VALUES
3. Now we can split the dataframe into train and test data
- FURTHER WE CAN PROCEED FOR MODELLING

# DATA PREPARATION

In [17]:
from sklearn.model_selection import train_test_split

In [19]:
x = df.drop(columns=['Outcome'])
y = df.Outcome

In [27]:
X_train,X_test,y_train,y_test = train_test_split(x,y,random_state=34)

# MODELLING

In [28]:
#importing algorithms
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

In [30]:
models = [LogisticRegression(),DecisionTreeClassifier(),SVC(),RandomForestClassifier()]

for model in models:
    if((model == LogisticRegression(max_iter=8000)) or ( model == SVC())):
        scaler = StandardScaler()
        scaler.fit_transform(X_train)
        model.fit(X_train,y_train)
        print(f'Accuracy Score of {model} is : {model.score(X_test,y_test)}')
    else:
        model.fit(X_train,y_train)
        print(f'Accuracy Score of {model} is : {model.score(X_test,y_test)}')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy Score of LogisticRegression() is : 0.8125
Accuracy Score of DecisionTreeClassifier() is : 0.6614583333333334
Accuracy Score of SVC() is : 0.8020833333333334
Accuracy Score of RandomForestClassifier() is : 0.7604166666666666


In [31]:
# importing ensemble methods 
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier

In [34]:
models1 = [BaggingClassifier(base_estimator=DecisionTreeClassifier(),n_estimators=9000,oob_score=True,random_state=32,max_features=0.99999999),GradientBoostingClassifier(n_estimators=9000,learning_rate=7500,max_depth=3),AdaBoostClassifier(n_estimators=10000,learning_rate=7500,base_estimator=LogisticRegression(max_iter=7500))]

for model in models1:
    model.fit(X_train,y_train)
    print(f'Accuracy Score of {model} : {model.score(X_test,y_test)}')
    print('*****************************')



Accuracy Score of BaggingClassifier(base_estimator=DecisionTreeClassifier(),
                  max_features=0.99999999, n_estimators=9000, oob_score=True,
                  random_state=32) : 0.7447916666666666
*****************************
Accuracy Score of GradientBoostingClassifier(learning_rate=7500, n_estimators=9000) : 0.6614583333333334
*****************************
Accuracy Score of AdaBoostClassifier(base_estimator=LogisticRegression(max_iter=7500),
                   learning_rate=7500, n_estimators=10000) : 0.7916666666666666
*****************************


  sample_weight *= np.exp(
  model.fit(X_train,y_train)


In [39]:
model = LogisticRegression(max_iter=10000)
model.fit(X_train,y_train)

In [40]:
model.score(X_test,y_test)

0.8177083333333334

In [41]:
from sklearn.model_selection import GridSearchCV

param_grid = { 'penalty' : ['l1', 'l2', 'elasticnet', 'none'],
    'C' : np.logspace(-4, 4, 20),
    'solver' : ['lbfgs','newton-cg','liblinear','sag','saga'],
    'max_iter' : [100, 1000,2500, 5000] 
             }

classifier = GridSearchCV(model,param_grid,cv=5)
classifier.fit(x,y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [42]:
classifier.best_score_

0.7761055937526524

In [43]:
model = LogisticRegression(max_iter=2000000)
model.fit(X_train,y_train)

In [44]:
model.score(X_test,y_test)

0.8177083333333334

# SAVING MODEL

In [45]:
import pickle

In [46]:
with open('diabetes','wb') as f:
    pickle.dump(model,f)
    f.close()