# Ada Boost Implementation

In [1]:
# Importing Necessary Libraries

import pandas as pd
import numpy as np
import pickle
from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')
from IPython.display import Markdown,display

In [2]:
# This is for designing purpose

def printmd(string):
    display(Markdown(string))
def printcl(string,color=None):
    colorstr="<span style='color':{}>{}</span>".format(color,string)
    display(Markdown(colorstr))

In [3]:
data=pd.read_csv('pima-indian_diabetes.csv')
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [4]:
data.shape

(768, 9)

In [5]:
data.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')

In [6]:
cols=['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age']

In [7]:
data.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [8]:
# Replacing zeros with nan

for col in cols:
    data[col]=data[col].replace(0,np.nan)

In [9]:
# Checking the missing value:

data.isna().sum()

Pregnancies                 111
Glucose                       5
BloodPressure                35
SkinThickness               227
Insulin                     374
BMI                          11
DiabetesPedigreeFunction      0
Age                           0
Outcome                       0
dtype: int64

In [10]:
# Imputing Missing values:

data['Pregnancies']=data['Pregnancies'].fillna(data['Pregnancies'].mode()[0])
data['Glucose']=data['Glucose'].fillna(data['Glucose'].mean())
data['BloodPressure']=data['BloodPressure'].fillna(data['BloodPressure'].mean())
data['SkinThickness']=data['SkinThickness'].fillna(data['SkinThickness'].mean())
data['Insulin']=data['Insulin'].fillna(data['Insulin'].mean())
data['BMI']=data['BMI'].fillna(data['BMI'].mean())

In [11]:
data.isna().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [12]:
x=data.drop(columns='Outcome')
y=data['Outcome']

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
train_x,test_x,train_y,test_y=train_test_split(x,y,test_size=0.3,random_state=42)

In [16]:
train_x.shape

(537, 8)

In [17]:
test_x.shape

(231, 8)

In [18]:
train_y.shape

(537,)

In [19]:
test_y.shape

(231,)

In [20]:
# Fit DecisionTreeClassifier to training data

model=DecisionTreeClassifier()
model.fit(train_x,train_y)

DecisionTreeClassifier()

In [22]:
# Checking training accuracy

y_pred=model.predict(train_x)
prediction=[round(value) for value in y_pred]
accuracy=accuracy_score(train_y,y_pred)
printmd('****Accuracy****')
accuracy

****Accuracy****

1.0

In [25]:
# Checking Initial test accuracy

y_pred=model.predict(test_x)
prediction=[round(value) for value in y_pred]
accuracy=accuracy_score(test_y,y_pred)
printmd('****Decision Tree Accuracy****')
accuracy

****Decision Tree Accuracy****

0.6926406926406926

In [26]:
# Using Ada Boost Classifier

ada=AdaBoostClassifier(base_estimator=model)
ada.fit(train_x,train_y)
y_pred=ada.predict(test_x)
printmd('****Accuracy before tuning****')
print(accuracy_score(test_y,y_pred))

****Accuracy before tuning****

0.7142857142857143


In [30]:
# Hyperparameter tuning for Ada Boost using GridSearchCV

params={'n_estimators':[40,42,45,44,57], 'learning_rate':[0.20,0.34,0.42,0.55,0.56]}
grid_ada=GridSearchCV(AdaBoostClassifier(),param_grid=params)
grid_ada.fit(train_x,train_y)

GridSearchCV(estimator=AdaBoostClassifier(),
             param_grid={'learning_rate': [0.2, 0.34, 0.42, 0.55, 0.56],
                         'n_estimators': [40, 42, 45, 44, 57]})

In [31]:
grid_ada.best_estimator_

AdaBoostClassifier(learning_rate=0.42, n_estimators=42)

In [32]:
ada=AdaBoostClassifier(base_estimator=model,learning_rate=0.2345,n_estimators=44)
ada.fit(train_x,train_y)

AdaBoostClassifier(base_estimator=DecisionTreeClassifier(),
                   learning_rate=0.2345, n_estimators=44)

In [34]:
y_pred=ada.predict(test_x)
y_pred

array([0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0,
       0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1,
       0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1,
       0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0,
       1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1,
       1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0], dtype=int64)

In [36]:
printmd('****Accuracy Post Tuning****')
print(accuracy_score(test_y,y_pred))

****Accuracy Post Tuning****

0.696969696969697
