In [77]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier

from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score


from sklearn.model_selection import GridSearchCV
SEED=888

In [50]:
data=pd.read_csv('indian_liver_patient.csv')
data.info()
data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 583 entries, 0 to 582
Data columns (total 11 columns):
Age                           583 non-null int64
Gender                        583 non-null object
Total_Bilirubin               583 non-null float64
Direct_Bilirubin              583 non-null float64
Alkaline_Phosphotase          583 non-null int64
Alamine_Aminotransferase      583 non-null int64
Aspartate_Aminotransferase    583 non-null int64
Total_Protiens                583 non-null float64
Albumin                       583 non-null float64
Albumin_and_Globulin_Ratio    579 non-null float64
Dataset                       583 non-null int64
dtypes: float64(5), int64(5), object(1)
memory usage: 50.2+ KB


Unnamed: 0,Age,Gender,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,Dataset
0,65,Female,0.7,0.1,187,16,18,6.8,3.3,0.9,1
1,62,Male,10.9,5.5,699,64,100,7.5,3.2,0.74,1
2,62,Male,7.3,4.1,490,60,68,7.0,3.3,0.89,1
3,58,Male,1.0,0.4,182,14,20,6.8,3.4,1.0,1
4,72,Male,3.9,2.0,195,27,59,7.3,2.4,0.4,1


In [51]:
# parse Dataset field: replace 1 with '0', and 2 with '1'
data.Dataset.replace([1,2],[0,1],inplace=True)

# parse Gender field: male 1, female 0
data.Gender.replace(['Male','Female'],[1,0],inplace=True)
data.tail()

Unnamed: 0,Age,Gender,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,Dataset
578,60,1,0.5,0.1,500,20,34,5.9,1.6,0.37,1
579,40,1,0.6,0.1,98,35,31,6.0,3.2,1.1,0
580,52,1,0.8,0.2,245,48,49,6.4,3.2,1.0,0
581,31,1,1.3,0.5,184,29,32,6.8,3.4,1.0,0
582,38,1,1.0,0.3,216,21,24,7.3,4.4,1.5,1


In [52]:
# drop rows with any columns value NaN
data.dropna(how='any',inplace=True)
len(data)

579

In [53]:
# Generate X dataset
X=data.drop('Dataset',axis=1)

# Generate y dataset
y=data['Dataset']


In [54]:
# train and test file split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,stratify=y,random_state=SEED)
print(len(X_train))
print(len(X_test))

405
174


In [92]:
# training model
dt=DecisionTreeClassifier(random_state=SEED)
bc=BaggingClassifier(base_estimator=dt,
                     max_samples=0.9,
                     n_estimators=500,
                     oob_score=True,
                     random_state=SEED)

# train decision tree model and compute the accuracy score
dt.fit(X_train,y_train)
dt_pred=dt.predict(X_test)
dt_accuracy=accuracy_score(dt_pred,y_test)
print('Test accuracy score of Decision Tree Model is:{:.2f}'.format(dt_accuracy))

# train bagging classifier and compute the accuracy score
bc.fit(X_train,y_train)
bc_pred=bc.predict(X_test)
bc_accuracy=accuracy_score(bc_pred,y_test)
bc_oob=bc.oob_score_
print('\nTest accuracy score of Bagging Classifier is:{:.2f}'.format(bc_accuracy))
print('\nOOB accuracy score of Bagging Classifier is :{:.2f}'.format(bc_oob))

Test accuracy score of Decision Tree Model is:0.59

Test accuracy score of Bagging Classifier is:0.70

OOB accuracy score of Bagging Classifier is :0.70


In [87]:
# train adaBoost model and compute the accuracy score
ada=AdaBoostClassifier(base_estimator=dt,random_state=SEED)
ada.fit(X_train,y_train)
ada_pred=ada.predict(X_test)
ada_accuracy=accuracy_score(ada_pred,y_test)
print('Test accuracy score of Ada Boost Classifer is:{:.2f}'.format(ada_accuracy))

Test accuracy score of Ada Boost Classifer is:0.62


In [91]:
# compute roc auc score
ada_pred_proba=ada.predict_proba(X_test)[:,1]
ada_roc_auc=roc_auc_score(y_test,ada_pred_proba)
print('Ada Boosting Model ROC AUC score: {:.2f}'.format(ada_roc_auc))


Ada Boosting Model ROC AUC score: 0.53


In [101]:
# Grid search 
param_dt={'max_depth':[2,3,4],'min_samples_leaf':[0.12,0.14,0.16,0.18]}

grid_dt=GridSearchCV(estimator=dt,
                    param_grid=param_dt,
                    scoring='roc_auc',
                    cv=10,
                    n_jobs=-1,
                    )

grid_dt.fit(X_train,y_train)
best_model=grid_dt.best_estimator_

grid_pred=best_model.predict(X_test)
grid_accuracy=accuracy_score(grid_pred,y_test)
print('Grid Search Testing accuracy score:{:.2f} '.format(grid_accuracy))

grid_pred_proba=best_model.predict_proba(X_test)[:,1]
grid_roc_auc=roc_auc_score(y_test,grid_pred_proba)
print('\nGrid Search roc auc score:{:.2f} '.format(grid_roc_auc))


Grid Search Testing accuracy score:0.71 

Grid Search roc auc score:0.77 
