In [67]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.ensemble import VotingClassifier, BaggingClassifier, AdaBoostClassifier

In [34]:
df = pd.read_csv('data/indian_liver_patient/indian_liver_patient.csv')
df.head()

Unnamed: 0,Age,Gender,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,Dataset
0,65,Female,0.7,0.1,187,16,18,6.8,3.3,0.9,1
1,62,Male,10.9,5.5,699,64,100,7.5,3.2,0.74,1
2,62,Male,7.3,4.1,490,60,68,7.0,3.3,0.89,1
3,58,Male,1.0,0.4,182,14,20,6.8,3.4,1.0,1
4,72,Male,3.9,2.0,195,27,59,7.3,2.4,0.4,1


In [35]:
df.shape

(583, 11)

In [40]:
df.isna().sum()

Age                           0
Gender                        0
Total_Bilirubin               0
Direct_Bilirubin              0
Alkaline_Phosphotase          0
Alamine_Aminotransferase      0
Aspartate_Aminotransferase    0
Total_Protiens                0
Albumin                       0
Albumin_and_Globulin_Ratio    0
Dataset                       0
dtype: int64

In [39]:
df = df.dropna()

In [41]:
df.Dataset.value_counts()

1    414
2    165
Name: Dataset, dtype: int64

In [42]:
df.Gender.value_counts()

Male      439
Female    140
Name: Gender, dtype: int64

In [43]:
df['Gender'] = np.where(df['Gender'] == 'Male', 1, 0)
df['Dataset'] = np.where(df['Dataset'] == 2, 0, 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Gender'] = np.where(df['Gender'] == 'Male', 1, 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Dataset'] = np.where(df['Dataset'] == 2, 0, 1)


In [44]:
df.Gender.value_counts()

1    439
0    140
Name: Gender, dtype: int64

In [45]:
df.Dataset.value_counts()

1    414
0    165
Name: Dataset, dtype: int64

In [46]:
X = df.drop('Dataset', axis=1).values
y = df['Dataset'].values

In [47]:
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [48]:
X

array([[ 1.24740264, -1.77079482, -0.42031984, ...,  0.29372174,
         0.20344649, -0.14738974],
       [ 1.06230624,  0.56471817,  1.21893587, ...,  0.93965456,
         0.07746198, -0.64846078],
       [ 1.06230624,  0.56471817,  0.64037503, ...,  0.47827397,
         0.20344649, -0.17870668],
       ...,
       [ 0.44531827,  0.56471817, -0.40424871, ..., -0.07538274,
         0.07746198,  0.16577966],
       [-0.85035649,  0.56471817, -0.32389304, ...,  0.29372174,
         0.329431  ,  0.16577966],
       [-0.4184649 ,  0.56471817, -0.37210644, ...,  0.75510233,
         1.5892761 ,  1.73162664]])

In [49]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [50]:
lr = LogisticRegression()
knn = KNeighborsClassifier(n_neighbors=27)
dt = DecisionTreeClassifier(min_samples_leaf=0.13)
classifiers = [('Logistic Regression', lr), ('K Nearest Neighbours', knn), ('Classification Tree', dt)]

In [51]:
for clf_name, clf in classifiers:
    clf.fit(X_train, y_train)    
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred) 
    print('{:s} : {:.3f}'.format(clf_name, accuracy))

Logistic Regression : 0.759
K Nearest Neighbours : 0.718
Classification Tree : 0.730


### Voting classifier

In [53]:
vc = VotingClassifier(estimators=classifiers)
vc.fit(X_train, y_train)

In [54]:
y_pred_esmb = vc.predict(X_test)
accuracy_esmb = accuracy_score(y_test, y_pred_esmb)
accuracy_esmb

0.7586206896551724

### Bagging classifier / OOB accuracy

In [61]:
bc = BaggingClassifier(estimator=dt, n_estimators=50, oob_score=True, n_jobs=-1)
bc.fit(X_train, y_train)

In [62]:
y_pred_bag = bc.predict(X_test)
accuracy_bag = accuracy_score(y_test, y_pred_bag)
accuracy_bag

0.7413793103448276

In [63]:
oob_acc = bc.oob_score_
oob_acc

0.674074074074074

### AdaBoost classifier

In [66]:
ada = AdaBoostClassifier(estimator=dt, n_estimators=180)
ada.fit(X_train, y_train)

In [68]:
y_pred_proba = ada.predict_proba(X_test)[:,1]
ada_roc_auc = roc_auc_score(y_test, y_pred_proba)
ada_roc_auc

0.7164513350559863