In [35]:
from sklearn.datasets import load_iris
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import cross_val_score
from scipy.stats import zscore
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import AdaBoostClassifier

In [21]:
df = pd.read_csv('heart.csv')
print(df.head())

   Age Sex ChestPainType  RestingBP  Cholesterol  FastingBS RestingECG  MaxHR  \
0   40   M           ATA        140          289          0     Normal    172   
1   49   F           NAP        160          180          0     Normal    156   
2   37   M           ATA        130          283          0         ST     98   
3   48   F           ASY        138          214          0     Normal    108   
4   54   M           NAP        150          195          0     Normal    122   

  ExerciseAngina  Oldpeak ST_Slope  HeartDisease  
0              N      0.0       Up             0  
1              N      1.0     Flat             1  
2              N      0.0       Up             0  
3              Y      1.5     Flat             1  
4              N      0.0       Up             0  


Removing outliers

In [22]:
z_scores = np.abs(zscore(df.select_dtypes(include=[np.number])))
final_df = df[(z_scores < 3).all(axis=1)].copy()
final_df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


Picking the caregorical columns

In [23]:
categorical_cols = df.select_dtypes(include=['object']).columns
print(categorical_cols)

Index(['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope'], dtype='object')


Apply label encoding to them

In [None]:
LE = LabelEncoder()
for col in categorical_cols:
    final_df[col] = LE.fit_transform(final_df[col])
final_df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,1,1,140,289,0,1,172,0,0.0,2,0
1,49,0,2,160,180,0,1,156,0,1.0,1,1
2,37,1,1,130,283,0,2,98,0,0.0,2,0
3,48,0,0,138,214,0,1,108,1,1.5,1,1
4,54,1,2,150,195,0,1,122,0,0.0,2,0


Preparing the training (x, y), and testing (x, y) data

In [29]:
x = final_df.drop(columns="HeartDisease")
y = final_df["HeartDisease"]
x = StandardScaler().fit_transform(x)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

Standalone SVC vs with Bagging

In [31]:
SA_SVC = SVC()
SA_SVC.fit(x_train, y_train)
SA_SVC_pred = SA_SVC.predict(x_test)

BG_SVC = BaggingClassifier(estimator=SVC(), n_estimators=10, random_state=42)
BG_SVC.fit(x_train, y_train)
BG_SVC_pred = BG_SVC.predict(x_test)

In [None]:
# SVC with and without bagging
print(f'Standalone SVC model accuracy: {accuracy_score(y_test, SA_SVC_pred)}')
print(f'Bagging SVC accuracy: {accuracy_score(y_test, BG_SVC_pred)}')

Standalone SVC model accuracy: 0.8777777777777778
Bagging SVC accuracy: 0.8888888888888888


Standalone Decision tree vs with Bagging

In [33]:
SA_DT = DecisionTreeClassifier()
SA_DT.fit(x_train, y_train)
SA_DT_pred = SA_DT.predict(x_test)

BG_DT = BaggingClassifier(estimator=DecisionTreeClassifier(), n_estimators=10, random_state=42)
BG_DT.fit(x_train, y_train)
BG_DT_pred = BG_DT.predict(x_test)

In [34]:
# DT with and without bagging
print(f'Standalone DT model accuracy: {accuracy_score(y_test, SA_DT_pred)}')
print(f'Bagging DT accuracy: {accuracy_score(y_test, BG_DT_pred)}')

Standalone DT model accuracy: 0.7833333333333333
Bagging DT accuracy: 0.8444444444444444


(as shown in the cell above) bagging helped improve the predictions of the DT model more than it did with the SVC one. 
That's because DT models have high variance and bagging helps lower it.

Decision tree with Adaboost

In [36]:
ADB = AdaBoostClassifier(estimator=DecisionTreeClassifier())
ADB.fit(x_train,y_train)
ADB_pred = ADB.predict(x_test)
print("Decision tree model with adaBoost accuracy:", accuracy_score(y_test, ADB_pred))

Decision tree model with adaBoost accuracy: 0.8055555555555556
