# 1. Import necessary libraries

In [20]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report

import warnings
warnings.filterwarnings('ignore')

# 2. Load Dataset

In [2]:
df=pd.read_csv('heart.csv')
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


# 3. Data Understanding

In [3]:
df.shape

(918, 12)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    object 
 2   ChestPainType   918 non-null    object 
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 86.2+ KB


In [5]:
df.describe()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease
count,918.0,918.0,918.0,918.0,918.0,918.0,918.0
mean,53.510893,132.396514,198.799564,0.233115,136.809368,0.887364,0.553377
std,9.432617,18.514154,109.384145,0.423046,25.460334,1.06657,0.497414
min,28.0,0.0,0.0,0.0,60.0,-2.6,0.0
25%,47.0,120.0,173.25,0.0,120.0,0.0,0.0
50%,54.0,130.0,223.0,0.0,138.0,0.6,1.0
75%,60.0,140.0,267.0,0.0,156.0,1.5,1.0
max,77.0,200.0,603.0,1.0,202.0,6.2,1.0


# 4. Data Preprocessing

## 4.1 Data Transformation for Categorical columns

In [8]:
le=LabelEncoder()

df['Sex'] = le.fit_transform(df['Sex'])
df['ChestPainType'] = le.fit_transform(df['ChestPainType'])
df['RestingECG'] = le.fit_transform(df['RestingECG'])
df['ExerciseAngina'] = le.fit_transform(df['ExerciseAngina'])
df['ST_Slope'] = le.fit_transform(df['ST_Slope'])

In [10]:
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,1,1,140,289,0,1,172,0,0.0,2,0
1,49,0,2,160,180,0,1,156,0,1.0,1,1
2,37,1,1,130,283,0,2,98,0,0.0,2,0
3,48,0,0,138,214,0,1,108,1,1.5,1,1
4,54,1,2,150,195,0,1,122,0,0.0,2,0


In [11]:
df.dtypes

Age                 int64
Sex                 int32
ChestPainType       int32
RestingBP           int64
Cholesterol         int64
FastingBS           int64
RestingECG          int32
MaxHR               int64
ExerciseAngina      int32
Oldpeak           float64
ST_Slope            int32
HeartDisease        int64
dtype: object

In [12]:
df['HeartDisease'].value_counts()

1    508
0    410
Name: HeartDisease, dtype: int64

## 4.2 Scaling Data

In [14]:
X=df.drop('HeartDisease',axis=1)
y=df[['HeartDisease']]

In [23]:
std_scaler = StandardScaler()
X_scaled_std = std_scaler.fit_transform(X)
X_scaled_std[:2]

array([[-1.4331398 ,  0.51595242,  0.22903206,  0.41090889,  0.82507026,
        -0.55134134,  0.01725451,  1.38292822, -0.8235563 , -0.83243239,
         1.05211381],
       [-0.47848359, -1.93816322,  1.27505906,  1.49175234, -0.17196105,
        -0.55134134,  0.01725451,  0.75415714, -0.8235563 ,  0.10566353,
        -0.59607813]])

In [24]:
mnx_scaler = MinMaxScaler()
X_scaled_mnx = mnx_scaler.fit_transform(X)
X_scaled_mnx[:2]

array([[0.24489796, 1.        , 0.33333333, 0.7       , 0.47927032,
        0.        , 0.5       , 0.78873239, 0.        , 0.29545455,
        1.        ],
       [0.42857143, 0.        , 0.66666667, 0.8       , 0.29850746,
        0.        , 0.5       , 0.67605634, 0.        , 0.40909091,
        0.5       ]])

# 5. Model Building & Training

In [44]:
X_train,X_test,y_train,y_test = train_test_split(X_scaled_std, y, random_state=10)

In [45]:
bag_model = BaggingClassifier(
    base_estimator=DecisionTreeClassifier(),
    n_estimators=100,
    max_samples=0.8,  #80% of train data taken 
    oob_score=True,   #out-of-bag samples--those datas missed by the estimators are kind of used to get oob_score which is like validation from X_train itself
    random_state=10
)
bag_model.fit(X_train,y_train)
print(f'oob_score using StandardScaler() : {bag_model.oob_score_}')

oob_score using StandardScaler() : 0.8546511627906976


In [47]:
X_train,X_test,y_train,y_test = train_test_split(X_scaled_mnx, y, random_state=10)

In [48]:
bag_model = BaggingClassifier(
    base_estimator=DecisionTreeClassifier(),
    n_estimators=100,
    max_samples=0.8,  #80% of train data taken 
    oob_score=True,   #out-of-bag samples--those datas missed by the estimators are kind of used to get oob_score which is like validation from X_train itself
    random_state=10
)
bag_model.fit(X_train,y_train)
print(f'oob_score using MinMaxScaler() : {bag_model.oob_score_}')

oob_score using MinMaxScaler() : 0.8531976744186046


# 6. Model Evaluation

## Decision Tree Classifier

In [41]:
scores=cross_val_score(DecisionTreeClassifier(),X,y,cv=10)
scores.mean()

0.7809006211180124

## Bagging Classifier

In [49]:
scores=cross_val_score(bag_model,X,y,cv=10)
scores.mean()

0.8364906832298138

## Random Forest

In [50]:
scores=cross_val_score(RandomForestClassifier(),X,y,cv=10)
scores.mean()

0.8549928332537029

### Random Forest is giving better accuracy score

# Model Building using RF

In [73]:
rf_model = RandomForestClassifier(n_estimators=40, max_depth=4, random_state=10, oob_score=True)
rf_model.fit(X_train,y_train)
rf_model.oob_score_

0.8517441860465116

In [71]:
y_pred=rf_model.predict(X_test)

In [72]:
confusion_matrix(y_test,y_pred)

array([[ 90,  17],
       [ 14, 109]], dtype=int64)

In [75]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.87      0.84      0.85       107
           1       0.87      0.89      0.88       123

    accuracy                           0.87       230
   macro avg       0.87      0.86      0.86       230
weighted avg       0.87      0.87      0.87       230

