In [41]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

In [2]:
data = pd.read_csv("heart.csv")
data.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    object 
 2   ChestPainType   918 non-null    object 
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 86.2+ KB


In [4]:
data.isnull().sum()

Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
HeartDisease      0
dtype: int64

In [6]:
# getting all the object columns with their unique values and indexes
for index, col in enumerate(data.columns):
    if data[col].dtypes == "O":
        print(f"{col}: {data[col].unique()} [{index}]\n")

Sex: ['M' 'F'] [1]

ChestPainType: ['ATA' 'NAP' 'ASY' 'TA'] [2]

RestingECG: ['Normal' 'ST' 'LVH'] [6]

ExerciseAngina: ['N' 'Y'] [8]

ST_Slope: ['Up' 'Flat' 'Down'] [10]



In [7]:
# assigning onehotencoder for all the object columns
encoder = {col: OneHotEncoder(sparse_output=False, drop='first') for col in data.columns if data[col].dtype == 'O'}
encoder

{'Sex': OneHotEncoder(drop='first', sparse_output=False),
 'ChestPainType': OneHotEncoder(drop='first', sparse_output=False),
 'RestingECG': OneHotEncoder(drop='first', sparse_output=False),
 'ExerciseAngina': OneHotEncoder(drop='first', sparse_output=False),
 'ST_Slope': OneHotEncoder(drop='first', sparse_output=False)}

In [19]:
# using column transformer to encode all the object columns with onehotencoder
ct = ColumnTransformer([('Sex',encoder['Sex'],[1]),
                        ('ChestPainType',encoder['ChestPainType'],[2]),
                        ('RestingECG',encoder['RestingECG'],[6]),
                        ('ExerciseAngina',encoder['ExerciseAngina'],[8]),
                        ('ST_Slope',encoder['ST_Slope'],[10])],
                        remainder='passthrough')
ct

In [20]:
encoded_data = ct.fit_transform(data)
encoded_data

array([[  1. ,   1. ,   0. , ..., 172. ,   0. ,   0. ],
       [  0. ,   0. ,   1. , ..., 156. ,   1. ,   1. ],
       [  1. ,   1. ,   0. , ...,  98. ,   0. ,   0. ],
       ...,
       [  1. ,   0. ,   0. , ..., 115. ,   1.2,   1. ],
       [  0. ,   1. ,   0. , ..., 174. ,   0. ,   1. ],
       [  1. ,   0. ,   1. , ..., 173. ,   0. ,   0. ]])

In [21]:
encoded_data.shape

(918, 16)

In [22]:
encoded_df = pd.DataFrame(encoded_data)
encoded_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,40.0,140.0,289.0,0.0,172.0,0.0,0.0
1,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,49.0,160.0,180.0,0.0,156.0,1.0,1.0
2,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,37.0,130.0,283.0,0.0,98.0,0.0,0.0
3,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,48.0,138.0,214.0,0.0,108.0,1.5,1.0
4,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,54.0,150.0,195.0,0.0,122.0,0.0,0.0


In [32]:
X = encoded_df.drop(columns=[encoded_df.columns[-1]])
y = encoded_df[15]

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=37)

In [40]:
gb = GradientBoostingClassifier()
rf = RandomForestClassifier()

### Gradient Boosting

In [57]:
# parms_gb = {'loss':['log_loss', 'exponential'], 'learning_rate':[0.1,0.01,1,0.2], 'n_estimators':[100,150,200], 
#             'criterion':['friedman_mse', 'squared_error'], 'min_samples_leaf':list(np.arange(2,5))}

# gb_rs = RandomizedSearchCV(gb, param_distributions=parms_gb, n_jobs=-1, cv=80, scoring='accuracy', verbose=1)
# gb_rs.fit(X_train, y_train)
# print(gb_rs.best_params_)
# print(gb_rs.best_score_)

Fitting 80 folds for each of 10 candidates, totalling 800 fits
{'n_estimators': 200, 'min_samples_leaf': 4, 'loss': 'exponential', 'learning_rate': 0.01, 'criterion': 'squared_error'}
0.8865277777777777


In [60]:
gb = GradientBoostingClassifier(n_estimators=200, min_samples_leaf=4, loss='exponential', learning_rate=0.01, criterion='squared_error')
gb.fit(X_train, y_train)

In [61]:
gb.score(X_test, y_test)

0.8641304347826086

### Random Forest

In [66]:
# parms_rf = {'criterion':['gini', 'entropy'], 'min_samples_leaf':list(np.arange(2,3)), 
#             'max_features':['sqrt', 'log2', None], 'n_estimators':[100,150,200]}

# rf_rs = RandomizedSearchCV(rf, param_distributions=parms_rf, n_jobs=-1, scoring='accuracy', cv=50, verbose=1)
# rf_rs.fit(X_train, y_train)
# print(rf_rs.best_params_)
# print(rf_rs.best_score_)

Fitting 50 folds for each of 10 candidates, totalling 500 fits
{'n_estimators': 150, 'min_samples_leaf': 2, 'max_features': 'log2', 'criterion': 'gini'}
0.885142857142857


In [72]:
rf = RandomForestClassifier(n_estimators=150, min_samples_leaf= 2, max_features= 'log2', criterion= 'gini')
rf.fit(X_train, y_train)

In [73]:
rf.score(X_test, y_test)

0.8478260869565217