## Random Forest
## Gradient Boosting
## Ada Boost

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import classification_report,accuracy_score,confusion_matrix, make_scorer, recall_score
from scipy.stats import randint, uniform

In [3]:
df=pd.read_csv("heart.csv")
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [4]:
from sklearn.preprocessing import LabelEncoder
categorical_cols = ["Sex", "ST_Slope","ExerciseAngina"]
encoders={}
for col in categorical_cols:
    le=LabelEncoder()
    df[col]=le.fit_transform(df[col])
    encoders[col]=le
df

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,1,ATA,140,289,0,Normal,172,0,0.0,2,0
1,49,0,NAP,160,180,0,Normal,156,0,1.0,1,1
2,37,1,ATA,130,283,0,ST,98,0,0.0,2,0
3,48,0,ASY,138,214,0,Normal,108,1,1.5,1,1
4,54,1,NAP,150,195,0,Normal,122,0,0.0,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,1,TA,110,264,0,Normal,132,0,1.2,1,1
914,68,1,ASY,144,193,1,Normal,141,0,3.4,1,1
915,57,1,ASY,130,131,0,Normal,115,1,1.2,1,1
916,57,0,ATA,130,236,0,LVH,174,0,0.0,1,1


In [5]:
from sklearn.preprocessing import OneHotEncoder
categorical_cols = ["ChestPainType", "RestingECG"]
ohe_encoder = OneHotEncoder(sparse_output=False)
encoded = ohe_encoder.fit_transform(df[categorical_cols])
encoded_df = pd.DataFrame(encoded, columns=ohe_encoder.get_feature_names_out(categorical_cols))
df=pd.concat([df.drop(columns=categorical_cols).reset_index(drop=True),
                      encoded_df.reset_index(drop=True)], axis=1)

In [6]:
df

Unnamed: 0,Age,Sex,RestingBP,Cholesterol,FastingBS,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST
0,40,1,140,289,0,172,0,0.0,2,0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,49,0,160,180,0,156,0,1.0,1,1,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,37,1,130,283,0,98,0,0.0,2,0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
3,48,0,138,214,0,108,1,1.5,1,1,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,54,1,150,195,0,122,0,0.0,2,0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,1,110,264,0,132,0,1.2,1,1,0.0,0.0,0.0,1.0,0.0,1.0,0.0
914,68,1,144,193,1,141,0,3.4,1,1,1.0,0.0,0.0,0.0,0.0,1.0,0.0
915,57,1,130,131,0,115,1,1.2,1,1,1.0,0.0,0.0,0.0,0.0,1.0,0.0
916,57,0,130,236,0,174,0,0.0,1,1,0.0,1.0,0.0,0.0,1.0,0.0,0.0


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 17 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Age                918 non-null    int64  
 1   Sex                918 non-null    int32  
 2   RestingBP          918 non-null    int64  
 3   Cholesterol        918 non-null    int64  
 4   FastingBS          918 non-null    int64  
 5   MaxHR              918 non-null    int64  
 6   ExerciseAngina     918 non-null    int32  
 7   Oldpeak            918 non-null    float64
 8   ST_Slope           918 non-null    int32  
 9   HeartDisease       918 non-null    int64  
 10  ChestPainType_ASY  918 non-null    float64
 11  ChestPainType_ATA  918 non-null    float64
 12  ChestPainType_NAP  918 non-null    float64
 13  ChestPainType_TA   918 non-null    float64
 14  RestingECG_LVH     918 non-null    float64
 15  RestingECG_Normal  918 non-null    float64
 16  RestingECG_ST      918 non

In [8]:
from sklearn.model_selection import train_test_split
X=df.drop("HeartDisease",axis=1)
Y=df["HeartDisease"]
x_train,x_test,y_train,y_test=train_test_split(
X,
Y,
shuffle=True,
test_size=0.2,
random_state=42    
)

In [9]:
x_train.shape,x_test.shape,y_train.shape,y_test.shape

((734, 16), (184, 16), (734,), (184,))

In [18]:
rf = RandomForestClassifier(
    n_estimators=200,
    random_state=42,
    class_weight={0:1, 1:5},
    n_jobs=-1
)
param_dist = {
    "n_estimators": [100, 200, 300, 400, 500],
    "max_depth": [None, 5, 10, 20, 30],
    "max_features": ["sqrt", "log2", None],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 5, 10],
    "bootstrap": [True],
    "max_samples": [0.5, 0.7, 0.9, None]  # Subsampling to reduce variance
}
rs = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_dist,
    n_iter=20,              
    scoring="roc_auc",      
    cv=5,         
    n_jobs=-1,
    random_state=42,
    verbose=2
)
rs.fit(x_train, y_train)
model= rs.best_estimator_

Fitting 5 folds for each of 20 candidates, totalling 100 fits


In [None]:
y_pred = rs.predict(x_train)
cm = confusion_matrix(y_train, y_pred)
plt.figure(figsize=(6,4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()
print(accuracy_score(y_train,y_pred))
print(classification_report(y_train,y_pred))

In [None]:
y_pred=model.predict(x_test)
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6,4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

In [None]:
rf = RandomForestClassifier(
    n_estimators=200,
    random_state=42,
    class_weight={0:1, 1:5},
    n_jobs=-1
)
param_dist = {
    "max_depth": randint(1, 5),
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "max_features": ["sqrt", "log2", None],
    "n_estimators":  randint(50, 1000)  # try 100,200,300,400,500
}
random_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_dist,
    n_iter=20,              
    scoring="roc_auc",      
    cv=5,         
    n_jobs=-1,
    random_state=42,
    verbose=2
)
random_search.fit(x_train, y_train)
model= random_search.best_estimator_

In [None]:
y_pred = rs.predict(x_train)
cm = confusion_matrix(y_train, y_pred)
plt.figure(figsize=(6,4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()
print(accuracy_score(y_train,y_pred))
print(classification_report(y_train,y_pred))

In [None]:
y_pred=model.predict(x_test)
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6,4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

In [None]:
model = GradientBoostingClassifier(random_state=42)
param_distributions = {
    "n_estimators": randint(50, 1000),
    "learning_rate": uniform(0.01, 0.59),
    "max_depth": randint(1, 5),
    "subsample": uniform(0.5, 0.5),
    "max_features": ["sqrt", "log2", None]
}
recall_pos_scorer = make_scorer(recall_score, pos_label=1)
rs = RandomizedSearchCV(
    estimator=model,
    param_distributions=param_distributions,
    n_iter=50,
    scoring=recall_pos_scorer,
    cv=5,
    verbose=2,
    random_state=42,
    n_jobs=-1
)
rs.fit(x_train, y_train)

In [None]:
y_pred = rs.predict(x_train)
cm = confusion_matrix(y_train, y_pred)
plt.figure(figsize=(6,4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()
print(accuracy_score(y_train,y_pred))
print(classification_report(y_train,y_pred))

In [None]:
y_pred=rs.predict(x_test)
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6,4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

In [None]:
# Base estimator: decision stump / shallow tree
base_tree = DecisionTreeClassifier(random_state=42)

# AdaBoost
ada = AdaBoostClassifier(
    estimator=base_tree,
    random_state=42
)

# Parameter distributions
param_dist = {
    "estimator__max_depth": randint(1, 5),            # shallow trees
    "estimator__min_samples_split": [2, 5, 10],
    "estimator__min_samples_leaf": [1, 2, 4],
    "estimator__max_features": ["sqrt", "log2", None],
    "n_estimators": randint(50, 1000),                
    "learning_rate": [0.01, 0.05, 0.1, 0.5, 1.0]      
}

# Randomized search
search = RandomizedSearchCV(
    ada,
    param_distributions=param_dist,
    n_iter=40,
    cv=5,
    scoring="recall",     # since you want to maximize recall for heart failure
    n_jobs=-1,
    random_state=42
)

search.fit(x_train, y_train)

In [None]:
y_pred = search.predict(x_train)
cm = confusion_matrix(y_train, y_pred)
plt.figure(figsize=(6,4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()
print(accuracy_score(y_train,y_pred))
print(classification_report(y_train,y_pred))

In [None]:
y_pred=search.predict(x_test)
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6,4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))