# Ensemble Learning

In [None]:
import pandas as pd
import numpy as nu
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

#Data Loading

In [None]:
!kaggle datasets download -d fedesoriano/heart-failure-prediction

!unzip heart-failure-prediction.zip

In [None]:
df = pd.read_csv('heart.csv')
df.head()

In [None]:
df.info()

# Feature Engineering

In [None]:
cat_col = df.select_dtypes(include=['object', 'category']).columns
cat_col

In [None]:
df[cat_col].nunique()

In this case, I will transform categorical features using One Hot Encoder

In [None]:
df = pd.get_dummies(df, columns=cat_col, drop_first=True)

In [None]:
df.head()

# Explore Data

## Checking Target Variable Balance

In [None]:
df['HeartDisease'].value_counts()

## Distribution of Numerical Features

In [None]:
for col in [col for col in df.columns if df[col].nunique() > 10]:
    plt.figure(figsize=(8, 6))
    sns.histplot(df[col], kde=True, bins=30, color='blue', edgecolor='black')
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')
    plt.show()

## Correlation Matrix

In [None]:
corr_matrix = df.corr()

plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.3, cbar=True)
plt.title('Correlation Matrix Heatmap')

plt.show()

# Splitting the Dataset into Training and Testing Subsets in an 80/20 Ratio

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x =  df.drop(columns='HeartDisease')
y = df['HeartDisease']

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)

#Model Creation

##  Decision Tree Classifier and Random Forest Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [None]:
dtc = DecisionTreeClassifier(random_state=1) # Decision Tree Classifier
dtc.fit(x_train,y_train)
y_pred_dtc = dtc.predict(x_test)

In [None]:
rfc = RandomForestClassifier(random_state=1) # Random Forest Classifier
rfc.fit(x_train,y_train)
y_pred_rfc = rfc.predict(x_test)

### Model Metrics

In [None]:
from sklearn.metrics import classification_report

In [None]:
print(classification_report(y_test,y_pred_dtc)) # Decision Tree  Metrics

In [None]:
print(classification_report(y_test,y_pred_rfc)) # Random Forest Metrics

### Feature importances  Decision Tree

In [None]:
# Feature importances Decision Tree
importances_dtc = dtc.feature_importances_

feature_importance_dtc = pd.DataFrame({
    'Feature': x_train.columns,
    'Importance': importances_dtc
})

# Sort values Feature importances
feature_importance_dtc = feature_importance_dtc.sort_values(by='Importance',ascending=False)

# Barplot
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature',
            data=feature_importance_dtc,
            palette='viridis')

plt.title('Feature Importances')
plt.xlabel('Importance')
plt.ylabel('Features')

plt.show()

### Feature importances Random Forest

In [None]:
# Feature importances Decision Tree
importances_rfc = rfc.feature_importances_

feature_importances_rfc = pd.DataFrame({
    'Feature': x_train.columns,
    'Importance': importances_rfc
})

# Sort values Feature importances
feature_importances_rfc = feature_importances_rfc.sort_values(by='Importance',ascending=False)

# Barplot
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature',
            data=feature_importances_rfc,
            palette='viridis')

plt.title('Feature Importances')
plt.xlabel('Importance')
plt.ylabel('Features')

plt.show()

## Bagging Classifier

### Metrics Bagging Classifier

In [None]:
from sklearn.ensemble import BaggingClassifier

In [None]:
bc = BaggingClassifier(random_state=1) # Bagging Classifier
bc.fit(x_train,y_train)
y_pred_bc = bc.predict(x_test)

In [None]:
print(classification_report(y_test,y_pred_bc)) #Bagging Classifier Metrics

## LinearSVC

In [None]:
from sklearn.svm import LinearSVC

In [None]:
lsvc = LinearSVC(random_state=1)
lsvc.fit(x_train,y_train)
y_pred_lsvc = lsvc.predict(x_test)

### Metrics LinearSVC

In [None]:
print(classification_report(y_test,y_pred_lsvc))

## **Stacking** Decision Tree Classifier + Random Forest Classifier + LinearSVC



In [None]:
from sklearn.ensemble import StackingClassifier

In [None]:
estimators = [
    ('dt', dtc),
    ('rfc', rfc),
    ('lsvc', lsvc)
    ]

stack_c = StackingClassifier(estimators=estimators)

In [None]:
stack_c.fit(x_train, y_train)

In [None]:
y_pred_stack = stack_c.predict(x_test)

### Metrics Stacking Classifier

In [None]:
print(classification_report(y_test,y_pred_stack))

# Comparing Results

In [None]:
model_scores = {
    "Decision Tree Classifier": 0.75,
    "Random Forest": 0.90,
    "Bagging Classifier": 0.80,
    "LinearSVC": 0.90,
    "StackingClassifier": 0.91
}


df_scores = pd.DataFrame(list(model_scores.items()), columns=["Model", "F1 Score"])

df_scores.sort_values(by="F1 Score", ascending=False)

# Conclusion

In this case, the best result was obtained by applying Stacking, as it uses the predictions of several models, each of which can excel in different aspects:

* Decision Tree works well with branching data and can capture complex patterns.
* Random Forest adds stability to overfitting by averaging the results of multiple trees.
* LinearSVC effectively identifies linear relationships.

It also helps balance bias and variance.