<a href="https://colab.research.google.com/github/Chrisolande/Machine-Learning-and-Data-Science-Projects/blob/main/Heart_Disease_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**About Dataset**

**Context**

This data set dates from 1988 and consists of four databases: Cleveland, Hungary, Switzerland, and Long Beach V. It contains 76 attributes, including the predicted attribute, but all published experiments refer to using a subset of 14 of them. The "target" field refers to the presence of heart disease in the patient. It is integer valued 0 = no disease and 1 = disease.

**Content**

Attribute Information:

* age
* sex
* chest pain type (4 values)
* resting blood pressure
* serum cholestoral in mg/dl
* fasting blood sugar > 120 mg/dl
* resting electrocardiographic results (values 0,1,2)
* maximum heart rate achieved
* exercise induced angina
* oldpeak = ST depression induced by exercise relative to rest
the slope of the peak exercise ST segment
number of major vessels (0-3) colored by flourosopy
* thal: 0 = normal; 1 = fixed defect; 2 = reversable defect

The names and social security numbers of the patients were recently removed from the database, replaced with dummy values.

In [None]:
import warnings

import graphviz
import matplotlib.pyplot as plt
import missingno as msno
import pandas as pd
import seaborn as sns
from google.colab import files
from scipy import stats
from sklearn.ensemble import (
    AdaBoostClassifier,
    BaggingClassifier,
    GradientBoostingClassifier,
    RandomForestClassifier,
)
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    auc,
    classification_report,
    confusion_matrix,
    roc_curve,
)
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.neighbors import KNeighborsClassifier as knn
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.tree import DecisionTreeClassifier, export_graphviz

warnings.filterwarnings("ignore")
%matplotlib inline

In [None]:
uploaded = files.upload()
df = pd.read_csv(next(iter(uploaded)))

In [None]:
df.info()

In [None]:
df.describe().T

In [None]:
df.head()

In [None]:
df.tail()

# Data Preprocessing

In [None]:
df.isnull().sum()

In [None]:
msno.matrix(df)

In [None]:
df.duplicated().sum()

In [None]:
perc_dupl = round((df.duplicated().sum() * 100 / (df.shape[0] * df.shape[1])), 2)

print(f"Duplicated Values Account for {perc_dupl}.d% of the data")

**Observation**

* There are no null values in the dataset as shown in the matrix

* There are numerous numbers of duplicated values in the dataset accounting for 4.15% of the entire data

## Handling the duplicated values

In [None]:
df = df.drop_duplicates()

# Exploratory Data Analysis

## Distribution of continuous variables in the dataset

In [None]:
features = ["age", "trestbps", "chol", "thalach"]
num_features = len(features)

num_cols = 3

num_rows = (num_features + num_cols - 1) // num_cols

plt.figure(figsize=(20, 10))

for i, feature in enumerate(features):
    plt.subplot(num_rows, num_cols, i + 1)

    data = df[feature]

    stat, p = stats.shapiro(data)

    print(f"{feature}")

    print("T-Statistic:", stat)

    print("P Value:", p)

    if p > 0.05:
        print("The data appears to be normally distributed")

    else:
        print("The data doesn't appear to be normally distributed")

    print("--------------------------------")

    stats.probplot(df[feature], plot=plt)

    plt.title(f"{feature}")

plt.tight_layout()

plt.show()

In [None]:
plt.figure(figsize=(15, 8))
plt.subplot(2, 4, 1)
sns.histplot(data=df, x="age", color="Blue", kde=True)
plt.title("Age Distribution")

plt.subplot(2, 4, 2)
sns.boxplot(data=df, x="age", color="Blue")
plt.title("Age Distribution")

plt.subplot(2, 4, 3)
sns.histplot(data=df, x="trestbps", color="Green", kde=True)
plt.title("Resting Blood Pressure Distribution")

plt.subplot(2, 4, 4)
sns.boxplot(data=df, x="trestbps", color="Green")
plt.title("Resting Blood Pressure Distribution")

plt.subplot(2, 4, 5)
sns.histplot(data=df, x="chol", color="DeepPink", kde=True)
plt.title("Cholesterol Distribution")

plt.subplot(2, 4, 6)
sns.boxplot(data=df, x="chol", color="DeepPink")
plt.title("Cholesterol Distribution")

plt.subplot(2, 4, 7)
sns.histplot(data=df, x="thalach", color="Maroon", kde=True)
plt.title("Maximum Heart Rate Distribution")

plt.subplot(2, 4, 8)
sns.boxplot(data=df, x="thalach", color="Maroon")
plt.title("Maximum Heart Rate Distribution")


plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(15, 8))
plt.subplot(2, 2, 1)
sns.countplot(data=df, x="sex")
plt.title("Number of interviewees by Gender")

plt.subplot(2, 2, 2)
df["sex"].value_counts().plot(kind="pie", explode=[0.01, 0.01], autopct="%1.1f%%")
plt.title("Proportion of Persons interviewed based on Gender")

plt.tight_layout()
plt.show()

In [None]:
sns.FacetGrid(data=df, col="target", height=8).map(sns.histplot, "age")
plt.show()

In [None]:
sns.FacetGrid(data=df, col="target", height=8).map(sns.kdeplot, "age")
plt.show()

In [None]:
sns.FacetGrid(data=df, hue="target", height=6).map(sns.distplot, "age").add_legend()
plt.show()

In [None]:
sns.boxplot(data=df, x="target", y="age")
plt.legend()
plt.title("Distribution of Age Based on Gender")
plt.show()

In [None]:
cols = ["age", "trestbps", "chol", "thalach"]
num_features = 3
num_cols = 2
num_rows = (num_features + num_cols - 1) // num_cols

plt.figure(figsize=(15, 8))

for i, col in enumerate(df[cols]):
    plt.subplot(num_rows, num_cols, i + 1)
    sns.pointplot(data=df, x="target", y=col)
    plt.title(f"Average {col} based on whether a patient has heart disease or not")

plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(15, 6))
plt.subplot(2, 2, 1)
sns.countplot(data=df, x="target")
plt.title("Number of People with Heart Disease")
plt.subplot(2, 2, 2)
sns.countplot(data=df, x="target", hue="sex", palette="Set1")
plt.title("Number of people with heart disease based on Gender")
plt.show()

## Observation

* There are outliers on the some of the numerical data on the dataset, however since this is a medical dataset, we aren't going to eliminate or cap them

* Resting Blood Pressure,cholesterol are skewed to the right indicating presence of potential outliers

* Maximum heart beat Rate is also skewed to the left an indication of outliers

* The number of Females were more than the number of Males that the data were collected From

* The KDE Plot of people with non-heart disease against is leptokurtic implying presence of distinct subpopulations with the same age

* The Median age of patients without heart diseases is highe than the median age of those with heart disease

* The average age of those with heart disease tend to be lower than those without heart disease

* The average resting blood pressure and cholesterol levels also show the same relationship as of the age

* The average maximum heart beat rate on the people wit heart disease tend to be higher than those of their counterparts



# Feature Engineering

In [None]:
min_age, max_age = df["age"].min(), df["age"].max()
print(f"Minimum Age:{min_age}")
print(f"Maximum Age:{max_age}")

In [None]:
def age_group(age):
    if age >= 29 and age <= 39:
        return "Young Adults"
    elif age > 39 and age <= 59:
        return "Middle Aged Adults"
    else:
        return "Elderly Adults"

In [None]:
df["age_group"] = df["age"].apply(age_group)

In [None]:
df.head(3)

In [None]:
min_chol, max_chol = df["chol"].min(), df["chol"].max()
print(f"Minimum Age:{min_chol}")
print(f"Maximum Age:{max_chol}")

Based on the article I found from the web via the following link,
https://www.medicalnewstoday.com/articles/315900#recommended-levels, I determined that the categories of cholesterol levels depend on age and amount of serum cholesterol. The Criteria to categorize the cholesterol levels is:

* for persons of age less than 19 and cholesterol levels lower or equal to  120md/dl, theyre categorizes as Non-HDL

* for persons of age less than 19 and cholesterol levels lower than 170md/dl, theyre categorizes as Non-HDL

* for persons of age greater than 20 and cholesterol levels less than 130md/dl, theyre categorizes as Non-HDL

* for persons of age greater than 20 and cholesterol levels more or equal to  120md/dl, theyre categorizes as Non-HDL

In [None]:
def cholesterol_levels(age, cholesterol):
    if age < 19 and cholesterol <= 120:
        return "Non-HDL"
    elif age < 19 and cholesterol < 170:
        return "Total Cholesterol"
    elif age >= 20 and cholesterol < 130:
        return "Non-HDL"
    elif age >= 20 and cholesterol >= 130:
        return "Total Cholesterol"

In [None]:
df["cholesterol_category"] = df.apply(
    lambda x: cholesterol_levels(x["age"], x["chol"]), axis=1
)

In [None]:
df["cholesterol_category"].unique()

In [None]:
df.head()

In [None]:
min_bps, max_bps = df["trestbps"].min(), df["trestbps"].max()
print(f"Minimum Blood Pressure:{min_bps}")
print(f"Maximum Blood Pressure:{max_bps}")

From the article provided in the link below, I determined how to categorize Blood pressure given the systolic values.
https://www.webmd.com/hypertension-high-blood-pressure/diastolic-and-systolic-blood-pressure-know-your-numbers

* If the Systolic pressure is less than 120, the person is categorized as having normal pressure

* if Systolic pressure is in the range 120-129, one is categorized as having an Elevated Blood pressure

* if Systolic pressure is in the range 130-139, one is categorized as having an Stage 1 Hypertension

* if Systolic pressure is in the range 140-180, one is categorized as having an Stage 2 Hypertension

* if Systolic pressure is beyond, one is categorized as having a Hypertensive Crisis and should seek for help as soon as possible

In [None]:
def blood_pressure(bps):
    if bps < 120:
        return "Normal"
    elif bps >= 120 and bps <= 129:
        return "Elevated"
    elif bps >= 130 and bps <= 139:
        return "Stage 1 Hypertension"
    elif bps >= 140 and bps <= 180:
        return "Stage 2 Hypertension"
    else:
        return "Hypertensive Crisis"

In [None]:
df["blood_pressure"] = df["trestbps"].apply(blood_pressure)

In [None]:
df.head()

In [None]:
min_thalach, max_thalach = df["thalach"].min(), df["thalach"].max()
print(f"Least Heart Rate:{min_thalach}")
print(f"Maximum Heart Rate:{max_thalach}")

In [None]:
cols = ["age_group", "cholesterol_category", "blood_pressure"]

num_features = 3
num_cols = 2
num_rows = (num_features + num_cols - 1) // num_cols
plt.figure(figsize=(15, 13))

for i, feature in enumerate(df[cols]):
    plt.subplot(num_rows, num_cols, i + 1)
    sns.pointplot(data=df, x=feature, y="age")
    plt.xticks(rotation=90)
    plt.title(f"Average age based on {feature}")

plt.tight_layout()
plt.show()

In [None]:
age_grouped = df.groupby("age_group")["target"].value_counts().reset_index(name="count")

In [None]:
age_grouped

In [None]:
sns.barplot(data=age_grouped, x="age_group", y="count", hue="target", palette="Set2")
plt.title("Presence of Heart Disease based on the age group")
plt.show()

In [None]:
sns.countplot(data=df, x="cholesterol_category", hue="target", palette="hls")
plt.title("Presence/Absence of Heart Disease based on the cholesterol category")
plt.show()

In [None]:
sns.countplot(data=df, x="blood_pressure", hue="target", palette="rocket")
plt.title("Presence/Absence of Heart Disease based on the Blood Pressure")
plt.xticks(rotation=90)
plt.show()

In [None]:
df.head(1)

In [None]:
fig = sns.PairGrid(data=df, hue="target", vars=["age", "trestbps", "chol", "thalach"])
fig.map_diag(sns.histplot)
fig.map_upper(sns.scatterplot)
fig.map_lower(sns.kdeplot)

plt.show()

In [None]:
numeric = df.select_dtypes(include="number")

corr = numeric.corr()
plt.figure(figsize=(15, 10))
sns.heatmap(
    corr, vmin=-1, vmax=1, cmap="Purples", linecolor="black", linewidth=0.1, annot=True
)
plt.title("Correlation between Numeric Features")
plt.show()

**Observation**

The average age of Persons with stage 2 hypertension is higher than all the other blood pressure categories

* The average age of persons with cholesterol levels categorized as Non-HDL is higher than those categorized as Total Cholesterol

* The middle aged people account for a large numbe of persons with heart disease

* A good chunk of those with cholesterol levels categorized as Total Cholesterol have heart disease

* People with Stage 1 Hypertension account for the largest number of people with heart disease followed by people with stage 2 hypertension then those with elevated blood pressure and people with normal, People with Hypertension crisis account for little to no cases of heart diseases

# Data Preprocessing II

In [None]:
df1 = df.copy()

In [None]:
df1.drop(columns=["age", "trestbps", "chol"], inplace=True)

In [None]:
encoder = LabelEncoder()
df1["cholesterol_category"] = encoder.fit_transform(df1["cholesterol_category"])
df1["blood_pressure"] = encoder.fit_transform(df1["blood_pressure"])
df1["age_group"] = encoder.fit_transform(df1["age_group"])

In [None]:
X = df1.drop(columns="target")
y = df1["target"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, shuffle=True, random_state=0
)

print("X-Train Shape:", X_train.shape)
print("X-Test Shape:", X_test.shape)
print("y-Train Shape:", y_train.shape)
print("y-Test Shape:", X_test.shape)

In [None]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Data Modelling

## Logistic Regression

In [None]:
lr = LogisticRegression()

lr.fit(X_train, y_train)
y_pred_train = lr.predict(X_train)
y_pred_test = lr.predict(X_test)

print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_test))
print(
    "Logistic Regression Classification report:\n",
    classification_report(y_test, y_pred_test),
)
cm = confusion_matrix(y_test, y_pred_test)

sns.heatmap(cm, annot=True, linecolor="black", linewidth=0.01, cmap="viridis")

y_prob = lr.predict_proba(X_test)[:, 1]

fpr, tpr, threshold = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)

plt.figure()
plt.plot(fpr, tpr, color="darkorange", lw=2, label="ROC curve (AUC = %0.2f)" % roc_auc)
plt.plot([0, 1], [0, 1], color="navy", lw=2, linestyle="--")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver Operating Characteristic")
plt.legend(loc="lower right")
plt.show()

print("AUC:", roc_auc)

## Multi-Layer Perceptron

In [None]:
ann = MLPClassifier(
    solver="sgd",
    hidden_layer_sizes=100,
    max_iter=1000,
    random_state=1,
    learning_rate_init=0.01,
)

ann.fit(X_train, y_train)

ann_ypred = ann.predict(X_test)

print("Artificial Neural Network Training Set score: \n", ann.score(X_train, y_train))

print("Artificial Neural Network Testing Set score: \n", ann.score(X_test, y_test))

print("Artificial Neural Network Accuracy score:\n ", accuracy_score(y_test, ann_ypred))

print(
    "Artificial Neural Network Classification Report:\n ",
    classification_report(y_test, ann_ypred),
)

conf_mat = confusion_matrix(y_test, ann_ypred)

plt.figure(figsize=(8, 6))

sns.heatmap(conf_mat, annot=True, fmt="d", cmap="Blues")

plt.xlabel("Predicted Labels")

plt.ylabel("True Labels")

plt.title("Confusion Matrix")

plt.show()

## K-Nearest Neighbors Classifier

In [None]:
model = knn(n_neighbors=3)

model.fit(X_train, y_train)

knn_ypred = model.predict(X_test)

print("KNN Classifier Training Set score: \n", model.score(X_train, y_train))

print("KNN Classifier Testing Set score: \n", model.score(X_test, y_test))

print("KNN Classifier Accuracy score:\n ", accuracy_score(y_test, knn_ypred))

print(
    "KNN Classifier Classification Report:\n ", classification_report(y_test, knn_ypred)
)

conf_mat = confusion_matrix(y_test, knn_ypred)

plt.figure(figsize=(8, 6))

sns.heatmap(conf_mat, annot=True, fmt="d", cmap="Purples")

plt.xlabel("Predicted Labels")

plt.ylabel("True Labels")

plt.title("Confusion Matrix")

plt.show()

## Decision Tree Classifier

In [None]:
dt = DecisionTreeClassifier()

dt.fit(X_train, y_train)

dt_ypred = dt.predict(X_test)

print("Decision Tree Classifier Training Set score: \n", dt.score(X_train, y_train))

print("Decision Tree Classifier Testing Set score: \n", dt.score(X_test, y_test))

print("Decision Tree Classifier Accuracy score:\n ", accuracy_score(y_test, dt_ypred))

print(
    "Decision Tree Classifier Classification Report:\n ",
    classification_report(y_test, dt_ypred),
)

conf_mat = confusion_matrix(y_test, dt_ypred)

plt.figure(figsize=(8, 6))

sns.heatmap(conf_mat, annot=True, fmt="d", cmap="viridis")

plt.xlabel("Predicted Labels")

plt.ylabel("True Labels")

plt.title("Confusion Matrix")

plt.show()

In [None]:
dot_data = export_graphviz(dt, out_file=None)

graph = graphviz.Source(dot_data)

graph

### HyperParameter Tuning the Decision Tree Classifier

In [None]:
param_grid = {
    "criterion": ["gini", "entropy", "log_loss"],
    "splitter": ["random", "best"],
    "max_depth": (list(range(1, 21))),
    "min_samples_leaf": (list(range(1, 21))),
    "max_features": ["auto", "sqrt", "log2"],
    "min_samples_split": [2, 3, 4, 5],
}

dt_grid = GridSearchCV(
    estimator=dt, param_grid=param_grid, scoring="recall", cv=5, n_jobs=-1
)

dt_grid.fit(X_train, y_train)

print(dt_grid.best_params_)

### Implementation of the HyperParameters from GridSearchCV

In [None]:
dt = DecisionTreeClassifier(
    criterion="log_loss",
    max_depth=12,
    max_features="sqrt",
    min_samples_leaf=16,
    min_samples_split=3,
    splitter="random",
)

dt.fit(X_train, y_train)

dt_ypred = dt.predict(X_test)

print("Decision Tree Classifier Training Set score: \n", dt.score(X_train, y_train))

print("Decision Tree Classifier Testing Set score: \n", dt.score(X_test, y_test))

print("Decision Tree Classifier Accuracy score:\n ", accuracy_score(y_test, dt_ypred))

print(
    "Decision Tree Classifier Classification Report:\n ",
    classification_report(y_test, dt_ypred),
)

conf_mat = confusion_matrix(y_test, dt_ypred)

plt.figure(figsize=(8, 6))

sns.heatmap(conf_mat, annot=True, fmt="d", cmap="viridis")

plt.xlabel("Predicted Labels")

plt.ylabel("True Labels")

plt.title("Confusion Matrix")

plt.show()

### Hyperparameter Tuning for Decision Tree Classifier using Randomized Search CV

In [None]:
dt_rand = RandomizedSearchCV(
    estimator=dt,
    param_distributions=param_grid,
    n_jobs=-1,
    refit=True,
    cv=5,
    scoring="recall",
)

dt_rand.fit(X_train, y_train)

print(dt_rand.best_params_)

### Implementation of the HyperParameters from RandomizedSearchCV

In [None]:
dt = DecisionTreeClassifier(
    criterion="gini",
    max_depth=4,
    max_features="auto",
    min_samples_leaf=18,
    min_samples_split=3,
    splitter="best",
)

dt.fit(X_train, y_train)

dt_ypred = dt.predict(X_test)

print("Decision Tree Classifier Training Set score: \n", dt.score(X_train, y_train))

print("Decision Tree Classifier Testing Set score: \n", dt.score(X_test, y_test))

print("Decision Tree Classifier Accuracy score:\n ", accuracy_score(y_test, dt_ypred))

print(
    "Decision Tree Classifier Classification Report:\n ",
    classification_report(y_test, dt_ypred),
)

conf_mat = confusion_matrix(y_test, dt_ypred)

plt.figure(figsize=(8, 6))

sns.heatmap(conf_mat, annot=True, fmt="d", cmap="viridis")

plt.xlabel("Predicted Labels")

plt.ylabel("True Labels")

plt.title("Confusion Matrix")

plt.show()

## Random Forest Classifier

In [None]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
rf_ypred = rf.predict(X_test)

print("Random Forest Classifier Training Set score: \n", rf.score(X_train, y_train))

print("Random Forest Classifier Testing Set score: \n", rf.score(X_test, y_test))

print("Random Forest Classifier Accuracy score:\n ", accuracy_score(y_test, rf_ypred))

print(
    "Random Forest Classifier Classification Report:\n ",
    classification_report(y_test, rf_ypred),
)

conf_mat = confusion_matrix(y_test, rf_ypred)

plt.figure(figsize=(8, 6))

sns.heatmap(conf_mat, annot=True, fmt="d", cmap="viridis")

plt.xlabel("Predicted Labels")

plt.ylabel("True Labels")

plt.title("Confusion Matrix")

plt.show()

## Bagging Classifier

In [None]:
# Using the Random Forest Model as the base model

rf = RandomForestClassifier()

bc_params = {
    "base_estimator": rf,
    "n_estimators": 50,
    "max_samples": 0.5,
    "random_state": 11,
    "n_jobs": -1,
}

bc = BaggingClassifier(**bc_params)

bc.fit(X_train, y_train)

bc_ypreds_train = bc.predict(X_train)

bc_ypreds_test = bc.predict(X_test)

print(
    "Bagging Classifier:\n> Accuracy on training data = {:.4f}"
    "\n> Accuracy on Testing data = {:.4f}".format(
        accuracy_score(y_true=y_train, y_pred=bc_ypreds_train),
        accuracy_score(y_true=y_test, y_pred=bc_ypreds_test),
    )
)

print("Bagging Classifier Accuracy score:\n ", accuracy_score(y_test, bc_ypreds_test))

print(
    "Bagging Classifier Classification Report:\n ",
    classification_report(y_test, bc_ypreds_test),
)

conf_mat = confusion_matrix(y_test, bc_ypreds_test)

plt.figure(figsize=(8, 6))

sns.heatmap(conf_mat, annot=True, fmt="d", cmap="coolwarm")

plt.xlabel("Predicted Labels")

plt.ylabel("True Labels")

plt.title("Confusion Matrix")

plt.show()

## Gradient Boosting Classifier

In [None]:
gb = GradientBoostingClassifier()

gb.fit(X_train, y_train)

gb_ypred_train = gb.predict(X_train)

gb_ypred_test = gb.predict(X_test)

print(
    "Gradient Boosting Classifier:"
    "\n> Accuracy on training data = {:.4f}"
    "\n> Accuracy on testing data = {:.4f}".format(
        accuracy_score(y_true=y_train, y_pred=gb_ypred_train),
        accuracy_score(y_true=y_test, y_pred=gb_ypred_test),
    )
)

print(
    "Gradient Boosting Classifier Accuracy score:\n ",
    accuracy_score(y_test, gb_ypred_test),
)

print(
    "Gradient Boosting  Classifier Classification Report:\n ",
    classification_report(y_test, gb_ypred_test),
)

conf_mat = confusion_matrix(y_test, gb_ypred_test)

plt.figure(figsize=(8, 6))

sns.heatmap(conf_mat, annot=True, fmt="d", cmap="viridis")

plt.xlabel("Predicted Labels")

plt.ylabel("True Labels")

plt.title("Confusion Matrix")

plt.show()

## AdaBoost Classifier

In [None]:
ad = AdaBoostClassifier()

ad.fit(X_train, y_train)

ad_ypred_test = ad.predict(X_test)

ad_ypred_train = ad.predict(X_train)

print(
    "AdaBoost Classifier Accuracy score on Training Set:",
    accuracy_score(y_true=y_train, y_pred=ad_ypred_train),
)

print(
    "AdaBoost Classifier Accuracy score on Testing Set:",
    accuracy_score(y_true=y_test, y_pred=ad_ypred_test),
)

print("AdaBoost Classifier Accuracy score:\n ", accuracy_score(y_test, ad_ypred_test))

print(
    "AdaBoost Classifier Classification Report:\n ",
    classification_report(y_test, ad_ypred_test),
)

conf_mat = confusion_matrix(y_test, ad_ypred_test)

plt.figure(figsize=(8, 6))

sns.heatmap(conf_mat, annot=True, fmt="d", cmap="viridis")

plt.xlabel("Predicted Labels")

plt.ylabel("True Labels")

plt.title("Confusion Matrix")

plt.show()

## Bagging Random Forest with adaboost classifier

In [None]:
# Using the Random Forest Model as the base model

rf = RandomForestClassifier()

ad_params = {"n_estimators": 100, "base_estimator": rf, "random_state": 11}

ad = AdaBoostClassifier(**ad_params)

ad.fit(X_train, y_train)

ad_ypreds_train = ad.predict(X_train)

ad_ypreds_test = ad.predict(X_test)

print(
    "Bagging AdaBoost Classifier:\n> Accuracy on training data = {:.4f}"
    "\n> Accuracy on Testing data = {:.4f}".format(
        accuracy_score(y_true=y_train, y_pred=ad_ypreds_train),
        accuracy_score(y_true=y_test, y_pred=ad_ypreds_test),
    )
)

print(
    "Bagging AdaBoost Classifier Accuracy score:\n ",
    accuracy_score(y_test, ad_ypreds_test),
)

print(
    "Bagging AdaBoost Classifier Classification Report:\n ",
    classification_report(y_test, ad_ypreds_test),
)

conf_mat = confusion_matrix(y_test, ad_ypreds_test)

plt.figure(figsize=(8, 6))

sns.heatmap(conf_mat, annot=True, fmt="d", cmap="viridis")

plt.xlabel("Predicted Labels")

plt.ylabel("True Labels")

plt.title("Confusion Matrix")

plt.show()