In [2]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
import matplotlib.pyplot as plt


RANDOM_STATE = 55 ## WE WILL PASS IT TO EVERY SKLEARN CALL SO WE ENSURE REPRODUCIBILITY



# Data Loading and Overview

In [32]:
# load the data set
df = pd.read_csv("/Users/berekettesfaye/Desktop/ML_projects/decsion_boosted_tree/heart.csv")
# Display the first few rows of the dataset to understand its structure
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


* Here, we're loading the dataset which contains features related to heart disease. Each row represents a patient, and columns include various health metrics and the target variable HeartDisease.

## Data Preprocessing

* We must perform some data engineering before working with the models. There are 5 categorical features, so we will use Pandas to one-hot encode them.

In [33]:
# Identify categorical variables for one-hot encoding
cat_variables = ['Sex',
                'ChestPainType',
                'RestingECG',
                'ExerciseAngina',
                'ST_Slope'
                ]

In [34]:
# Perform one-hot encoding on categorical variables
df = pd.get_dummies(data = df,
                   prefix = cat_variables,
                   columns = cat_variables)

In [36]:
# Convert boolean values to integers for consistency
df = df.astype(int)

# Display the first few rows of the modified dataset
df.head()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease,Sex_F,Sex_M,ChestPainType_ASY,...,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,ExerciseAngina_N,ExerciseAngina_Y,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up
0,40,140,289,0,172,0,0,0,1,0,...,0,0,0,1,0,1,0,0,0,1
1,49,160,180,0,156,1,1,1,0,0,...,1,0,0,1,0,1,0,0,1,0
2,37,130,283,0,98,0,0,0,1,0,...,0,0,0,0,1,1,0,0,0,1
3,48,138,214,0,108,1,1,1,0,1,...,0,0,0,1,0,0,1,0,1,0
4,54,150,195,0,122,0,0,0,1,0,...,1,0,0,1,0,1,0,0,0,1


## One-hot encoding

* One-hot encoding is used to convert categorical variables into a format that can be provided to ML algorithms to do a better job in prediction. Here, boolean values are forced to integers for clarity in the dataset.

* We started with 11 features. Let's see how many feature variables we have after one-hot encoding.

In [37]:
# Identify the target variable and separate it from the features
features = [x for x in df.columns if x != 'HeartDisease']

# Display the number of features after one-hot encoding
print(len(features))

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df[features], df['HeartDisease'], test_size=0.2, random_state=RANDOM_STATE)

# Output the number of samples in the training and testing sets
print(f'train samples: {len(X_train)}')
print(f'validation samples: {len(X_test)}')

20
train samples: 734
validation samples: 184


* After preprocessing, the dataset is split into training and testing sets to evaluate the performance of different models. Here, 80% of the data is used for training, and 20% is set aside for testing.

## Decision Tree Model
### Model Initialization and Hyperparameter Tuning

In [None]:
# Initialize the Decision Tree model
decision_model = DecisionTreeClassifier(random_state=RANDOM_STATE)

# Define the hyperparameter search space
search_values = {
    'criterion': ['gini', 'entropy'],
    'min_samples_split': [2, 10, 30, 50, 100, 200, 300, 700],
    'max_depth': [1, 2, 3, 4, 5, 16, 32, 64, None]  # None means no limit on depth
}

# Use GridSearchCV to find the best hyperparameters
gs_decision_tree = GridSearchCV(
    estimator=decision_model,
    param_grid=search_values,
    cv=5,  # Use 5-fold cross-validation
    scoring='accuracy',
    verbose=3
)

# Fit the model to the training data
gs_decision_tree.fit(X_train, y_train)

# Output the best estimator found by GridSearchCV
print(gs_decision_tree.best_estimator_)


## Evaluation

In [39]:
# Evaluate the model on the training and testing sets
print(f"Metrics train:\n\tAccuracy score: {accuracy_score(gs_decision_tree.best_estimator_.predict(X_train), y_train):.4f}")
print(f"Metrics validation:\n\tAccuracy score: {accuracy_score(gs_decision_tree.best_estimator_.predict(X_test), y_test):.4f}")


Metrics train:
	Accuracy score: 0.8815
Metrics validation:
	Accuracy score: 0.8641


* In this section, a Decision Tree model is trained using GridSearchCV to find the optimal hyperparameters. The performance is evaluated using accuracy on both the training and testing sets. From the accuracy i got, it shows that the model is not overfitting but it has high bias , this result could be because am descision tree next i will try using random forest to try if i can fix the underfitting part since i have an option to train ensemble of trees

## Random Forest Model

### Model Initialization and Hyperparameter Tuning

In [None]:
# Initialize the Random Forest model
random_forest_model = RandomForestClassifier(random_state=RANDOM_STATE)

# Define the hyperparameter search space
search_values = {
    'criterion': ['entropy'],
    'min_samples_split': [2, 10, 30, 50, 100, 200, 300, 700],
    'max_depth': [2, 4, 8, 16, 32, 64, None],
    'n_estimators': [10, 50, 100, 500]
}

# Use GridSearchCV to find the best hyperparameters
gs_random_forest = GridSearchCV(
    estimator=random_forest_model,
    param_grid=search_values,
    scoring='accuracy',
    cv=5,
    verbose=3
)

# Fit the model to the training data
gs_random_forest.fit(X_train, y_train)

# Output the best estimator found by GridSearchCV
print(gs_random_forest.best_estimator_)


## Evaluation

In [41]:
# Evaluate the model on the training and testing sets
best_model = gs_random_forest.best_estimator_
print(f"Metrics train:\n\tAccuracy score: {accuracy_score(best_model.predict(X_train), y_train):.4f}")
print(f"Metrics validation:\n\tAccuracy score: {accuracy_score(best_model.predict(X_test), y_test):.4f}")


Metrics train:
	Accuracy score: 0.8869
Metrics validation:
	Accuracy score: 0.8859


* The Random Forest model is also optimized using 'GridSearchCV' . The accuracy of the model on both training and testing sets is good even it does better on the validation set but still on the train set its not doing good .my aim is to get move than 90 % accuracy so next i will try XGboostclassifier

## XGBoost Model

### Data Splitting for XGBoost

In [43]:
# Split the training data for XGBoost
n = int(len(X_train) * 0.8)  # 80% for training and 20% for validation
X_train_fit, X_train_val, y_train_fit, y_train_val = X_train[:n], X_train[n:], y_train[:n], y_train[n:]


* XGBoost requires a validation set during training to evaluate performance and stop early if no improvement is observed. Here, the training set is further split into training and validation sets.

### Model Initialization and Training

In [None]:
# Initialize the XGBoost model
xgb_model = XGBClassifier(n_estimators=500, learning_rate=0.1, verbosity=1, random_state=RANDOM_STATE, early_stopping_rounds=10)

# Output the XGBoost model configuration
print(xgb_model)

# Fit the model to the training data with early stopping
xgb_model.fit(X_train_fit, y_train_fit, eval_set=[(X_train_val, y_train_val)])


## Evaluation

In [46]:
# Evaluate the model on the training and testing sets
print(f"Metrics train:\n\tAccuracy score: {accuracy_score(xgb_model.predict(X_train), y_train):.4f}\nMetrics test:\n\tAccuracy score: {accuracy_score(xgb_model.predict(X_test), y_test):.4f}")


Metrics train:
	Accuracy score: 0.9292
Metrics test:
	Accuracy score: 0.8859


* XGBoost is a powerful boosting algorithm that often performs well on a variety of tasks. The accuracy on the training and testing sets is is really good as i expected i got more than 90 % accuracy on the training data and it also have the same amount of accuracy on the validation set



## Conclusion

* By applying Decision Tree, Random Forest, and XGBoost models to the dataset, we have explored different ways to predict heart disease. Each model's performance is evaluated based on accuracy, and the results suggest that XGBoost is the model which is more effective for this specific task. XGBoost, with its advanced boosting techniques, generally shows improved accuracy and generalization.