In [2]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
import matplotlib.pyplot as plt


RANDOM_STATE = 55 ## WE WILL PASS IT TO EVERY SKLEARN CALL SO WE ENSURE REPRODUCIBILITY



# Data Loading and Overview

In [32]:
# load the data set
df = pd.read_csv("/Users/berekettesfaye/Desktop/ML_projects/decsion_boosted_tree/heart.csv")
# Display the first few rows of the dataset to understand its structure
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


* Here, we're loading the dataset which contains features related to heart disease. Each row represents a patient, and columns include various health metrics and the target variable HeartDisease.

* We must perform some data engineering before working with the models. There are 5 categorical features, so we will use Pandas to one-hot encode them.

In [33]:
# Identify categorical variables for one-hot encoding
cat_variables = ['Sex',
                'ChestPainType',
                'RestingECG',
                'ExerciseAngina',
                'ST_Slope'
                ]

In [34]:
# Perform one-hot encoding on categorical variables
df = pd.get_dummies(data = df,
                   prefix = cat_variables,
                   columns = cat_variables)

In [36]:
# Convert boolean values to integers for consistency
df = df.astype(int)

# Display the first few rows of the modified dataset
df.head()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease,Sex_F,Sex_M,ChestPainType_ASY,...,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,ExerciseAngina_N,ExerciseAngina_Y,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up
0,40,140,289,0,172,0,0,0,1,0,...,0,0,0,1,0,1,0,0,0,1
1,49,160,180,0,156,1,1,1,0,0,...,1,0,0,1,0,1,0,0,1,0
2,37,130,283,0,98,0,0,0,1,0,...,0,0,0,0,1,1,0,0,0,1
3,48,138,214,0,108,1,1,1,0,1,...,0,0,0,1,0,0,1,0,1,0
4,54,150,195,0,122,0,0,0,1,0,...,1,0,0,1,0,1,0,0,0,1


* One-hot encoding is used to convert categorical variables into a format that can be provided to ML algorithms to do a better job in prediction. Here, boolean values are forced to integers for clarity in the dataset.

In [8]:
# the target value is heartDisease so we need to separet the the features fromn the target value
features = [x for x in df.columns if x not in 'HeartDisease']

* We started with 11 features. Let's see how many feature variables we have after one-hot encoding.

In [9]:
print(len(features))

20


In [10]:
# split the data set into train and test and cross validation set
X_train,X_test,y_train,y_test = train_test_split(df[features],df['HeartDisease'], test_size = 0.2, random_state = RANDOM_STATE)





In [11]:
print(f'train samples: {len(X_train)}')
print(f'validation samples: {len(X_test)}')


train samples: 734
validation samples: 184


* The first alorithm i will perform on this data set is Decision tree. i will use GridSearchCV to find the best hyperparameter for our model 

In [12]:
# i will just creat an object of my model and then use gridsearchcv to try out the best hyperparameter
decision_model = DecisionTreeClassifier(random_state = RANDOM_STATE)

In [13]:
# make a dictionary of hyperparameter values to search
search_values = {
    'criterion': ['gini', 'entropy'],
    'min_samples_split': [2,10,30,50,100,200,300,700],
    'max_depth': [1,2,3,4,5,16,32,64,None]# None means there is no depth limit
}

In [14]:
# create the gridsearchcv object and pass in the arguments
gs_decision_tree = GridSearchCV(
    estimator = decision_model,
    param_grid = search_values,
    cv = 2, # Use 5-fold cross-validation
    scoring = 'accuracy', # Use accuracy as the metric
    verbose = 3 # Use accuracy as the metric
)

In [15]:
gs_decision_tree.fit(X_train,y_train)

Fitting 2 folds for each of 144 candidates, totalling 288 fits
[CV 1/2] END criterion=gini, max_depth=1, min_samples_split=2;, score=0.809 total time=   0.0s
[CV 2/2] END criterion=gini, max_depth=1, min_samples_split=2;, score=0.812 total time=   0.0s
[CV 1/2] END criterion=gini, max_depth=1, min_samples_split=10;, score=0.809 total time=   0.0s
[CV 2/2] END criterion=gini, max_depth=1, min_samples_split=10;, score=0.812 total time=   0.0s
[CV 1/2] END criterion=gini, max_depth=1, min_samples_split=30;, score=0.809 total time=   0.0s
[CV 2/2] END criterion=gini, max_depth=1, min_samples_split=30;, score=0.812 total time=   0.0s
[CV 1/2] END criterion=gini, max_depth=1, min_samples_split=50;, score=0.809 total time=   0.0s
[CV 2/2] END criterion=gini, max_depth=1, min_samples_split=50;, score=0.812 total time=   0.0s
[CV 1/2] END criterion=gini, max_depth=1, min_samples_split=100;, score=0.809 total time=   0.0s
[CV 2/2] END criterion=gini, max_depth=1, min_samples_split=100;, score=0.

In [17]:
# lets printt he best estimator
print(gs_decision_tree.best_estimator_)

DecisionTreeClassifier(criterion='entropy', max_depth=5, min_samples_split=30,
                       random_state=55)


In [18]:
# now lets use the best estimator on a  new data set and compare the accuracy with the training data to see if it generalizes
best_model = gs_decision_tree.best_estimator_

print(f"Metrics train:\n\tAccuracy score: {accuracy_score(best_model.predict(X_train),y_train):.4f}")
print(f"Metrics validation:\n\tAccuracy score: {accuracy_score(best_model.predict(X_test),y_test):.4f}")

Metrics train:
	Accuracy score: 0.8787
Metrics validation:
	Accuracy score: 0.8641


* even though the model generalizes whic means it does good in training set and test set . the accuracy is very low 84 % and this happens because descision tree train using one tree next i will use randomforest and xgboostclassifier to see if can get a better accurcy by using ensamble of trees 

In [19]:
# now i will use randomforest .. its the same steps as the previous one 
random_forest_model = RandomForestClassifier(random_state = RANDOM_STATE)


In [20]:
# make a dictionary of hyperparameter values to search
# the additional parameter is n_estimators this is because random forest trains by creating multiple trees
search_values ={
    
    'criterion': [ 'entropy'],
    'min_samples_split' : [2,10,30,50,100,200,300,700],
    'max_depth' :  [2,4,8,16,32,64, None],
    'n_estimators' : [10,50,100,500]

}

In [21]:
# create the gridsearchcv object and pass in the arguments
gs_random_forest = GridSearchCV(
    estimator = random_forest_model,
    param_grid = search_values,
    scoring = 'accuracy',
    cv = 5,
    verbose = 3
)

In [22]:
gs_random_forest.fit(X_train,y_train)

Fitting 5 folds for each of 224 candidates, totalling 1120 fits
[CV 1/5] END criterion=entropy, max_depth=2, min_samples_split=2, n_estimators=10;, score=0.837 total time=   0.0s
[CV 2/5] END criterion=entropy, max_depth=2, min_samples_split=2, n_estimators=10;, score=0.816 total time=   0.0s
[CV 3/5] END criterion=entropy, max_depth=2, min_samples_split=2, n_estimators=10;, score=0.850 total time=   0.0s
[CV 4/5] END criterion=entropy, max_depth=2, min_samples_split=2, n_estimators=10;, score=0.844 total time=   0.0s
[CV 5/5] END criterion=entropy, max_depth=2, min_samples_split=2, n_estimators=10;, score=0.801 total time=   0.0s
[CV 1/5] END criterion=entropy, max_depth=2, min_samples_split=2, n_estimators=50;, score=0.857 total time=   0.0s
[CV 2/5] END criterion=entropy, max_depth=2, min_samples_split=2, n_estimators=50;, score=0.844 total time=   0.0s
[CV 3/5] END criterion=entropy, max_depth=2, min_samples_split=2, n_estimators=50;, score=0.871 total time=   0.0s
[CV 4/5] END cri

In [23]:
print(gs_random_forest.best_estimator_)


RandomForestClassifier(criterion='entropy', max_depth=4, n_estimators=50,
                       random_state=55)


In [24]:
# now lets use the best estimator on a  new data set and compare the accuracy with the training data to see if it generalizes
best_model = gs_random_forest.best_estimator_
print(f"Metrics train:\n\tAccuracy score: {accuracy_score(best_model.predict(X_train),y_train):.4f}")
print(f"Metrics validation:\n\tAccuracy score: {accuracy_score(best_model.predict(X_test),y_test):.4f}")

Metrics train:
	Accuracy score: 0.8869
Metrics validation:
	Accuracy score: 0.8859


* after using random forest the accuracy is is still low it generlizes but the acuracy would be it its above 90 % at list, so i will build xgboostclassifier model to see if it makes any difference

In [25]:
# xgb_boost needs valiadtion set while training to compute cost we will devide the traing set into train and val set
n = int(len(X_train)* 0.8)# 80 % for train and 20 % for validation
X_train_fit, X_train_val,y_train_fit,y_train_val = X_train[:n],X_train[n:],y_train[:n],y_train[n:]

In [28]:

xgb_model = XGBClassifier(n_estimators = 500, learning_rate = 0.1, verbosity = 1, random_state = RANDOM_STATE, early_stopping_rounds = 10)
print(xgb_model)

xgb_model.fit(X_train_fit,y_train_fit, eval_set = [(X_train_val,y_train_val)])

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=10,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.1, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=500, n_jobs=None,
              num_parallel_tree=None, random_state=55, ...)
[0]	validation_0-logloss:0.63278
[1]	validation_0-logloss:0.59892
[2]	validation_0-logloss:0.57315
[3]	validation_0-logloss:0.55187
[4]	validation_0-logloss:0.53284
[5]	validation_0-logloss:0.51712
[6]	validation_0-logloss:0.50139
[7]	validation_0-

In [30]:
print(f"Metrics train:\n\tAccuracy score: {accuracy_score(xgb_model.predict(X_train),y_train):.4f}\nMetrics test:\n\tAccuracy score: {accuracy_score(xgb_model.predict(X_test),y_test):.4f}")

Metrics train:
	Accuracy score: 0.9292
Metrics test:
	Accuracy score: 0.8859
