### Required Libraries

In [1]:
from ucimlrepo import fetch_ucirepo
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
import xgboost as xgb
from sklearn.metrics import accuracy_score 

### Import Dataset

In [2]:
# fetch dataset 
heart_disease = fetch_ucirepo(id=45) 
  
# data (as pandas dataframes) 
X = heart_disease.data.features 
y = heart_disease.data.targets 
  
# metadata 
print(heart_disease.metadata)   
  
# variable information 
print(heart_disease.variables) 

{'uci_id': 45, 'name': 'Heart Disease', 'repository_url': 'https://archive.ics.uci.edu/dataset/45/heart+disease', 'data_url': 'https://archive.ics.uci.edu/static/public/45/data.csv', 'abstract': '4 databases: Cleveland, Hungary, Switzerland, and the VA Long Beach', 'area': 'Health and Medicine', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 303, 'num_features': 13, 'feature_types': ['Categorical', 'Integer', 'Real'], 'demographics': ['Age', 'Sex'], 'target_col': ['num'], 'index_col': None, 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 1989, 'last_updated': 'Fri Nov 03 2023', 'dataset_doi': '10.24432/C52P4X', 'creators': ['Andras Janosi', 'William Steinbrunn', 'Matthias Pfisterer', 'Robert Detrano'], 'intro_paper': {'title': 'International application of a new probability algorithm for the diagnosis of coronary artery disease.', 'authors': 'R. Detrano, A. JÃ¡nosi, W. Steinbrunn, M. Pfisterer, J. Schmid, S. S

### Train Test Split

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

NOTE: There is no need to impute missing values, becuase XGBoost handles missing values, too.

In [4]:
# imputer = SimpleImputer(strategy='mean')
# X_train = imputer.fit_transform(X_train)
# X_test = imputer.transform(X_test)

### List of hyper-parameters values

In [5]:
max_depth_values = [3, 6, 8, 10, 12, 15, 20]
learning_rate_values = [0.25, 0.1, 0.01, 0.05]
n_estimators_values = [100, 200, 250, 300]

### XGBoost Model Classifier

In [6]:
max_accuracy = 0.0
best_hyperparameters = None

for max_depth in max_depth_values:
    for learning_rate in learning_rate_values:
        for n_estimators in n_estimators_values:
            print(f"Testing Hyperparameters: max_depth={max_depth}, learning_rate={learning_rate}, n_estimators={n_estimators}")
            
            params = {
                'max_depth': max_depth,
                'learning_rate': learning_rate,
                'n_estimators': n_estimators,
            }
            
            model = xgb.XGBClassifier(**params)
            model.fit(X_train, y_train)
            
            y_pred = model.predict(X_test)
            
            accuracy = accuracy_score(y_test, y_pred)
            print("Accuracy:", accuracy)
            print("------------------------")
            
            if accuracy > max_accuracy:
                max_accuracy = accuracy
                best_hyperparameters = params

print("Best Hyperparameters:", best_hyperparameters)
print("Max Accuracy:", max_accuracy)

Testing Hyperparameters: max_depth=3, learning_rate=0.25, n_estimators=100


Accuracy: 0.47540983606557374
------------------------
Testing Hyperparameters: max_depth=3, learning_rate=0.25, n_estimators=200
Accuracy: 0.45901639344262296
------------------------
Testing Hyperparameters: max_depth=3, learning_rate=0.25, n_estimators=250
Accuracy: 0.45901639344262296
------------------------
Testing Hyperparameters: max_depth=3, learning_rate=0.25, n_estimators=300
Accuracy: 0.47540983606557374
------------------------
Testing Hyperparameters: max_depth=3, learning_rate=0.1, n_estimators=100
Accuracy: 0.5081967213114754
------------------------
Testing Hyperparameters: max_depth=3, learning_rate=0.1, n_estimators=200
Accuracy: 0.4918032786885246
------------------------
Testing Hyperparameters: max_depth=3, learning_rate=0.1, n_estimators=250
Accuracy: 0.4918032786885246
------------------------
Testing Hyperparameters: max_depth=3, learning_rate=0.1, n_estimators=300
Accuracy: 0.4918032786885246
------------------------
Testing Hyperparameters: max_depth=3, learn

### Discussion

### 1. max_depth=3
This hyperparameter represents the maximum depth of each tree in the XGBoost classifier. A lower value, such as 3, indicates a shallow tree structure, which can help prevent overfitting and improve generalization.

### 2. learning_rate=0.01
The learning rate determines the step size at each boosting iteration. A lower learning rate, such as 0.01, means the model makes smaller updates to the weights of the trees in each iteration, which can lead to more accurate predictions but may require more iterations to converge.

### 3. n_estimators=250
This hyperparameter defines the number of trees in the XGBoost model, or the number of boosting iterations. More trees can lead to a more complex model, but there is a trade-off between model complexity and computational efficiency. In this case, 250 trees were used.

## XGB Code Found In Kaggle
Link: https://www.kaggle.com/code/adepvenugopal/predicting-heart-disease-using-ml-xgboost

In [8]:
from sklearn.model_selection import RandomizedSearchCV

xgb_classifier = xgb.XGBClassifier()

gbm_param_grid = {
    'n_estimators': range(1,1000, 100),
    'max_depth': range(1, 10),
    'learning_rate': [.1,.4, .45, .5, .55, .6],
    'colsample_bytree': [.6, .7, .8, .9, 1],
    'booster':["gbtree"],
     'min_child_weight': [0.001,0.003,0.01],
}

xgb_random = RandomizedSearchCV(param_distributions=gbm_param_grid, 
                                    estimator = xgb_classifier, scoring = "accuracy", 
                                    verbose = 0, n_iter = 100, cv = 4)

xgb_random.fit(X_train,y_train)

xgb_bp = xgb_random.best_params_

xgb_model1=xgb.XGBClassifier(n_estimators=xgb_bp["n_estimators"],
                            min_child_weight=xgb_bp["min_child_weight"],
                            max_depth=xgb_bp["max_depth"],
                            learning_rate=xgb_bp["learning_rate"],
                            colsample_bytree=xgb_bp["colsample_bytree"],
                            booster=xgb_bp["booster"])


xgb_model1.fit(X_train,y_train)
y_pred1 = xgb_model1.predict(X_test)
accuracy = accuracy_score(y_test, y_pred1)
print("Accuracy:", accuracy)

Accuracy: 0.5409836065573771


In [9]:
print(xgb_bp)

{'n_estimators': 61, 'min_child_weight': 0.001, 'max_depth': 9, 'learning_rate': 0.55, 'colsample_bytree': 0.6, 'booster': 'gbtree'}
