In [1]:
# Initial imports.
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from collections import Counter

In [2]:
#  Import and read CSV file
heart_df = pd.read_csv("../Data/framinghamid.csv")
heart_df.count()

patientid          4119
male               4119
age                4119
education          4016
currentsmoker      4119
cigsperday         4090
bpmeds             4119
prevalentstroke    4119
prevalenthyp       4119
diabetes           4119
totchol            4119
sysbp              4119
diabp              4119
bmi                4119
heartrate          4119
glucose            3776
tenyearchd         4119
dtype: int64

In [3]:
#  Clean imorted file for machine learning
heart_df = heart_df.drop(columns=["education", "glucose", "patientid"])
heart_df = heart_df.dropna()
heart_df.count()

male               4090
age                4090
currentsmoker      4090
cigsperday         4090
bpmeds             4090
prevalentstroke    4090
prevalenthyp       4090
diabetes           4090
totchol            4090
sysbp              4090
diabp              4090
bmi                4090
heartrate          4090
tenyearchd         4090
dtype: int64

In [4]:
#Seperating target(y) from features (x)
y = heart_df.tenyearchd
X = heart_df.drop(columns=["tenyearchd"], axis=1)

In [5]:
# split the train and test data, and count how much of the predicted values we have.
X_train, X_test, y_train, y_test = train_test_split(X,
   y, random_state=1)

Counter(y_train)

Counter({1: 465, 0: 2602})

In [6]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [7]:
# Check learning rates for which one is the most efficient for our purposes
learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1]
for learning_rate in learning_rates:
   classifier = GradientBoostingClassifier(n_estimators=20,
   learning_rate=learning_rate,
   max_features=5,
   max_depth=3,
   random_state=0)
   classifier.fit(X_train_scaled, y_train)
   print("Learning rate: ", learning_rate)
   print("Accuracy score (training): {0:.3f}".format(
       classifier.score(
           X_train_scaled,
           y_train)))
   print("Accuracy score (validation): {0:.3f}".format(
       classifier.score(
           X_test_scaled,
           y_test)))

Learning rate:  0.05
Accuracy score (training): 0.849
Accuracy score (validation): 0.856
Learning rate:  0.1
Accuracy score (training): 0.854
Accuracy score (validation): 0.859
Learning rate:  0.25
Accuracy score (training): 0.868
Accuracy score (validation): 0.848
Learning rate:  0.5
Accuracy score (training): 0.877
Accuracy score (validation): 0.844
Learning rate:  0.75
Accuracy score (training): 0.881
Accuracy score (validation): 0.837
Learning rate:  1
Accuracy score (training): 0.882
Accuracy score (validation): 0.836


In [17]:
# Set the learning rate that was best for our purpose and predict.
classifier = GradientBoostingClassifier(n_estimators=20,
   learning_rate=0.1, max_features=5, max_depth=3, random_state=0)

classifier.fit(X_train_scaled, y_train)
predictions = classifier.predict(X_test_scaled)

In [18]:
# get our accuracy score
acc_score = accuracy_score(y_test, predictions)
print(f"Accuracy Score : {acc_score}")

Accuracy Score : 0.8592375366568915


In [19]:
# Get our confusion Matrix made
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
   cm, index=["Actual 0", "Actual 1"],
   columns=["Predicted 0", "Predicted 1"]
)
display(cm_df)

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,875,2
Actual 1,142,4


In [20]:
# print the classification report
print("Classification Report")
print(classification_report(y_test, predictions))

Classification Report
              precision    recall  f1-score   support

           0       0.86      1.00      0.92       877
           1       0.67      0.03      0.05       146

    accuracy                           0.86      1023
   macro avg       0.76      0.51      0.49      1023
weighted avg       0.83      0.86      0.80      1023



In [21]:
# List the features sorted in descending order by feature importance
sorted(zip(classifier.feature_importances_, X.columns), reverse=True)

[(0.31432223286244243, 'age'),
 (0.20676131242209264, 'sysbp'),
 (0.1002704434239728, 'diabp'),
 (0.08703375874049148, 'prevalenthyp'),
 (0.08211336565431708, 'cigsperday'),
 (0.07841764802822451, 'male'),
 (0.033766095131217526, 'totchol'),
 (0.03144090747051599, 'bmi'),
 (0.02916540975826362, 'heartrate'),
 (0.02107051817012969, 'diabetes'),
 (0.012034665048505406, 'bpmeds'),
 (0.0036036432898266433, 'currentsmoker'),
 (0.0, 'prevalentstroke')]