In [1]:
# import dependencies for training, testing, scaling, and model creation
from path import Path
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier

# import dependencies for model evaluation
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

In [2]:
# import cleaned and preprocessed dataset
path = Path("../resources/clean_stroke_df.csv")

clean_stroke_df = pd.read_csv(path)
clean_stroke_df.head()

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke,gender_Female,gender_Male,gender_Other,ever_married_No,...,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,67,0,1,228,36,1,0,1,0,0,...,0,1,0,0,0,1,0,1,0,0
1,80,0,1,105,32,1,0,1,0,0,...,0,1,0,0,1,0,0,0,1,0
2,49,0,0,171,34,1,1,0,0,0,...,0,1,0,0,0,1,0,0,0,1
3,79,1,0,174,24,1,1,0,0,0,...,0,0,1,0,1,0,0,0,1,0
4,81,0,0,186,29,1,0,1,0,0,...,0,1,0,0,0,1,0,1,0,0


In [3]:
clean_stroke_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4909 entries, 0 to 4908
Data columns (total 22 columns):
 #   Column                          Non-Null Count  Dtype
---  ------                          --------------  -----
 0   age                             4909 non-null   int64
 1   hypertension                    4909 non-null   int64
 2   heart_disease                   4909 non-null   int64
 3   avg_glucose_level               4909 non-null   int64
 4   bmi                             4909 non-null   int64
 5   stroke                          4909 non-null   int64
 6   gender_Female                   4909 non-null   int64
 7   gender_Male                     4909 non-null   int64
 8   gender_Other                    4909 non-null   int64
 9   ever_married_No                 4909 non-null   int64
 10  ever_married_Yes                4909 non-null   int64
 11  work_type_Govt_job              4909 non-null   int64
 12  work_type_Never_worked          4909 non-null   int64
 13  wor

In [4]:
# check distribution of "stroke" target vector, we will over-sample data since there is a discrepancy between the two values
clean_stroke_df["stroke"].value_counts()

0    4700
1     209
Name: stroke, dtype: int64

In [5]:
# create feature and target vector objects, check shapes of both
X = clean_stroke_df.drop(columns=["stroke"]).values
y = clean_stroke_df["stroke"].values
print(X.shape, y.shape)

(4909, 21) (4909,)


In [6]:
X[:5]

array([[ 67,   0,   1, 228,  36,   0,   1,   0,   0,   1,   0,   0,   1,
          0,   0,   0,   1,   0,   1,   0,   0],
       [ 80,   0,   1, 105,  32,   0,   1,   0,   0,   1,   0,   0,   1,
          0,   0,   1,   0,   0,   0,   1,   0],
       [ 49,   0,   0, 171,  34,   1,   0,   0,   0,   1,   0,   0,   1,
          0,   0,   0,   1,   0,   0,   0,   1],
       [ 79,   1,   0, 174,  24,   1,   0,   0,   0,   1,   0,   0,   0,
          1,   0,   1,   0,   0,   0,   1,   0],
       [ 81,   0,   0, 186,  29,   0,   1,   0,   0,   1,   0,   0,   1,
          0,   0,   0,   1,   0,   1,   0,   0]], dtype=int64)

In [7]:
y[:5]

array([1, 1, 1, 1, 1], dtype=int64)

In [8]:
# split data into training and testing data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size = 0.25, random_state=42, stratify=y
)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(3681, 21) (1228, 21) (3681,) (1228,)


In [9]:
# over-sample data since the patients with no stroke cases far out-weighs the number of stroke cases
from imblearn.over_sampling import RandomOverSampler
from collections import Counter
ros = RandomOverSampler(random_state=1)

# NOTE: only oversample training data
X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)


print(Counter(y_train_resampled))

Counter({0: 3524, 1: 3524})


In [10]:
# create an instance of the StandardScaler
scaler = StandardScaler()

X_scaler = scaler.fit(X_train_resampled)

X_train_scaled = X_scaler.transform(X_train_resampled)
X_test_scaled = X_scaler.transform(X_test)

#print first two rows of scaled training and testing features
print(
f'{X_train_scaled[:2]}\n \
----------\n \
{X_test_scaled[:2]}'
)

[[ 0.11953641 -0.51996894  2.82865288  1.82993353 -0.45444952 -1.19597025
   1.19632068 -0.01191236 -0.56227496  0.56227496 -0.36660008 -0.04461314
   0.82225293 -0.51864759 -0.27633167  1.01199015 -1.01199015 -0.52436993
  -0.5316207  -0.80970737  2.22546685]
 [-1.23227286 -0.51996894 -0.35352517 -0.23775606  1.22814569  0.8361412
  -0.83589628 -0.01191236  1.77848931 -1.77848931 -0.36660008 -0.04461314
   0.82225293 -0.51864759 -0.27633167 -0.98815191  0.98815191 -0.52436993
  -0.5316207   1.23501407 -0.44934392]]
 ----------
 [[-1.14215225 -0.51996894 -0.35352517 -0.65830309 -0.87509832  0.8361412
  -0.83589628 -0.01191236 -0.56227496  0.56227496  2.72776806 -0.04461314
  -1.2161708  -0.51864759 -0.27633167 -0.98815191  0.98815191  1.90705063
  -0.5316207  -0.80970737 -0.44934392]
 [-0.28600637 -0.51996894 -0.35352517 -0.57068913  0.38684809 -1.19597025
   1.19632068 -0.01191236 -0.56227496  0.56227496 -0.36660008 -0.04461314
   0.82225293 -0.51864759 -0.27633167  1.01199015 -1.0119

In [11]:
# create a function that takes implements the X_train_scaled and y_train data, and tests for the best learning rate for the
# gradient boosting model.

# model parameter selection e.g. max_depth, max_features, random_state: 
# https://www.analyticsvidhya.com/blog/2016/02/complete-guide-parameter-tuning-gradient-boosting-gbm-python/

def gb_learning_rates(X_train_scaled, y_train_resampled, X_test_scaled, y_test):
    
    learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1]
    
    for learning_rate in learning_rates:
        
        # create instance of model
        gb_model = GradientBoostingClassifier(n_estimators=21,
                                             learning_rate=learning_rate,
                                             max_depth=3,
                                             max_features=6,
                                             random_state=None)
                                              
        # train model with training data
        gb_model.fit(X_train_scaled, y_train_resampled)
        
        print(f'Learning Rate: {learning_rate}')
        
        print(f'Mean Accuracy for Training Data: {gb_model.score(X_train_scaled, y_train_resampled):.3f}')
        
        print(f'Mean Accuracy for Testing Data: {gb_model.score(X_test_scaled, y_test):.3f}')
        
        print('--------------------------------')
    
    return

In [12]:
# call function and input arguments which is the intended training and testing data

gb_learning_rates(X_train_scaled, y_train_resampled, X_test_scaled, y_test)

Learning Rate: 0.05
Mean Accuracy for Training Data: 0.811
Mean Accuracy for Testing Data: 0.749
--------------------------------
Learning Rate: 0.1
Mean Accuracy for Training Data: 0.816
Mean Accuracy for Testing Data: 0.761
--------------------------------
Learning Rate: 0.25
Mean Accuracy for Training Data: 0.839
Mean Accuracy for Testing Data: 0.778
--------------------------------
Learning Rate: 0.5
Mean Accuracy for Training Data: 0.885
Mean Accuracy for Testing Data: 0.803
--------------------------------
Learning Rate: 0.75
Mean Accuracy for Training Data: 0.872
Mean Accuracy for Testing Data: 0.803
--------------------------------
Learning Rate: 1
Mean Accuracy for Training Data: 0.914
Mean Accuracy for Testing Data: 0.828
--------------------------------


In [13]:
# create a new model with the highest learning accuracy

# create instance of model
classifier = GradientBoostingClassifier(n_estimators=21,
                                             learning_rate=1,
                                             max_depth=3,
                                             max_features=6,
                                             random_state=None)

#train model with training features and target
classifier.fit(X_train_scaled, y_train_resampled)

# predict target using testing features and make dataframe to compare predictions with actual results
predictions = classifier.predict(X_test_scaled)

predictions_df = pd.DataFrame({"Prediction": predictions, "Actual": y_test})
predictions_df.sample(50)

Unnamed: 0,Prediction,Actual
1076,1,0
1116,1,0
758,0,0
52,0,0
1089,1,0
599,0,0
672,0,0
990,0,0
177,0,0
747,0,0


In [14]:
# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)
print(f"Accuracy Score : {acc_score}")

Accuracy Score : 0.8118892508143323


In [15]:
# Generate the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual No Stroke", "Actual Stroke"],
    columns=["Predicted No Stroke", "Predicted Stroke"]
)

# Displaying results
display(cm_df)

Unnamed: 0,Predicted No Stroke,Predicted Stroke
Actual No Stroke,975,201
Actual Stroke,30,22


In [16]:
# Generate classification report
target_names = ["No stroke (0)", "Stroke (1)"]
classification_report = classification_report(y_test, predictions, target_names=target_names)
print(classification_report) 

               precision    recall  f1-score   support

No stroke (0)       0.97      0.83      0.89      1176
   Stroke (1)       0.10      0.42      0.16        52

     accuracy                           0.81      1228
    macro avg       0.53      0.63      0.53      1228
 weighted avg       0.93      0.81      0.86      1228

