In [1]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report

In [2]:
# Read the CSV file from the Resources folder into a Pandas DataFrame
data = Path('./heart_2020_cleaned.csv')
df = pd.read_csv(data)
df.head()

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,No,16.6,Yes,No,No,3.0,30.0,No,Female,55-59,White,Yes,Yes,Very good,5.0,Yes,No,Yes
1,No,20.34,No,No,Yes,0.0,0.0,No,Female,80 or older,White,No,Yes,Very good,7.0,No,No,No
2,No,26.58,Yes,No,No,20.0,30.0,No,Male,65-69,White,Yes,Yes,Fair,8.0,Yes,No,No
3,No,24.21,No,No,No,0.0,0.0,No,Female,75-79,White,No,No,Good,6.0,No,No,Yes
4,No,23.71,No,No,No,28.0,0.0,Yes,Female,40-44,White,No,Yes,Very good,8.0,No,No,No


In [3]:
# get dummies

df = pd.get_dummies(df, drop_first=True)
df.columns

Index(['BMI', 'PhysicalHealth', 'MentalHealth', 'SleepTime',
       'HeartDisease_Yes', 'Smoking_Yes', 'AlcoholDrinking_Yes', 'Stroke_Yes',
       'DiffWalking_Yes', 'Sex_Male', 'AgeCategory_25-29', 'AgeCategory_30-34',
       'AgeCategory_35-39', 'AgeCategory_40-44', 'AgeCategory_45-49',
       'AgeCategory_50-54', 'AgeCategory_55-59', 'AgeCategory_60-64',
       'AgeCategory_65-69', 'AgeCategory_70-74', 'AgeCategory_75-79',
       'AgeCategory_80 or older', 'Race_Asian', 'Race_Black', 'Race_Hispanic',
       'Race_Other', 'Race_White', 'Diabetic_No, borderline diabetes',
       'Diabetic_Yes', 'Diabetic_Yes (during pregnancy)',
       'PhysicalActivity_Yes', 'GenHealth_Fair', 'GenHealth_Good',
       'GenHealth_Poor', 'GenHealth_Very good', 'Asthma_Yes',
       'KidneyDisease_Yes', 'SkinCancer_Yes'],
      dtype='object')

In [4]:
df.head()

Unnamed: 0,BMI,PhysicalHealth,MentalHealth,SleepTime,HeartDisease_Yes,Smoking_Yes,AlcoholDrinking_Yes,Stroke_Yes,DiffWalking_Yes,Sex_Male,...,Diabetic_Yes,Diabetic_Yes (during pregnancy),PhysicalActivity_Yes,GenHealth_Fair,GenHealth_Good,GenHealth_Poor,GenHealth_Very good,Asthma_Yes,KidneyDisease_Yes,SkinCancer_Yes
0,16.6,3.0,30.0,5.0,0,1,0,0,0,0,...,1,0,1,0,0,0,1,1,0,1
1,20.34,0.0,0.0,7.0,0,0,0,1,0,0,...,0,0,1,0,0,0,1,0,0,0
2,26.58,20.0,30.0,8.0,0,1,0,0,0,1,...,1,0,1,1,0,0,0,1,0,0
3,24.21,0.0,0.0,6.0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
4,23.71,28.0,0.0,8.0,0,0,0,0,1,0,...,0,0,1,0,0,0,1,0,0,0


In [5]:
# Separate the data into labels and features

# Separate the y variable, the labels
y = df["HeartDisease_Yes"]

# Separate the X variable, the features
X = df.drop(columns=['HeartDisease_Yes'])

In [6]:
# Review the y variable series
y.head()

0    0
1    0
2    0
3    0
4    0
Name: HeartDisease_Yes, dtype: uint8

In [7]:
# Review the X variable Dataframe
X.head()

Unnamed: 0,BMI,PhysicalHealth,MentalHealth,SleepTime,Smoking_Yes,AlcoholDrinking_Yes,Stroke_Yes,DiffWalking_Yes,Sex_Male,AgeCategory_25-29,...,Diabetic_Yes,Diabetic_Yes (during pregnancy),PhysicalActivity_Yes,GenHealth_Fair,GenHealth_Good,GenHealth_Poor,GenHealth_Very good,Asthma_Yes,KidneyDisease_Yes,SkinCancer_Yes
0,16.6,3.0,30.0,5.0,1,0,0,0,0,0,...,1,0,1,0,0,0,1,1,0,1
1,20.34,0.0,0.0,7.0,0,0,1,0,0,0,...,0,0,1,0,0,0,1,0,0,0
2,26.58,20.0,30.0,8.0,1,0,0,0,1,0,...,1,0,1,1,0,0,0,1,0,0
3,24.21,0.0,0.0,6.0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
4,23.71,28.0,0.0,8.0,0,0,0,1,0,0,...,0,0,1,0,0,0,1,0,0,0


In [8]:
# check the balance of the target variables
from sklearn.model_selection import train_test_split

# Split the data using train_test_split
# Assign a random_state of 1 to the function
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [9]:
# Import the LogisticRegression module from SKLearn
from sklearn.linear_model import LogisticRegression

# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
classifier = LogisticRegression(max_iter= 1000, random_state=42)

# Fit the model using training data
classifier.fit(X_train, y_train)

LogisticRegression(max_iter=1000, random_state=42)

In [10]:
# Make a prediction using the testing data
predictions = classifier.predict(X_test)

In [11]:
# Print the balanced_accuracy score of the model
from sklearn.metrics import accuracy_score
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

print(f"balanced_accuracy_score: {balanced_accuracy_score(y_test, predictions)}")

Training Data Score: 0.9165256039291879
Testing Data Score: 0.9142328234249334
balanced_accuracy_score: 0.5457484000699154


In [12]:
# Generate a confusion matrix for the model
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, predictions)

array([[72399,   605],
       [ 6252,   693]], dtype=int64)

In [13]:
# Print the classification report for the model
from sklearn.metrics import classification_report
target_names = ["Heart Risk 0", "Heart Risk 1"]
report = classification_report(y_test, predictions, target_names=target_names)

print(report)

              precision    recall  f1-score   support

Heart Risk 0       0.92      0.99      0.95     73004
Heart Risk 1       0.53      0.10      0.17      6945

    accuracy                           0.91     79949
   macro avg       0.73      0.55      0.56     79949
weighted avg       0.89      0.91      0.89     79949



In [14]:

# Import the RandomOverSampler module form imbalanced-learn
from imblearn.over_sampling import RandomOverSampler

# Instantiate the random oversampler model
# # Assign a random_state parameter of 1 to the model
random_sampler = RandomOverSampler(random_state=42)

# Fit the original training data to the random_oversampler model
X_resampled, y_resampled = random_sampler.fit_resample(X_train, y_train)

In [15]:
# Count the distinct values of the resampled labels data
y_resampled.value_counts()

0    219418
1    219418
Name: HeartDisease_Yes, dtype: int64

In [17]:
# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
from sklearn.linear_model import LogisticRegression
re_classifier = LogisticRegression(max_iter = 1000, solver = 'lbfgs', random_state=42)

# Fit the model using the resampled training data
re_classifier.fit(X_resampled, y_resampled)

# Make a prediction using the testing data
y_prediction = re_classifier.predict(X_test)

In [18]:
# Print the balanced_accuracy score of the model 
print(f"Training Data Score: {re_classifier.score(X_train, y_train)}")
print(f"Training Data Score: {re_classifier.score(X_test, y_test)}")

from sklearn.metrics import accuracy_score
print(f"Balanced Accuracy Score: {balanced_accuracy_score(y_test, y_prediction)}")

Training Data Score: 0.7510110654336533
Training Data Score: 0.7489399492176262
Balanced Accuracy Score: 0.7620088817090567


In [19]:
# Generate a confusion matrix for the model
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_prediction)

array([[54475, 18529],
       [ 1543,  5402]], dtype=int64)

In [20]:
# Print the classification report for the model
from sklearn.metrics import classification_report
target_name = ["heart risk 0", "heart risk 1"]
class_report = classification_report(y_test, y_prediction, target_names = target_name)

print(class_report)

              precision    recall  f1-score   support

heart risk 0       0.97      0.75      0.84     73004
heart risk 1       0.23      0.78      0.35      6945

    accuracy                           0.75     79949
   macro avg       0.60      0.76      0.60     79949
weighted avg       0.91      0.75      0.80     79949

