In [1]:
import pandas as pd
import numpy as np
import joblib

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

In [2]:
df = pd.read_csv('heart_2020_cleaned.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 319795 entries, 0 to 319794
Data columns (total 18 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   HeartDisease      319795 non-null  object 
 1   BMI               319795 non-null  float64
 2   Smoking           319795 non-null  object 
 3   AlcoholDrinking   319795 non-null  object 
 4   Stroke            319795 non-null  object 
 5   PhysicalHealth    319795 non-null  float64
 6   MentalHealth      319795 non-null  float64
 7   DiffWalking       319795 non-null  object 
 8   Sex               319795 non-null  object 
 9   AgeCategory       319795 non-null  object 
 10  Race              319795 non-null  object 
 11  Diabetic          319795 non-null  object 
 12  PhysicalActivity  319795 non-null  object 
 13  GenHealth         319795 non-null  object 
 14  SleepTime         319795 non-null  float64
 15  Asthma            319795 non-null  object 
 16  KidneyDisease     31

In [4]:
df.head()

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,No,16.6,Yes,No,No,3.0,30.0,No,Female,55-59,White,Yes,Yes,Very good,5.0,Yes,No,Yes
1,No,20.34,No,No,Yes,0.0,0.0,No,Female,80 or older,White,No,Yes,Very good,7.0,No,No,No
2,No,26.58,Yes,No,No,20.0,30.0,No,Male,65-69,White,Yes,Yes,Fair,8.0,Yes,No,No
3,No,24.21,No,No,No,0.0,0.0,No,Female,75-79,White,No,No,Good,6.0,No,No,Yes
4,No,23.71,No,No,No,28.0,0.0,Yes,Female,40-44,White,No,Yes,Very good,8.0,No,No,No


In [5]:
# Perform data preprocessing
# Convert categorical features using get_dummies
df = pd.get_dummies(df, drop_first=True)

# Separate minority and majority class samples
minority_samples = df[df['HeartDisease_Yes'] == 1]
majority_samples = df[df['HeartDisease_Yes'] == 0]

# Determine the size of the minority class
minority_size = len(minority_samples)

# Perform bootstrapping on minority class samples
bootstrapped_minority_samples = minority_samples.sample(n=minority_size, replace=True, random_state=42)

# Concatenate the bootstrapped minority class samples with majority class samples
bootstrapped_df = pd.concat([bootstrapped_minority_samples, majority_samples], axis=0)

# Shuffle the bootstrapped dataset
bootstrapped_df = bootstrapped_df.sample(frac=1, random_state=42).reset_index(drop=True)

X = bootstrapped_df.drop('HeartDisease_Yes', axis=1)  # Predictor variables
y = bootstrapped_df['HeartDisease_Yes']  # Target variable


In [6]:
# Model Training
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the input data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

clf = LogisticRegression(max_iter=1000, class_weight='balanced')  # Create an instance of the Logistic Regression Classifier
clf = clf.fit(X_train_scaled, y_train)  # Fit the model to the training data

# Model Evaluation
y_pred = clf.predict(X_test_scaled)  # Predict the target variable for the test data

In [7]:
# Evaluate the performance of the trained model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1_score_value = f1_score(y_test, y_pred, average='weighted')
confusion_matrix = confusion_matrix(y_test, y_pred)
classification_report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1-score: {f1_score_value}')
print(f'Confusion Matrix: \n{confusion_matrix}')
print(f'Classification Report: {classification_report}')

Accuracy: 0.750418236682875
Precision: 0.908666287933293
Recall: 0.750418236682875
F1-score: 0.80295876785079
Confusion Matrix: 
[[43720 14746]
 [ 1217  4276]]
Classification Report:               precision    recall  f1-score   support

           0       0.97      0.75      0.85     58466
           1       0.22      0.78      0.35      5493

    accuracy                           0.75     63959
   macro avg       0.60      0.76      0.60     63959
weighted avg       0.91      0.75      0.80     63959



In [8]:
# Save the trained logistic regression model and scaler
joblib.dump(clf, 'app/logistic_regression_model.pkl')
joblib.dump(scaler, 'app/scaler.pkl')

['app/scaler.pkl']