In [1]:
# Import the dependencies
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

In [2]:
# Read in the CSV
data = Path("Resources/heart_attack_prediction_dataset.csv")
data_df = pd.read_csv(data)
data_df.head()

Unnamed: 0,Patient ID,Age,Sex,Cholesterol,Blood Pressure,Heart Rate,Diabetes,Family History,Smoking,Obesity,...,Sedentary Hours Per Day,Income,BMI,Triglycerides,Physical Activity Days Per Week,Sleep Hours Per Day,Country,Continent,Hemisphere,Heart Attack Risk
0,BMW7812,67,Male,208,158/88,72,0,0,1,0,...,6.615001,261404,31.251233,286,0,6,Argentina,South America,Southern Hemisphere,0
1,CZE1114,21,Male,389,165/93,98,1,1,1,1,...,4.963459,285768,27.194973,235,1,7,Canada,North America,Northern Hemisphere,0
2,BNI9906,21,Female,324,174/99,72,1,0,0,0,...,9.463426,235282,28.176571,587,4,4,France,Europe,Northern Hemisphere,0
3,JLN3497,84,Male,383,163/100,73,1,1,1,0,...,7.648981,125640,36.464704,378,3,4,Canada,North America,Northern Hemisphere,0
4,GFO8847,66,Male,318,91/88,93,1,1,1,1,...,1.514821,160555,21.809144,231,1,5,Thailand,Asia,Northern Hemisphere,0


In [3]:
# Drop the un-necessary columns - "Patient ID" and "BMI" 
updated_data_df = data_df.drop(columns =['Patient ID','BMI'])
updated_data_df

Unnamed: 0,Age,Sex,Cholesterol,Blood Pressure,Heart Rate,Diabetes,Family History,Smoking,Obesity,Alcohol Consumption,...,Stress Level,Sedentary Hours Per Day,Income,Triglycerides,Physical Activity Days Per Week,Sleep Hours Per Day,Country,Continent,Hemisphere,Heart Attack Risk
0,67,Male,208,158/88,72,0,0,1,0,0,...,9,6.615001,261404,286,0,6,Argentina,South America,Southern Hemisphere,0
1,21,Male,389,165/93,98,1,1,1,1,1,...,1,4.963459,285768,235,1,7,Canada,North America,Northern Hemisphere,0
2,21,Female,324,174/99,72,1,0,0,0,0,...,9,9.463426,235282,587,4,4,France,Europe,Northern Hemisphere,0
3,84,Male,383,163/100,73,1,1,1,0,1,...,9,7.648981,125640,378,3,4,Canada,North America,Northern Hemisphere,0
4,66,Male,318,91/88,93,1,1,1,1,0,...,6,1.514821,160555,231,1,5,Thailand,Asia,Northern Hemisphere,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8758,60,Male,121,94/76,61,1,1,1,0,1,...,8,10.806373,235420,67,7,7,Thailand,Asia,Northern Hemisphere,0
8759,28,Female,120,157/102,73,1,0,0,1,0,...,8,3.833038,217881,617,4,9,Canada,North America,Northern Hemisphere,0
8760,47,Male,250,161/75,105,0,1,1,1,1,...,5,2.375214,36998,527,4,4,Brazil,South America,Southern Hemisphere,1
8761,36,Male,178,119/67,60,1,0,1,0,0,...,5,0.029104,209943,114,2,8,Brazil,South America,Southern Hemisphere,0


In [4]:
# Split blood pressure
updated_data_df[['systole','diastole']] = updated_data_df['Blood Pressure'].str.split("/",expand=True)
updated_data_df.head()

Unnamed: 0,Age,Sex,Cholesterol,Blood Pressure,Heart Rate,Diabetes,Family History,Smoking,Obesity,Alcohol Consumption,...,Income,Triglycerides,Physical Activity Days Per Week,Sleep Hours Per Day,Country,Continent,Hemisphere,Heart Attack Risk,systole,diastole
0,67,Male,208,158/88,72,0,0,1,0,0,...,261404,286,0,6,Argentina,South America,Southern Hemisphere,0,158,88
1,21,Male,389,165/93,98,1,1,1,1,1,...,285768,235,1,7,Canada,North America,Northern Hemisphere,0,165,93
2,21,Female,324,174/99,72,1,0,0,0,0,...,235282,587,4,4,France,Europe,Northern Hemisphere,0,174,99
3,84,Male,383,163/100,73,1,1,1,0,1,...,125640,378,3,4,Canada,North America,Northern Hemisphere,0,163,100
4,66,Male,318,91/88,93,1,1,1,1,0,...,160555,231,1,5,Thailand,Asia,Northern Hemisphere,0,91,88


In [5]:
# Drop original Blood Pressure column
updated_data_df = updated_data_df.drop(columns =['Blood Pressure'])
updated_data_df

Unnamed: 0,Age,Sex,Cholesterol,Heart Rate,Diabetes,Family History,Smoking,Obesity,Alcohol Consumption,Exercise Hours Per Week,...,Income,Triglycerides,Physical Activity Days Per Week,Sleep Hours Per Day,Country,Continent,Hemisphere,Heart Attack Risk,systole,diastole
0,67,Male,208,72,0,0,1,0,0,4.168189,...,261404,286,0,6,Argentina,South America,Southern Hemisphere,0,158,88
1,21,Male,389,98,1,1,1,1,1,1.813242,...,285768,235,1,7,Canada,North America,Northern Hemisphere,0,165,93
2,21,Female,324,72,1,0,0,0,0,2.078353,...,235282,587,4,4,France,Europe,Northern Hemisphere,0,174,99
3,84,Male,383,73,1,1,1,0,1,9.828130,...,125640,378,3,4,Canada,North America,Northern Hemisphere,0,163,100
4,66,Male,318,93,1,1,1,1,0,5.804299,...,160555,231,1,5,Thailand,Asia,Northern Hemisphere,0,91,88
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8758,60,Male,121,61,1,1,1,0,1,7.917342,...,235420,67,7,7,Thailand,Asia,Northern Hemisphere,0,94,76
8759,28,Female,120,73,1,0,0,1,0,16.558426,...,217881,617,4,9,Canada,North America,Northern Hemisphere,0,157,102
8760,47,Male,250,105,0,1,1,1,1,3.148438,...,36998,527,4,4,Brazil,South America,Southern Hemisphere,1,161,75
8761,36,Male,178,60,1,0,1,0,0,3.789950,...,209943,114,2,8,Brazil,South America,Southern Hemisphere,0,119,67


In [6]:
updated_data_df[['systole','diastole']] = updated_data_df[['systole','diastole']].astype(int)

In [7]:
bp = []
for index, row in updated_data_df.iterrows():

    if((row["systole"] < 120) & (row["diastole"] < 80)):
     bp.append("Normal Blood Pressure")

    elif ((row["systole"] >= 120 & row["systole"] < 130) & (row["diastole"] < 80)):
     bp.append("Elevated Blood Pressure")

    elif ((row["systole"] >= 130 & row["systole"] < 140) | (row["diastole"]>=80 & row["diastole"]<=89)):
     bp.append("First Stage High Blood Pressure")

    else:
     bp.append("Second Stage High Blood Pressure")

bp_updated_data_df = updated_data_df.assign (bp = bp)
bp_updated_data_df.head()

Unnamed: 0,Age,Sex,Cholesterol,Heart Rate,Diabetes,Family History,Smoking,Obesity,Alcohol Consumption,Exercise Hours Per Week,...,Triglycerides,Physical Activity Days Per Week,Sleep Hours Per Day,Country,Continent,Hemisphere,Heart Attack Risk,systole,diastole,bp
0,67,Male,208,72,0,0,1,0,0,4.168189,...,286,0,6,Argentina,South America,Southern Hemisphere,0,158,88,First Stage High Blood Pressure
1,21,Male,389,98,1,1,1,1,1,1.813242,...,235,1,7,Canada,North America,Northern Hemisphere,0,165,93,First Stage High Blood Pressure
2,21,Female,324,72,1,0,0,0,0,2.078353,...,587,4,4,France,Europe,Northern Hemisphere,0,174,99,First Stage High Blood Pressure
3,84,Male,383,73,1,1,1,0,1,9.82813,...,378,3,4,Canada,North America,Northern Hemisphere,0,163,100,First Stage High Blood Pressure
4,66,Male,318,93,1,1,1,1,0,5.804299,...,231,1,5,Thailand,Asia,Northern Hemisphere,0,91,88,First Stage High Blood Pressure


In [8]:
# Drop the un-necessary columns - "systole" and "diastole"
bp_updated_data_df = bp_updated_data_df.drop(columns =["systole","diastole"])
bp_updated_data_df

Unnamed: 0,Age,Sex,Cholesterol,Heart Rate,Diabetes,Family History,Smoking,Obesity,Alcohol Consumption,Exercise Hours Per Week,...,Sedentary Hours Per Day,Income,Triglycerides,Physical Activity Days Per Week,Sleep Hours Per Day,Country,Continent,Hemisphere,Heart Attack Risk,bp
0,67,Male,208,72,0,0,1,0,0,4.168189,...,6.615001,261404,286,0,6,Argentina,South America,Southern Hemisphere,0,First Stage High Blood Pressure
1,21,Male,389,98,1,1,1,1,1,1.813242,...,4.963459,285768,235,1,7,Canada,North America,Northern Hemisphere,0,First Stage High Blood Pressure
2,21,Female,324,72,1,0,0,0,0,2.078353,...,9.463426,235282,587,4,4,France,Europe,Northern Hemisphere,0,First Stage High Blood Pressure
3,84,Male,383,73,1,1,1,0,1,9.828130,...,7.648981,125640,378,3,4,Canada,North America,Northern Hemisphere,0,First Stage High Blood Pressure
4,66,Male,318,93,1,1,1,1,0,5.804299,...,1.514821,160555,231,1,5,Thailand,Asia,Northern Hemisphere,0,First Stage High Blood Pressure
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8758,60,Male,121,61,1,1,1,0,1,7.917342,...,10.806373,235420,67,7,7,Thailand,Asia,Northern Hemisphere,0,Normal Blood Pressure
8759,28,Female,120,73,1,0,0,1,0,16.558426,...,3.833038,217881,617,4,9,Canada,North America,Northern Hemisphere,0,First Stage High Blood Pressure
8760,47,Male,250,105,0,1,1,1,1,3.148438,...,2.375214,36998,527,4,4,Brazil,South America,Southern Hemisphere,1,Elevated Blood Pressure
8761,36,Male,178,60,1,0,1,0,0,3.789950,...,0.029104,209943,114,2,8,Brazil,South America,Southern Hemisphere,0,Normal Blood Pressure


In [9]:
# Change the categorical data types to numbers
final_data_df = pd.get_dummies(bp_updated_data_df)
final_data_df

Unnamed: 0,Age,Cholesterol,Heart Rate,Diabetes,Family History,Smoking,Obesity,Alcohol Consumption,Exercise Hours Per Week,Previous Heart Problems,...,Continent_Asia,Continent_Australia,Continent_Europe,Continent_North America,Continent_South America,Hemisphere_Northern Hemisphere,Hemisphere_Southern Hemisphere,bp_Elevated Blood Pressure,bp_First Stage High Blood Pressure,bp_Normal Blood Pressure
0,67,208,72,0,0,1,0,0,4.168189,0,...,0,0,0,0,1,0,1,0,1,0
1,21,389,98,1,1,1,1,1,1.813242,1,...,0,0,0,1,0,1,0,0,1,0
2,21,324,72,1,0,0,0,0,2.078353,1,...,0,0,1,0,0,1,0,0,1,0
3,84,383,73,1,1,1,0,1,9.828130,1,...,0,0,0,1,0,1,0,0,1,0
4,66,318,93,1,1,1,1,0,5.804299,1,...,1,0,0,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8758,60,121,61,1,1,1,0,1,7.917342,1,...,1,0,0,0,0,1,0,0,0,1
8759,28,120,73,1,0,0,1,0,16.558426,0,...,0,0,0,1,0,1,0,0,1,0
8760,47,250,105,0,1,1,1,1,3.148438,1,...,0,0,0,0,1,0,1,1,0,0
8761,36,178,60,1,0,1,0,0,3.789950,1,...,0,0,0,0,1,0,1,0,0,1


In [10]:
y = final_data_df["Heart Attack Risk"]
X = final_data_df.drop(columns="Heart Attack Risk")

In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)
X_train.shape

(6572, 53)

 ## Create a Logistic Regression Model

In [12]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs',
                                max_iter=100,
                                random_state=1)
classifier

 ## Fit (train) or model using the training data

In [13]:
classifier.fit(X_train, y_train)

 ## Score the model using the test data

In [14]:
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.6418137553256239
Testing Data Score: 0.6417161113646737


 ## Make predictions

In [15]:
predictions = classifier.predict(X_test)
results = pd.DataFrame({"Prediction": predictions, "Actual": y_test}).reset_index(drop=True)
results.head(10)

Unnamed: 0,Prediction,Actual
0,0,0
1,0,0
2,0,1
3,0,0
4,0,1
5,0,0
6,0,1
7,0,0
8,0,1
9,0,0


## Calculate the Accuracy Score

In [16]:
from sklearn.metrics import accuracy_score
# Display the accuracy score for the test dataset.
accuracy_score(y_test, predictions)

0.6417161113646737