In [56]:
# Import the dependencies
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np 

In [57]:
# Read in the CSV
data = Path("Resources/heart_attack_prediction_dataset.csv")
data_df = pd.read_csv(data)
data_df.head()

Unnamed: 0,Patient ID,Age,Sex,Cholesterol,Blood Pressure,Heart Rate,Diabetes,Family History,Smoking,Obesity,...,Sedentary Hours Per Day,Income,BMI,Triglycerides,Physical Activity Days Per Week,Sleep Hours Per Day,Country,Continent,Hemisphere,Heart Attack Risk
0,BMW7812,67,Male,208,158/88,72,0,0,1,0,...,6.615001,261404,31.251233,286,0,6,Argentina,South America,Southern Hemisphere,0
1,CZE1114,21,Male,389,165/93,98,1,1,1,1,...,4.963459,285768,27.194973,235,1,7,Canada,North America,Northern Hemisphere,0
2,BNI9906,21,Female,324,174/99,72,1,0,0,0,...,9.463426,235282,28.176571,587,4,4,France,Europe,Northern Hemisphere,0
3,JLN3497,84,Male,383,163/100,73,1,1,1,0,...,7.648981,125640,36.464704,378,3,4,Canada,North America,Northern Hemisphere,0
4,GFO8847,66,Male,318,91/88,93,1,1,1,1,...,1.514821,160555,21.809144,231,1,5,Thailand,Asia,Northern Hemisphere,0


In [58]:
# Drop the un-necessary columns - "Patient ID" and "BMI"
updated_data_df = data_df.drop(columns =['Patient ID','BMI'])
updated_data_df.head()

Unnamed: 0,Age,Sex,Cholesterol,Blood Pressure,Heart Rate,Diabetes,Family History,Smoking,Obesity,Alcohol Consumption,...,Stress Level,Sedentary Hours Per Day,Income,Triglycerides,Physical Activity Days Per Week,Sleep Hours Per Day,Country,Continent,Hemisphere,Heart Attack Risk
0,67,Male,208,158/88,72,0,0,1,0,0,...,9,6.615001,261404,286,0,6,Argentina,South America,Southern Hemisphere,0
1,21,Male,389,165/93,98,1,1,1,1,1,...,1,4.963459,285768,235,1,7,Canada,North America,Northern Hemisphere,0
2,21,Female,324,174/99,72,1,0,0,0,0,...,9,9.463426,235282,587,4,4,France,Europe,Northern Hemisphere,0
3,84,Male,383,163/100,73,1,1,1,0,1,...,9,7.648981,125640,378,3,4,Canada,North America,Northern Hemisphere,0
4,66,Male,318,91/88,93,1,1,1,1,0,...,6,1.514821,160555,231,1,5,Thailand,Asia,Northern Hemisphere,0


In [59]:
# Create the dataset based on all of the controlled variables
controlled_data_df = updated_data_df[['Cholesterol', 'Blood Pressure', 'Heart Rate', 'Diabetes', 'Smoking', 'Obesity','Alcohol Consumption', 'Diet', 'Stress Level', 'Income', 'Heart Attack Risk']].copy()
controlled_data_df.head()

Unnamed: 0,Cholesterol,Blood Pressure,Heart Rate,Diabetes,Smoking,Obesity,Alcohol Consumption,Diet,Stress Level,Income,Heart Attack Risk
0,208,158/88,72,0,1,0,0,Average,9,261404,0
1,389,165/93,98,1,1,1,1,Unhealthy,1,285768,0
2,324,174/99,72,1,0,0,0,Healthy,9,235282,0
3,383,163/100,73,1,1,0,1,Average,9,125640,0
4,318,91/88,93,1,1,1,0,Unhealthy,6,160555,0


In [60]:
# Rename variables with spaces
controlled_data_df_2 = controlled_data_df.rename(columns={'Blood Pressure': 'BloodPressure'})
controlled_data_df_2.head()

Unnamed: 0,Cholesterol,BloodPressure,Heart Rate,Diabetes,Smoking,Obesity,Alcohol Consumption,Diet,Stress Level,Income,Heart Attack Risk
0,208,158/88,72,0,1,0,0,Average,9,261404,0
1,389,165/93,98,1,1,1,1,Unhealthy,1,285768,0
2,324,174/99,72,1,0,0,0,Healthy,9,235282,0
3,383,163/100,73,1,1,0,1,Average,9,125640,0
4,318,91/88,93,1,1,1,0,Unhealthy,6,160555,0


In [61]:
# Split the Blood Pressure column into 2 new columns called systole and diastole
controlled_data_df_2[["systole", "diastole"]] = controlled_data_df_2["BloodPressure"].str.split("/", expand = True)
controlled_data_df_2.head()

Unnamed: 0,Cholesterol,BloodPressure,Heart Rate,Diabetes,Smoking,Obesity,Alcohol Consumption,Diet,Stress Level,Income,Heart Attack Risk,systole,diastole
0,208,158/88,72,0,1,0,0,Average,9,261404,0,158,88
1,389,165/93,98,1,1,1,1,Unhealthy,1,285768,0,165,93
2,324,174/99,72,1,0,0,0,Healthy,9,235282,0,174,99
3,383,163/100,73,1,1,0,1,Average,9,125640,0,163,100
4,318,91/88,93,1,1,1,0,Unhealthy,6,160555,0,91,88


In [62]:
# Drop the original Blood Pressure column
data_df_2 = controlled_data_df_2.drop(["BloodPressure"], axis = 1)
data_df_2.head()

Unnamed: 0,Cholesterol,Heart Rate,Diabetes,Smoking,Obesity,Alcohol Consumption,Diet,Stress Level,Income,Heart Attack Risk,systole,diastole
0,208,72,0,1,0,0,Average,9,261404,0,158,88
1,389,98,1,1,1,1,Unhealthy,1,285768,0,165,93
2,324,72,1,0,0,0,Healthy,9,235282,0,174,99
3,383,73,1,1,0,1,Average,9,125640,0,163,100
4,318,93,1,1,1,0,Unhealthy,6,160555,0,91,88


In [63]:
# Check the data types of the controlled dataframe
data_df_2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8763 entries, 0 to 8762
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Cholesterol          8763 non-null   int64 
 1   Heart Rate           8763 non-null   int64 
 2   Diabetes             8763 non-null   int64 
 3   Smoking              8763 non-null   int64 
 4   Obesity              8763 non-null   int64 
 5   Alcohol Consumption  8763 non-null   int64 
 6   Diet                 8763 non-null   object
 7   Stress Level         8763 non-null   int64 
 8   Income               8763 non-null   int64 
 9   Heart Attack Risk    8763 non-null   int64 
 10  systole              8763 non-null   object
 11  diastole             8763 non-null   object
dtypes: int64(9), object(3)
memory usage: 821.7+ KB


In [64]:
# Create a new column to hold new blood pressure categories
data_df_2["bp"] = " "
data_df_2.head()

Unnamed: 0,Cholesterol,Heart Rate,Diabetes,Smoking,Obesity,Alcohol Consumption,Diet,Stress Level,Income,Heart Attack Risk,systole,diastole,bp
0,208,72,0,1,0,0,Average,9,261404,0,158,88,
1,389,98,1,1,1,1,Unhealthy,1,285768,0,165,93,
2,324,72,1,0,0,0,Healthy,9,235282,0,174,99,
3,383,73,1,1,0,1,Average,9,125640,0,163,100,
4,318,93,1,1,1,0,Unhealthy,6,160555,0,91,88,


In [65]:
# Check the details of the updated dataframe
data_df_2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8763 entries, 0 to 8762
Data columns (total 13 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Cholesterol          8763 non-null   int64 
 1   Heart Rate           8763 non-null   int64 
 2   Diabetes             8763 non-null   int64 
 3   Smoking              8763 non-null   int64 
 4   Obesity              8763 non-null   int64 
 5   Alcohol Consumption  8763 non-null   int64 
 6   Diet                 8763 non-null   object
 7   Stress Level         8763 non-null   int64 
 8   Income               8763 non-null   int64 
 9   Heart Attack Risk    8763 non-null   int64 
 10  systole              8763 non-null   object
 11  diastole             8763 non-null   object
 12  bp                   8763 non-null   object
dtypes: int64(9), object(4)
memory usage: 890.1+ KB


In [66]:
# Change the data type of the Systole and Diastole columns for the new categorization
data_df_2[["systole", "diastole"]] = data_df_2[["systole", "diastole"]].astype(int)

In [67]:
# Build a loop to read blood pressure values, bin them into assigned categories and save them in a list called "bp"
bp = []
for index, row in data_df_2.iterrows(): 
    
    if((row["systole"] < 120) & (row["diastole"] < 80)): 
     bp.append("Normal Blood Pressure") 
  
    elif ((row["systole"] >= 120 & row["systole"] < 130) & (row["diastole"] < 80)): 
     bp.append("Elevated Blood Pressure") 
  
    elif ((row["systole"] >= 130 & row["systole"] < 140) | (row["diastole"]>=80 & row["diastole"]<=89)): 
     bp.append("First Stage High Blood Pressure") 

    else: 
     bp.append("Second Stage High Blood Pressure") 

data_df_2 = data_df_2.assign (bp = bp)
data_df_2.head()

Unnamed: 0,Cholesterol,Heart Rate,Diabetes,Smoking,Obesity,Alcohol Consumption,Diet,Stress Level,Income,Heart Attack Risk,systole,diastole,bp
0,208,72,0,1,0,0,Average,9,261404,0,158,88,First Stage High Blood Pressure
1,389,98,1,1,1,1,Unhealthy,1,285768,0,165,93,First Stage High Blood Pressure
2,324,72,1,0,0,0,Healthy,9,235282,0,174,99,First Stage High Blood Pressure
3,383,73,1,1,0,1,Average,9,125640,0,163,100,First Stage High Blood Pressure
4,318,93,1,1,1,0,Unhealthy,6,160555,0,91,88,First Stage High Blood Pressure


In [68]:
# Drop the now unnecessary columns, systole and disatole
data_df_2.drop(columns=['systole','diastole'], inplace=True)
data_df_2.head()

Unnamed: 0,Cholesterol,Heart Rate,Diabetes,Smoking,Obesity,Alcohol Consumption,Diet,Stress Level,Income,Heart Attack Risk,bp
0,208,72,0,1,0,0,Average,9,261404,0,First Stage High Blood Pressure
1,389,98,1,1,1,1,Unhealthy,1,285768,0,First Stage High Blood Pressure
2,324,72,1,0,0,0,Healthy,9,235282,0,First Stage High Blood Pressure
3,383,73,1,1,0,1,Average,9,125640,0,First Stage High Blood Pressure
4,318,93,1,1,1,0,Unhealthy,6,160555,0,First Stage High Blood Pressure


In [69]:
# Check the number of Diet variables and their counts
data_df_2["Diet"].value_counts()

Healthy      2960
Average      2912
Unhealthy    2891
Name: Diet, dtype: int64

In [70]:
# Change the categorical data types to numbers
final_controlled_data_df = pd.get_dummies(data_df_2)
final_controlled_data_df

Unnamed: 0,Cholesterol,Heart Rate,Diabetes,Smoking,Obesity,Alcohol Consumption,Stress Level,Income,Heart Attack Risk,Diet_Average,Diet_Healthy,Diet_Unhealthy,bp_Elevated Blood Pressure,bp_First Stage High Blood Pressure,bp_Normal Blood Pressure
0,208,72,0,1,0,0,9,261404,0,1,0,0,0,1,0
1,389,98,1,1,1,1,1,285768,0,0,0,1,0,1,0
2,324,72,1,0,0,0,9,235282,0,0,1,0,0,1,0
3,383,73,1,1,0,1,9,125640,0,1,0,0,0,1,0
4,318,93,1,1,1,0,6,160555,0,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8758,121,61,1,1,0,1,8,235420,0,0,1,0,0,0,1
8759,120,73,1,0,1,0,8,217881,0,0,1,0,0,1,0
8760,250,105,0,1,1,1,5,36998,1,1,0,0,1,0,0
8761,178,60,1,1,0,0,5,209943,0,0,0,1,0,0,1


In [71]:
# Save this CSV file to the Resources folder
final_controlled_data_df.to_csv("Resources/controlled_data.csv")