In [1]:
# Import the dependencies
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
# Read in the CSV
data = Path("Resources/heart_attack_prediction_dataset.csv")
data_df = pd.read_csv(data)
data_df.head()

Unnamed: 0,Patient ID,Age,Sex,Cholesterol,Blood Pressure,Heart Rate,Diabetes,Family History,Smoking,Obesity,...,Sedentary Hours Per Day,Income,BMI,Triglycerides,Physical Activity Days Per Week,Sleep Hours Per Day,Country,Continent,Hemisphere,Heart Attack Risk
0,BMW7812,67,Male,208,158/88,72,0,0,1,0,...,6.615001,261404,31.251233,286,0,6,Argentina,South America,Southern Hemisphere,0
1,CZE1114,21,Male,389,165/93,98,1,1,1,1,...,4.963459,285768,27.194973,235,1,7,Canada,North America,Northern Hemisphere,0
2,BNI9906,21,Female,324,174/99,72,1,0,0,0,...,9.463426,235282,28.176571,587,4,4,France,Europe,Northern Hemisphere,0
3,JLN3497,84,Male,383,163/100,73,1,1,1,0,...,7.648981,125640,36.464704,378,3,4,Canada,North America,Northern Hemisphere,0
4,GFO8847,66,Male,318,91/88,93,1,1,1,1,...,1.514821,160555,21.809144,231,1,5,Thailand,Asia,Northern Hemisphere,0


In [3]:
# Drop the un-necessary columns - "Patient ID" and "BMI"
updated_data_df = data_df.drop(columns =['Patient ID','BMI'])
updated_data_df

Unnamed: 0,Age,Sex,Cholesterol,Blood Pressure,Heart Rate,Diabetes,Family History,Smoking,Obesity,Alcohol Consumption,...,Stress Level,Sedentary Hours Per Day,Income,Triglycerides,Physical Activity Days Per Week,Sleep Hours Per Day,Country,Continent,Hemisphere,Heart Attack Risk
0,67,Male,208,158/88,72,0,0,1,0,0,...,9,6.615001,261404,286,0,6,Argentina,South America,Southern Hemisphere,0
1,21,Male,389,165/93,98,1,1,1,1,1,...,1,4.963459,285768,235,1,7,Canada,North America,Northern Hemisphere,0
2,21,Female,324,174/99,72,1,0,0,0,0,...,9,9.463426,235282,587,4,4,France,Europe,Northern Hemisphere,0
3,84,Male,383,163/100,73,1,1,1,0,1,...,9,7.648981,125640,378,3,4,Canada,North America,Northern Hemisphere,0
4,66,Male,318,91/88,93,1,1,1,1,0,...,6,1.514821,160555,231,1,5,Thailand,Asia,Northern Hemisphere,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8758,60,Male,121,94/76,61,1,1,1,0,1,...,8,10.806373,235420,67,7,7,Thailand,Asia,Northern Hemisphere,0
8759,28,Female,120,157/102,73,1,0,0,1,0,...,8,3.833038,217881,617,4,9,Canada,North America,Northern Hemisphere,0
8760,47,Male,250,161/75,105,0,1,1,1,1,...,5,2.375214,36998,527,4,4,Brazil,South America,Southern Hemisphere,1
8761,36,Male,178,119/67,60,1,0,1,0,0,...,5,0.029104,209943,114,2,8,Brazil,South America,Southern Hemisphere,0


In [4]:
# Create the dataset based on all of the controlled variables
controlled_data_df = updated_data_df[['Cholesterol', 'Blood Pressure', 'Heart Rate', 'Diabetes', 'Smoking', 'Obesity','Alcohol Consumption', 'Diet', 'Stress Level', 'Income', 'Heart Attack Risk']].copy()
controlled_data_df

Unnamed: 0,Cholesterol,Blood Pressure,Heart Rate,Diabetes,Smoking,Obesity,Alcohol Consumption,Diet,Stress Level,Income,Heart Attack Risk
0,208,158/88,72,0,1,0,0,Average,9,261404,0
1,389,165/93,98,1,1,1,1,Unhealthy,1,285768,0
2,324,174/99,72,1,0,0,0,Healthy,9,235282,0
3,383,163/100,73,1,1,0,1,Average,9,125640,0
4,318,91/88,93,1,1,1,0,Unhealthy,6,160555,0
...,...,...,...,...,...,...,...,...,...,...,...
8758,121,94/76,61,1,1,0,1,Healthy,8,235420,0
8759,120,157/102,73,1,0,1,0,Healthy,8,217881,0
8760,250,161/75,105,0,1,1,1,Average,5,36998,1
8761,178,119/67,60,1,1,0,0,Unhealthy,5,209943,0


In [5]:
#Rename variables with spaces
controlled_data_df_2 = controlled_data_df.rename(columns={'Blood Pressure': 'BloodPressure'})
controlled_data_df_2.head()

Unnamed: 0,Cholesterol,BloodPressure,Heart Rate,Diabetes,Smoking,Obesity,Alcohol Consumption,Diet,Stress Level,Income,Heart Attack Risk
0,208,158/88,72,0,1,0,0,Average,9,261404,0
1,389,165/93,98,1,1,1,1,Unhealthy,1,285768,0
2,324,174/99,72,1,0,0,0,Healthy,9,235282,0
3,383,163/100,73,1,1,0,1,Average,9,125640,0
4,318,91/88,93,1,1,1,0,Unhealthy,6,160555,0


In [6]:
# Check the data types of the controlled dataframe
controlled_data_df_2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8763 entries, 0 to 8762
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Cholesterol          8763 non-null   int64 
 1   BloodPressure        8763 non-null   object
 2   Heart Rate           8763 non-null   int64 
 3   Diabetes             8763 non-null   int64 
 4   Smoking              8763 non-null   int64 
 5   Obesity              8763 non-null   int64 
 6   Alcohol Consumption  8763 non-null   int64 
 7   Diet                 8763 non-null   object
 8   Stress Level         8763 non-null   int64 
 9   Income               8763 non-null   int64 
 10  Heart Attack Risk    8763 non-null   int64 
dtypes: int64(9), object(2)
memory usage: 753.2+ KB


In [7]:
# Change the BloodPressure data to integer ??
#(find a way to change blood pressure to a numerical variable)
controlled_data_df_2.BloodPressure.astype(int) 

ValueError: invalid literal for int() with base 10: '158/88'

In [8]:
# Check the number of Diet variables and their counts
controlled_data_df_2["Diet"].value_counts()

Healthy      2960
Average      2912
Unhealthy    2891
Name: Diet, dtype: int64

In [None]:
# # Change the categorical data types to numbers
# final_controlled_data_df = pd.get_dummies(controlled_data_df_2)
# final_controlled_data_df

In [None]:
# #Split the preprocessed data into features and target arrays
# y= final_controlled_data_df["Heart Attack Risk"].values
# X= final_controlled_data_df.drop(["Heart Attack Risk"], axis = 1)

# #Split the preprocssed data into a training and testing dataset
# X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [None]:
# # Create a StandardScaler instance
# scaler = StandardScaler()

# # Fit the StandardScaler
# X_scaler = scaler.fit(X_train)

# # Scale the data
# X_train_scaled = X_scaler.transform(X_train)
# X_test_scaled = X_scaler.transform(X_test)