In [32]:
# Import the dependencies
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [33]:
# Read in the CSV
data = Path("Resources/heart_attack_prediction_dataset.csv")
data_df = pd.read_csv(data)
data_df.head()

Unnamed: 0,Patient ID,Age,Sex,Cholesterol,Blood Pressure,Heart Rate,Diabetes,Family History,Smoking,Obesity,...,Sedentary Hours Per Day,Income,BMI,Triglycerides,Physical Activity Days Per Week,Sleep Hours Per Day,Country,Continent,Hemisphere,Heart Attack Risk
0,BMW7812,67,Male,208,158/88,72,0,0,1,0,...,6.615001,261404,31.251233,286,0,6,Argentina,South America,Southern Hemisphere,0
1,CZE1114,21,Male,389,165/93,98,1,1,1,1,...,4.963459,285768,27.194973,235,1,7,Canada,North America,Northern Hemisphere,0
2,BNI9906,21,Female,324,174/99,72,1,0,0,0,...,9.463426,235282,28.176571,587,4,4,France,Europe,Northern Hemisphere,0
3,JLN3497,84,Male,383,163/100,73,1,1,1,0,...,7.648981,125640,36.464704,378,3,4,Canada,North America,Northern Hemisphere,0
4,GFO8847,66,Male,318,91/88,93,1,1,1,1,...,1.514821,160555,21.809144,231,1,5,Thailand,Asia,Northern Hemisphere,0


Clean the data for the Uncontrolled Variables DataFrame

In [34]:
# Drop the un-necessary columns - "Patient ID" and "BMI"
updated_data_df = data_df.drop(columns =['Patient ID','BMI'])
updated_data_df

Unnamed: 0,Age,Sex,Cholesterol,Blood Pressure,Heart Rate,Diabetes,Family History,Smoking,Obesity,Alcohol Consumption,...,Stress Level,Sedentary Hours Per Day,Income,Triglycerides,Physical Activity Days Per Week,Sleep Hours Per Day,Country,Continent,Hemisphere,Heart Attack Risk
0,67,Male,208,158/88,72,0,0,1,0,0,...,9,6.615001,261404,286,0,6,Argentina,South America,Southern Hemisphere,0
1,21,Male,389,165/93,98,1,1,1,1,1,...,1,4.963459,285768,235,1,7,Canada,North America,Northern Hemisphere,0
2,21,Female,324,174/99,72,1,0,0,0,0,...,9,9.463426,235282,587,4,4,France,Europe,Northern Hemisphere,0
3,84,Male,383,163/100,73,1,1,1,0,1,...,9,7.648981,125640,378,3,4,Canada,North America,Northern Hemisphere,0
4,66,Male,318,91/88,93,1,1,1,1,0,...,6,1.514821,160555,231,1,5,Thailand,Asia,Northern Hemisphere,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8758,60,Male,121,94/76,61,1,1,1,0,1,...,8,10.806373,235420,67,7,7,Thailand,Asia,Northern Hemisphere,0
8759,28,Female,120,157/102,73,1,0,0,1,0,...,8,3.833038,217881,617,4,9,Canada,North America,Northern Hemisphere,0
8760,47,Male,250,161/75,105,0,1,1,1,1,...,5,2.375214,36998,527,4,4,Brazil,South America,Southern Hemisphere,1
8761,36,Male,178,119/67,60,1,0,1,0,0,...,5,0.029104,209943,114,2,8,Brazil,South America,Southern Hemisphere,0


In [35]:
# Create the dataset based on all of the uncontrolled variables
uncontrolled_data_df = updated_data_df[['Age', 'Sex', 'Family History', 'Country', 'Continent', 'Hemisphere', "Heart Attack Risk"]].copy()
uncontrolled_data_df

Unnamed: 0,Age,Sex,Family History,Country,Continent,Hemisphere,Heart Attack Risk
0,67,Male,0,Argentina,South America,Southern Hemisphere,0
1,21,Male,1,Canada,North America,Northern Hemisphere,0
2,21,Female,0,France,Europe,Northern Hemisphere,0
3,84,Male,1,Canada,North America,Northern Hemisphere,0
4,66,Male,1,Thailand,Asia,Northern Hemisphere,0
...,...,...,...,...,...,...,...
8758,60,Male,1,Thailand,Asia,Northern Hemisphere,0
8759,28,Female,0,Canada,North America,Northern Hemisphere,0
8760,47,Male,1,Brazil,South America,Southern Hemisphere,1
8761,36,Male,0,Brazil,South America,Southern Hemisphere,0


In [36]:
# Check the data types of the uncontrolled dataframe
uncontrolled_data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8763 entries, 0 to 8762
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Age                8763 non-null   int64 
 1   Sex                8763 non-null   object
 2   Family History     8763 non-null   int64 
 3   Country            8763 non-null   object
 4   Continent          8763 non-null   object
 5   Hemisphere         8763 non-null   object
 6   Heart Attack Risk  8763 non-null   int64 
dtypes: int64(3), object(4)
memory usage: 479.4+ KB


In [37]:
# Check the number of data points in each column
uncontrolled_data_df.nunique()

Age                  73
Sex                   2
Family History        2
Country              20
Continent             6
Hemisphere            2
Heart Attack Risk     2
dtype: int64

In [38]:
# Check the Age column for further details
uncontrolled_data_df["Age"].value_counts()

90    152
42    150
33    147
59    147
29    137
     ... 
75    102
72    101
39    100
47     99
51     82
Name: Age, Length: 73, dtype: int64

In [39]:
# Check the max Age
max_age = uncontrolled_data_df["Age"].max()

# Check the min Age
min_age = uncontrolled_data_df["Age"].min()

print(f'max age = {max_age}, min age = {min_age}')

max age = 90, min age = 18


In [40]:
# Bin the Age column with qcut to get equal sized bins
# labels legend: 0=[18-25], 1=[25-32], 2=[32-39]. 3=[39-46], 4=[46-54], 5=[54-61], 6=[61-68], 7=[68-76], 8=[76-83], 9=[83-90]
uncontrolled_data_df["Age_bin"] = pd.qcut(uncontrolled_data_df["Age"], q=10, labels = False, duplicates = 'drop')
uncontrolled_data_df.head(20)

Unnamed: 0,Age,Sex,Family History,Country,Continent,Hemisphere,Heart Attack Risk,Age_bin
0,67,Male,0,Argentina,South America,Southern Hemisphere,0,6
1,21,Male,1,Canada,North America,Northern Hemisphere,0,0
2,21,Female,0,France,Europe,Northern Hemisphere,0,0
3,84,Male,1,Canada,North America,Northern Hemisphere,0,9
4,66,Male,1,Thailand,Asia,Northern Hemisphere,0,6
5,54,Female,1,Germany,Europe,Northern Hemisphere,1,4
6,90,Male,0,Canada,North America,Northern Hemisphere,1,9
7,84,Male,0,Japan,Asia,Northern Hemisphere,1,9
8,20,Male,0,Brazil,South America,Southern Hemisphere,0,0
9,43,Female,1,Japan,Asia,Northern Hemisphere,0,3


In [41]:
#Check the age ranges for the new binned age column
uncontrolled_data_df["Age_bin"].value_counts()

0    992
7    912
4    892
2    884
3    868
9    867
1    851
6    847
5    834
8    816
Name: Age_bin, dtype: int64

In [42]:
#Drop the original Age column
uncontrolled_data_df = uncontrolled_data_df.drop(["Age"], axis = 1)
uncontrolled_data_df

Unnamed: 0,Sex,Family History,Country,Continent,Hemisphere,Heart Attack Risk,Age_bin
0,Male,0,Argentina,South America,Southern Hemisphere,0,6
1,Male,1,Canada,North America,Northern Hemisphere,0,0
2,Female,0,France,Europe,Northern Hemisphere,0,0
3,Male,1,Canada,North America,Northern Hemisphere,0,9
4,Male,1,Thailand,Asia,Northern Hemisphere,0,6
...,...,...,...,...,...,...,...
8758,Male,1,Thailand,Asia,Northern Hemisphere,0,5
8759,Female,0,Canada,North America,Northern Hemisphere,0,1
8760,Male,1,Brazil,South America,Southern Hemisphere,1,4
8761,Male,0,Brazil,South America,Southern Hemisphere,0,2


In [43]:
# Change the categorical data types to numbers
final_uncontrolled_data_df = pd.get_dummies(uncontrolled_data_df)
final_uncontrolled_data_df

Unnamed: 0,Family History,Heart Attack Risk,Age_bin,Sex_Female,Sex_Male,Country_Argentina,Country_Australia,Country_Brazil,Country_Canada,Country_China,...,Country_United States,Country_Vietnam,Continent_Africa,Continent_Asia,Continent_Australia,Continent_Europe,Continent_North America,Continent_South America,Hemisphere_Northern Hemisphere,Hemisphere_Southern Hemisphere
0,0,0,6,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1
1,1,0,0,0,1,0,0,0,1,0,...,0,0,0,0,0,0,1,0,1,0
2,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
3,1,0,9,0,1,0,0,0,1,0,...,0,0,0,0,0,0,1,0,1,0
4,1,0,6,0,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8758,1,0,5,0,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
8759,0,0,1,1,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,1,0
8760,1,1,4,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,1
8761,0,0,2,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,1


In [44]:
#Split the preprocessed data into features and target arrays
y= final_uncontrolled_data_df["Heart Attack Risk"].values
X= final_uncontrolled_data_df.drop(["Heart Attack Risk"], axis = 1)

#Split the preprocssed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [45]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)