In [113]:
# Initial imports
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from pprint import pprint


#Spark imports
!wget -q https://dlcdn.apache.org/spark/spark-3.5.0/spark-3.5.0-bin-hadoop3.tgz
!tar xf spark-3.5.0-bin-hadoop3.tgz
!pip install -q findspark
!pip install pyspark
!pip install py4j
import os
import sys


# Start a SparkSession
import findspark
findspark.init()


# Amazon SageMaker and related imports
import sagemaker
import sagemaker.amazon.common as smac
from sagemaker.deserializers import JSONDeserializer
from sagemaker.serializers import CSVSerializer
from sagemaker import get_execution_role
from sagemaker import image_uris
import boto3  # AWS Python sdk

%matplotlib inline



In [114]:
# Import packages
from pyspark.sql import SparkSession
# Import the time module so we can time our queries.
import time

# Create a SparkSession
spark = SparkSession.builder.appName("SparkSQL").config("spark.driver.memory", "2g").getOrCreate()

In [115]:
# Read in data from S3 Bucket
from pyspark import SparkFiles
url = "Resources/heart_attack_prediction_dataset.csv"
#spark.sparkContext.addFile(url)
#df = spark.read.csv(SparkFiles.get("heart_attack_prediction_dataset.csv"), sep=",", header=True)

df = spark.read.csv(url, sep=",", header=True)

df.show()

+----------+---+------+-----------+--------------+----------+--------+--------------+-------+-------+-------------------+-----------------------+---------+-----------------------+--------------+------------+-----------------------+------+------------------+-------------+-------------------------------+-------------------+-------------+-------------+-------------------+-----------------+
|Patient ID|Age|   Sex|Cholesterol|Blood Pressure|Heart Rate|Diabetes|Family History|Smoking|Obesity|Alcohol Consumption|Exercise Hours Per Week|     Diet|Previous Heart Problems|Medication Use|Stress Level|Sedentary Hours Per Day|Income|               BMI|Triglycerides|Physical Activity Days Per Week|Sleep Hours Per Day|      Country|    Continent|         Hemisphere|Heart Attack Risk|
+----------+---+------+-----------+--------------+----------+--------+--------------+-------+-------+-------------------+-----------------------+---------+-----------------------+--------------+------------+-------------

In [116]:
# Convert PySpark DataFrame to Pandas DataFrame for display options
pd_df = df.toPandas()

# Set display options
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
pd.set_option("display.max_colwidth", None)
pd.set_option("display.precision", 2)

# Display the DataFrame
pd_df.head(10)

Unnamed: 0,Patient ID,Age,Sex,Cholesterol,Blood Pressure,Heart Rate,Diabetes,Family History,Smoking,Obesity,Alcohol Consumption,Exercise Hours Per Week,Diet,Previous Heart Problems,Medication Use,Stress Level,Sedentary Hours Per Day,Income,BMI,Triglycerides,Physical Activity Days Per Week,Sleep Hours Per Day,Country,Continent,Hemisphere,Heart Attack Risk
0,BMW7812,67,Male,208,158/88,72,0,0,1,0,0,4.168188835442079,Average,0,0,9,6.61500145291406,261404,31.2512327252954,286,0,6,Argentina,South America,Southern Hemisphere,0
1,CZE1114,21,Male,389,165/93,98,1,1,1,1,1,1.8132416178634456,Unhealthy,1,0,1,4.963458839757678,285768,27.1949733519874,235,1,7,Canada,North America,Northern Hemisphere,0
2,BNI9906,21,Female,324,174/99,72,1,0,0,0,0,2.0783529861178884,Healthy,1,1,9,9.463425838029828,235282,28.176570683909876,587,4,4,France,Europe,Northern Hemisphere,0
3,JLN3497,84,Male,383,163/100,73,1,1,1,0,1,9.82812959348533,Average,1,0,9,7.648980824461007,125640,36.464704293082846,378,3,4,Canada,North America,Northern Hemisphere,0
4,GFO8847,66,Male,318,91/88,93,1,1,1,1,0,5.804298820315434,Unhealthy,1,0,6,1.5148209264291386,160555,21.809144180619757,231,1,5,Thailand,Asia,Northern Hemisphere,0
5,ZOO7941,54,Female,297,172/86,48,1,1,1,0,1,0.6250080237057354,Unhealthy,1,1,2,7.798752408582432,241339,20.14683950301005,795,5,10,Germany,Europe,Northern Hemisphere,1
6,WYV0966,90,Male,358,102/73,84,0,0,1,0,1,4.098177090985471,Healthy,0,0,7,0.6273560009569512,190450,28.885810606590454,284,4,10,Canada,North America,Northern Hemisphere,1
7,XXM0972,84,Male,220,131/68,107,0,0,1,1,1,3.427928754300873,Average,0,1,4,10.54378023926684,122093,22.2218617394038,370,6,7,Japan,Asia,Northern Hemisphere,1
8,XCQ5937,20,Male,145,144/105,68,1,0,1,1,0,16.868302239450017,Average,0,0,5,11.348786873498993,25086,35.80990131909641,790,7,4,Brazil,South America,Southern Hemisphere,0
9,FTJ5456,43,Female,248,160/70,55,0,1,1,1,1,0.1945150606299495,Unhealthy,0,0,4,4.055114781794609,209703,22.55891675229818,232,7,7,Japan,Asia,Northern Hemisphere,0


In [117]:
pd_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8763 entries, 0 to 8762
Data columns (total 26 columns):
 #   Column                           Non-Null Count  Dtype 
---  ------                           --------------  ----- 
 0   Patient ID                       8763 non-null   object
 1   Age                              8763 non-null   object
 2   Sex                              8763 non-null   object
 3   Cholesterol                      8763 non-null   object
 4   Blood Pressure                   8763 non-null   object
 5   Heart Rate                       8763 non-null   object
 6   Diabetes                         8763 non-null   object
 7   Family History                   8763 non-null   object
 8   Smoking                          8763 non-null   object
 9   Obesity                          8763 non-null   object
 10  Alcohol Consumption              8763 non-null   object
 11  Exercise Hours Per Week          8763 non-null   object
 12  Diet                             8

In [118]:
# Split the "Blood Pressure" column into two separate columns for systolic and diastolic pressure
pd_df[['Systolic Pressure', 'Diastolic Pressure']] = pd_df['Blood Pressure'].str.split('/', expand=True)

# Convert the new columns to numeric
pd_df['Systolic Pressure'] = pd.to_numeric(pd_df['Systolic Pressure'])
pd_df['Diastolic Pressure'] = pd.to_numeric(pd_df['Diastolic Pressure'])

# Drop the original "Blood Pressure" column
pd_df.drop('Blood Pressure', axis=1, inplace=True)

pd_df.columns

Index(['Patient ID', 'Age', 'Sex', 'Cholesterol', 'Heart Rate', 'Diabetes',
       'Family History', 'Smoking', 'Obesity', 'Alcohol Consumption',
       'Exercise Hours Per Week', 'Diet', 'Previous Heart Problems',
       'Medication Use', 'Stress Level', 'Sedentary Hours Per Day', 'Income',
       'BMI', 'Triglycerides', 'Physical Activity Days Per Week',
       'Sleep Hours Per Day', 'Country', 'Continent', 'Hemisphere',
       'Heart Attack Risk', 'Systolic Pressure', 'Diastolic Pressure'],
      dtype='object')

In [119]:
# Dictionary mapping values in the 'gender' column to new values
mapping = {'Female':0,'Male':1,
           'Northern Hemisphere':0, 'Southern Hemisphere':1}

# Use the 'map' method to change the values in the 'gender' column
pd_df['Sex'] = pd_df['Sex'].map(mapping)
pd_df['Hemisphere'] = pd_df['Hemisphere'].map(mapping)

# Show DataFrame
pd_df[:5]

Unnamed: 0,Patient ID,Age,Sex,Cholesterol,Heart Rate,Diabetes,Family History,Smoking,Obesity,Alcohol Consumption,Exercise Hours Per Week,Diet,Previous Heart Problems,Medication Use,Stress Level,Sedentary Hours Per Day,Income,BMI,Triglycerides,Physical Activity Days Per Week,Sleep Hours Per Day,Country,Continent,Hemisphere,Heart Attack Risk,Systolic Pressure,Diastolic Pressure
0,BMW7812,67,1,208,72,0,0,1,0,0,4.168188835442079,Average,0,0,9,6.61500145291406,261404,31.2512327252954,286,0,6,Argentina,South America,1,0,158,88
1,CZE1114,21,1,389,98,1,1,1,1,1,1.8132416178634456,Unhealthy,1,0,1,4.963458839757678,285768,27.1949733519874,235,1,7,Canada,North America,0,0,165,93
2,BNI9906,21,0,324,72,1,0,0,0,0,2.0783529861178884,Healthy,1,1,9,9.463425838029828,235282,28.176570683909876,587,4,4,France,Europe,0,0,174,99
3,JLN3497,84,1,383,73,1,1,1,0,1,9.82812959348533,Average,1,0,9,7.648980824461007,125640,36.464704293082846,378,3,4,Canada,North America,0,0,163,100
4,GFO8847,66,1,318,93,1,1,1,1,0,5.804298820315434,Unhealthy,1,0,6,1.5148209264291386,160555,21.809144180619757,231,1,5,Thailand,Asia,0,0,91,88


In [120]:
# Define categorical features
categorical_features = ['Country', 'Continent', 'Diet'] 
categorical_dummies = pd.get_dummies(pd_df[categorical_features])
categorical_dummies[:5]

Unnamed: 0,Country_Argentina,Country_Australia,Country_Brazil,Country_Canada,Country_China,Country_Colombia,Country_France,Country_Germany,Country_India,Country_Italy,Country_Japan,Country_New Zealand,Country_Nigeria,Country_South Africa,Country_South Korea,Country_Spain,Country_Thailand,Country_United Kingdom,Country_United States,Country_Vietnam,Continent_Africa,Continent_Asia,Continent_Australia,Continent_Europe,Continent_North America,Continent_South America,Diet_Average,Diet_Healthy,Diet_Unhealthy
0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0
1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1
2,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0
3,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1


In [121]:
# Check unique values for each column
for col in pd_df.columns:
    print(f"Unique values for column '{col}': {pd_df[col].unique()}")


Unique values for column 'Patient ID': ['BMW7812' 'CZE1114' 'BNI9906' ... 'XKA5925' 'EPE6801' 'ZWN9666']
Unique values for column 'Age': ['67' '21' '84' '66' '54' '90' '20' '43' '73' '71' '77' '60' '88' '69'
 '38' '50' '45' '36' '48' '40' '79' '63' '27' '25' '86' '42' '52' '29'
 '30' '47' '44' '33' '51' '70' '85' '31' '56' '24' '74' '72' '55' '26'
 '53' '46' '57' '22' '35' '39' '80' '65' '83' '82' '28' '19' '75' '18'
 '34' '37' '89' '32' '49' '23' '59' '62' '64' '61' '76' '41' '87' '81'
 '58' '78' '68']
Unique values for column 'Sex': [1 0]
Unique values for column 'Cholesterol': ['208' '389' '324' '383' '318' '297' '358' '220' '145' '248' '373' '374'
 '228' '259' '122' '379' '166' '303' '340' '294' '359' '202' '133' '159'
 '271' '273' '328' '154' '135' '197' '321' '375' '360' '263' '201' '347'
 '129' '229' '251' '121' '190' '185' '279' '336' '192' '180' '203' '368'
 '222' '243' '218' '120' '285' '377' '369' '311' '139' '266' '153' '339'
 '329' '333' '398' '124' '183' '163' '362' '390'

In [122]:
# Define numerical features
numerical_features = [col for col in pd_df.columns if col not in ['Patient ID','Heart Attack Risk'] + categorical_features]

In [123]:
# Save encoded_df as a JSON file with records orientation
#encoded_df.to_json("Resources/encoded_df.json")

In [124]:
encoded_df = pd.concat([pd_df.drop(categorical_features, axis=1), categorical_dummies], axis=1)
encoded_df['Patient ID'] = pd_df['Patient ID']
encoded_df = encoded_df.set_index('Patient ID')
encoded_df[:5]

Unnamed: 0_level_0,Age,Sex,Cholesterol,Heart Rate,Diabetes,Family History,Smoking,Obesity,Alcohol Consumption,Exercise Hours Per Week,Previous Heart Problems,Medication Use,Stress Level,Sedentary Hours Per Day,Income,BMI,Triglycerides,Physical Activity Days Per Week,Sleep Hours Per Day,Hemisphere,Heart Attack Risk,Systolic Pressure,Diastolic Pressure,Country_Argentina,Country_Australia,Country_Brazil,Country_Canada,Country_China,Country_Colombia,Country_France,Country_Germany,Country_India,Country_Italy,Country_Japan,Country_New Zealand,Country_Nigeria,Country_South Africa,Country_South Korea,Country_Spain,Country_Thailand,Country_United Kingdom,Country_United States,Country_Vietnam,Continent_Africa,Continent_Asia,Continent_Australia,Continent_Europe,Continent_North America,Continent_South America,Diet_Average,Diet_Healthy,Diet_Unhealthy
Patient ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1
BMW7812,67,1,208,72,0,0,1,0,0,4.168188835442079,0,0,9,6.61500145291406,261404,31.2512327252954,286,0,6,1,0,158,88,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0
CZE1114,21,1,389,98,1,1,1,1,1,1.8132416178634456,1,0,1,4.963458839757678,285768,27.1949733519874,235,1,7,0,0,165,93,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1
BNI9906,21,0,324,72,1,0,0,0,0,2.0783529861178884,1,1,9,9.463425838029828,235282,28.176570683909876,587,4,4,0,0,174,99,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0
JLN3497,84,1,383,73,1,1,1,0,1,9.82812959348533,1,0,9,7.648980824461007,125640,36.464704293082846,378,3,4,0,0,163,100,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0
GFO8847,66,1,318,93,1,1,1,1,0,5.804298820315434,1,0,6,1.5148209264291386,160555,21.809144180619757,231,1,5,0,0,91,88,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1


In [125]:
encoded_df.shape

(8763, 52)

# Separate features and target variable

In [126]:
X = encoded_df.drop(['Heart Attack Risk'], axis=1)
y = encoded_df['Heart Attack Risk']

# Split the data into training and testing sets

In [127]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [128]:
# Define features for scale
scaled_features = ['Age',
 'Cholesterol',
 'Heart Rate',
 'Exercise Hours Per Week',
 'Stress Level',
 'Sedentary Hours Per Day',
 'Income',
 'BMI',
 'Triglycerides',
 'Physical Activity Days Per Week',
 'Sleep Hours Per Day',
 'Systolic Pressure',
 'Diastolic Pressure']


In [129]:
# Creating StandardScaler instance
scaler = StandardScaler()

In [130]:
# Fitting Standard Scaller
X_scaler = scaler.fit(X_train)

In [131]:
# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [132]:
# scaled_data = StandardScaler().fit_transform(encoded_df[scaled_features])
# scaled_df = pd.DataFrame(scaled_data, columns= scaled_features)
# scaled_df[:5]

In [133]:
# #columns = [col for col in encoded_df.columns if col not in scaled_features]
# scaled_df.reset_index(drop=True, inplace=True)
# encoded_df.reset_index(drop=True, inplace=True)
# scaled_df = pd.concat([encoded_df.drop(scaled_features, axis=1), scaled_df], axis=1)

In [134]:
# scaled_df[:5]

In [135]:
# scaled_df.shape

In [136]:
# Create a random forest classifier
rf_model = RandomForestClassifier(n_estimators=500, random_state=78)

In [137]:
# Fit the model and use .ravel()on the "y_train" data. 
rf_model = rf_model.fit(X_train_scaled, y_train)

In [138]:
# Making predictions using the testing data
rf_predictions = rf_model.predict(X_test_scaled)

In [139]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, rf_predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, rf_predictions)

In [140]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, rf_predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,1121,4
Actual 1,627,1


Accuracy Score : 0.6400456360524814
Classification Report
              precision    recall  f1-score   support

           0       0.64      1.00      0.78      1125
           1       0.20      0.00      0.00       628

    accuracy                           0.64      1753
   macro avg       0.42      0.50      0.39      1753
weighted avg       0.48      0.64      0.50      1753



In [141]:
# Get the feature importance array
importances = rf_model.feature_importances_

# List the top 10 most important features
importances_top_10 = sorted(zip(importances, X.columns), reverse=True)[:10]
print("Top 10 most important features:")
pprint(importances_top_10)

Top 10 most important features:
[(0.07036996442075587, 'Sedentary Hours Per Day'),
 (0.06999042024707497, 'BMI'),
 (0.06975033182782396, 'Income'),
 (0.06892273733769666, 'Triglycerides'),
 (0.0688889990047464, 'Exercise Hours Per Week'),
 (0.06626658633635532, 'Cholesterol'),
 (0.06288440415327652, 'Systolic Pressure'),
 (0.06099525729567608, 'Heart Rate'),
 (0.060252105048378204, 'Age'),
 (0.05836195325116488, 'Diastolic Pressure')]
