In [1]:
# Initial imports
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from pprint import pprint
import psycopg2
from psycopg2 import sql

#Spark imports
!wget -q https://dlcdn.apache.org/spark/spark-3.5.0/spark-3.5.0-bin-hadoop3.tgz
!tar xf spark-3.5.0-bin-hadoop3.tgz
!pip install -q findspark
!pip install pyspark
!pip install py4j
import os
import sys


# Start a SparkSession
import findspark
findspark.init()


%matplotlib inline

Collecting pyspark
  Downloading pyspark-3.5.0.tar.gz (316.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.9/316.9 MB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.0-py2.py3-none-any.whl size=317425345 sha256=79e55a1ca97f6ca92fae6a9f46725f595bd26c643c70a205d88af37bb0a2d26b
  Stored in directory: /root/.cache/pip/wheels/41/4e/10/c2cf2467f71c678cfc8a6b9ac9241e5e44a01940da8fbb17fc
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.0


In [2]:
# Import packages
from pyspark.sql import SparkSession
# Import the time module so we can time our queries.
import time
# Create a SparkSession
spark = SparkSession.builder \
    .appName("SqlSpark") \
    .config("spark.executor.memory", "2g") \
    .config("spark.driver.memory", "2g") \
    .getOrCreate()

In [3]:
# Read in data from S3 Bucket
from pyspark import SparkFiles
url = "https://aws-project-4.s3.ca-central-1.amazonaws.com/encoded_df.csv"
spark.sparkContext.addFile(url)
encoded_df = spark.read.csv(SparkFiles.get("encoded_df.csv"), sep=",", header=True)

# Display the DataFrame
encoded_df.show(5)

+----------+---------+------------+---+---+-----------+----------+--------+--------------+-------+-------+-------------------+-----------------------+-----------------------+--------------+------------+-----------------------+------+------------------+-------------+-------------------------------+-------------------+-------------+----------+-----------------+-----------------+------------------+-----+-----+------------+------------+--------------+
|Patient ID|  Country|     Capital|Age|Sex|Cholesterol|Heart Rate|Diabetes|Family History|Smoking|Obesity|Alcohol Consumption|Exercise Hours Per Week|Previous Heart Problems|Medication Use|Stress Level|Sedentary Hours Per Day|Income|               BMI|Triglycerides|Physical Activity Days Per Week|Sleep Hours Per Day|    Continent|Hemisphere|Heart Attack Risk|Systolic Pressure|Diastolic Pressure|  lat| long|Diet_Average|Diet_Healthy|Diet_Unhealthy|
+----------+---------+------------+---+---+-----------+----------+--------+--------------+------

In [4]:
encoded_df = encoded_df.toPandas()
encoded_df.dtypes

Patient ID                         object
Country                            object
Capital                            object
Age                                object
Sex                                object
Cholesterol                        object
Heart Rate                         object
Diabetes                           object
Family History                     object
Smoking                            object
Obesity                            object
Alcohol Consumption                object
Exercise Hours Per Week            object
Previous Heart Problems            object
Medication Use                     object
Stress Level                       object
Sedentary Hours Per Day            object
Income                             object
BMI                                object
Triglycerides                      object
Physical Activity Days Per Week    object
Sleep Hours Per Day                object
Continent                          object
Hemisphere                        

In [5]:
# Convert columns to the desired data types
encoded_df['Age'] = encoded_df['Age'].astype(int)
encoded_df['Cholesterol'] = encoded_df['Cholesterol'].astype(int)
encoded_df['Heart Rate'] = encoded_df['Heart Rate'].astype(int)
encoded_df['Diabetes'] = encoded_df['Diabetes'].astype(int)
encoded_df['Family History'] = encoded_df['Family History'].astype(int)
encoded_df['Smoking'] = encoded_df['Smoking'].astype(int)
encoded_df['Obesity'] = encoded_df['Obesity'].astype(int)
encoded_df['Alcohol Consumption'] = encoded_df['Alcohol Consumption'].astype(int)
encoded_df['Exercise Hours Per Week'] = encoded_df['Exercise Hours Per Week'].astype(float)
encoded_df['Previous Heart Problems'] = encoded_df['Previous Heart Problems'].astype(int)
encoded_df['Medication Use'] = encoded_df['Medication Use'].astype(int)
encoded_df['Stress Level'] = encoded_df['Stress Level'].astype(int)
encoded_df['Sedentary Hours Per Day'] = encoded_df['Sedentary Hours Per Day'].astype(float)
encoded_df['Income'] = encoded_df['Income'].astype(int)
encoded_df['BMI'] = encoded_df['BMI'].astype(float)
encoded_df['Triglycerides'] = encoded_df['Triglycerides'].astype(int)
encoded_df['Physical Activity Days Per Week'] = encoded_df['Physical Activity Days Per Week'].astype(int)
encoded_df['Sleep Hours Per Day'] = encoded_df['Sleep Hours Per Day'].astype(int)
encoded_df['Systolic Pressure'] = encoded_df['Systolic Pressure'].astype(int)
encoded_df['Diastolic Pressure'] = encoded_df['Diastolic Pressure'].astype(int)
encoded_df['lat'] = encoded_df['lat'].astype(float)
encoded_df['long'] = encoded_df['long'].astype(float)
encoded_df['Diet_Average'] = encoded_df['Diet_Average'].astype(int)
encoded_df['Diet_Healthy'] = encoded_df['Diet_Healthy'].astype(int)
encoded_df['Diet_Unhealthy'] = encoded_df['Diet_Unhealthy'].astype(int)

# Print the data types of all columns
encoded_df.dtypes


Patient ID                          object
Country                             object
Capital                             object
Age                                  int64
Sex                                 object
Cholesterol                          int64
Heart Rate                           int64
Diabetes                             int64
Family History                       int64
Smoking                              int64
Obesity                              int64
Alcohol Consumption                  int64
Exercise Hours Per Week            float64
Previous Heart Problems              int64
Medication Use                       int64
Stress Level                         int64
Sedentary Hours Per Day            float64
Income                               int64
BMI                                float64
Triglycerides                        int64
Physical Activity Days Per Week      int64
Sleep Hours Per Day                  int64
Continent                           object
Hemisphere 

# Separate features and target variable

In [24]:
# Dropping specified columns to create feature matrix X and target vector y
X = encoded_df.drop(['Exercise Hours Per Week',
                     'Stress Level',
                     'Sedentary Hours Per Day',
                     'Income',
                     'Physical Activity Days Per Week',
                     'Sleep Hours Per Day',
                     'Heart Attack Risk',
                     'Diet_Average',
                     'Diet_Healthy',
                     'Diet_Unhealthy',
                     'Country',
                     'Capital',
                     'lat',
                     'long',
                     'Continent',
                     'Patient ID',
                     'Hemisphere'], axis=1)

y = encoded_df['Heart Attack Risk']

# Printing the columns in X
print(X.columns)


Index(['Age', 'Sex', 'Cholesterol', 'Heart Rate', 'Diabetes', 'Family History',
       'Smoking', 'Obesity', 'Alcohol Consumption', 'Previous Heart Problems',
       'Medication Use', 'BMI', 'Triglycerides', 'Systolic Pressure',
       'Diastolic Pressure'],
      dtype='object')


# Split the data into training and testing sets

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [26]:
# Creating StandardScaler instance
scaler = StandardScaler()

In [27]:
# Fitting Standard Scaller
X_scaler = scaler.fit(X_train)

In [28]:
# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [29]:
# Create a random forest classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=78)

In [30]:
# Fit the model and use .ravel()on the "y_train" data.
rf_model = rf_model.fit(X_train_scaled, y_train)

In [31]:
# Making predictions using the testing data
rf_predictions = rf_model.predict(X_test_scaled)

In [32]:
print(f"Training Data Score: {rf_model.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {rf_model.score(X_test_scaled, y_test)}")

Training Data Score: 1.0
Testing Data Score: 0.6252139189960069


In [33]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, rf_predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, rf_predictions)

In [34]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, rf_predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,1076,49
Actual 1,608,20


Accuracy Score : 0.6252139189960069
Classification Report
              precision    recall  f1-score   support

           0       0.64      0.96      0.77      1125
           1       0.29      0.03      0.06       628

    accuracy                           0.63      1753
   macro avg       0.46      0.49      0.41      1753
weighted avg       0.51      0.63      0.51      1753



In [35]:
# Get the feature importance array
importances = rf_model.feature_importances_

# List the top 10 most important features
importances_top_10 = sorted(zip(importances, X.columns), reverse=True)[:10]
print("Top 10 most important features:")
pprint(importances_top_10)

Top 10 most important features:
[(0.13668777034555477, 'BMI'),
 (0.1330755039509209, 'Triglycerides'),
 (0.1296822901907914, 'Cholesterol'),
 (0.12061988933981975, 'Systolic Pressure'),
 (0.11680282141392462, 'Age'),
 (0.1149790413745522, 'Heart Rate'),
 (0.11117956690373326, 'Diastolic Pressure'),
 (0.020655081991957324, 'Medication Use'),
 (0.020547867738163843, 'Previous Heart Problems'),
 (0.019538792127817776, 'Alcohol Consumption')]


# KNN model

In [36]:
# Dropping specified columns to create feature matrix X and target vector y
X = encoded_df.drop(['Exercise Hours Per Week',
                     'Stress Level',
                     'Sedentary Hours Per Day',
                     'Income',
                     'Physical Activity Days Per Week',
                     'Sleep Hours Per Day',
                     'Heart Attack Risk',
                     'Diet_Average',
                     'Diet_Healthy',
                     'Diet_Unhealthy',
                     'Country',
                     'Capital',
                     'lat',
                     'long',
                     'Continent',
                     'Patient ID',
                     'Hemisphere'], axis=1)

y = encoded_df['Heart Attack Risk']
print(X.columns)

Index(['Age', 'Sex', 'Cholesterol', 'Heart Rate', 'Diabetes', 'Family History',
       'Smoking', 'Obesity', 'Alcohol Consumption', 'Previous Heart Problems',
       'Medication Use', 'BMI', 'Triglycerides', 'Systolic Pressure',
       'Diastolic Pressure'],
      dtype='object')


In [37]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [38]:
# Creating StandardScaler instance
scaler = StandardScaler()

In [39]:
# Fitting Standard Scaller
X_scaler = scaler.fit(X_train)

In [40]:
# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [41]:
from sklearn.neighbors import KNeighborsClassifier
# Instantiate the model with k = 3 neighbors
knn_model = KNeighborsClassifier(n_neighbors=20)

# Train the model
knn_model.fit(X_train_scaled, y_train)
# Create predictions
knn_y_pred = knn_model.predict(X_test_scaled)

In [42]:
print(f"Training Data Score: {knn_model.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {knn_model.score(X_test_scaled, y_test)}")

Training Data Score: 0.6507845934379458
Testing Data Score: 0.6269252709640616


In [43]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, knn_y_pred)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, knn_y_pred)

In [44]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test,  knn_y_pred))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,1066,59
Actual 1,595,33


Accuracy Score : 0.6269252709640616
Classification Report
              precision    recall  f1-score   support

           0       0.64      0.95      0.77      1125
           1       0.36      0.05      0.09       628

    accuracy                           0.63      1753
   macro avg       0.50      0.50      0.43      1753
weighted avg       0.54      0.63      0.52      1753



# SVC model

In [45]:
# Dropping specified columns to create feature matrix X and target vector y
X = encoded_df.drop(['Exercise Hours Per Week',
                     'Stress Level',
                     'Sedentary Hours Per Day',
                     'Income',
                     'Physical Activity Days Per Week',
                     'Sleep Hours Per Day',
                     'Heart Attack Risk',
                     'Diet_Average',
                     'Diet_Healthy',
                     'Diet_Unhealthy',
                     'Country',
                     'Capital',
                     'lat',
                     'long',
                     'Continent',
                     'Patient ID',
                     'Hemisphere'], axis=1)

y = encoded_df['Heart Attack Risk']
print(X.columns)

Index(['Age', 'Sex', 'Cholesterol', 'Heart Rate', 'Diabetes', 'Family History',
       'Smoking', 'Obesity', 'Alcohol Consumption', 'Previous Heart Problems',
       'Medication Use', 'BMI', 'Triglycerides', 'Systolic Pressure',
       'Diastolic Pressure'],
      dtype='object')


In [46]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.svm import SVC
SVC_model = SVC(kernel='linear')
SVC_model.fit(X_train, y_train)

In [None]:
SVC_predictions = SVC_model.predict(X_test)

In [None]:
print(f"Training Data Score: {SVC_model.score(X_train, y_train)}")
print(f"Testing Data Score: {SVC_model.score(X_test, y_test)}")

In [None]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, SVC_predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, SVC_predictions)

In [None]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test,  SVC_predictions))


# LogisticRegression

In [None]:
# Dropping specified columns to create feature matrix X and target vector y
X = encoded_df.drop(['Exercise Hours Per Week',
                     'Stress Level',
                     'Sedentary Hours Per Day',
                     'Income',
                     'Physical Activity Days Per Week',
                     'Sleep Hours Per Day',
                     'Heart Attack Risk',
                     'Diet_Average',
                     'Diet_Healthy',
                     'Diet_Unhealthy',
                     'Country',
                     'Capital',
                     'lat',
                     'long',
                     'Continent',
                     'Patient ID',
                     'Hemisphere'], axis=1)

y = encoded_df['Heart Attack Risk']
print(X.columns)

In [None]:
#balance labels y
balance_labels = y.value_counts()
balance_labels

There seems to be a bit of imbalance, with more instances of
class 0 compared to class 1. Depending on our model's sensitivity to class
imbalances, we may want to consider techniques like (1) oversampling,
(2) undersampling, or (3) using class weights during model training to handle this imbalance.

In [None]:
# Split the data using train_test_split
# Assign a random_state of 1 to the function

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
X_train.shape

7010: This is the number of samples or instances in your training set. Each row represents a separate observation or data point in your dataset.

26: This is the number of features or variables in your dataset. Each column in your dataset corresponds to a different feature or attribute that is used for training your machine learning model.

So, in summary, you have 7010 training samples, and each sample has 26 features. This information is crucial for understanding the dimensions of your training data, especially when you're working with machine learning models that require input of specific shapes.

In [None]:
# Import the LogisticRegression module from SKLearn
from sklearn.linear_model import LogisticRegression

# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
logistic_model = LogisticRegression(solver="lbfgs",max_iter=200, random_state=1)

In [None]:
# Fit the model using training data
logistic_model.fit(X_train, y_train)

In [None]:
print(f"Training Data Score: {logistic_model.score(X_train, y_train)}")
print(f"Testing Data Score: {logistic_model.score(X_test, y_test)}")

In [None]:
# Make a prediction using the testing data
lr_predictions = logistic_model.predict(X_test)
results = pd.DataFrame({"Prediction": lr_predictions, "Actual": y_test}).reset_index(drop = True)
results.head(10)

In [None]:
accuracy_score(y_test, lr_predictions)

### Evaluation of the model’s performance by doing the following:

* Calculate the accuracy score of the model.

* Generate a confusion matrix.

* Print the classification report.

In [None]:
balanced_accuracy = accuracy_score(y_test, lr_predictions)
balanced_accuracy

In [None]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, lr_predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, lr_predictions)

In [None]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test,  lr_predictions))

Let's break down what each of these means:

True Negative (TN): 18673 instances were correctly predicted as class 0 (negative class).

False Positive (FP): 86 instances were incorrectly predicted as class 1 (positive class) when they were actually class 0.

False Negative (FN): 32 instances were incorrectly predicted as class 0 when they were actually class 1.

True Positive (TP): 593 instances were correctly predicted as class 1.

Precision: Precision is the ratio of correctly predicted positive observations to the total predicted positives. In your case:

Precision for class 0: 0.65 (65% of instances predicted as class 0 were actually class 0)
Precision for class 1: 0.00 (None of the instances predicted as class 1 were actually class 1)
Recall (Sensitivity): Recall is the ratio of correctly predicted positive observations to the all observations in actual class. In your case:

Recall for class 0: 1.00 (All instances of class 0 were correctly predicted)
Recall for class 1: 0.00 (None of the instances of class 1 were correctly predicted)
F1-Score: F1-Score is the weighted average of Precision and Recall. It is a balance between precision and recall. In your case:

F1-Score for class 0: 0.79 (Weighted average of precision and recall for class 0)
F1-Score for class 1: 0.00 (Weighted average of precision and recall for class 1)
Support: The number of actual occurrences of each class in the specified dataset. In your case:

Support for class 0: 1142 instances
Support for class 1: 611 instances
Accuracy: Overall accuracy of the model on the testing data. In your case: 0.65 (65%)

Macro Average: The average of precision, recall, and F1-Score for both classes, without considering class imbalance. In your case:

Macro Average Precision: 0.33
Macro Average Recall: 0.50
Macro Average F1-Score: 0.39
Weighted Average: The average of precision, recall, and F1-Score, weighted by the number of samples in each class. In your case:

Weighted Average Precision: 0.42
Weighted Average Recall: 0.65
Weighted Average F1-Score: 0.51
These metrics provide a more nuanced understanding of your model's performance for each class and overall. In this case, it seems that the model is performing well in predicting class 0 but struggling to predict class 1, as indicated by the low recall and F1-Score for class 1. Depending on the specific goals of your model, you may need to adjust the model or data preprocessing to improve its performance.

In [None]:
#Handling Imbalanced Data:

logistic_model = LogisticRegression(class_weight='balanced', random_state=1)

In [None]:
!pip install imbalanced-learn

In [None]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=1)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

In [None]:
# Instantiate a new logistic regression model
logistic_model_resampled = LogisticRegression(random_state=1)

# Fit the model with the resampled data
logistic_model_resampled.fit(X_resampled, y_resampled)

# Make predictions on the testing data
predictions_resampled = logistic_model_resampled.predict(X_test)

In [None]:
#Evaluate the performance
accuracy_resampled = accuracy_score(y_test, predictions_resampled)
classification_report_resampled = classification_report(y_test, predictions_resampled)
confusion_matrix_resampled = confusion_matrix(y_test, predictions_resampled)

print("Accuracy (Resampled):", accuracy_resampled)
print("Classification Report (Resampled):\n", classification_report_resampled)
print("Confusion Matrix (Resampled):\n", confusion_matrix_resampled)

# GridSearchCV

In [None]:
# Dropping specified columns to create feature matrix X and target vector y
X = encoded_df.drop(['Exercise Hours Per Week',
                     'Stress Level',
                     'Sedentary Hours Per Day',
                     'Income',
                     'Physical Activity Days Per Week',
                     'Sleep Hours Per Day',
                     'Heart Attack Risk',
                     'Diet_Average',
                     'Diet_Healthy',
                     'Diet_Unhealthy',
                     'Country',
                     'Capital',
                     'lat',
                     'long',
                     'Continent',
                     'Patient ID',
                     'Hemisphere'], axis=1)

y = encoded_df['Heart Attack Risk']
print(X.columns)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

# Assuming you already have X and y defined

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Creating StandardScaler instance
scaler = MinMaxScaler()

# Fitting StandardScaler
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define the pipeline including MinMaxScaler and KNeighborsClassifier
knn_pipe = Pipeline([
    ('mms', MinMaxScaler()),
    ('knn', KNeighborsClassifier())
])

# Define the parameters grid for GridSearchCV
params = [{
    'knn__n_neighbors': [3, 5, 7, 9],
    'knn__weights': ['uniform', 'distance'],
    'knn__leaf_size': [15, 20]
}]

# Initialize GridSearchCV with the pipeline, parameters, scoring, and cross-validation
gs_knn = GridSearchCV(
    knn_pipe,
    param_grid=params,
    scoring='accuracy',
    cv=5
)

# Fit GridSearchCV on the training data
gs_knn.fit(X_train_scaled, y_train)

# Print the best parameters found by GridSearchCV
print("Best Parameters:", gs_knn.best_params_)

# Find the score of the best model on the training data
print("Best Model Score on Training Data:", gs_knn.score(X_train_scaled, y_train))
print(f"Training Data Score: {gs_knn.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {gs_knn.score(X_test_scaled, y_test)}")

In [None]:
gs_knn_y_pred = gs_knn.predict(X_test_scaled)

In [None]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, gs_knn_y_pred)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, gs_knn_y_pred)

In [None]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test,  gs_knn_y_pred))

# Grid Search with Logistic Regression

In [None]:
# Dropping specified columns to create feature matrix X and target vector y
X = encoded_df.drop(['Exercise Hours Per Week',
                     'Stress Level',
                     'Sedentary Hours Per Day',
                     'Income',
                     'Physical Activity Days Per Week',
                     'Sleep Hours Per Day',
                     'Heart Attack Risk',
                     'Diet_Average',
                     'Diet_Healthy',
                     'Diet_Unhealthy',
                     'Country',
                     'Capital',
                     'lat',
                     'long',
                     'Continent',
                     'Patient ID',
                     'Hemisphere'], axis=1)

y = encoded_df['Heart Attack Risk']
print(X.columns)

In [None]:
X = (X-np.min(X))/(np.max(X)-np.min(X))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
# Grid search cross validation
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
grid={"C":np.logspace(-3,3,7), "penalty":["l1","l2"]}# l1 lasso l2 ridge
logreg=LogisticRegression()
logreg_cv=GridSearchCV(logreg,grid,cv=10)
logreg_cv.fit(X_train,y_train)

print("tuned hpyerparameters :(best parameters) ",logreg_cv.best_params_)
print("accuracy :",logreg_cv.best_score_)

In [None]:
logreg2=LogisticRegression(C=1,penalty="l2")
logreg2.fit(X_train,y_train)
print("score",logreg2.score(X_test,y_test))

In [None]:
!pip install keras-tuner

In [None]:
# Import our dependencies
import sklearn as skl
import tensorflow as tf

# Dropping specified columns to create feature matrix X and target vector y
X = encoded_df.drop(['Exercise Hours Per Week',
                     'Stress Level',
                     'Sedentary Hours Per Day',
                     'Income',
                     'Physical Activity Days Per Week',
                     'Sleep Hours Per Day',
                     'Heart Attack Risk',
                     'Diet_Average',
                     'Diet_Healthy',
                     'Diet_Unhealthy',
                     'Country',
                     'Capital',
                     'lat',
                     'long',
                     'Continent',
                     'Patient ID',
                     'Hemisphere'], axis=1)

y = encoded_df['Heart Attack Risk']

# Use sklearn to split dataset
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

# Create scaler instance
X_scaler = skl.preprocessing.StandardScaler()

# Fit the scaler
X_scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
# Create a method that creates a new Sequential model with hyperparameter options
def create_model(hp):
    nn_model = tf.keras.models.Sequential()

    # Allow kerastuner to decide which activation function to use in hidden layers
    activation = hp.Choice('activation',['relu','tanh','sigmoid'])

    # Allow kerastuner to decide number of neurons in first layer
    nn_model.add(tf.keras.layers.Dense(units=hp.Int('first_units',
        min_value=1,
        max_value=10,
        step=2), activation=activation, input_dim=2))

    # Allow kerastuner to decide number of hidden layers and neurons in hidden layers
    for i in range(hp.Int('num_layers', 1, 6)):
        nn_model.add(tf.keras.layers.Dense(units=hp.Int('units_' + str(i),
            min_value=1,
            max_value=10,
            step=2),
            activation=activation))

    nn_model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

    # Compile the model
    nn_model.compile(loss="binary_crossentropy", optimizer='adam', metrics=["accuracy"])

    return nn_model

In [None]:
# Import the kerastuner library
import keras_tuner as kt

tuner = kt.Hyperband(
    create_model,
    objective="val_accuracy",
    max_epochs=20,
    hyperband_iterations=2)

In [None]:
# Run the kerastuner search for best hyperparameters
tuner.search(X_train_scaled,y_train,epochs=20,validation_data=(X_test_scaled,y_test))

In [None]:
# Get top 3 best model hyperparameters
best_hyperparameters = tuner.get_best_hyperparameters(3)

# Print the values of the top 3 best hyperparameters
for idx, best_hyper in enumerate(best_hyperparameters):
    print(f"Hyperparameters for model {idx+1}:")
    print(best_hyper.values)


In [None]:
# Get top 3 best models
best_models = tuner.get_best_models(3)

# Evaluate each best model against full test data and print accuracy
for idx, best_model in enumerate(best_models):
    model_loss, model_accuracy = best_model.evaluate(X_test_scaled, y_test, verbose=2)
    print(f"Accuracy for model {idx + 1}: {model_accuracy}")


In [None]:
# Dropping specified columns to create feature matrix X and target vector y
X = encoded_df.drop(['Exercise Hours Per Week',
                     'Stress Level',
                     'Sedentary Hours Per Day',
                     'Income',
                     'Physical Activity Days Per Week',
                     'Sleep Hours Per Day',
                     'Heart Attack Risk',
                     'Diet_Average',
                     'Diet_Healthy',
                     'Diet_Unhealthy',
                     'Country',
                     'Capital',
                     'lat',
                     'long',
                     'Continent',
                     'Patient ID',
                     'Hemisphere'], axis=1)

y = encoded_df['Heart Attack Risk']

In [None]:
# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2 ,random_state=42)

In [None]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.

number_input_features = len(X_train_scaled[0])
hidden_nodes_layer1 =  40
hidden_nodes_layer2 = 20
hidden_nodes_layer3 = 5
hidden_nodes_layer4 = 2

nn = tf.keras.models.Sequential()


# First hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu"))

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="sigmoid"))

# Third hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer3, activation="tanh"))

# Fourth hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer4, activation="softmax"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn.summary()

In [None]:
# Compile the model
nn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
# Train the model with early stopping and provide validation data
hist = nn.fit(X_train_scaled, y_train, epochs=100)

In [None]:
# Import matplotlib.pyplot
import matplotlib.pyplot as plt

# Plot the training loss
plt.plot(hist.history['loss'])
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training Loss Over Epochs')
plt.show()