In [1]:

import pandas as pd  
import matplotlib.pyplot as plt   
from sklearn.model_selection import train_test_split

Heart_Attack = pd.read_csv("Heart Attack.csv")
Heart_Attack.head()



X = Heart_Attack['glucose'] #Assign the column “glucose” as your feature data
y = Heart_Attack['class']  #Assign the column "class" as your target variable




# Step 1: Split data into temporary training and test sets (80% training, 20% test)
X_train_temporary, X_test, y_train_temporary, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 2: Split the temporary training set into final training and validation sets (80% training, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(X_train_temporary, y_train_temporary, test_size=0.5, random_state=42)


Logistic Regression

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


#Create a Logistic Regression model
model = LogisticRegression(random_state=42)

#Train the model on the training data
model.fit(X_train.values.reshape(-1, 1), y_train)

#Make predictions on the test data
y_pred = model.predict(X_test.values.reshape(-1, 1))

#Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

#Print classification report and confusion matrix
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.63
              precision    recall  f1-score   support

    negative       0.64      0.07      0.13       101
    positive       0.63      0.98      0.76       163

    accuracy                           0.63       264
   macro avg       0.63      0.52      0.44       264
weighted avg       0.63      0.63      0.52       264

[[  7  94]
 [  4 159]]


Random Forest Classifier

In [3]:
from sklearn.ensemble import RandomForestClassifier



#Create a Random Forest classifier with 100 trees
model_rf = RandomForestClassifier(n_estimators=100, random_state=42)

#Train the model on the training data
model_rf.fit(X_train.values.reshape(-1, 1), y_train)

#Make predictions on the test data
y_pred_rf = model_rf.predict(X_test.values.reshape(-1, 1))

#Evaluate the model
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f"Random Forest Accuracy: {accuracy_rf:.2f}")

#Print classification report and confusion matrix
print(classification_report(y_test, y_pred_rf))
print(confusion_matrix(y_test, y_pred_rf))


Random Forest Accuracy: 0.52
              precision    recall  f1-score   support

    negative       0.36      0.35      0.36       101
    positive       0.61      0.63      0.62       163

    accuracy                           0.52       264
   macro avg       0.49      0.49      0.49       264
weighted avg       0.51      0.52      0.52       264

[[ 35  66]
 [ 61 102]]


In [4]:
#In order to get a higher Accuracy-score, I will perform a Grid Search to find the best hyperparamters for the Random Forest Classifier.

from sklearn.model_selection import GridSearchCV



#Define the hyperparameters to search through
param_grid = {
    'n_estimators': [50, 100, 150],          
    'max_depth': [None, 5, 10, 15],         
    'min_samples_split': [2, 5, 10]          
}

#Set up Grid Search with cross-validation (using the validation set)
grid_search = GridSearchCV(model_rf, param_grid, cv=5, scoring='accuracy')

#Fit the Grid Search to find the best hyperparameters
grid_search.fit(X_val.values.reshape(-1, 1), y_val)

#Get the best hyperparameters found by Grid Search
best_params = grid_search.best_params_
print(best_params)

# Create a new Random Forest classifier with the best hyperparameters
best_model_rf = RandomForestClassifier(
    n_estimators=best_params['n_estimators'],
    max_depth=best_params['max_depth'],
    min_samples_split=best_params['min_samples_split'],
    random_state=42
)

#Train the best model on the training data
best_model_rf.fit(X_train.values.reshape(-1, 1), y_train)

#Make predictions on the test data using the best model
y_pred_best_rf = best_model_rf.predict(X_test.values.reshape(-1, 1))

#Evaluate the best model
accuracy_best_rf = accuracy_score(y_test, y_pred_best_rf)
print(f"Best Random Forest Accuracy: {accuracy_best_rf:.2f}")

#Print classification report and confusion matrix for the best model
print(classification_report(y_test, y_pred_best_rf))
print(confusion_matrix(y_test, y_pred_best_rf))


{'max_depth': 5, 'min_samples_split': 2, 'n_estimators': 150}
Best Random Forest Accuracy: 0.55
              precision    recall  f1-score   support

    negative       0.35      0.19      0.24       101
    positive       0.61      0.78      0.68       163

    accuracy                           0.55       264
   macro avg       0.48      0.48      0.46       264
weighted avg       0.51      0.55      0.51       264

[[ 19  82]
 [ 36 127]]


In [5]:
#Since the accuracy-score is still pretty low, I decided to try my luck with an SVM model

from sklearn.svm import SVC

#Create an SVM classifier with a linear kernel
model_svm = SVC(kernel='linear', random_state=42)

#Train the model on the training data
model_svm.fit(X_train.values.reshape(-1, 1), y_train)

#Make predictions on the test data
y_pred_svm = model_svm.predict(X_test.values.reshape(-1, 1))

#Evaluate the model
accuracy_svm = accuracy_score(y_test, y_pred_svm)
print(f"SVM Accuracy: {accuracy_svm:.2f}")

#Print classification report and confusion matrix
print(classification_report(y_test, y_pred_svm))
print(confusion_matrix(y_test, y_pred_svm))


SVM Accuracy: 0.62
              precision    recall  f1-score   support

    negative       0.00      0.00      0.00       101
    positive       0.62      1.00      0.76       163

    accuracy                           0.62       264
   macro avg       0.31      0.50      0.38       264
weighted avg       0.38      0.62      0.47       264

[[  0 101]
 [  0 163]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [4]:
#Since one's heart health deteroriates over the years, I also wanted to train some models using the variable "age" as a predictor

X2 = Heart_Attack['age'] 
y2 = Heart_Attack['class']

#80/10/10 split again
X_train_temporary, X_test, y_train_temporary, y_test = train_test_split(X2, y2, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_temporary, y_train_temporary, test_size=0.5, random_state=42)

#Create a LR model
model_logistic = LogisticRegression(random_state=42)

#Train the model on the training data
model_logistic.fit(X_train.values.reshape(-1, 1), y_train)

#Make predictions on the test data
y_pred_logistic = model_logistic.predict(X_test.values.reshape(-1, 1))

#Evaluate the model
accuracy_logistic = accuracy_score(y_test, y_pred_logistic)
print(f"Logistic Regression Accuracy: {accuracy_logistic:.2f}")

#Print classification report and confusion matrix
print(classification_report(y_test, y_pred_logistic))
print(confusion_matrix(y_test, y_pred_logistic))

Logistic Regression Accuracy: 0.66
              precision    recall  f1-score   support

    negative       0.58      0.38      0.46       101
    positive       0.68      0.83      0.75       163

    accuracy                           0.66       264
   macro avg       0.63      0.61      0.60       264
weighted avg       0.65      0.66      0.64       264

[[ 38  63]
 [ 27 136]]


In [6]:
#Achieving a higher accuracy with "age" as a predictor compared to "glucose" 
#indicates that age might be a more informative feature for predicting heart attack outcomes in this specific dataset.
#Since as we age, the pancreas produces less insulin, blood sugar remains elevated for longer - Therefore I decided to create one last model,
#which combines both features, "age" and "glucose"


# Separate features and target variable
X3 = Heart_Attack[['age', 'glucose']]
y3 = Heart_Attack['class']

# 80/10/10 split
X_train, X_temp, y_train, y_temp = train_test_split(X3, y3, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

#Create a Logistic Regression model
model_logistic = LogisticRegression(random_state=42)

#Train the model on the training data
model_logistic.fit(X_train, y_train)

#Make predictions on the test data
y_pred_logistic = model_logistic.predict(X_test)

#Evaluate the model
accuracy_logistic = accuracy_score(y_test, y_pred_logistic)
print(f"Logistic Regression Accuracy: {accuracy_logistic:.2f}")

#Print classification report and confusion matrix
print(classification_report(y_test, y_pred_logistic))
print(confusion_matrix(y_test, y_pred_logistic))




Logistic Regression Accuracy: 0.65
              precision    recall  f1-score   support

    negative       0.74      0.25      0.38        55
    positive       0.64      0.94      0.76        77

    accuracy                           0.65       132
   macro avg       0.69      0.59      0.57       132
weighted avg       0.68      0.65      0.60       132

[[14 41]
 [ 5 72]]


In [7]:
#Since this model only got an accuracy of 0.65, I decided to try and increase it by using Feature Scaling
#Since "age" and "glucose" have different scales, 
#applying feature scaling helps bring them to a similar range. 

from sklearn.preprocessing import StandardScaler

#Standardize the features
scaler = StandardScaler()
X_standardized = scaler.fit_transform(X3)

X_train, X_temp, y_train, y_temp = train_test_split(X_standardized, y3, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)


#Create a Logistic Regression model
model_logistic = LogisticRegression(random_state=42)

#Train the model on the training data
model_logistic.fit(X_train, y_train)

#Make predictions on the test data
y_pred_logistic = model_logistic.predict(X_test)

#Evaluate the model
accuracy_logistic = accuracy_score(y_test, y_pred_logistic)
print(f"Logistic Regression Accuracy: {accuracy_logistic:.2f}")


#Print classification report and confusion matrix
print(classification_report(y_test, y_pred_logistic))
print(confusion_matrix(y_test, y_pred_logistic))


Logistic Regression Accuracy: 0.65
              precision    recall  f1-score   support

    negative       0.74      0.25      0.38        55
    positive       0.64      0.94      0.76        77

    accuracy                           0.65       132
   macro avg       0.69      0.59      0.57       132
weighted avg       0.68      0.65      0.60       132

[[14 41]
 [ 5 72]]


In [8]:
#Since we have gotten 2 features and a moderate sample size, I will try running a Random Forest Model

from sklearn.ensemble import RandomForestClassifier

# Create a Random Forest classifier
model_rf = RandomForestClassifier(random_state=42)

# Train the model on the training data
model_rf.fit(X_train, y_train)

# Make predictions on the test data
y_pred_rf = model_rf.predict(X_test)

# Evaluate the model
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f"Random Forest Accuracy: {accuracy_rf:.2f}")

# Print classification report and confusion matrix
print(classification_report(y_test, y_pred_rf))
print(confusion_matrix(y_test, y_pred_rf))


Random Forest Accuracy: 0.58
              precision    recall  f1-score   support

    negative       0.49      0.42      0.45        55
    positive       0.62      0.69      0.65        77

    accuracy                           0.58       132
   macro avg       0.56      0.55      0.55       132
weighted avg       0.57      0.58      0.57       132

[[23 32]
 [24 53]]


In [9]:
#I will have to rest my case, and accept that with this data and my models, the Logistic regression Model using only the feature "Age", 
#gives the highest prediction accuracy

In [5]:
#summary table

import pandas as pd  




summary_results = {'algorithm': ['Logistic Regression', "Random Forest", "Random Forest", "SVM", "Logistic Regression", 
                                 "Logistic Regression", "Logistic Regression", "Random Forest" ],
                   
                   
'data': ["y", "y", "y", "y", "y2", "y3", "y3 (X3=scaled)", "y3 (X3=scaled)" ],
                   
'hyperparameter': ['random_state=42', "{n_estimators=100, random_state=42}","{'max_depth': 5, 'min_samples_split': 2, 'n_estimators': 150}",
                  "{kernel='linear', random_state=42}", "random_state=42", "random_state=42", "random_state=42", "random_state=42" ],
                   
'accuracy': [0.63, 0.52, 0.55, 0.62, 0.66, 0.65, 0.65, 0.58]}

#create a df from the results dictionary
summary_table = pd.DataFrame(summary_results)

#save the summary table as a CSV file
summary_table.to_csv('summary_table.csv', index=False)