In [1]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from imblearn.over_sampling import SMOTE  # SMOTE to balance classes

# Load dataset
df = pd.read_csv(r'C:\Users\cheyanne.gardner\Downloads\heart_attack_prediction_dataset.csv')

# Data Cleaning 
df[['Systolic_BP', 'Diastolic_BP']] = df['Blood Pressure'].str.split('/', expand=True)
df['Systolic_BP'] = pd.to_numeric(df['Systolic_BP'], errors='coerce')
df['Diastolic_BP'] = pd.to_numeric(df['Diastolic_BP'], errors='coerce')
df.drop(['Patient ID', 'Country', 'Continent', 'Hemisphere', 'Blood Pressure'], axis=1, inplace=True)

# Encode 
le = LabelEncoder()
df['Sex'] = le.fit_transform(df['Sex'])
diet_mapping = {'Unhealthy': 0, 'Average': 1, 'Healthy': 2}
df['Diet'] = df['Diet'].map(diet_mapping)

# Fill missing values
df.fillna(df.median(numeric_only=True), inplace=True)

# Define features 
X = df.drop('Heart Attack Risk', axis=1)
y = df['Heart Attack Risk']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply SMOTE 
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

# Feature scaling
scaler = StandardScaler()
X_train_balanced_scaled = scaler.fit_transform(X_train_balanced)
X_test_scaled = scaler.transform(X_test)


# Logistic Regression
logistic_regression = LogisticRegression(max_iter=200, random_state=42)
logistic_regression.fit(X_train_balanced_scaled, y_train_balanced)

def predict_with_threshold(model, X, threshold=0.3):
    y_probs = model.predict_proba(X)[:, 1]
    return (y_probs >= threshold).astype(int)

# Decision Tree - Tuned 
dt = DecisionTreeClassifier(random_state=42)
param_grid = {
    'max_depth': [5, 7, 10],
    'min_samples_leaf': [1, 2, 5]
}
grid_search_dt = GridSearchCV(
    estimator=dt,
    param_grid=param_grid,
    scoring='recall',
    cv=5,
    n_jobs=-1,
    verbose=0
)
grid_search_dt.fit(X_train_balanced, y_train_balanced)
best_decision_tree = grid_search_dt.best_estimator_


# Random Forest - Tuned
random_forest = RandomForestClassifier(
    n_estimators=300,
    max_depth=10,
    class_weight='balanced',
    random_state=42
)
random_forest.fit(X_train_balanced, y_train_balanced)

# Model Review
def evaluate_model(model, X_test, y_test, threshold_adjustment=False):
    if threshold_adjustment:
        y_pred = predict_with_threshold(model, X_test)
    else:
        y_pred = model.predict(X_test)
    return {
        'Accuracy': round(accuracy_score(y_test, y_pred), 3),
        'Precision': round(precision_score(y_test, y_pred), 3),
        'Recall': round(recall_score(y_test, y_pred), 3),
        'F1 Score': round(f1_score(y_test, y_pred), 3)
    }

# Review all models
results = {
    'Logistic Regression (Adj Threshold)': evaluate_model(logistic_regression, X_test_scaled, y_test, threshold_adjustment=True),
    'Decision Tree (Tuned)': evaluate_model(best_decision_tree, X_test, y_test),
    'Random Forest (Tuned)': evaluate_model(random_forest, X_test, y_test)
}

# Convert results
results_df = pd.DataFrame(results).T

# Display Results
print("\n=== Model Performance Comparison (with SMOTE and Tuning) ===")
print(results_df)


# Display Best Decision Tree Hyperparameters
print("\n=== Best Decision Tree Hyperparameters ===")
print(grid_search_dt.best_params_)


[WinError 2] The system cannot find the file specified
  File "C:\Users\cheyanne.gardner\AppData\Local\anaconda3\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
               ^^^^^^^^^^^^^^^
  File "C:\Users\cheyanne.gardner\AppData\Local\anaconda3\Lib\subprocess.py", line 548, in run
    with Popen(*popenargs, **kwargs) as process:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\cheyanne.gardner\AppData\Local\anaconda3\Lib\subprocess.py", line 1026, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "C:\Users\cheyanne.gardner\AppData\Local\anaconda3\Lib\subprocess.py", line 1538, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^



=== Model Performance Comparison (with SMOTE and Tuning) ===
                                     Accuracy  Precision  Recall  F1 Score
Logistic Regression (Adj Threshold)     0.381      0.352   0.866     0.501
Decision Tree (Tuned)                   0.501      0.345   0.436     0.385
Random Forest (Tuned)                   0.544      0.315   0.232     0.267

=== Best Decision Tree Hyperparameters ===
{'max_depth': 10, 'min_samples_leaf': 1}
