# **Heart Attack Prediction Model**

### Project Setup

In [None]:
# Install required packages
!pip install numpy pandas matplotlib seaborn scikit-learn tensorflow keras xgboost imbalanced-learn shap eli5

# Import  libraries:

In [None]:
# Import basic libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Scikit-learn tools
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
                             roc_auc_score, confusion_matrix, classification_report,
                             roc_curve, precision_recall_curve)

# ML algorithms
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier

# XGBoost
import xgboost as xgb

# Imbalanced data handling
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

# Visualization & interpretation
import shap
import eli5
from eli5.sklearn import PermutationImportance

# Warnings
import warnings
warnings.filterwarnings('ignore')


# Set Seaborn theme and color palette
sns.set_theme(style="whitegrid", palette="viridis")

sns.set_palette('viridis')

# Set random seed for reproducibility
np.random.seed(42)

# For Jupyter notebooks (optional, uncomment if using Jupyter)
# %matplotlib inline


# **2. Data Collection and Exploration**

### Load the Heart Disease Dataset

In [None]:
# Load the Cleveland Heart Disease dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data"
column_names = [
    'age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
    'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target'
]
data = pd.read_csv(url, header=None, names=column_names, na_values='?')

# Display the first few rows
print(f"Dataset shape: {data.shape}")
data.head()

In [None]:

print(data.shape)

# Understanding the Data

In [None]:
data.info()

print("Missing values per column:")
print(data.isnull().sum())


In [None]:
data.describe()

# **EDA ANALYSIS**

In [None]:
plt.figure(figsize=(15, 10))

numerical_features = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
for i, feature in enumerate(numerical_features):
    plt.subplot(2, 3, i+1)
    sns.histplot(data=data, x=feature, hue='target', kde=True, bins=30)
    plt.title(f'Distribution of {feature}')

plt.tight_layout()
plt.show()


In [None]:
plt.figure(figsize=(8, 6))

target_counts = data['target'].value_counts()


labels = [f'{cls}: {count}' for cls, count in zip(target_counts.index, target_counts.values)]

colors = ['blue', 'yellow', 'lightgreen', 'orange'][:len(target_counts)]

# Plot pie chart
plt.pie(target_counts, labels=labels, autopct='%1.1f%%', startangle=90, colors=colors)
plt.title('Target Distribution')
plt.show()


# correaltion of Heatmap

In [None]:
plt.figure(figsize=(12, 10))
correlation = data.corr()
mask = np.triu(correlation)
sns.heatmap(correlation, annot=True, fmt='.2f', cmap='coolwarm', mask=mask, linewidths=0.5)
plt.title('Correlation Matrix')
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(15, 10))
for i, feature in enumerate(numerical_features):
    plt.subplot(2, 3, i+1)
    sns.boxplot(x='target', y=feature, data=data)
    plt.title(f'{feature} vs. Heart Disease')

plt.tight_layout()
plt.show()

In [None]:
categorical_features = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']
plt.figure(figsize=(20, 15))
for i, feature in enumerate(categorical_features):
    plt.subplot(3, 3, i+1)
    feature_data = pd.crosstab(data[feature], data['target'])
    feature_data.plot(kind='bar', stacked=True, ax=plt.gca())
    plt.title(f'{feature} vs. Heart Disease')
    plt.xlabel(feature)
    plt.ylabel('Count')
    plt.legend(['No Disease', 'Disease'])

plt.tight_layout()
plt.show()

# Feature Interactions

In [None]:
# Create pair plots for interesting feature combinations
sns.pairplot(data[['age', 'thalach', 'chol', 'oldpeak', 'target']], hue='target', diag_kind='kde')
plt.suptitle('Pair Plot of Key Features', y=1.02)
plt.show()

# Age and gender analysis
plt.figure(figsize=(12, 6))
sns.violinplot(x='sex', y='age', hue='target', data=data, split=True, inner='quart')
plt.title('Age Distribution by Gender and Heart Disease')
plt.xlabel('Sex (0=Female, 1=Male)')
plt.ylabel('Age')
plt.show()

# **3. Data Preprocessing**

In [None]:
print("Missing values before handling:")
print(data.isnull().sum())


In [None]:
columns_to_impute = ['ca', 'thal']
data[columns_to_impute] = data[columns_to_impute].fillna(data[columns_to_impute].median())
print("\nMissing values after handling:")
print(data.isnull().sum())


In [None]:
# Convert target variable - if target>0, then 1, otherwise 0
data['target'] = data['target'].apply(lambda x: 1 if x > 0 else 0)
print("Target variable distribution after conversion:")
print(data['target'].value_counts())

In [None]:
# Split features and target
X = data.drop('target', axis=1)
y = data['target']

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"Training set shape: {X_train.shape}, {y_train.shape}")
print(f"Testing set shape: {X_test.shape}, {y_test.shape}")

In [None]:
# Scale numerical features
numerical_features = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
scaler = StandardScaler()

X_train[numerical_features] = scaler.fit_transform(X_train[numerical_features])
X_test[numerical_features] = scaler.transform(X_test[numerical_features])


In [None]:
with open('heart_scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

print("Scaled features:")
X_train[numerical_features].describe()

In [None]:
data.head()

# 4. Feature Engineering

In [None]:
# Create new features
def add_features(df):
    # Age and cholesterol interaction
    df['age_chol_ratio'] = df['age'] / df['chol']

    # Heart rate reserve (approximate)
    df['max_heart_rate'] = 220 - df['age']
    df['heart_rate_reserve'] = df['max_heart_rate'] - df['thalach']

    # Blood pressure and cholesterol risk
    df['bp_chol_product'] = df['trestbps'] * df['chol'] / 1000

    # Exercise-induced ST depression severity
    df['exang_oldpeak'] = df['exang'] * df['oldpeak']

    return df

X_train = add_features(X_train)
X_test = add_features(X_test)

print("New features added:")
print(X_train.columns.tolist())

In [None]:
# Check class balance
print("Original class distribution:")
print(y_train.value_counts())

# Apply SMOTE to balance the training set
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

print("\nBalanced class distribution:")
print(pd.Series(y_train_balanced).value_counts())

# Visualize class distribution before and after SMOTE
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
sns.countplot(x=y_train)
plt.title('Original Class Distribution')

plt.subplot(1, 2, 2)
sns.countplot(x=y_train_balanced)
plt.title('Balanced Class Distribution after SMOTE')

plt.tight_layout()
plt.show()

# **5. Model Selection and Training**

In [None]:
def evaluate_model(model, X_train, y_train, X_test, y_test, model_name="Model"):
    """Evaluate model performance with multiple metrics"""
    # Train the model
    model.fit(X_train, y_train)

    # Make predictions
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]

    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_prob)

    # Print results
    print(f"{model_name} Performance:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"ROC AUC: {auc:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

    # Plot confusion matrix
    plt.figure(figsize=(8, 6))
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
    plt.title(f'Confusion Matrix - {model_name}')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.show()

    # Plot ROC curve
    plt.figure(figsize=(8, 6))
    fpr, tpr, _ = roc_curve(y_test, y_prob)
    plt.plot(fpr, tpr, label=f'AUC = {auc:.4f}')
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curve - {model_name}')
    plt.legend(loc='lower right')
    plt.show()

    return {
        'model': model,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'auc': auc,
        'y_pred': y_pred,
        'y_prob': y_prob
    }

In [None]:
# Define models to try
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'XGBoost': xgb.XGBClassifier(random_state=42),
    'SVM': SVC(probability=True, random_state=42),
    'Neural Network': MLPClassifier(max_iter=1000, random_state=42)
}

# Train and evaluate each model
results = {}
for name, model in models.items():
    print(f"Training {name}...")
    result = evaluate_model(model, X_train_balanced, y_train_balanced, X_test, y_test, name)
    results[name] = result
    print("-" * 50)

In [None]:
# Compare model performance
performance_df = pd.DataFrame({
    'Model': list(results.keys()),
    'Accuracy': [results[m]['accuracy'] for m in results],
    'Precision': [results[m]['precision'] for m in results],
    'Recall': [results[m]['recall'] for m in results],
    'F1 Score': [results[m]['f1'] for m in results],
    'AUC': [results[m]['auc'] for m in results]
})

# Sort by accuracy
performance_df = performance_df.sort_values('Accuracy', ascending=False).reset_index(drop=True)
print("Model Performance Comparison:")
print(performance_df)

# Visualize model comparison
plt.figure(figsize=(15, 10))
metrics = ['Accuracy', 'Precision', 'Recall', 'F1 Score', 'AUC']
for i, metric in enumerate(metrics):
    plt.subplot(2, 3, i+1)
    sns.barplot(x='Model', y=metric, data=performance_df)
    plt.title(f'Model Comparison - {metric}')
    plt.xticks(rotation=45, ha='right')
    plt.ylim(0.5, 1.0)  # Set reasonable y-limits for better comparison

plt.tight_layout()
plt.show()

# **6. Model Evaluation**

In [None]:
# Select the best performing model
best_model_name = performance_df.iloc[0]['Model']
best_model = results[best_model_name]['model']
print(f"Best performing model: {best_model_name} with accuracy: {performance_df.iloc[0]['Accuracy']:.4f}")

In [None]:
# Perform cross-validation on the best model
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(best_model, X_train_balanced, y_train_balanced, cv=cv, scoring='accuracy')

print(f"Cross-validation scores: {cv_scores}")
print(f"Mean CV score: {cv_scores.mean():.4f}")
print(f"Standard deviation: {cv_scores.std():.4f}")

In [None]:
# Feature importance analysis
if best_model_name in ['Random Forest', 'Gradient Boosting', 'XGBoost']:
    # Direct feature importance from tree-based models
    feature_importance = pd.DataFrame({
        'Feature': X_train.columns,
        'Importance': best_model.feature_importances_
    }).sort_values('Importance', ascending=False)

    plt.figure(figsize=(12, 8))
    sns.barplot(x='Importance', y='Feature', data=feature_importance.head(15))
    plt.title(f'Feature Importance - {best_model_name}')
    plt.show()

else:
    # Permutation importance for non-tree-based models
    perm = PermutationImportance(best_model, random_state=42).fit(X_test, y_test)
    perm_importance = eli5.explain_weights_df(perm, feature_names=X_test.columns.tolist())

    plt.figure(figsize=(12, 8))
    sns.barplot(x='weight', y='feature', data=perm_importance.head(15))
    plt.title(f'Permutation Feature Importance - {best_model_name}')
    plt.show()

In [None]:
if best_model_name in ['Random Forest', 'Gradient Boosting', 'XGBoost']:
    explainer = shap.TreeExplainer(best_model)
    shap_values = explainer.shap_values(X_test)

In [None]:
# Summary plot - Feature Importance (bar)
plt.figure(figsize=(12, 10))
shap.summary_plot(shap_values, X_test, plot_type="bar")
plt.title(f'SHAP Feature Importance - {best_model_name}')
plt.show()

# Detailed SHAP values summary plot (beeswarm)
plt.figure(figsize=(12, 10))
shap.summary_plot(shap_values, X_test)
plt.title(f'SHAP Summary Plot - {best_model_name}')
plt.show()


# **7. Model Optimization**

In [None]:
# Hyperparameter tuning for the best model
if best_model_name == 'Logistic Regression':
    param_grid = {
        'C': [0.001, 0.01, 0.1, 1, 10, 100],
        'penalty': ['l1', 'l2', 'elasticnet', 'none'],
        'solver': ['liblinear', 'lbfgs', 'saga']
    }

elif best_model_name == 'Random Forest':
    param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }

elif best_model_name == 'Gradient Boosting':
    param_grid = {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 4, 5],
        'subsample': [0.8, 0.9, 1.0]
    }

elif best_model_name == 'XGBoost':
    param_grid = {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 4, 5],
        'colsample_bytree': [0.7, 0.8, 0.9]
    }

elif best_model_name == 'SVM':
    param_grid = {
        'C': [0.1, 1, 10, 100],
        'gamma': ['scale', 'auto', 0.1, 0.01],
        'kernel': ['rbf', 'poly', 'sigmoid']
    }

else:  # Neural Network
    param_grid = {
        'hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 50)],
        'activation': ['relu', 'tanh'],
        'alpha': [0.0001, 0.001, 0.01],
        'learning_rate': ['constant', 'adaptive']
    }

# Create grid search with cross-validation
grid_search = GridSearchCV(
    estimator=best_model,
    param_grid=param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=2
)

# Fit grid search
grid_search.fit(X_train_balanced, y_train_balanced)

# Best parameters and score
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best CV score: {grid_search.best_score_:.4f}")

# Get the optimized model
optimized_model = grid_search.best_estimator_

# Evaluate optimized model
optimized_results = evaluate_model(
    optimized_model,
    X_train_balanced,
    y_train_balanced,
    X_test,
    y_test,
    f"Optimized {best_model_name}"
)

In [None]:
# Create a final pipeline that includes preprocessing and the optimized model
preprocessor = StandardScaler()

# For imbalanced data handling
final_pipeline = ImbPipeline([
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('classifier', optimized_model)
])

# Fit the final pipeline
final_pipeline.fit(X_train[numerical_features], y_train)

# Evaluate the pipeline
y_pred = final_pipeline.predict(X_test[numerical_features])
final_accuracy = accuracy_score(y_test, y_pred)
print(f"Final pipeline accuracy: {final_accuracy:.4f}")
print(classification_report(y_test, y_pred))

# Save the final model
with open('heart_attack_prediction_model.pkl', 'wb') as f:
    pickle.dump(final_pipeline, f)

print("Final model saved as 'heart_attack_prediction_model.pkl'")

In [None]:
# Create a final pipeline that includes preprocessing and the optimized model
preprocessor = StandardScaler()

# For imbalanced data handling
final_pipeline = ImbPipeline([
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('classifier', optimized_model)
])

# Fit the final pipeline
final_pipeline.fit(X_train[numerical_features], y_train)

# Evaluate the pipeline
y_pred = final_pipeline.predict(X_test[numerical_features])
final_accuracy = accuracy_score(y_test, y_pred)
print(f"Final pipeline accuracy: {final_accuracy:.4f}")
print(classification_report(y_test, y_pred))

# Save the final model
with open('heart_attack_prediction_model.pkl', 'wb') as f:
    pickle.dump(final_pipeline, f)

print("Final model saved as 'heart_attack_prediction_model.pkl'")