# Heart Disease Prediction â€” Complete Project

Author: Generated for you

This notebook contains a full project workflow aligned to your project brief:
- Data loading & cleaning
- Exploratory Data Analysis (EDA)
- Feature engineering & preprocessing
- Model training & comparison (Logistic Regression, Random Forest, SVM, KNN, XGBoost)
- Model selection and export
- Recommendations for hospital use and discussion of challenges

You can run this notebook end-to-end. It expects `values.csv` and `labels.csv` to be in the same directory.

In [None]:
# Setup: imports and loading data
import os, sys
import pandas as pd, numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve, accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
import joblib

# Optional: XGBoost (if installed)
try:
    import xgboost as xgb
    has_xgb = True
except Exception:
    has_xgb = False

print('xgboost available:', has_xgb)

# Load files
VALUES = 'values.csv'
LABELS = 'labels.csv'

for f in [VALUES, LABELS]:
    print('Exists', f, os.path.exists(f))

df_values = pd.read_csv(VALUES)
df_labels = pd.read_csv(LABELS)

print('\nvalues.csv shape:', df_values.shape)
print('labels.csv shape:', df_labels.shape)

# Merge on patient id if possible
# common ids columns: try to detect a common column name
common_cols = set(df_values.columns).intersection(df_labels.columns)
if common_cols:
    key = list(common_cols)[0]
    print('Merging on', key)
    df = df_values.merge(df_labels, on=key)
else:
    # if labels file only has a single column 'target' or 'label', try concat by index
    if df_labels.shape[1] == 1:
        label_col = df_labels.columns[0]
        df = df_values.copy()
        df[label_col] = df_labels[label_col]
        print('Appended labels column as', label_col)
    else:
        # fallback to concat columns side-by-side
        df = pd.concat([df_values.reset_index(drop=True), df_labels.reset_index(drop=True)], axis=1)
        print('Concatenated values and labels')

print('Merged df shape:', df.shape)
df.head()

## Exploratory Data Analysis (EDA)

This section performs basic EDA: missing values, distributions, and correlations.

In [None]:
# EDA: missing values and distributions
print('\nMissing values per column:\n', df.isnull().sum())

# Basic stats
display(df.describe(include='all'))

# Plot distribution for numeric columns (first 8 to avoid overcrowding)
num_cols = df.select_dtypes(include=['int64','float64']).columns.tolist()
print('Numeric columns:', num_cols)

import matplotlib.pyplot as plt
for col in num_cols[:8]:
    plt.figure(figsize=(6,3))
    plt.hist(df[col].dropna(), bins=30)
    plt.title(f'Distribution of {col}')
    plt.tight_layout()
    plt.show()

# Correlation heatmap
plt.figure(figsize=(10,8))
sns.heatmap(df[num_cols].corr(), annot=True, fmt='.2f', cmap='coolwarm')
plt.title('Correlation matrix (numeric features)')
plt.show()

## Preprocessing & Feature Engineering

- Handle missing values
- Encode categorical variables
- Scale numeric features
- Prepare X and y

In [None]:
# Identify feature columns and target
# Try to detect a target column automatically (common names)
possible_targets = ['target','heart_disease_present','label','diagnosis','has_disease','disease']
target = None
for t in possible_targets:
    if t in df.columns:
        target = t
        break
if target is None:
    # fallback: assume last column is target
    target = df.columns[-1]
print('Using target column:', target)

X = df.drop(columns=[target])
y = df[target]

# Identify numeric and categorical features
num_features = X.select_dtypes(include=['int64','float64']).columns.tolist()
cat_features = X.select_dtypes(include=['object','category','bool']).columns.tolist()
# Some integer-coded categorical columns: check cardinality
for col in num_features[:]:
    if X[col].nunique() < 10 and col not in ['age','resting_blood_pressure','serum_cholesterol_mg_per_dl','max_heart_rate_achieved']:
        # treat as categorical
        cat_features.append(col)
        num_features.remove(col)

print('Numeric features:', num_features)
print('Categorical features:', cat_features)

# Build preprocessing pipeline
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, num_features),
    ('cat', categorical_transformer, cat_features)
])

## Model Training & Comparison
We'll train several models and compare performance using cross-validation and a hold-out test set.

In [None]:
# Train-test split (stratified if possible)
from sklearn.model_selection import train_test_split
if len(y.unique()) > 1:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
else:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print('Train shape:', X_train.shape, 'Test shape:', X_test.shape)

# Define models to evaluate
models = {
    'LogisticRegression': LogisticRegression(max_iter=1000, random_state=42),
    'RandomForest': RandomForestClassifier(n_estimators=200, random_state=42),
    'SVC': SVC(probability=True, random_state=42),
    'KNN': KNeighborsClassifier(n_neighbors=5)
}
if has_xgb:
    models['XGBoost'] = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)

# Function to evaluate models
def evaluate_model(name, model, X_train, X_test, y_train, y_test, preprocessor):
    pipe = Pipeline(steps=[('preprocessor', preprocessor), ('clf', model)])
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    y_proba = None
    try:
        y_proba = pipe.predict_proba(X_test)[:,1]
    except Exception:
        pass
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, zero_division=0)
    rec = recall_score(y_test, y_pred, zero_division=0)
    f1 = f1_score(y_test, y_pred, zero_division=0)
    auc = roc_auc_score(y_test, y_proba) if y_proba is not None and len(np.unique(y_test))>1 else None
    print(f'== {name} ==')
    print('Accuracy:', acc)
    print('Precision:', prec)
    print('Recall:', rec)
    print('F1:', f1)
    if auc is not None:
        print('ROC AUC:', auc)
    print('\nClassification report:\n', classification_report(y_test, y_pred, zero_division=0))
    print('\nConfusion matrix:\n', confusion_matrix(y_test, y_pred))
    return dict(name=name, accuracy=acc, precision=prec, recall=rec, f1=f1, roc_auc=auc)

results = []
for name, model in models.items():
    try:
        res = evaluate_model(name, model, X_train, X_test, y_train, y_test, preprocessor)
        results.append(res)
    except Exception as e:
        print('Error training', name, e)

results_df = pd.DataFrame(results).sort_values(by='f1', ascending=False)
display(results_df)

## Save Best Model
We'll save the best model pipeline to `best_model.joblib` for later deployment.

In [None]:
if not results_df.empty:
    best_name = results_df.iloc[0]['name']
    print('Best model by F1:', best_name)
    best_model = None
    # retrain best model on full data (train+test)
    model_instance = None
    if best_name == 'LogisticRegression':
        model_instance = LogisticRegression(max_iter=1000, random_state=42)
    elif best_name == 'RandomForest':
        model_instance = RandomForestClassifier(n_estimators=200, random_state=42)
    elif best_name == 'SVC':
        model_instance = SVC(probability=True, random_state=42)
    elif best_name == 'KNN':
        model_instance = KNeighborsClassifier(n_neighbors=5)
    elif best_name == 'XGBoost' and has_xgb:
        model_instance = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
    if model_instance is not None:
        full_pipe = Pipeline(steps=[('preprocessor', preprocessor), ('clf', model_instance)])
        full_pipe.fit(pd.concat([X_train, X_test]), pd.concat([y_train, y_test]))
        joblib.dump(full_pipe, 'best_model.joblib')
        print('Saved best model pipeline to best_model.joblib')

## Recommendations for Hospital

- Integrate the saved model into screening workflows to flag high-risk patients.
- Collect more labeled data, especially for underrepresented groups.
- Add follow-up and outcome data to improve temporal predictions.
- Use explainability (SHAP or LIME) to provide clinicians with reasons for predictions.

## Challenges Faced

- Missing values and inconsistent column naming across files.
- Potential class imbalance in heart disease labels (use stratified sampling).
- Categorical variables encoded as integers needing domain knowledge.
- Need to avoid data leakage by proper pipeline usage.

---

## How to run
1. Place `values.csv` and `labels.csv` in the same directory as this notebook.
2. Run cells top to bottom. `xgboost` is optional but recommended if available.
3. The notebook will produce evaluation metrics and save `best_model.joblib`.


In [None]:
# Usage example after running and saving model
if os.path.exists('best_model.joblib'):
    pipe = joblib.load('best_model.joblib')
    # take first row (drop target if present)
    sample = df.drop(columns=[target]).iloc[0:2]
    print('Sample input:\n', sample)
    preds = pipe.predict(sample)
    probs = None
    try:
        probs = pipe.predict_proba(sample)[:,1]
    except Exception:
        pass
    print('Predictions:', preds)
    if probs is not None:
        print('Probabilities:', probs)