# Heart Disease Prediction

**Simple notebook following the same STEP 1 / STEP 2... layout**

This notebook uses `values.csv` and `labels.csv` (provided) and performs a basic end-to-end workflow: data merge, EDA, preprocessing, model training/comparison, and final conclusion.

---

## STEP 1: Import Libraries

In [None]:
# STEP 1: Import libraries
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report, roc_auc_score
import joblib

print('Libraries imported')

## STEP 2: Load and merge data (same as your file)
We will merge `values.csv` and `labels.csv` by row order (concatenate) to reconstruct the full dataset.

In [None]:
# STEP 2: Load and merge data
VALUES = 'values.csv'
LABELS = 'labels.csv'

print('Files in working directory:', os.listdir('.'))

if not os.path.exists(VALUES):
    raise FileNotFoundError(f"{VALUES} not found in working directory.")
if not os.path.exists(LABELS):
    raise FileNotFoundError(f"{LABELS} not found in working directory.")

values = pd.read_csv(VALUES)
labels = pd.read_csv(LABELS)

print('\nvalues shape:', values.shape)
print('labels shape:', labels.shape)

# Merge by index (concatenate columns side-by-side)
data = pd.concat([values.reset_index(drop=True), labels.reset_index(drop=True)], axis=1)
print('merged data shape:', data.shape)

# Show first few rows
display(data.head())

## STEP 3: Quick data checks
Check for missing values, datatypes, and basic statistics.

In [None]:
# STEP 3: Quick data checks
print('Columns:', data.columns.tolist())
print('\nData types:\n', data.dtypes)
print('\nMissing values per column:\n', data.isnull().sum())

# Basic descriptive stats
display(data.describe(include='all'))

## STEP 4: Exploratory Data Analysis (simple plots)
Plots of key numeric features and correlation heatmap.

In [None]:
# STEP 4: EDA plots (simple)
# Select numeric columns
num_cols = data.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Plot distributions for some numeric columns (up to 6)
for col in num_cols[:6]:
    plt.figure(figsize=(6,3))
    plt.hist(data[col].dropna(), bins=30)
    plt.title(f'Distribution of {col}')
    plt.tight_layout()
    plt.show()

# Correlation heatmap for numeric features
if len(num_cols) >= 2:
    plt.figure(figsize=(8,6))
    corr = data[num_cols].corr()
    plt.imshow(corr, aspect='auto')
    plt.colorbar()
    plt.xticks(range(len(num_cols)), num_cols, rotation=90)
    plt.yticks(range(len(num_cols)), num_cols)
    plt.title('Correlation matrix (numeric features)')
    plt.tight_layout()
    plt.show()

## STEP 5: Prepare data (features X and target y)
We will try to detect the target column automatically. If labels had a single column, it will be used as target.

In [None]:
# STEP 5: Prepare data for modeling
# Detect target column in the merged dataframe (common names)
possible_targets = ['target','heart_disease_present','label','diagnosis','has_disease','disease']
target = None
for t in possible_targets:
    if t in data.columns:
        target = t
        break
# If not found and labels had exactly 1 column, take that column
if target is None:
    if labels.shape[1] == 1:
        target = labels.columns[0]
    else:
        # fallback: last column
        target = data.columns[-1]

print('Using target column:', target)
X = data.drop(columns=[target])
y = data[target]

print('Feature shape:', X.shape, 'Target shape:', y.shape)
print('Target distribution:\n', y.value_counts())

## STEP 6: Preprocessing
Simple imputation for numeric and one-hot for categorical features.

In [None]:
# STEP 6: Preprocessing setup
# Identify numeric and categorical features
num_features = X.select_dtypes(include=['int64','float64']).columns.tolist()
cat_features = X.select_dtypes(include=['object','category','bool']).columns.tolist()

# Some integer-coded categoricals might appear in num_features; keep it simple as-is
print('Numeric features:', num_features)
print('Categorical features:', cat_features)

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, num_features),
    ('cat', categorical_transformer, cat_features)
])

## STEP 7: Train/test split and Model training
We train Logistic Regression, Random Forest, SVM, and KNN. We'll compare results on the test set.

In [None]:
# STEP 7: Train/test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print('Train:', X_train.shape, 'Test:', X_test.shape)

# Define models
models = {
    'LogisticRegression': LogisticRegression(max_iter=1000, random_state=42),
    'RandomForest': RandomForestClassifier(n_estimators=200, random_state=42),
    'SVM': SVC(probability=True, random_state=42),
    'KNN': KNeighborsClassifier(n_neighbors=5)
}

results = []
for name, model in models.items():
    print('\nTraining', name)
    pipe = Pipeline(steps=[('preprocessor', preprocessor), ('clf', model)])
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    # try to get probability for ROC AUC
    y_proba = None
    try:
        y_proba = pipe.predict_proba(X_test)[:,1]
    except Exception:
        pass
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, zero_division=0)
    rec = recall_score(y_test, y_pred, zero_division=0)
    f1 = f1_score(y_test, y_pred, zero_division=0)
    auc = roc_auc_score(y_test, y_proba) if y_proba is not None and len(y_test.unique())>1 else None
    print('Accuracy:', acc)
    print('Precision:', prec)
    print('Recall:', rec)
    print('F1:', f1)
    if auc is not None:
        print('ROC AUC:', auc)
    print('Confusion matrix:\n', confusion_matrix(y_test, y_pred))
    results.append({'model': name, 'accuracy': acc, 'precision': prec, 'recall': rec, 'f1': f1, 'roc_auc': auc})

results_df = pd.DataFrame(results).sort_values(by='f1', ascending=False).reset_index(drop=True)
print('\nModel comparison:')
display(results_df)

## STEP 8: Conclusion and save best model
We save the best model (by F1 score) to `best_model.joblib` and provide a short recommendation.

In [None]:
# STEP 8: Save best model and conclusion
if not results_df.empty:
    best = results_df.iloc[0]['model']
    print('Best model by F1:', best)
    # instantiate the best model class again
    model_inst = None
    if best == 'LogisticRegression':
        model_inst = LogisticRegression(max_iter=1000, random_state=42)
    elif best == 'RandomForest':
        model_inst = RandomForestClassifier(n_estimators=200, random_state=42)
    elif best == 'SVM':
        model_inst = SVC(probability=True, random_state=42)
    elif best == 'KNN':
        model_inst = KNeighborsClassifier(n_neighbors=5)

    if model_inst is not None:
        best_pipe = Pipeline(steps=[('preprocessor', preprocessor), ('clf', model_inst)])
        # retrain on full data
        best_pipe.fit(pd.concat([X_train, X_test]), pd.concat([y_train, y_test]))
        joblib.dump(best_pipe, 'best_model.joblib')
        print('Saved best model pipeline to best_model.joblib')

# Short recommendation
print('\nRecommendation:')
print('- Use the best model to screen patients and flag high-risk cases.')
print('- Collect more labeled data and add domain-specific features for better performance.')
print('- For production, wrap the pipeline with input validation and monitoring.')