In [3]:
"""
Classification model for the provided heart_disease.csv dataset.

Target column: "Heart Disease Status" (Yes / No)
Features:
- Mix of numeric columns (Age, Blood Pressure, Cholesterol Level, BMI, etc.)
- Categorical columns (Gender, Exercise Habits, Smoking, etc.)
"""

from dataclasses import dataclass
from pathlib import Path
from typing import Tuple, List

import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler


@dataclass
class TrainingConfig:
    """
    Configuration for training the heart disease classifier.
    """
    data_path: Path = Path("/kaggle/input/heart-disease/heart_disease.csv")
    target_column: str = "Heart Disease Status"
    test_size: float = 0.2
    random_state: int = 42


def load_data(config: TrainingConfig) -> pd.DataFrame:
    """
    Load the dataset and perform basic sanity checks.
    """
    if not config.data_path.is_file():
        raise FileNotFoundError(
            f"Dataset file not found at {config.data_path.resolve()}. "
            "Place heart_disease.csv in the same folder as this script."
        )

    df = pd.read_csv(config.data_path)

    if config.target_column not in df.columns:
        raise ValueError(
            f"Target column '{config.target_column}' not found. "
            f"Available columns: {list(df.columns)}"
        )

    return df


def split_features_and_target(
    df: pd.DataFrame, config: TrainingConfig
) -> Tuple[pd.DataFrame, pd.Series]:
    """
    Separate feature matrix X and target vector y.
    """
    X = df.drop(columns=[config.target_column])
    y = df[config.target_column]

    return X, y


def get_feature_types(X: pd.DataFrame) -> Tuple[List[str], List[str]]:
    """
    Identify numeric and categorical feature columns.
    """
    numeric_features = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
    categorical_features = X.select_dtypes(include=["object", "bool"]).columns.tolist()

    return numeric_features, categorical_features


def build_pipeline(
    numeric_features: List[str],
    categorical_features: List[str],
    config: TrainingConfig,
) -> Pipeline:
    """
    Build a preprocessing + model pipeline.

    - Numeric features:
        * Impute missing values with median.
        * Standardize with StandardScaler.
    - Categorical features:
        * Impute missing values with most frequent category.
        * One-hot encode.
    - Model:
        * RandomForestClassifier.
    """
    numeric_transformer = Pipeline(
        steps=[
            ("imputer", SimpleImputer(strategy="median")),
            ("scaler", StandardScaler()),
        ]
    )

    categorical_transformer = Pipeline(
        steps=[
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("encoder", OneHotEncoder(handle_unknown="ignore")),
        ]
    )

    preprocessor = ColumnTransformer(
        transformers=[
            ("num", numeric_transformer, numeric_features),
            ("cat", categorical_transformer, categorical_features),
        ]
    )

    model = RandomForestClassifier(
        n_estimators=200,
        random_state=config.random_state,
        n_jobs=-1,
    )

    clf = Pipeline(
        steps=[
            ("preprocess", preprocessor),
            ("model", model),
        ]
    )

    return clf


def evaluate_model(model: Pipeline, X_test: pd.DataFrame, y_test: pd.Series) -> None:
    """
    Evaluate the trained model on the test set and print metrics.
    """
    y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    print(f"Test Accuracy: {accuracy:.4f}\n")

    print("Classification Report:")
    print(classification_report(y_test, y_pred))


def main() -> None:
    """
    Run the full ML pipeline on heart_disease.csv.
    """
    config = TrainingConfig()

    # 1. Load data
    df = load_data(config)

    # 2. Split into features and target
    X, y = split_features_and_target(df, config)

    # 3. Identify feature types
    numeric_features, categorical_features = get_feature_types(X)

    # 4. Train-test split
    X_train, X_test, y_train, y_test = train_test_split(
        X,
        y,
        test_size=config.test_size,
        random_state=config.random_state,
        stratify=y,  # preserve class balance
    )

    # 5. Build pipeline and train
    pipeline = build_pipeline(numeric_features, categorical_features, config)
    pipeline.fit(X_train, y_train)

    # 6. Evaluate
    evaluate_model(pipeline, X_test, y_test)


if __name__ == "__main__":
    main()

Test Accuracy: 0.8000

Classification Report:
              precision    recall  f1-score   support

          No       0.80      1.00      0.89      1600
         Yes       0.00      0.00      0.00       400

    accuracy                           0.80      2000
   macro avg       0.40      0.50      0.44      2000
weighted avg       0.64      0.80      0.71      2000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
