# 04 – Supervised Learning (Classification)

Models: Logistic Regression, Decision Tree, Random Forest, SVM.
Evaluation: Accuracy, Precision, Recall, F1, ROC AUC.
Use consistent preprocessing pipeline and cross-validation.

In [None]:
import pandas as pd, numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import RocCurveDisplay, roc_auc_score
import matplotlib.pyplot as plt
import joblib

df = pd.read_csv(Path('../data/heart_disease.csv'))
target_col = 'target' if 'target' in df.columns else 'num'
y = df[target_col]
X = df.drop(columns=[target_col])
categorical = [c for c in X.columns if X[c].dtype=='object']
numeric = [c for c in X.columns if c not in categorical]

numeric_transformer = Pipeline([('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())])
categorical_transformer = Pipeline([('imputer', SimpleImputer(strategy='most_frequent')), ('ohe', OneHotEncoder(handle_unknown='ignore'))])
preprocessor = ColumnTransformer([('num', numeric_transformer, numeric), ('cat', categorical_transformer, categorical)])

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

models = {
    'log_reg': LogisticRegression(max_iter=1000),
    'dtree': DecisionTreeClassifier(random_state=42),
    'rf': RandomForestClassifier(n_estimators=300, random_state=42),
    'svm': SVC(probability=True, kernel='rbf', random_state=42)
}

scoring = {'accuracy':'accuracy','precision':'precision','recall':'recall','f1':'f1','roc_auc':'roc_auc'}
results = []
for name, clf in models.items():
    pipe = Pipeline([('prep', preprocessor), ('clf', clf)])
    cv_scores = cross_validate(pipe, X_train, y_train, scoring=scoring, cv=5, n_jobs=-1)
    row = {'model': name}
    for metric in scoring.keys():
        row[metric] = cv_scores['test_'+metric].mean()
    results.append(row)
pd.DataFrame(results)

## 1. Fit Best Baseline on Full Train & Evaluate Test

In [None]:
# Choose model with highest roc_auc or f1
metric_df = pd.DataFrame(results)
best_name = metric_df.sort_values('roc_auc', ascending=False).iloc[0]['model']
best_name
best_clf = models[best_name]
best_pipe = Pipeline([('prep', preprocessor), ('clf', best_clf)])
best_pipe.fit(X_train, y_train)
from sklearn.metrics import classification_report, ConfusionMatrixDisplay
y_pred = best_pipe.predict(X_test)
y_proba = best_pipe.predict_proba(X_test)[:,1]
print(classification_report(y_test, y_pred))
print('Test ROC AUC:', roc_auc_score(y_test, y_proba))
ConfusionMatrixDisplay.from_estimator(best_pipe, X_test, y_test) 

In [None]:
RocCurveDisplay.from_predictions(y_test, y_proba)

## 2. Save Baseline Best Pipeline

In [None]:
joblib.dump(best_pipe, '../models/baseline_best.pkl')
print('Saved baseline model.')

## Notes
- SVM with probability=True incurs Platt scaling cost; acceptable for dataset size.
- For imbalanced: inspect PR curve.
- Fix random seeds for reproducibility.