# Import libraries

In [10]:
import numpy as np
import pandas as pd

from pathlib import Path

from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import (
    accuracy_score, f1_score, roc_auc_score,
    confusion_matrix, classification_report
)

import matplotlib.pyplot as plt

# Load Data

In [4]:
DATA_PATH = Path("..") / "data" / "raw" / "heart_disease_uci.csv"
df = pd.read_csv(DATA_PATH)

print(df.shape)
display(df.head())
display(df.describe(include="all").T.head(20))
print("Missing values per column:\n", df.isna().sum().sort_values(ascending=False).head(20))


(920, 16)


Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,1,63,Male,Cleveland,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,2,67,Male,Cleveland,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2
2,3,67,Male,Cleveland,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
3,4,37,Male,Cleveland,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0
4,5,41,Female,Cleveland,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0


Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
id,920.0,,,,460.5,265.725422,1.0,230.75,460.5,690.25,920.0
age,920.0,,,,53.51087,9.424685,28.0,47.0,54.0,60.0,77.0
sex,920.0,2.0,Male,726.0,,,,,,,
dataset,920.0,4.0,Cleveland,304.0,,,,,,,
cp,920.0,4.0,asymptomatic,496.0,,,,,,,
trestbps,861.0,,,,132.132404,19.06607,0.0,120.0,130.0,140.0,200.0
chol,890.0,,,,199.130337,110.78081,0.0,175.0,223.0,268.0,603.0
fbs,830.0,2.0,False,692.0,,,,,,,
restecg,918.0,3.0,normal,551.0,,,,,,,
thalch,865.0,,,,137.545665,25.926276,60.0,120.0,140.0,157.0,202.0


Missing values per column:
 ca          611
thal        486
slope       309
fbs          90
oldpeak      62
trestbps     59
exang        55
thalch       55
chol         30
restecg       2
cp            0
dataset       0
id            0
age           0
sex           0
num           0
dtype: int64


# 2) Define target / basic cleaning

In [5]:
# --- pick target column (adjust if needed) ---
possible_targets = ["target", "num", "condition", "label", "y"]
target_col = None
for c in possible_targets:
    if c in df.columns:
        target_col = c
        break

if target_col is None:
    raise ValueError(f"Cannot find target column. Columns: {df.columns.tolist()}")

print("Using target_col =", target_col)

# --- make y binary if needed ---
y_raw = df[target_col]

# If target is like {0,1} already -> keep
# If target is like {0,1,2,3,4} -> map to binary (>=1 => 1)
if y_raw.nunique() > 2:
    y = (y_raw.astype(float) >= 1).astype(int)
else:
    y = y_raw.astype(int)

X = df.drop(columns=[target_col])

print("X shape:", X.shape, "y distribution:\n", pd.Series(y).value_counts(normalize=True))


Using target_col = num
X shape: (920, 15) y distribution:
 num
1    0.553261
0    0.446739
Name: proportion, dtype: float64


# 3) Split (stratify)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Train:", X_train.shape, "Test:", X_test.shape)

Train: (736, 15) Test: (184, 15)


# 4) Preprocessing: impute + scaling (compare scalers)

In [8]:
# identify numeric vs categorical columns
num_cols = X_train.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = [c for c in X_train.columns if c not in num_cols]

print("num_cols:", len(num_cols), "cat_cols:", len(cat_cols))
if cat_cols:
    print("categorical columns:", cat_cols[:20])


num_cols: 7 cat_cols: 8
categorical columns: ['sex', 'dataset', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'thal']


In [11]:
numeric_preprocess = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),   # placeholder, later overridden by grid
])

categorical_preprocess = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore")),
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_preprocess, num_cols),
        ("cat", categorical_preprocess, cat_cols),
    ],
    remainder="drop"
)

pipe = Pipeline(steps=[
    ("preprocess", preprocess),
    ("knn", KNeighborsClassifier())
])

# 5) Build pipeline + GridSearchCV

grid design（scaling、selection of k、metric+ weights）

scaling: StandardScaler vs MinMaxScaler vs “no scaling”

metric: euclidean / manhattan / minkowski(p=2) / minkowski(p=1)

k: [1,3,5,7,9,11,15,21,31]

weights: uniform / distance

metric="euclidean" / "manhattan" 

or metric="minkowski", p=2（equivalent to euclidean），p=1（equivalent to manhattan）

In [12]:
clf = KNeighborsClassifier()

pipe = Pipeline(steps=[
    ("preprocess", preprocess),
    ("knn", clf)
])

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

k_grid = [1,3,5,7,9,11,15,21,31]

param_grid = [
    # StandardScaler
    {
        "preprocess__num__scaler": [StandardScaler()],
        "knn__n_neighbors": k_grid,
        "knn__weights": ["uniform", "distance"],
        "knn__metric": ["euclidean", "manhattan"],
    },
    # MinMaxScaler
    {
        "preprocess__num__scaler": [MinMaxScaler()],
        "knn__n_neighbors": k_grid,
        "knn__weights": ["uniform", "distance"],
        "knn__metric": ["euclidean", "manhattan"],
    },
    # No scaling: use "passthrough" scaler
    {
        "preprocess__num__scaler": ["passthrough"],
        "knn__n_neighbors": k_grid,
        "knn__weights": ["uniform", "distance"],
        "knn__metric": ["euclidean", "manhattan"],
    }
]

# scoring: choose one primary for refit, keep others for reporting
scoring = {
    "acc": "accuracy",
    "f1": "f1",
    "roc_auc": "roc_auc"
}

gs = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    scoring=scoring,
    refit="roc_auc",   # pick primary refit metric (roc_auc or f1)
    cv=cv,
    n_jobs=-1,
    verbose=1,
    return_train_score=False
)

gs.fit(X_train, y_train)

print("Best params:", gs.best_params_)
print("Best CV score (refit metric):", gs.best_score_)


Fitting 5 folds for each of 108 candidates, totalling 540 fits
Best params: {'knn__metric': 'manhattan', 'knn__n_neighbors': 31, 'knn__weights': 'distance', 'preprocess__num__scaler': StandardScaler()}
Best CV score (refit metric): 0.9098265891490824


# 6) Result table (sorted)

In [13]:
results = pd.DataFrame(gs.cv_results_)

cols_show = [
    "mean_test_acc", "mean_test_f1", "mean_test_roc_auc",
    "param_preprocess__num__scaler",
    "param_knn__metric", "param_knn__weights", "param_knn__n_neighbors",
    "rank_test_roc_auc"
]

res_view = results[cols_show].copy()

# cleaner display for scaler
def scaler_name(x):
    if x == "passthrough":
        return "none"
    return x.__class__.__name__

res_view["scaler"] = res_view["param_preprocess__num__scaler"].apply(scaler_name)
res_view = res_view.drop(columns=["param_preprocess__num__scaler"])

res_view = res_view.rename(columns={
    "param_knn__metric": "metric",
    "param_knn__weights": "weights",
    "param_knn__n_neighbors": "k",
})

res_view = res_view.sort_values(by="mean_test_roc_auc", ascending=False)

display(res_view.head(20))


Unnamed: 0,mean_test_acc,mean_test_f1,mean_test_roc_auc,metric,weights,k,rank_test_roc_auc,scaler
35,0.836955,0.853841,0.909827,manhattan,distance,31,1,StandardScaler
33,0.831541,0.849026,0.909489,manhattan,distance,21,2,StandardScaler
34,0.836946,0.854189,0.908025,manhattan,uniform,31,3,StandardScaler
32,0.826108,0.843688,0.906807,manhattan,uniform,21,4,StandardScaler
31,0.832892,0.850398,0.904459,manhattan,distance,15,5,StandardScaler
29,0.83018,0.848908,0.904367,manhattan,distance,11,6,StandardScaler
27,0.835622,0.85461,0.903967,manhattan,distance,9,7,StandardScaler
30,0.839676,0.855897,0.903034,manhattan,uniform,15,8,StandardScaler
28,0.835613,0.854294,0.902996,manhattan,uniform,11,9,StandardScaler
26,0.842407,0.860611,0.901811,manhattan,uniform,9,10,StandardScaler


# 8) Evaluate best model on test set

In [14]:
best_model = gs.best_estimator_

y_pred = best_model.predict(X_test)
y_proba = best_model.predict_proba(X_test)[:, 1]  # requires predict_proba

acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_proba)

print({"test_acc": acc, "test_f1": f1, "test_roc_auc": auc})
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred, digits=4))


{'test_acc': 0.8532608695652174, 'test_f1': 0.8720379146919431, 'test_roc_auc': 0.9370516499282641}
Confusion matrix:
 [[65 17]
 [10 92]]
              precision    recall  f1-score   support

           0     0.8667    0.7927    0.8280        82
           1     0.8440    0.9020    0.8720       102

    accuracy                         0.8533       184
   macro avg     0.8554    0.8473    0.8500       184
weighted avg     0.8541    0.8533    0.8524       184



In [15]:
y_pred

array([0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1,
       0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1,
       1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1,
       0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0,
       1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0,
       0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1,
       1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0,
       0, 0, 1, 1, 1, 1, 1, 1])

In [19]:
np.array(y_test)

array([1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0,
       1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1,
       1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1,
       0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1,
       1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0,
       0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0,
       1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
       0, 0, 1, 0, 1, 1, 1, 1])