# Modelling (Refactored)

- 前置：需先运行 `DataCleaning&EDA_refactored.ipynb` 生成 `heart_data_cleaned.csv`
- 任务：训练分类模型（含聚类特征）、评估、导出模型与元数据
- 产物：`best_classification_model.joblib`、`best_model_metadata.json`、`sample_input_5rows.csv`
- 不包含：数据清洗、部署/Streamlit



In [3]:
import json
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.cluster import KMeans
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, roc_auc_score, confusion_matrix)
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
import joblib

DATA_PATH = "heart_data_cleaned.csv"
ARTIFACT_DIR = Path("outputs")
ARTIFACT_DIR.mkdir(exist_ok=True)
MODEL_PATH = ARTIFACT_DIR / "best_classification_model.joblib"
META_PATH = ARTIFACT_DIR / "best_model_metadata.json"
SAMPLE_PATH = ARTIFACT_DIR / "sample_input_5rows.csv"

# 1) 加载清洗后数据

df = pd.read_csv(DATA_PATH)
print("Clean shape:", df.shape)

# 2) 目标与特征

y = df["HeartDisease"].map({"No": 0, "Yes": 1}).astype(int) if df["HeartDisease"].dtype == object else df["HeartDisease"].astype(int)
X = df.drop(columns=["HeartDisease"])

# 3) 自定义聚类特征

class ClusterFeatureAdder(BaseEstimator, TransformerMixin):
    def __init__(self, cluster_cols=None, n_clusters=4, random_state=42):
        self.cluster_cols = cluster_cols
        self.n_clusters = n_clusters
        self.random_state = random_state

    def fit(self, X, y=None):
        cols = self.cluster_cols or ["Age", "RestingBP", "Cholesterol", "MaxHR", "Oldpeak"]
        self.cols_ = cols
        self.scaler_ = StandardScaler()
        Z = self.scaler_.fit_transform(X[cols])
        self.kmeans_ = KMeans(n_clusters=self.n_clusters, n_init=10, random_state=self.random_state)
        self.kmeans_.fit(Z)
        return self

    def transform(self, X):
        Xc = X.copy()
        Z = self.scaler_.transform(Xc[self.cols_])
        labels = self.kmeans_.predict(Z)
        Xc["cluster_label"] = labels.astype(str)
        return Xc

# 4) 预处理

numeric_features = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_features = X.select_dtypes(include=["object", "bool", "category"]).columns.tolist()

numeric_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
])

categorical_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore")),
])

def make_preprocessor():
    return ColumnTransformer([
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features + ["cluster_label"]),
    ])

# 5) 划分数据

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.30, random_state=42, stratify=y
)

# 6) Pipeline + GridSearch（Logistic 与 RF）

cluster = ClusterFeatureAdder()

logi = Pipeline([
    ("cluster", cluster),
    ("prep", make_preprocessor()),
    ("model", LogisticRegression(max_iter=200, n_jobs=-1)),
])

rf = Pipeline([
    ("cluster", cluster),
    ("prep", make_preprocessor()),
    ("model", RandomForestClassifier(random_state=42, n_jobs=-1)),
])

logi_grid = {
    "cluster__n_clusters": [3, 4, 5],
    "model__C": [0.1, 0.5, 1.0],
    "model__penalty": ["l2"],
    "model__solver": ["lbfgs"],
}

rf_grid = {
    "cluster__n_clusters": [3, 4, 5],
    "model__n_estimators": [200, 400],
    "model__max_depth": [None, 8, 12],
    "model__min_samples_leaf": [1, 2],
}

models = []
for name, pipe, grid in [
    ("Logistic", logi, logi_grid),
    ("RandomForest", rf, rf_grid),
]:
    gs = GridSearchCV(pipe, grid, cv=4, scoring="recall", n_jobs=-1)
    gs.fit(X_train, y_train)
    models.append((name, gs))
    print(f"{name} best params: {gs.best_params_}, best CV recall: {gs.best_score_:.3f}")

# 7) 选最佳并在测试集评估

def eval_model(model, Xte, yte):
    proba = model.predict_proba(Xte)[:, 1]
    pred = (proba >= 0.5).astype(int)
    return {
        "accuracy": accuracy_score(yte, pred),
        "precision": precision_score(yte, pred),
        "recall": recall_score(yte, pred),
        "f1": f1_score(yte, pred),
        "roc_auc": roc_auc_score(yte, proba),
        "cm": confusion_matrix(yte, pred).tolist(),
    }

best_name, best_gs = max(models, key=lambda t: t[1].best_score_)
best_model = best_gs.best_estimator_

metrics = eval_model(best_model, X_test, y_test)
print("=== Best Model ===")
print("Name:", best_name)
print("Test metrics:", metrics)

# 8) 导出模型与元数据

joblib.dump(best_model, MODEL_PATH)
print("Saved model ->", MODEL_PATH)

meta = {
    "final_model_name": best_name,
    "best_params": best_gs.best_params_,
    "test_set_performance": metrics,
    "expected_input_columns": X.columns.tolist(),
}
Path(META_PATH).write_text(json.dumps(meta, indent=2), encoding="utf-8")
print("Saved meta ->", META_PATH)

# 导出示例输入
X.sample(5, random_state=42).to_csv(SAMPLE_PATH, index=False)
print("Saved sample ->", SAMPLE_PATH)

print("下一步：在 Streamlit_refactored.ipynb 或 app.py 启动前端。")



Clean shape: (918, 12)
Logistic best params: {'cluster__n_clusters': 3, 'model__C': 0.1, 'model__penalty': 'l2', 'model__solver': 'lbfgs'}, best CV recall: 0.884
RandomForest best params: {'cluster__n_clusters': 5, 'model__max_depth': None, 'model__min_samples_leaf': 2, 'model__n_estimators': 200}, best CV recall: 0.890
=== Best Model ===
Name: RandomForest
Test metrics: {'accuracy': 0.8913043478260869, 'precision': 0.896774193548387, 'recall': 0.9084967320261438, 'f1': 0.9025974025974026, 'roc_auc': 0.9410701950156757, 'cm': [[107, 16], [14, 139]]}
Saved model -> outputs/best_classification_model.joblib
Saved meta -> outputs/best_model_metadata.json
Saved sample -> outputs/sample_input_5rows.csv
下一步：在 Streamlit_refactored.ipynb 或 app.py 启动前端。
