Import data

In [11]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
import os
from IPython.display import display

# 回退到上一级目录访问数据集
data_file = os.path.join(os.getcwd(), '..', 'data', 'heart.csv')

# 检查数据文件是否存在
if not os.path.exists(data_file):
    display(f"数据文件未找到: {data_file}")
else:
    # 加载数据
    try:
        data = pd.read_csv(data_file)
        print(f"数据加载成功，形状: {data.shape}") # 预期是（918，12）
    except Exception as e:
        display(f"加载数据时出现错误: {e}")

数据加载成功，形状: (918, 12)


Preprocessing

In [15]:

    # 异常值处理
    '''
    deng.wei: 血压和胆固醇为0不符合常理
    '''
    bp_zero_count = (data['RestingBP'] == 0).sum()
    chol_zero_count = (data['Cholesterol'] == 0).sum()
    data['RestingBP'] = data['RestingBP'].replace(0, data['RestingBP'].median())
    data['Cholesterol'] = data['Cholesterol'].replace(0, data['Cholesterol'].median())
    print("异常值处理完成")
    print(f"异常值处理完成：替换了{bp_zero_count}个血压0值，{chol_zero_count}个胆固醇0值")

    # 类别编码
    categorical_cols = ['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']

    for col in categorical_cols:
        le = LabelEncoder()
        data[col] = le.fit_transform(data[col])
    print("类别特征编码完成")

    # 标准化数值特征（
    numerical_cols = ['Age', 'RestingBP', 'Cholesterol', 'MaxHR', 'Oldpeak']
    scaler = StandardScaler()
    data[numerical_cols] = scaler.fit_transform(data[numerical_cols])
    print("数值特征标准化完成")

    # 数据集划分
    X = data.drop('HeartDisease', axis=1)
    y = data['HeartDisease']
    
    # 首先划分训练集和临时集（80%训练，20%临时）
    X_train, X_temp, y_train, y_temp = train_test_split(
        X, y, 
        test_size=0.2, 
        random_state=42, 
        stratify=y
    )
    
    # 然后从临时集划分验证集和测试集（各占一半，即各10%原始数据）
    X_val, X_test, y_val, y_test = train_test_split(
        X_temp, y_temp, 
        test_size=0.5, 
        random_state=42, 
        stratify=y_temp
    )
    
    print(f"数据集划分完成：")
    print(f"训练集 {X_train.shape} ({len(X_train)/len(X):.1%})")
    print(f"验证集 {X_val.shape} ({len(X_val)/len(X):.1%})")
    print(f"测试集 {X_test.shape} ({len(X_test)/len(X):.1%})")

异常值处理完成
异常值处理完成：替换了0个血压0值，0个胆固醇0值
类别特征编码完成
数值特征标准化完成
数据集划分完成：
训练集 (734, 11) (80.0%)
验证集 (92, 11) (10.0%)
测试集 (92, 11) (10.0%)


Exploratory Data Analysis 

In [None]:
Model evaluation