Import data

In [6]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
import os
from IPython.display import display

# 回退到上一级目录访问数据集
data_file = os.path.join(os.getcwd(), '..', 'data', 'heart.csv')

# 检查数据文件是否存在
if not os.path.exists(data_file):
    display(f"数据文件未找到: {data_file}")
else:
    # 加载数据
    try:
        data = pd.read_csv(data_file)
        print(f"数据加载成功，形状: {data.shape}")
    except Exception as e:
        display(f"加载数据时出现错误: {e}")

数据加载成功，形状: (918, 12)


Preprocessing

In [7]:


    # 异常值处理
    data['RestingBP'] = data['RestingBP'].replace(0, data['RestingBP'].median())
    data['Cholesterol'] = data['Cholesterol'].replace(0, data['Cholesterol'].median())
    print("异常值处理完成")

    # 类别编码
    categorical_cols = ['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']

    for col in categorical_cols:
        le = LabelEncoder()
        data[col] = le.fit_transform(data[col])
    print("类别特征编码完成")

    # 标准化数值特征
    numerical_cols = ['Age', 'RestingBP', 'Cholesterol', 'MaxHR', 'Oldpeak']
    scaler = StandardScaler()
    data[numerical_cols] = scaler.fit_transform(data[numerical_cols])
    print("数值特征标准化完成")

    # 数据集划分
    X = data.drop('HeartDisease', axis=1)
    y = data['HeartDisease']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    print(f"数据集划分完成：训练集 {X_train.shape}, 测试集 {X_test.shape}")


异常值处理完成
类别特征编码完成
数值特征标准化完成
数据集划分完成：训练集 (734, 11), 测试集 (184, 11)


Exploratory Data Analysis 

In [None]:
Model evaluation