In [1]:
import pandas as pd
import numpy as np
import os
import sys

In [2]:
def add_project_root_to_sys_path(target_file="config.py"):
    current_path = os.getcwd()
    while True:
        if target_file in os.listdir(current_path):
            # 找到包含 config.py 的目录，加入 sys.path
            if current_path not in sys.path:
                sys.path.append(current_path)
            break
        else:
            # 向上一级目录查找
            parent_path = os.path.dirname(current_path)
            if parent_path == current_path:
                # 到达根目录还没找到，停止
                raise FileNotFoundError(f"未找到包含 {target_file} 的目录")
            current_path = parent_path

add_project_root_to_sys_path()

In [3]:
from config import arrhythmia_raw_data_path, arrhythmia_data_path

In [4]:
data_path = arrhythmia_raw_data_path

In [5]:
def list_folder_contents(folder_path):
    try:
        items = os.listdir(folder_path)
        print(f"文件夹 '{folder_path}' 下的内容：")
        for item in items:
            print(item)
    except FileNotFoundError:
        print(f"文件夹 '{folder_path}' 不存在！")

In [6]:
list_folder_contents(data_path)

文件夹 'C:\Users\26494\PycharmProjects\MOO-HFS\dataset\Arrhythmia' 下的内容：
arrhythmia.data
arrhythmia.names


In [7]:
# 读取数据
df = pd.read_csv(os.path.join(data_path, "arrhythmia.data"), header=None, na_values='?')

In [8]:
# 特征列命名
num_features = df.shape[1] - 1
feature_names = [f'{i+1}' for i in range(num_features)] + ['Class']

df.columns = feature_names

# 把类别列移到第一列
cols = ['Class'] + [col for col in df.columns if col != 'Class']
df = df[cols]

In [9]:
# 筛选的类别
selected_classes = [1, 10, 2, 6, 16, 3, 4, 5, 9]

In [10]:
# 筛选
df_filtered = df[df['Class'].isin(selected_classes)]

print(f"筛选后数据大小：{df_filtered.shape}")
print(df_filtered['Class'].value_counts().sort_index())

筛选后数据大小：(438, 280)
Class
1     245
2      44
3      15
4      15
5      13
6      25
9       9
10     50
16     22
Name: count, dtype: int64


In [11]:
df_filtered.head()

Unnamed: 0,Class,1,2,3,4,5,6,7,8,9,...,270,271,272,273,274,275,276,277,278,279
1,6,56,1,165,64,81,174,401,149,39,...,-0.5,0.0,8.5,0.0,0.0,0.0,0.2,2.1,20.4,38.8
2,10,54,0,172,95,138,163,386,185,102,...,0.9,0.0,9.5,-2.4,0.0,0.0,0.3,3.4,12.3,49.0
3,1,55,0,175,94,100,202,380,179,143,...,0.1,0.0,12.2,-2.2,0.0,0.0,0.4,2.6,34.6,61.6
6,1,40,1,160,52,77,129,377,133,77,...,-0.4,0.0,6.5,0.0,0.0,0.0,0.4,1.0,14.3,20.5
7,1,49,1,162,54,78,0,376,157,70,...,-0.3,0.0,8.2,-1.9,0.0,0.0,0.1,0.5,15.8,19.8


In [12]:
# df_filtered.to_csv('Arrhythmia_raw.csv', index=False)
data = df_filtered.copy()

In [13]:
# 缺失值统计
missing_count = data.isna().sum()
missing_ratio = missing_count / len(data)

missing_df = pd.DataFrame({
    'MissingCount': missing_count,
    'MissingRatio': missing_ratio
}).sort_values(by='MissingCount', ascending=False)

print(missing_df.head(20))  # 显示缺失最多的前20列

       MissingCount  MissingRatio
14              367      0.837900
12               17      0.038813
11                8      0.018265
13                1      0.002283
Class             0      0.000000
192               0      0.000000
191               0      0.000000
190               0      0.000000
189               0      0.000000
188               0      0.000000
187               0      0.000000
194               0      0.000000
186               0      0.000000
185               0      0.000000
184               0      0.000000
183               0      0.000000
182               0      0.000000
181               0      0.000000
180               0      0.000000
179               0      0.000000


In [14]:
# 删除缺失比例高的列
data.drop(columns=['14'], inplace=True)

In [15]:
# 有缺失值的列
cols_to_impute = ['12', '11', '13']

In [21]:
df_group_imputed = data.copy()

for col in cols_to_impute:
    # 计算每个类别该列的均值（忽略缺失）
    class_means = df_group_imputed.groupby('Class')[col].transform('mean')
    
    # 对缺失的用对应类别均值填充
    missing_mask = df_group_imputed[col].isna()
    df_group_imputed.loc[missing_mask, col] = class_means[missing_mask]

print(df_group_imputed[cols_to_impute].isna().sum())  # 确认缺失是否填完

12    0
11    0
13    0
dtype: int64


In [24]:
# 假设你的DataFrame名字是df
constant_cols = []
for col in df_group_imputed.columns:
    unique_vals = df_group_imputed[col].unique()
    if len(unique_vals) <= 1:
        constant_cols.append(col)

print("下面的列列在所有数据中完全相同:")
print(constant_cols)

下列列在所有数据中完全相同:
['20', '68', '70', '84', '132', '133', '140', '142', '144', '146', '152', '157', '158', '165', '205', '265', '275']


In [25]:
df_group_imputed = df_group_imputed.drop(columns=constant_cols)

In [27]:
# 筛选
print(df_group_imputed['Class'].value_counts().sort_index())

Class
1     245
2      44
3      15
4      15
5      13
6      25
9       9
10     50
16     22
Name: count, dtype: int64


In [28]:
print(df_group_imputed.shape)

(438, 262)


In [29]:
df_group_imputed.to_csv(arrhythmia_data_path, index=False)