In [1]:
import pandas as pd
import numpy as np
import sys
import os

In [2]:
def add_project_root_to_sys_path(target_file="config.py"):
    current_path = os.getcwd()
    while True:
        if target_file in os.listdir(current_path):
            # 找到包含 config.py 的目录，加入 sys.path
            if current_path not in sys.path:
                sys.path.append(current_path)
            break
        else:
            # 向上一级目录查找
            parent_path = os.path.dirname(current_path)
            if parent_path == current_path:
                # 到达根目录还没找到，停止
                raise FileNotFoundError(f"未找到包含 {target_file} 的目录")
            current_path = parent_path

add_project_root_to_sys_path()

In [3]:
from config import p53_raw_data_path, p53_data_path
# 使用 p53_Mutants 原始数据得到数据

  arrhythmia_data_path = os.path.join(DATA_DIR, 'Arrhythmia\Arrhythmia.csv')


In [4]:
raw_data_path = p53_raw_data_path

In [5]:
def list_folder_contents(folder_path):
    try:
        items = os.listdir(folder_path)
        print(f"文件夹 '{folder_path}' 下的内容：")
        for item in items:
            print(item)
    except FileNotFoundError:
        print(f"文件夹 '{folder_path}' 不存在！")

In [6]:
list_folder_contents(raw_data_path)

文件夹 'C:\Users\26494\PycharmProjects\MOO-HFS\dataset\p53_Mutants\p53_old_2010' 下的内容：
K1.def
K2.def
K3.def
K4.def
K5.def
K6.def
K7.def
K8.data
K8.def
K8.instance.tags
p53.names
p53_mutants_merged.csv


In [7]:
# 数据文件路径
data_file = os.path.join(raw_data_path, 'K8.data')

In [43]:
# 读取数据矩阵，分类标签 (active/inactive)
data = pd.read_csv(data_file, header=None, sep=',', dtype=str)

In [44]:
print(f"数据维度: {data.shape}")
last_col = data.columns[-2]
print(f"列名：{last_col}")
print("前20个样本的数据：")
print(data[last_col].head(20))
print("\n唯一值统计：")
print(data[last_col].value_counts(dropna=False))

数据维度: (16772, 5410)
列名：5408
前20个样本的数据：
0     inactive
1     inactive
2     inactive
3     inactive
4     inactive
5     inactive
6     inactive
7     inactive
8     inactive
9     inactive
10    inactive
11    inactive
12    inactive
13    inactive
14    inactive
15    inactive
16    inactive
17      active
18      active
19      active
Name: 5408, dtype: object

唯一值统计：
5408
inactive    16629
active        143
Name: count, dtype: int64


In [45]:
# 特征列命名
num_features = data.shape[1] - 2
feature_names = [f'{i+1}' for i in range(num_features)] + ['Class'] + [f'{num_features+1}']

data.columns = feature_names

# 移到第一列
cols = list(data.columns)
cols.remove('Class')
new_cols = ['Class'] + cols
data = data[new_cols]

In [46]:
for col in cols:
    data[col] = pd.to_numeric(data[col], errors='coerce')

data.head()

Unnamed: 0,Class,1,2,3,4,5,6,7,8,9,...,5400,5401,5402,5403,5404,5405,5406,5407,5408,5409
0,inactive,-0.161,-0.014,0.002,-0.036,-0.033,-0.093,0.025,0.005,0.0,...,0.006,0.013,0.021,0.02,0.016,-0.011,0.003,0.01,-0.007,
1,inactive,-0.158,-0.002,-0.012,-0.025,-0.012,-0.106,0.013,0.005,0.0,...,0.002,-0.008,0.007,0.015,-0.008,-0.011,-0.004,0.013,0.005,
2,inactive,,,,,,,,,,...,,,,,,,,,,
3,inactive,-0.169,-0.025,-0.01,-0.041,-0.045,-0.069,0.038,0.014,0.008,...,0.019,0.01,0.025,0.025,0.021,-0.012,0.006,0.016,-0.018,
4,inactive,-0.183,-0.051,-0.023,-0.077,-0.092,-0.015,0.071,0.027,0.02,...,0.051,0.012,0.05,0.038,0.051,-0.015,0.017,0.027,-0.049,


In [47]:
# 缺失值统计
missing_count = data.isna().sum()
missing_ratio = missing_count / len(data)

missing_df = pd.DataFrame({
    'MissingCount': missing_count,
    'MissingRatio': missing_ratio
}).sort_values(by='MissingCount', ascending=False)

print(missing_df.head(10))  # 显示缺失最多的前10列

      MissingCount  MissingRatio
5409         16772      1.000000
3225           180      0.010732
3223           180      0.010732
3222           180      0.010732
3221           180      0.010732
3220           180      0.010732
3219           180      0.010732
3218           180      0.010732
3217           180      0.010732
3216           180      0.010732


In [48]:
# 删除缺失比例高的列
data.drop(columns=['5409'], inplace=True)

In [49]:
# 查看哪些行含有空值
rows_with_nan = data[data.isnull().any(axis=1)]

print(rows_with_nan)

          Class   1   2   3   4   5   6   7   8   9  ...   5399   5400   5401  \
2      inactive NaN NaN NaN NaN NaN NaN NaN NaN NaN  ...    NaN    NaN    NaN   
16     inactive NaN NaN NaN NaN NaN NaN NaN NaN NaN  ...  0.001 -0.018 -0.014   
187    inactive NaN NaN NaN NaN NaN NaN NaN NaN NaN  ... -0.002 -0.013 -0.011   
189    inactive NaN NaN NaN NaN NaN NaN NaN NaN NaN  ... -0.011  0.005  0.021   
191    inactive NaN NaN NaN NaN NaN NaN NaN NaN NaN  ...  0.004 -0.029 -0.024   
...         ...  ..  ..  ..  ..  ..  ..  ..  ..  ..  ...    ...    ...    ...   
15371  inactive NaN NaN NaN NaN NaN NaN NaN NaN NaN  ...    NaN    NaN    NaN   
15372  inactive NaN NaN NaN NaN NaN NaN NaN NaN NaN  ...    NaN    NaN    NaN   
15373  inactive NaN NaN NaN NaN NaN NaN NaN NaN NaN  ...    NaN    NaN    NaN   
15374  inactive NaN NaN NaN NaN NaN NaN NaN NaN NaN  ...    NaN    NaN    NaN   
15375  inactive NaN NaN NaN NaN NaN NaN NaN NaN NaN  ...    NaN    NaN    NaN   

        5402   5403   5404 

In [50]:
# 只查看含空值的行索引
miss_list = data[data.isnull().any(axis=1)].index.tolist()
print(data[data.isnull().any(axis=1)].index.tolist())

[2, 16, 187, 189, 191, 192, 193, 201, 301, 402, 504, 703, 807, 1006, 1009, 1044, 1071, 1081, 1220, 1281, 1310, 1323, 1342, 1408, 1504, 1506, 1509, 1512, 1514, 1609, 1716, 1788, 1914, 2019, 2067, 2081, 2115, 2203, 2221, 2307, 2322, 2420, 2423, 2458, 2461, 2514, 2516, 2519, 2524, 2585, 2586, 2599, 2616, 2632, 2636, 2668, 2693, 2827, 2938, 2950, 3193, 3324, 3325, 3432, 3516, 3527, 3530, 3534, 3565, 3592, 3736, 3828, 3919, 4038, 4342, 4420, 4544, 4746, 4948, 5047, 5251, 5259, 5373, 5554, 5756, 6184, 6875, 6948, 6964, 7176, 7334, 7469, 7480, 7507, 7526, 7735, 7824, 7825, 7837, 7839, 7988, 8007, 8107, 8126, 8297, 9236, 9239, 9330, 9344, 9349, 9356, 9372, 9376, 9380, 9392, 9396, 9421, 9426, 9429, 9434, 9447, 9697, 9700, 9709, 9951, 10035, 10189, 10190, 10274, 10319, 10323, 10350, 10451, 10466, 10485, 10504, 10636, 10700, 11934, 11940, 12143, 13017, 13036, 13207, 13687, 13908, 14277, 14278, 14866, 15189, 15265, 15347, 15348, 15349, 15350, 15351, 15352, 15353, 15354, 15355, 15356, 15357, 15358,

In [51]:
# 直接删掉含缺失值的数据
data.dropna(inplace=True)

In [57]:
print(f"数据维度: {data.shape}")
print(f"缺失值个数:{data.isna().sum().sum()}")

数据维度: (16592, 5409)
缺失值个数:0


In [59]:
print("\n 唯一值统计：")
print(data['Class'].value_counts(dropna=False))


唯一值统计：
Class
inactive    16449
active        143
Name: count, dtype: int64


In [60]:
# 保存
data.to_csv(p53_data_path)

print(f"数据保存到：{p53_data_path}")

数据保存到：C:\Users\26494\GA\data\p53_Mutants/p53_Mutants.csv
