In [7]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer

# 读取 CSV 文件
file_path = "TrainDataset2024.csv"  # 替换为你的文件路径
df = pd.read_csv(file_path)

# 将特定值（例如 999）替换为 NaN，表示缺失值
df.replace(999, np.nan, inplace=True)

# 分离数值列和非数值列
numeric_df = df.select_dtypes(include=[np.number])  # 选择数值列
non_numeric_df = df.select_dtypes(exclude=[np.number])  # 选择非数值列

# 初始化 KNNImputer 并填补数值列的缺失值
imputer = KNNImputer(n_neighbors=5)
numeric_df_imputed = pd.DataFrame(imputer.fit_transform(numeric_df), columns=numeric_df.columns)

# 将数值列和非数值列重新合并
df_imputed = pd.concat([numeric_df_imputed, non_numeric_df], axis=1)

# 检查填补后的数据
print("Data after KNN imputation:")
print(df_imputed.head())

# 将填补后的数据保存到新的 CSV 文件
output_file_path = "imputed_file.csv"  # 替换为要保存的文件路径
df_imputed.to_csv(output_file_path, index=False)
print(f"Data with imputed values saved to {output_file_path}")


Data after KNN imputation:
   pCR (outcome)  RelapseFreeSurvival (outcome)   Age   ER  PgR  HER2  \
0            1.0                          144.0  41.0  0.0  0.0   0.0   
1            0.0                          142.0  39.0  1.0  1.0   0.0   
2            1.0                          135.0  31.0  0.0  0.0   0.0   
3            0.0                           12.0  35.0  0.0  0.0   0.0   
4            0.0                          109.0  61.0  1.0  0.0   0.0   

   TrippleNegative  ChemoGrade  Proliferation  HistologyType  ...  \
0              1.0         3.0            3.0            1.0  ...   
1              0.0         3.0            3.0            1.0  ...   
2              1.0         2.0            1.0            1.0  ...   
3              1.0         3.0            3.0            1.0  ...   
4              0.0         2.0            1.0            1.0  ...   

   original_glszm_SmallAreaLowGrayLevelEmphasis  original_glszm_ZoneEntropy  \
0                                      0

In [15]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
#from cuml.ensemble import RandomForestRegressor  # 使用 cuML 中的随机森林回归


# 加载 CSV 文件
file_path = "TrainDataset2024.csv"  
df = pd.read_csv(file_path)

# 将特定值（999）替换为 NaN，表示缺失值
df.replace(999, np.nan, inplace=True)

# 将非数值列编码为数值
df = pd.get_dummies(df, drop_first=True)

# 遍历每一列并使用随机森林回归模型填补缺失值
for column in df.columns:
    if df[column].isna().sum() > 0:  # 仅处理包含缺失值的列
        # 将数据划分为训练集（无缺失值）和测试集（缺失值位置）
        train_data = df[df[column].notna()]  # 有该列数据的样本
        test_data = df[df[column].isna()]     # 缺少该列数据的样本
        
        # 如果缺失值数量过多或训练数据量过少，跳过该列
        if train_data.shape[0] < 5:
            print(f"Skipping {column} due to insufficient data.")
            continue

        # 使用其他列作为特征进行训练
        X_train = train_data.drop(columns=[column])
        y_train = train_data[column]
        X_test = test_data.drop(columns=[column])

        # 初始化随机森林回归模型
        rf = RandomForestRegressor(n_estimators=1000, random_state=42)
        rf.fit(X_train, y_train)  # 训练模型

        # 使用模型预测缺失值并填补
        predicted_values = rf.predict(X_test)
        df.loc[df[column].isna(), column] = predicted_values  # 用预测值填补缺失值

print("Data after filling missing values with RandomForestRegressor:")
print(df.head())

# 保存填补后的数据到新文件
output_file_path = "Imputed_RandomForest.csv"
df.to_csv(output_file_path, index=False)
print(f"Data with imputed values saved to {output_file_path}")


Data after filling missing values with RandomForestRegressor:
   pCR (outcome)  RelapseFreeSurvival (outcome)   Age  ER  PgR  HER2  \
0            1.0                          144.0  41.0   0  0.0   0.0   
1            0.0                          142.0  39.0   1  1.0   0.0   
2            1.0                          135.0  31.0   0  0.0   0.0   
3            0.0                           12.0  35.0   0  0.0   0.0   
4            0.0                          109.0  61.0   1  0.0   0.0   

   TrippleNegative  ChemoGrade  Proliferation  HistologyType  ...  \
0              1.0         3.0            3.0            1.0  ...   
1              0.0         3.0            3.0            1.0  ...   
2              1.0         2.0            1.0            1.0  ...   
3              1.0         3.0            3.0            1.0  ...   
4              0.0         2.0            1.0            1.0  ...   

   ID_TRG002941  ID_TRG002943  ID_TRG002944  ID_TRG002945  ID_TRG002947  \
0         False