# 特征筛选
## 使用RFECV对ssGSEA、PPI30、WGCNA进行特征筛选

In [1]:
import optuna
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, BaggingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import SVC
import pandas as pd
import os
import numpy as np
from sklearn.preprocessing import StandardScaler

np.random.seed(42)
random_state = 42

scaler = StandardScaler()
# 加载数据集
data_ssgsea = pd.read_csv('ssgsea_10.csv')
X_ssgsea = data_ssgsea.iloc[:, 1:]
y_ssgsea = data_ssgsea.iloc[:, 0]  # 假设标签在 data1 的第一列

X_ssgsea_scaled = scaler.fit_transform(X_ssgsea)

# 拆分数据集
X_train_ssgsea, X_test_ssgsea, y_train_ssgsea, y_test_ssgsea = train_test_split(X_ssgsea_scaled, y_ssgsea, test_size=0.2, random_state = random_state)

data_ppi = pd.read_csv('ppi_10.csv')
X_ppi = data_ppi.iloc[:, 1:]
y_ppi = data_ppi.iloc[:, 0]  # 假设标签在 data1 的第一列

X_ppi_scaled = scaler.fit_transform(X_ppi)

# 拆分数据集
X_train_ppi, X_test_ppi, y_train_ppi, y_test_ppi = train_test_split(X_ppi_scaled, y_ppi, test_size=0.2, random_state = random_state)

data_wgcna = pd.read_csv('wgcna_10.csv')
X_wgcna = data_wgcna.iloc[:, 1:]
y_wgcna = data_wgcna.iloc[:, 0]  # 假设标签在 data1 的第一列

X_wgcna_scaled = scaler.fit_transform(X_wgcna)

# 拆分数据集
X_train_wgcna, X_test_wgcna, y_train_wgcna, y_test_wgcna = train_test_split(X_wgcna_scaled, y_wgcna, test_size=0.2, random_state = random_state)

In [2]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV


# 更新RFECV筛选函数
def feature_selection_rfecv(X, y, num_features=10):
    model = LogisticRegression(max_iter=5000, random_state=42, solver='saga')  # 增加max_iter并更换求解器
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    selector = RFECV(estimator=model, step=1, cv=cv, scoring='roc_auc', min_features_to_select=num_features)
    selector = selector.fit(X, y)
    selected_features = X.columns[selector.support_][:num_features]
    return selected_features

# 使用标准化数据进行特征选择
selected_features_ssgsea = feature_selection_rfecv(pd.DataFrame(X_train_ssgsea, columns=X_ssgsea.columns), y_train_ssgsea, num_features=10)
selected_features_ppi = feature_selection_rfecv(pd.DataFrame(X_train_ppi, columns=X_ppi.columns), y_train_ppi, num_features=10)
selected_features_wgcna = feature_selection_rfecv(pd.DataFrame(X_train_wgcna, columns=X_wgcna.columns), y_train_wgcna, num_features=10)

# 打印结果
print("SSGSEA筛选的特征:", selected_features_ssgsea.tolist())
print("PPI筛选的特征:", selected_features_ppi.tolist())
print("WGCNA筛选的特征:", selected_features_wgcna.tolist())


SSGSEA筛选的特征: ['CD79A', 'PPP3CA', 'IFNG', 'KRAS', 'CD40LG', 'CTSH', 'FCN1', 'PTPN6', 'F2RL2', 'FCGR2B']
PPI筛选的特征: ['PDGFB', 'H3C13', 'SERPINE1', 'CALML4', 'MMP1', 'CDH5', 'H2AC8', 'CENPW', 'H2BC5', 'CCNB1']
WGCNA筛选的特征: ['APLP2', 'GRB2', 'BST1', 'MAP3K3', 'CHP1', 'HSPA8', 'PABPC1', 'DYNLT1', 'ELF1', 'RBM38']


In [3]:
# 根据筛选的特征提取数据
selected_features_ssgsea = ['CD79A', 'PPP3CA', 'IFNG', 'KRAS', 'CD40LG', 'CTSH', 'FCN1', 'PTPN6', 'F2RL2', 'FCGR2B']
selected_features_wgcna = ['APLP2', 'GRB2', 'BST1', 'MAP3K3', 'CHP1', 'HSPA8', 'PABPC1', 'DYNLT1', 'ELF1', 'RBM38']
selected_features_ppi = ['PDGFB', 'H3C13', 'SERPINE1', 'CALML4', 'MMP1', 'CDH5', 'H2AC8', 'CENPW', 'H2BC5', 'CCNB1']

# 创建新的数据集
new_data_ssgsea = data_ssgsea[['group'] + selected_features_ssgsea]
new_data_wgcna = data_wgcna[['group'] + selected_features_wgcna]
new_data_ppi = data_ppi[['group'] + selected_features_ppi]

In [4]:
# 保存为新的 CSV 文件
new_data_ssgsea.to_csv('ssgsea_10.csv', index=False)
new_data_wgcna.to_csv('wgcna_10.csv', index=False)
new_data_ppi.to_csv('ppi_10.csv', index=False)