In [None]:
import pandas as pd
import numpy as np
from gmdhpy.gmdh import MultilayerGMDH

# 1. 加载数据
gene_data = pd.read_csv('Training_clinical_infor.txt')  # 基因表达数据
clinical_data = pd.read_csv('Training_selected_genes.csv')  # 生存时间和状态数据

# 2. 合并数据
data = pd.merge(gene_data, clinical_data, on='id')  # 假设 'id' 是样本标识列

# 3. 过滤死亡样本（OS.state=1）
data_dead = data[data['OS.state'] == 1]

# 4. 准备自变量和因变量
X = data_dead.drop(columns=['id', 'OS.time', 'OS.state'])  # 自变量：基因表达数据
y = data_dead['OS.time']  # 因变量：生存时间

# 5. 处理缺失值（如果有）
X = X.fillna(X.mean())  # 用均值填充缺失值

# 6. 划分训练集和测试集
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 7. 训练 GMDH 模型
gmdh = GMDHreg()  # 初始化 GMDH 回归模型
gmdh.fit(X_train, y_train)  # 拟合模型

# 8. 预测
y_pred = gmdh.predict(X_test)

# 9. 评估模型
from sklearn.metrics import mean_squared_error, r2_score
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"均方误差 (MSE): {mse}")
print(f"R² 分数: {r2}")

# 10. 查看模型选择的特征（可选）
print("选择的特征:", gmdh.selected_features_)