``shell
pip install pandas numpy scikit-learn six matplotlib gmdhpy
``

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from gmdhpy.gmdh import MultilayerGMDH

In [None]:
# 1. 加载数据
clinical_data = pd.read_csv('Training_clinical_infor.txt')  # 假设临床数据文件名为此
gene_expression_data = pd.read_csv('Training_selected_genes.txt')  # 假设基因数据文件名为此

In [None]:

# 2. 合并数据
data = pd.merge(clinical_data, gene_expression_data, on='id')  # 假设'id'是公共列

# 3. 分离目标变量和特征
X = data.drop(['id', 'OS.time',''], axis=1)  # 移除ID和目标变量
y = data['OS.time']  # 目标变量：生存时间


In [None]:

# 4. 数据预处理
# 定义分类特征
categorical_features = ['subtype', 'IDH stat', 'PRS_type', 'Histology', 'Grade', 
                        'Gender', 'Radio_status', 'Chemo_status', '1p19q_codeletion_status', 
                        '2021classification_created By BZT']

# 处理缺失值
imputer = SimpleImputer(strategy='most_frequent')  # 用众数填充缺失值
X_imputed = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

# 编码分类变量
encoder = OneHotEncoder(sparse=False, drop='first')  # 独热编码，去掉第一个类别避免共线性
X_categorical = encoder.fit_transform(X_imputed[categorical_features])

# 对数值特征进行缩放
numerical_features = X_imputed.drop(categorical_features, axis=1)
scaler = StandardScaler()
X_numerical = scaler.fit_transform(numerical_features)

# 合并处理后的特征
X_final = pd.concat([pd.DataFrame(X_categorical), pd.DataFrame(X_numerical)], axis=1)

# 5. 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X_final, y, test_size=0.2, random_state=42)

# 6. 训练GMDH模型
model = MultilayerGMDH()  # 初始化GMDH模型
model.fit(X_train, y_train)  # 训练模型

# 7. 模型评估
score = model.score(X_test, y_test)  # 计算模型在测试集上的得分
print(f'模型得分: {score}')

# 8. 模型预测
predictions = model.predict(X_test)  # 对测试集进行预测
print(f'前5个预测值: {predictions[:5]}')