In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LassoCV
import pandas as pd
import numpy as np

dataFile = "D:/Apple-paper/Radiomics/survival analysis/survival analysis/APPLE/t1+t1Gd+t2+flair/2_2_all_feature_divide_train_test/Total_GBM+LGG_flair_s_add_os_age_gender_label_train.csv"
data = pd.read_csv(dataFile)
dataFile_test = "D:/Apple-paper/Radiomics/survival analysis/survival analysis/APPLE/t1+t1Gd+t2+flair/2_2_all_feature_divide_train_test/Total_GBM+LGG_flair_s_add_os_age_gender_label_test.csv"
data_test = pd.read_csv(dataFile_test)
# x = data.iloc[:, 1:]
# y = data["label"]
# x_test = data_test.iloc[:, 1:]
# y_test = data_test["label"]
# x.shape
# 将 OS, OS.time, age_at_index, gender 列从 x 中剔除，避免它们参与 Lasso 回归计算
x = data.drop(columns=['index','OS', 'OS.time', 'age_at_index', 'label','gender'])
y = data["label"]  # label 作为目标变量，如果是其余任务则视情况更换目标变量
x_test = data_test.drop(columns=['index','OS', 'OS.time', 'age_at_index', 'label','gender'])
y_test = data_test["label"]  # 测试集的label

# 查看 x 的形状
x.shape


In [None]:
# log函数，以10为底，即lg(-10)至lg(-1)中随机取100个数。
alphas = np.logspace(-10, -1, 100, base=10)
alphas

selector_lasso = LassoCV(alphas=alphas, cv=5, max_iter=int(1e6))
# alphas = alphas即上面生成的数组，cv = 5即5次交叉验证
selector_lasso.fit(x, y)  # 拟合

selector_lasso.alpha_  # 选出最优的a值，此值不可在alphas（即lg(-10)至lg(-1)）的范围内，否则这个边界设置不合理。

selector_lasso.coef_  # 特征系数

x.columns[selector_lasso.coef_ != 0]  # 把不等于0的特征系数挑选出来，删除特征系数=0的无意义特征

x[x.columns[selector_lasso.coef_ != 0]]  # 生成DataFrame

selector_lasso.intercept_  # 截距

selector_lasso.mse_path_.shape  # selector_lasso.mse_path_为每次交叉验证的误差

selector_lasso.mse_path_.mean(axis=1)  # 5次误差的平均值，axis = 1表示列


In [None]:
# # 保存所选特征到CSV文件
selected_features_df = x[x.columns[selector_lasso.coef_ != 0]]
selected_features_test = x_test[x_test.columns[selector_lasso.coef_ != 0]]

labels = data[["label", "OS", "OS.time", "age_at_index", "index",'gender']]  # 假设标签列名为'label'，请根据实际情况修改
labels_test = data_test[["label", "OS", "OS.time", "age_at_index", "index",'gender']]

# 将标签列与选择的特征数据框合并
selected_features_with_labels = pd.concat(
    [selected_features_df, labels], axis=1)
selected_features_with_labels_test = pd.concat(
    [selected_features_test, labels_test], axis=1)

# # 保存合并后的数据框为CSV文件
# selected_features_with_labels.to_csv('D:/radiomic1/glioma/csv2/' +
#                                      'lasso_selected_features_train_t2_s.csv', index=False)
# selected_features_with_labels_test.to_csv(
#     'D:/radiomic1/glioma/csv2/'+'lasso_selected_features_test_t2_s.csv', index=False)
import os
output_dir = 'D:/Apple-paper/Radiomics/survival analysis/survival analysis/APPLE/t1+t1Gd+t2+flair/2_3_lasso_feature_divide_train_test'

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# 保存合并后的数据框为CSV文件
selected_features_with_labels.to_csv(os.path.join(output_dir, 'Total_GBM+LGG_flair_s_add_os_age_gender_label_train_selsect_lasso.csv'), index=False)
selected_features_with_labels_test.to_csv(os.path.join(output_dir, 'Total_GBM+LGG_flair_s_add_os_age_gender_label_test_selsect_lasso.csv'), index=False)

In [None]:
# 图一

MSEs_mean = selector_lasso.mse_path_.mean(axis=1)
MSEs_std = selector_lasso.mse_path_.std(axis=1)

plt.figure()
plt.errorbar(selector_lasso.alphas_, MSEs_mean, yerr=MSEs_std  # 第一个表示x轴，第二表示y轴，第三表示误差范围即蓝线误差棒
             , fmt="o"  # 数据点标记
             , ms=3  # 数据点大小
             , mfc="r"  # 数据点颜色
             , mec="r"  # 数据点边缘颜色
             , ecolor="lightblue"  # 误差棒颜色
             , elinewidth=2  # 误差棒线宽
             , capsize=4  # 误差棒边界线长度
             , capthick=1  # 误差棒边界线厚度
             )
plt.semilogx()  # 画横坐标
plt.axvline(selector_lasso.alpha_, color="black",
            ls="--")  # 在最小MSE值，即最佳Lambda画竖线
plt.xlabel("Lambda")  # x轴标签
plt.ylabel("MSE")  # y轴标签
plt.show()  # 展示

In [None]:

# # 提取特征重要性
# feature_importances = np.abs(selector_lasso.coef_)

# # 按特征重要性排序
# sorted_indices = np.argsort(feature_importances)[::-1]
# sorted_feature_names = x.columns[sorted_indices[:10]]
# sorted_feature_importances = feature_importances[sorted_indices[:10]]
# # 绘图
# plt.figure(figsize=(10, 6))
# plt.barh(range(len(sorted_feature_names)), sorted_feature_importances)
# plt.yticks(range(len(sorted_feature_names)), sorted_feature_names)
# plt.xlabel('Feature Importance')
# plt.ylabel('Feature Name')
# plt.title('Top 10 Important Features')
# plt.tight_layout()
# plt.show()
# 提取特征重要性
feature_importances = np.abs(selector_lasso.coef_)

# 创建包含特征名和特征重要性的DataFrame
features_df = pd.DataFrame({
    'Feature Name': x.columns,
    'Importance': feature_importances
})

# 保存整个特征重要性列表到CSV
features_df.to_csv('D:/Apple-paper/Radiomics/survival analysis/survival analysis/APPLE/t1+t1Gd+t2+flair/2_3_lasso_feature_divide_train_test/flair/flair_lasso_csv/flair_s_Lasso_feature_importances.csv', index=False)

# 按特征重要性排序，选择前10个
sorted_features_df = features_df.sort_values(by='Importance', ascending=False).head(10)

# 绘图
plt.figure(figsize=(10, 6))
plt.barh(sorted_features_df['Feature Name'], sorted_features_df['Importance'])
plt.xlabel('Feature Importance')
plt.ylabel('Feature Name')
plt.title('Top 10 Important Features')
plt.tight_layout()
plt.gca().invert_yaxis()  # 使最重要的特征在顶部
plt.show()


In [None]:

# 图二
coefs = selector_lasso.path(x, y, alphas=alphas, max_iter=1e6)[1].T
plt.figure()
plt.semilogx(selector_lasso.alphas_, coefs, '-')  # 以对数形式画横轴，
plt.axvline(selector_lasso.alpha_, color='black', ls="--")
plt.xlabel('Lambda')
plt.ylabel('Coefficients')
plt.show()
