In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LassoCV
import pandas as pd
import numpy as np

dataFile = "D:/Apple-paper/Radiomics/survival analysis/survival analysis/APPLE/t1+t1Gd+t2+flair/2_2_all_feature_divide_train_test/Total_GBM+LGG_flair_s_add_os_age_gender_label_train.csv"
data = pd.read_csv(dataFile)
dataFile_test = "D:/Apple-paper/Radiomics/survival analysis/survival analysis/APPLE/t1+t1Gd+t2+flair/2_2_all_feature_divide_train_test/Total_GBM+LGG_flair_s_add_os_age_gender_label_test.csv"
data_test = pd.read_csv(dataFile_test)
# x = data.iloc[:, 1:]
# y = data["label"]
# x_test = data_test.iloc[:, 1:]
# y_test = data_test["label"]
# x.shape
# Remove the columns OS, OS.time, age_at_index, and gender from x to prevent them from participating in the Lasso regression calculation.
x = data.drop(columns=['index','OS', 'OS.time', 'age_at_index', 'label','gender'])
y = data["label"]  # label is used as the target variable. If it is other tasks, the target variable is replaced as appropriate.
x_test = data_test.drop(columns=['index','OS', 'OS.time', 'age_at_index', 'label','gender'])
y_test = data_test["label"]  # label of the test set

# Check the shape of x
x.shape


In [None]:
# log function, with base 10, that is, randomly select 100 numbers from lg(-10) to lg(-1).
alphas = np.logspace(-10, -1, 100, base=10)
alphas

selector_lasso = LassoCV(alphas=alphas, cv=5, max_iter=int(1e6))
# alphas = alphas is the array generated above, cv = 5 is 5 times cross validation
selector_lasso.fit(x, y)  

selector_lasso.alpha_  # Select the optimal a value. This value cannot be in the range of alphas (i.e., lg(-10) to lg(-1)), otherwise the boundary setting is unreasonable.

selector_lasso.coef_  # Characteristic coefficient

x.columns[selector_lasso.coef_ != 0]  # Select the feature coefficients that are not equal to 0 and delete the meaningless features with feature coefficients = 0

x[x.columns[selector_lasso.coef_ != 0]]  # Generating DataFrame

selector_lasso.intercept_  # intercept

selector_lasso.mse_path_.shape  # selector_lasso.mse_path_is the error of each cross validation

selector_lasso.mse_path_.mean(axis=1)  # The average of 5 errors, axis = 1 represent column


In [None]:

selected_features_df = x[x.columns[selector_lasso.coef_ != 0]]  #Select features whose LASSO regression coefficient is not 0 from the training set `x` and generate a new data frame `selected_features_df`
selected_features_test = x_test[x_test.columns[selector_lasso.coef_ != 0]] #Select features whose LASSO regression coefficient is not 0 from the test set `x_test` to generate a new data frame `selected_features_test`

labels = data[["label", "OS", "OS.time", "age_at_index", "index",'gender']]  # Assume the label column name is 'label', please modify it according to the actual situation
labels_test = data_test[["label", "OS", "OS.time", "age_at_index", "index",'gender']]

# Merge the label column with the selected features dataframe
selected_features_with_labels = pd.concat(
    [selected_features_df, labels], axis=1)
selected_features_with_labels_test = pd.concat(
    [selected_features_test, labels_test], axis=1)

# # 
# selected_features_with_labels.to_csv('D:/radiomic1/glioma/csv2/' +
#                                      'lasso_selected_features_train_t2_s.csv', index=False)
# selected_features_with_labels_test.to_csv(
#     'D:/radiomic1/glioma/csv2/'+'lasso_selected_features_test_t2_s.csv', index=False)
import os
output_dir = 'D:/Apple-paper/Radiomics/survival analysis/survival analysis/APPLE/t1+t1Gd+t2+flair/2_3_lasso_feature_divide_train_test'

if not os.path.exists(output_dir):
    os.makedirs(output_dir)  #If the output directory does not exist, create it

# Save the merged data frame as a CSV file
selected_features_with_labels.to_csv(os.path.join(output_dir, 'Total_GBM+LGG_flair_s_add_os_age_gender_label_train_selsect_lasso.csv'), index=False)
selected_features_with_labels_test.to_csv(os.path.join(output_dir, 'Total_GBM+LGG_flair_s_add_os_age_gender_label_test_selsect_lasso.csv'), index=False)

In [None]:
# Figure 1

MSEs_mean = selector_lasso.mse_path_.mean(axis=1)  # Calculate the mean squared error (MSE) for each alpha (Lambda) in LASSO regression and store it in the variable `MSEs_mean`.
MSEs_std = selector_lasso.mse_path_.std(axis=1)  # Calculate the standard deviation of the MSE for each alpha (Lambda) in LASSO regression and store it in the variable `MSEs_std`.

plt.figure()
plt.errorbar(selector_lasso.alphas_, MSEs_mean, yerr=MSEs_std  # The first argument represents the x-axis, the second represents the y-axis, and the third represents the error range (blue error bars).
             , fmt="o"  # Marker style for data points.
             , ms=3  # Size of the data points.
             , mfc="r"  # Fill color of the data points (red).
             , mec="r"  # Edge color of the data points (red).
             , ecolor="lightblue"  # Color of the error bars (light blue).
             , elinewidth=2  # Line width of the error bars.
             , capsize=4  # Length of the error bar caps.
             , capthick=1  # Thickness of the error bar caps.
             )
plt.semilogx()  # Plot the x-axis on a logarithmic scale.
plt.axvline(selector_lasso.alpha_, color="black",
            ls="--")  # Draw a vertical dashed line at the alpha (Lambda) corresponding to the minimum MSE (optimal Lambda).
plt.xlabel("Lambda")  # Label for the x-axis.
plt.ylabel("MSE")  # Label for the y-axis.
plt.show()  # Display the plot.

In [None]:
feature_importances = np.abs(selector_lasso.coef_)
# Calculate the absolute values of the LASSO regression coefficients to represent feature importance.

# Create a DataFrame containing feature names and their importance
features_df = pd.DataFrame({
    'Feature Name': x.columns,
    'Importance': feature_importances
})

# Save the complete list of feature importances to a CSV file
features_df.to_csv('D:/Apple-paper/Radiomics/survival analysis/survival analysis/APPLE/t1+t1Gd+t2+flair/2_3_lasso_feature_divide_train_test/flair/flair_lasso_csv/flair_s_Lasso_feature_importances.csv', index=False)

# Sort features by importance and select the top 10
sorted_features_df = features_df.sort_values(by='Importance', ascending=False).head(10)

# Plot the top 10 important features
plt.figure(figsize=(10, 6))
plt.barh(sorted_features_df['Feature Name'], sorted_features_df['Importance'])
plt.xlabel('Feature Importance')
plt.ylabel('Feature Name')
plt.title('Top 10 Important Features')
plt.tight_layout()
plt.gca().invert_yaxis()  # Invert the y-axis to display the most important feature at the top
plt.show()

In [None]:
# Figure 2
coefs = selector_lasso.path(x, y, alphas=alphas, max_iter=1e6)[1].T
# Use the LASSO path method to compute the regression coefficient paths for different alpha (Lambda) values.
# `selector_lasso.path` returns a tuple, where the second element is the coefficient matrix (each column corresponds to an alpha value).
# `.T` transposes the matrix so that each row corresponds to the coefficient changes of a feature.

plt.figure()
plt.semilogx(selector_lasso.alphas_, coefs, '-')
# Plot the LASSO regression coefficient paths.
# The x-axis represents alpha (Lambda) values (logarithmic scale), and the y-axis represents the corresponding coefficients.
# Each line represents the coefficient changes of a feature, with a solid line style.

plt.axvline(selector_lasso.alpha_, color='black', ls="--")
# Draw a vertical dashed line at the alpha (Lambda) value selected by the LASSO model, with black color.

plt.xlabel('Lambda')  # Set the label for the x-axis as "Lambda".
plt.ylabel('Coefficients')  # Set the label for the y-axis as "Coefficients".
plt.show()  # Display the plot.