In [1]:
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.linear_model import LogisticRegressionCV
from sklearn.preprocessing import StandardScaler
import os

# Setup output folder
os.makedirs("BRCA_ML_Interpretability", exist_ok=True)

# Load data
expr = pd.read_csv("BRCA_VST_Normalized_Matrix.csv", index_col=0)
meta = pd.read_csv("BRCA_Metadata_Final.csv", index_col=0)
genes = pd.read_excel("Common_Genes_BRCA.xlsx", header=None).iloc[:, 0].str.upper().tolist()

# Prepare data
expr.index = expr.index.str.upper()
X = expr.loc[genes].T
y = meta.loc[X.index, 'sample_type'].replace({'Solid Tissue Normal': 0, 'Primary Tumor': 1}).values
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train LASSO
lasso = LogisticRegressionCV(penalty='l1', solver='liblinear', cv=5,
                             class_weight='balanced', random_state=42, max_iter=1000)
lasso.fit(X_scaled, y)

# Extract coefficients
coefs = lasso.coef_[0]
coef_df = pd.DataFrame({'Gene': genes, 'Coefficient': coefs})
coef_df = coef_df.sort_values(by='Coefficient', key=abs, ascending=False)
coef_df.to_csv("BRCA_ML_Interpretability/LASSO_Coefficients.csv", index=False)

# Plot (high-res, serif, bold)
plt.figure(figsize=(8, 6))
plt.rcParams.update({'font.family': 'serif', 'font.weight': 'bold'})
bars = plt.barh(coef_df['Gene'], coef_df['Coefficient'], color='teal')
plt.axvline(0, linestyle='--', color='gray')
plt.xlabel("Coefficient Weight", fontsize=14, fontweight='bold')
plt.title("LASSO Coefficients – BRCA 9-Gene Panel", fontsize=15, fontweight='bold')
plt.xticks(fontsize=12, fontweight='bold')
plt.yticks(fontsize=12, fontweight='bold')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.savefig("BRCA_ML_Interpretability/LASSO_Coefficients_Plot_HighRes.png", dpi=600)
plt.close()

  y = meta.loc[X.index, 'sample_type'].replace({'Solid Tissue Normal': 0, 'Primary Tumor': 1}).values


In [3]:
import shap
import xgboost as xgb
import numpy as np
import matplotlib.pyplot as plt

# Train XGB model
model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
model.fit(X_scaled, y)

# SHAP Explainer
explainer = shap.Explainer(model)
shap_values = explainer(X_scaled)

# Save SHAP values
shap_df = pd.DataFrame(shap_values.values, columns=genes)
shap_df.to_csv("BRCA_ML_Interpretability/XGB_SHAP_Values.csv", index=False)

# SHAP Global Bar Plot
plt.rcParams.update({'font.family': 'serif', 'font.weight': 'bold'})
shap.plots.bar(shap_values, max_display=9, show=False)
plt.tight_layout()
plt.savefig("BRCA_ML_Interpretability/XGB_SHAP_BarPlot_HighRes.png", dpi=600)
plt.close()

# SHAP Beeswarm Plot
shap.plots.beeswarm(shap_values, max_display=9, show=False)
plt.tight_layout()
plt.savefig("BRCA_ML_Interpretability/XGB_SHAP_Beeswarm_HighRes.png", dpi=600)
plt.close()

# SHAP Waterfall Plot (first sample)
shap.plots.waterfall(shap_values[0], show=False)
plt.tight_layout()
plt.savefig("BRCA_ML_Interpretability/XGB_SHAP_Waterfall_Sample0_HighRes.png", dpi=600)
plt.close()

  from .autonotebook import tqdm as notebook_tqdm
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
