用于n=24队列 作图

In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import roc_curve, auc
from scipy import stats
from lifelines import KaplanMeierFitter
from lifelines.statistics import logrank_test
import warnings
warnings.filterwarnings('ignore')

In [17]:


def plot_tnb_boxplot_dual_axis(qc_df, clinical_df, tnb_col, output_path, title_suffix=""):
    """
    绘制双Y轴TNB分布箱线图（原始值+对数变换值）
    修正点：
    1. 移除scatterplot的order参数（该函数不支持）
    2. 改用数据筛选的方式保证绘图顺序
    3. 增加异常值处理，提升鲁棒性
    参数：
        qc_df: TNB数据框（含sample列和各策略TNB列）
        clinical_df: 临床数据框（含Patient, Response列）
        tnb_col: 要分析的TNB列名（如'binding_only_raw'）
        output_path: 图片保存路径
        title_suffix: 标题后缀
    返回：
        无（直接保存图片）
    """
    # 数据预处理
    clinical = clinical_df.copy()
    qc = qc_df.copy()
    clinical.rename(columns={'Patient': 'sample'}, inplace=True)
    merged_df = pd.merge(qc, clinical, on='sample', how='inner')
    
    # 过滤NE样本，定义响应分组
    merged_df = merged_df[merged_df['Response'] != 'NE'].copy()
    response_map = {'CR':'Responder','PR':'Responder','SD':'Non-responder','PD':'Non-responder'}
    merged_df['Response_group'] = merged_df['Response'].map(response_map)
    
    # ★ 核心修正1：强制固定分组顺序（通过数据筛选而非绘图参数）
    merged_df['Response_group'] = pd.Categorical(
        merged_df['Response_group'],
        categories=['Responder', 'Non-responder'],  # 固定顺序
        ordered=True
    )
    # 过滤空值（避免categorical导致的NaN）
    merged_df = merged_df.dropna(subset=['Response_group']).reset_index(drop=True)
    
    # 对数变换（避免log(0)）
    merged_df[f'{tnb_col}_log'] = np.log10(merged_df[tnb_col] + 1)
    
    # 创建双Y轴图表
    fig, ax1 = plt.subplots(figsize=(9, 6))
    
    # 左侧Y轴：原始TNB值（散点图）
    color1 = '#3498db'
    ax1.set_xlabel('Response to Nivolumab', fontsize=11, labelpad=10)
    ax1.set_ylabel('TNB (Raw Value)', color=color1, fontsize=11, labelpad=10)
    
    # ★ 核心修正2：分两次绘制散点图，保证Responder在前、Non-responder在后
    # 先画Responder
    responder_df = merged_df[merged_df['Response_group'] == 'Responder']
    sns.scatterplot(
        x='Response_group', 
        y=tnb_col, 
        data=responder_df,
        color='#2ecc71',  # 响应者绿色
        s=80,
        alpha=0.7,
        ax=ax1,
        legend=False
    )
    # 再画Non-responder
    non_responder_df = merged_df[merged_df['Response_group'] == 'Non-responder']
    sns.scatterplot(
        x='Response_group', 
        y=tnb_col, 
        data=non_responder_df,
        color='#e74c3c',  # 非响应者红色
        s=80,
        alpha=0.7,
        ax=ax1,
        legend=False
    )
    
    ax1.tick_params(axis='y', labelcolor=color1)
    ax1.grid(axis='y', linestyle='--', alpha=0.3)
    
    # 右侧Y轴：对数变换值（箱线图）
    ax2 = ax1.twinx()
    color2 = '#9b59b6'
    ax2.set_ylabel('TNB (log10(TNB + 1))', color=color2, fontsize=11, labelpad=10)
    sns.boxplot(
        x='Response_group', 
        y=f'{tnb_col}_log', 
        data=merged_df,
        color='white',
        width=0.3,
        showfliers=False,
        ax=ax2,
        order=['Responder', 'Non-responder']  # boxplot支持order参数，保留
    )
    ax2.tick_params(axis='y', labelcolor=color2)
    
    # 自定义对数轴刻度
    max_log = merged_df[f'{tnb_col}_log'].max()
    yticks = [0, 1, 2, 3, 4, np.ceil(max_log)]
    ytick_labels = [f'$10^{int(t)}-1$' if t !=0 else '0' for t in yticks]
    ax2.set_yticks(yticks)
    ax2.set_yticklabels(ytick_labels)
    
    # 按固定顺序标注样本数
    responder_count = len(responder_df)
    non_responder_count = len(non_responder_df)
    # 获取Y轴最小值（避免标注超出范围）
    y_min = merged_df[tnb_col].min()
    ax1.text(0, y_min * 0.8, f'n={responder_count}', ha='center', fontsize=10, fontweight='bold')
    ax1.text(1, y_min * 0.8, f'n={non_responder_count}', ha='center', fontsize=10, fontweight='bold')
    
    # 标题与保存
    fig.suptitle(f'TNB Distribution in Responders vs Non-responders {title_suffix}\nIpi-N Cohort', 
                 fontsize=12, fontweight='bold', y=0.98)
    fig.tight_layout(rect=[0, 0, 0.95, 0.96])
    sns.despine(top=True, right=False)
    
    # ★ 增加异常处理：确保路径可写
    try:
        plt.savefig(output_path, dpi=300, bbox_inches='tight')
        print(f"箱线图已保存至：{output_path}")
    except Exception as e:
        print(f"保存图片失败：{e}")
        print("将保存至默认路径：./temp_boxplot.png")
        plt.savefig('./temp_boxplot.png', dpi=300, bbox_inches='tight')
    
    plt.close()
    
    # 按固定顺序输出统计
    print(f"=== 箱线图统计 ({tnb_col}) ===")
    print(f"Responder: n={responder_count}, 中位TNB={responder_df[tnb_col].median():.0f}")
    print(f"Non-responder: n={non_responder_count}, 中位TNB={non_responder_df[tnb_col].median():.0f}")

In [15]:
# 数据加载 qc和clinical
qc = pd.read_csv("/work/longyh/BY/processed/TNB/TNB_summary_n24.csv")
clinical = pd.read_excel("/work/longyh/BY/raw/1-s2.0-S0092867417311224-mmc2.xlsx", skiprows=2)

sample,total_raw,binding_only_raw,ic50_500_tpm1_raw,ic50_50_tpm5_raw,high_quality_raw,

In [18]:
#设定分析策略列名
strategy='binding_only_raw'
strategy_num='1'
# 2. 绘制箱线图
plot_tnb_boxplot_dual_axis(
    qc_df=qc,
    clinical_df=clinical,
    tnb_col=strategy, 
    output_path=f"/work/longyh/BY/processed/TNB/n24_strategy{strategy_num}_boxplot.png",
    title_suffix=f"(Strategy {strategy_num}: IC50 < 500 nM & TPM > 1)"
)

# 3. 绘制ROC曲线
auc_str2, ci_str2 = plot_tnb_roc(
    qc_df=qc,
    clinical_df=clinical,
    tnb_col=strategy,
    output_path=f"/work/longyh/BY/processed/TNB/n24_strategy{strategy_num}_roc.png",
    title_suffix=f"(Strategy {strategy_num}: IC50 < 500 nM & TPM > 1)"
)

# 4. 绘制KM生存曲线
lr_p_str2, median_st_str2 = plot_tnb_km_curve(
    qc_df=qc,
    clinical_df=clinical,
    tnb_col=strategy,
    output_path=f"/work/longyh/BY/processed/TNB/n24_strategy{strategy_num}_km.png",
    title_suffix=f"(Strategy {strategy_num}: IC50 < 500 nM & TPM > 1)",
    group_method="median"  # 或"quantile"
)

箱线图已保存至：/work/longyh/BY/processed/TNB/n24_strategy1_boxplot.png
=== 箱线图统计 (binding_only_raw) ===
Responder: n=6, 中位TNB=184
Non-responder: n=16, 中位TNB=126

=== ROC分析结果 (binding_only_raw) ===
AUC: 0.562 (95%CI: 0.421-0.704)
纳入样本: 22 (Responder: 6, Non-responder: 16)
=== 生存分析统计 (binding_only_raw) ===
纳入样本: 22
Low TNB: n=11, 事件数=8, 中位生存时间=67.3周
High TNB: n=11, 事件数=7, 中位生存时间=101.1周

=== 生存分析结果 (binding_only_raw) ===
Log-rank检验P值: 0.462
高TNB组中位生存时间: 101.1周
低TNB组中位生存时间: 67.3周


In [None]:
# # 箱线图
# plot_tnb_boxplot_dual_axis(
#     qc_df=qc,
#     clinical_df=clinical,
#     tnb_col='binding_only_raw',
#     output_path='/work/longyh/BY/processed/TNB/n24_strategy1_boxplot.png',
#     title_suffix='(Strategy 1: IC50 < 500 nM)'
# )

In [None]:
# # ROC曲线
# auc_val, auc_ci = plot_tnb_roc(
#     qc_df=qc,
#     clinical_df=clinical,
#     tnb_col='binding_only_raw',
#     output_path='/work/longyh/BY/processed/TNB/n24_strategy2_roc.png',
#     title_suffix='(Strategy 2: IC50 < 500 & TPM > 1)'
# )

In [None]:
# # KM曲线
# logrank_p, median_st = plot_tnb_km_curve(
#     qc_df=qc,
#     clinical_df=clinical,
#     tnb_col='binding_only_raw',
#     output_path='/work/longyh/BY/processed/TNB/n24_strategy3_km.png',
#     title_suffix='(Strategy 3: IC50 < 50 & TPM > 5)',
#     group_method='median'
# )