In [1]:
import pandas as pd
from tqdm import tqdm # 使用tqdm来显示处理进度，对于大数据集很有用
df_brca = pd.read_csv('df_brca.csv')
df_crc = pd.read_csv('df_crc.csv')
df_nsclc = pd.read_csv('df_nsclc.csv')
df_panc = pd.read_csv('df_panc.csv')
df_prostate = pd.read_csv('df_prostate.csv')
print("BRCA shape:", df_brca.shape)
print("CRC shape:", df_crc.shape)
print("NSCLC shape:", df_nsclc.shape)
print("PANC shape:", df_panc.shape)
print("PROSTATE shape:", df_prostate.shape)

BRCA shape: (740545, 90)
CRC shape: (699010, 95)
NSCLC shape: (773089, 80)
PANC shape: (349115, 84)
PROSTATE shape: (381656, 89)


In [2]:
def create_landmark_snapshots(input_filepath="df_full.csv", 
                              output_filepath="df_landmarks.csv",
                              landmark_times_in_days=None):
    """
    【已修正】将一个包含完整时序历史的数据集，转换为一个用于动态风险预测的
    时间点快照（Landmark）数据集。
    此版本会保留原始的 'entry' 和 'stop' 列以供参考。

    :param input_filepath: 包含所有拼接好的数据的CSV文件路径。
    :param output_filepath: 将要保存的landmark数据集的文件路径。
    :param landmark_times_in_days: 一个包含所有预测时间点（以天为单位）的列表。
    """
    
    # --- 0. 参数设置与数据加载 ---
    if landmark_times_in_days is None:
        landmark_times_in_days = [90, 180, 365, 548, 730, 1095]
    print(f"使用的Landmark时间点 (天): {landmark_times_in_days}")

    print(f"正在从 '{input_filepath}' 加载数据...")
    try:
        df_full = pd.read_csv(input_filepath)
    except FileNotFoundError:
        print(f"错误: 输入文件 '{input_filepath}' 未找到。")
        return

    # --- 1. 确保必要的列存在并计算总生存时长 'time' ---
    required_cols = ['PATIENT_ID', 'START_DATE', 'stop', 'dead']
    if not all(col in df_full.columns for col in required_cols):
        print(f"错误: 输入文件必须包含以下列: {required_cols}")
        return
        
    # 获取每个病人的唯一结局信息
    patient_outcomes = df_full[['PATIENT_ID', 'stop', 'dead', 'entry']].drop_duplicates().copy()
    
    # 计算相对生存时长 'time_total'
    if 'entry' not in patient_outcomes.columns:
        patient_outcomes['entry'] = 0
    patient_outcomes['time_total'] = patient_outcomes['stop'] - patient_outcomes['entry']
    
    patient_outcomes = patient_outcomes[patient_outcomes['time_total'] >= 0]
    
    # 将结局信息合并回主DataFrame
    df_full = pd.merge(
        df_full.drop(columns=['time_total'], errors='ignore'),
        patient_outcomes[['PATIENT_ID', 'time_total']],
        on='PATIENT_ID'
    )
    
    print("数据加载与初步处理完成。")

    # --- 2. 遍历病人与Landmarks，生成快照 ---
    all_snapshots = []
    patient_groups = df_full.groupby('PATIENT_ID')
    unique_patient_ids = df_full['PATIENT_ID'].unique()
    
    print(f"正在为 {len(unique_patient_ids)} 名患者生成快照...")
    
    for patient_id in tqdm(unique_patient_ids, desc="Processing Patients"):
        patient_data = patient_groups.get_group(patient_id)
        total_survival_time = patient_data['time_total'].iloc[0]
        original_dead_status = patient_data['dead'].iloc[0]

        for landmark_time in landmark_times_in_days:
            
            if total_survival_time < landmark_time:
                continue
            
            feature_sequence = patient_data[patient_data['START_DATE'] < landmark_time].copy()
            
            if feature_sequence.empty:
                continue
            
            time_residual = total_survival_time - landmark_time
            
            feature_sequence['SAMPLE_ID'] = f"{patient_id}_{landmark_time}"
            feature_sequence['time_landmark'] = landmark_time
            feature_sequence['time_residual'] = time_residual
            feature_sequence['dead_residual'] = original_dead_status
            
            all_snapshots.append(feature_sequence)

    # --- 3. 合并并保存最终的数据集 ---
    if not all_snapshots:
        print("错误: 未能生成任何有效的快照样本。请检查您的数据和landmark设置。")
        return
        
    df_landmarks = pd.concat(all_snapshots, ignore_index=True)
    
    # 【核心修正】重命名列以供后续使用，并保留原始列
    df_landmarks = df_landmarks.rename(columns={
        'time_residual': 'time',       # 'time' 现在是模型的因变量 (时长)
        'dead_residual': 'dead'        # 'dead' 现在是模型的因变量 (事件)
    })
    
    # 清理掉可能引起混淆的中间列
    df_landmarks = df_landmarks.drop(columns=['time_total'], errors='ignore')

    print("\n快照生成完毕。")
    print(f"  - 原始事件记录数: {len(df_full)}")
    print(f"  - 生成的Landmark事件记录数: {len(df_landmarks)}")
    print(f"  - 生成的独立样本（快照）总数: {df_landmarks['SAMPLE_ID'].nunique()}")
    
    print(f"正在将结果保存到 '{output_filepath}'...")
    df_landmarks.to_csv(output_filepath, index=False)
    print("处理完成！")

In [4]:
365*3

1095

In [5]:
custom_landmarks = [180, 365, 545, 730, 1095, 1825]
create_landmark_snapshots(input_filepath="df_brca.csv", 
                            output_filepath="df_brca_landmarks.csv",
                            landmark_times_in_days=custom_landmarks)
create_landmark_snapshots(input_filepath="df_crc.csv", 
                            output_filepath="df_crc_landmarks.csv",
                            landmark_times_in_days=custom_landmarks)
create_landmark_snapshots(input_filepath="df_nsclc.csv", 
                            output_filepath="df_nsclc_landmarks.csv",
                            landmark_times_in_days=custom_landmarks)
create_landmark_snapshots(input_filepath="df_panc.csv", 
                            output_filepath="df_panc_landmarks.csv",
                            landmark_times_in_days=custom_landmarks)
create_landmark_snapshots(input_filepath="df_prostate.csv", 
                            output_filepath="df_prostate_landmarks.csv",
                            landmark_times_in_days=custom_landmarks)

使用的Landmark时间点 (天): [180, 365, 545, 730, 1095, 1825]
正在从 'df_brca.csv' 加载数据...
数据加载与初步处理完成。
正在为 5368 名患者生成快照...


Processing Patients: 100%|██████████| 5368/5368 [00:19<00:00, 274.37it/s]



快照生成完毕。
  - 原始事件记录数: 740545
  - 生成的Landmark事件记录数: 1833037
  - 生成的独立样本（快照）总数: 20687
正在将结果保存到 'df_brca_landmarks.csv'...
处理完成！
使用的Landmark时间点 (天): [180, 365, 545, 730, 1095, 1825]
正在从 'df_crc.csv' 加载数据...
数据加载与初步处理完成。
正在为 5543 名患者生成快照...


Processing Patients: 100%|██████████| 5543/5543 [00:17<00:00, 314.37it/s]



快照生成完毕。
  - 原始事件记录数: 699010
  - 生成的Landmark事件记录数: 1649879
  - 生成的独立样本（快照）总数: 18139
正在将结果保存到 'df_crc_landmarks.csv'...
处理完成！
使用的Landmark时间点 (天): [180, 365, 545, 730, 1095, 1825]
正在从 'df_nsclc.csv' 加载数据...
数据加载与初步处理完成。
正在为 7809 名患者生成快照...


Processing Patients: 100%|██████████| 7809/7809 [00:22<00:00, 344.64it/s]



快照生成完毕。
  - 原始事件记录数: 773089
  - 生成的Landmark事件记录数: 1643315
  - 生成的独立样本（快照）总数: 23924
正在将结果保存到 'df_nsclc_landmarks.csv'...
处理完成！
使用的Landmark时间点 (天): [180, 365, 545, 730, 1095, 1825]
正在从 'df_panc.csv' 加载数据...
数据加载与初步处理完成。
正在为 3109 名患者生成快照...


Processing Patients: 100%|██████████| 3109/3109 [00:06<00:00, 494.95it/s]



快照生成完毕。
  - 原始事件记录数: 349115
  - 生成的Landmark事件记录数: 615424
  - 生成的独立样本（快照）总数: 6914
正在将结果保存到 'df_panc_landmarks.csv'...
处理完成！
使用的Landmark时间点 (天): [180, 365, 545, 730, 1095, 1825]
正在从 'df_prostate.csv' 加载数据...
数据加载与初步处理完成。
正在为 3211 名患者生成快照...


Processing Patients: 100%|██████████| 3211/3211 [00:11<00:00, 281.49it/s]



快照生成完毕。
  - 原始事件记录数: 381656
  - 生成的Landmark事件记录数: 935187
  - 生成的独立样本（快照）总数: 11799
正在将结果保存到 'df_prostate_landmarks.csv'...
处理完成！


In [6]:
df_brca = pd.read_csv('df_brca_landmarks.csv')
df_crc = pd.read_csv('df_crc_landmarks.csv')
df_nsclc = pd.read_csv('df_nsclc_landmarks.csv')
df_panc = pd.read_csv('df_panc_landmarks.csv')
df_prostate = pd.read_csv('df_prostate_landmarks.csv')
print("BRCA shape:", df_brca.shape)
print("CRC shape:", df_crc.shape)
print("NSCLC shape:", df_nsclc.shape)
print("PANC shape:", df_panc.shape)
print("PROSTATE shape:", df_prostate.shape)

BRCA shape: (1833037, 94)
CRC shape: (1649879, 99)
NSCLC shape: (1643315, 84)
PANC shape: (615424, 88)
PROSTATE shape: (935187, 93)


In [7]:
df_brca.drop(['time.1', 'dead.1'], axis=1, inplace=True)
df_crc.drop(['time.1', 'dead.1'], axis=1, inplace=True)
df_nsclc.drop(['time.1', 'dead.1'], axis=1, inplace=True)
df_panc.drop(['time.1', 'dead.1'], axis=1, inplace=True)
df_prostate.drop(['time.1', 'dead.1'], axis=1, inplace=True)
print("BRCA shape:", df_brca.shape)
print("CRC shape:", df_crc.shape)
print("NSCLC shape:", df_nsclc.shape)
print("PANC shape:", df_panc.shape)
print("PROSTATE shape:", df_prostate.shape)

BRCA shape: (1833037, 92)
CRC shape: (1649879, 97)
NSCLC shape: (1643315, 82)
PANC shape: (615424, 86)
PROSTATE shape: (935187, 91)


In [8]:
# 保存为csv文件
df_brca.to_csv('df_brca_landmarks.csv', index=False)
df_crc.to_csv('df_crc_landmarks.csv', index=False)
df_nsclc.to_csv('df_nsclc_landmarks.csv', index=False)
df_panc.to_csv('df_panc_landmarks.csv', index=False)
df_prostate.to_csv('df_prostate_landmarks.csv', index=False)

In [6]:
import toad
pd.set_option ('display.max_columns', None)
pd.set_option ('display.max_rows', None)
toad.detect(df_panc)

Unnamed: 0,type,size,missing,unique,mean_or_top1,std_or_top2,min_or_top3,1%_or_top4,10%_or_top5,50%_or_bottom5,75%_or_bottom4,90%_or_bottom3,99%_or_bottom2,max_or_bottom1
PATIENT_ID,object,349115,0.00%,3109,P-0011000:0.21%,P-0009421:0.18%,P-0007565:0.15%,P-0012792:0.14%,P-0018942:0.14%,P-0082660:0.00%,P-0071303:0.00%,P-0080504:0.00%,P-0071773:0.00%,P-0025682:0.00%
START_DATE,int64,349115,0.00%,6304,132.623926,772.221585,-11513.0,-2749.86,-400.0,88.0,410.0,877.0,2140.0,3360.0
EVENT_DURATION,int64,349115,0.00%,729,6.599671,50.819059,0.0,0.0,0.0,0.0,0.0,0.0,182.0,6028.0
EVENT_TYPE,object,349115,0.00%,9,Diagnosis:59.68%,LAB_TEST:18.62%,Treatment:15.44%,SURGERY:2.35%,TREATMENT:2.01%,TREATMENT:2.01%,Sample acquisition:0.90%,Sequencing:0.90%,PATHOLOGY:0.08%,Pathology:0.03%
EVENT_SUBTYPE,object,349115,1.80%,42,IMAGING_PROGRESSION:11.41%,CA_19-9:11.26%,IMAGING_PELVIS:7.99%,IMAGING_ABDOMEN:7.44%,CEA:6.93%,Bone Treatment:0.04%,Immuno:0.03%,Gleason Score:0.03%,Biologic:0.02%,PD-L1 Positive:0.01%
VALUE_NUMERIC,float64,349115,68.49%,8564,1132.021617,12319.651468,0.0,0.0,0.0,2.8,40.0,507.0,21640.2,1268920.0
VALUE_CATEGORICAL,object,349115,23.12%,169,CT:34.02%,Units/ml :11.41%,ng/ml :7.21%,N:6.55%,MR:3.46%,ADO-TRASTUZUMAB EMTANSINE:0.00%,BUSULFAN:0.00%,TRETINOIN:0.00%,LUTETIUM LU-177 5B1-MVT1075:0.00%,VENETOCLAX:0.00%
AGE,int64,349115,0.00%,71,64.83201,10.999944,5.0,34.0,50.0,66.0,73.0,78.0,86.0,90.0
MALE,int64,349115,0.00%,2,0.517537,0.499693,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0
WHITE,int64,349115,0.00%,2,0.824863,0.380085,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0


In [68]:
lst = ['AGE','MALE','WHITE','ASIAN','BLACK', 'SMOKER',
        'KRAS', 'HRAS', 'RET', 'MET', 'GNAQ', 'PTEN', 'KIT', 'EGFR', 'FGFR3', 
        'PDGFRA', 'ERBB2', 'TP53', 'NRAS', 'NOTCH1', 'GNA11', 'CTNNB1', 
        'FGFR2', 'PIK3CA', 'IDH1', 'BRAF', 'FGFR1', 'ALK', 'AKT1', 
        'STAGE 1', 'STAGE 2', 'STAGE 3', 'STAGE 4']

for i in lst:
    x = sum(df_nsclc[i])/len(df_nsclc)
    print(f"{i}: {x}")

AGE: 65.72347685712771
MALE: 0.4040854287151932
WHITE: 0.7740467138971063
ASIAN: 0.10972992760212602
BLACK: 0.05756905091134397
SMOKER: 0.6495474647808984
KRAS: 0.25825228401904565
HRAS: 0.0026504063568360174
RET: 0.0007062576236371232
MET: 0.04837088614635572
GNAQ: 6.984965508499022e-05
PTEN: 0.03135732108463579
KIT: 0.007534708164260519
EGFR: 0.2774234273156131
FGFR3: 0.0016349993338412524
PDGFRA: 0.007016009799647906
ERBB2: 0.04175715861951211
TP53: 0.5259368584988274
NRAS: 0.004809278103814697
NOTCH1: 0.00883468785612006
GNA11: 0.0
CTNNB1: 0.02767598555923057
FGFR2: 0.0015405729482633954
PIK3CA: 0.056839510069345185
IDH1: 0.0025469253863397356
BRAF: 0.03604630256024856
FGFR1: 0.02158095639699957
ALK: 0.0029272179529135716
AKT1: 0.007416998560320998
STAGE 1: 0.24531974973127285
STAGE 2: 0.06645289222844976
STAGE 3: 0.1722259662212242
STAGE 4: 0.5114391745322984


In [2]:
import pandas as pd
df_full = pd.read_csv('df_prostate_landmarks.csv')
df_full

Unnamed: 0,PATIENT_ID,START_DATE,EVENT_DURATION,EVENT_TYPE,EVENT_SUBTYPE,VALUE_NUMERIC,VALUE_CATEGORICAL,AGE,MALE,WHITE,...,stop_DMETS_DX_LUNG,stop_DMETS_DX_LIVER,stop_DMETS_DX_BONE,dead_DMETS_DX_BRAIN,dead_DMETS_DX_LUNG,dead_DMETS_DX_LIVER,dead_DMETS_DX_BONE,time,SAMPLE_ID,time_landmark
0,P-0000373,-224,0,Diagnosis,IMAGING_ABDOMEN,,MR,55,1,1,...,-28,1102,-75,1,1,1,1,1037,P-0000373_180,180
1,P-0000373,-224,0,Diagnosis,IMAGING_PELVIS,,MR,55,1,1,...,-28,1102,-75,1,1,1,1,1037,P-0000373_180,180
2,P-0000373,-224,0,Diagnosis,IMAGING_PROGRESSION,,Indeterminate,55,1,1,...,-28,1102,-75,1,1,1,1,1037,P-0000373_180,180
3,P-0000373,-224,0,Diagnosis,TUMOR_SITE_Lymph Nodes,,MR,55,1,1,...,-28,1102,-75,1,1,1,1,1037,P-0000373_180,180
4,P-0000373,-224,0,Diagnosis,TUMOR_SITE_Other,,MR,55,1,1,...,-28,1102,-75,1,1,1,1,1037,P-0000373_180,180
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
935182,P-0089592,130,0,Diagnosis,IMAGING_ABDOMEN,,CT,64,1,0,...,2765,2765,2503,0,0,0,1,189,P-0089592_180,180
935183,P-0089592,130,0,Diagnosis,IMAGING_CHEST,,CT,64,1,0,...,2765,2765,2503,0,0,0,1,189,P-0089592_180,180
935184,P-0089592,130,0,Diagnosis,IMAGING_PELVIS,,CT,64,1,0,...,2765,2765,2503,0,0,0,1,189,P-0089592_180,180
935185,P-0089592,130,0,Diagnosis,IMAGING_PROGRESSION,,N,64,1,0,...,2765,2765,2503,0,0,0,1,189,P-0089592_180,180


In [45]:
pd.set_option ('display.max_columns', None)
pd.set_option ('display.max_rows', None)
tmp = pd.read_csv('./data/source_panc_dx_1st_seq_OS')
tmp.head()

FileNotFoundError: [Errno 2] No such file or directory: './data/source_panc_dx_1st_seq_OS'

threshold = 0.01
BRCA Statics: AGE(56.0), MALE(0.01), WHITE(0.74), ASIAN(0.08), BLACK(0.10), KRAS(0.02), PTEN(0.08), EGFR(0.01), ERBB2(0.18), TP53(0.38), FGFR2(0.02), PIK3CA(0.36), FGFR1(0.13), AKT1(0.05)
CRC Statics: AGE(56.6), MALE(0.56), WHITE(0.79), ASIAN(0.08), BLACK(0.07), KRAS(0.43),  PTEN(0.06), EGFR(0.02), ERBB2(0.05), TP53(0.0.77), NRAS(0.04), CTNNB1(0.02), PIK3CA(0.16), BRAF(0.08), FGFR1(0.03), AKT1(0.01)
NSCLC Statics: AGE(65.7), MALE(0.40), WHITE(0.77), ASIAN(0.11), BLACK(0.06), KRAS(0.26), MET(0.05), PTEN(0.03), EGFR(0.28), ERBB2(0.04), TP53(0.53), CTNNB1(0.03), PIK3CA(0.06), BRAF(0.04), FGFR1(0.02)
PANC Statics: AGE(64.8), MALE(0.52), WHITE(0.82), ASIAN(0.07), BLACK(0.05), KRAS(0.80), PTEN(0.02), ERBB2(0.01), TP53(0.64), PIK3CA(0.02), BRAF(0.01), FGFR1(0.01)
PROSTATE Statics: AGE(67.5), MALE(1.00), WHITE(0.82), ASIAN(0.04), BLACK(0.08), PTEN(0.19), TP53(0.31), CTNNB1(0.03), PIK3CA(0.04), BRAF(0.02), FGFR1(0.01), AKT1(0.02)