In [1]:
import pandas as pd
from tqdm import tqdm # 使用tqdm来显示处理进度，对于大数据集很有用
df_brca = pd.read_csv('df_brca.csv')
df_crc = pd.read_csv('df_crc.csv')
df_nsclc = pd.read_csv('df_nsclc.csv')
df_panc = pd.read_csv('df_panc.csv')
df_prostate = pd.read_csv('df_prostate.csv')
print("BRCA shape:", df_brca.shape)
print("CRC shape:", df_crc.shape)
print("NSCLC shape:", df_nsclc.shape)
print("PANC shape:", df_panc.shape)
print("PROSTATE shape:", df_prostate.shape)

BRCA shape: (740545, 90)
CRC shape: (699010, 95)
NSCLC shape: (773089, 80)
PANC shape: (349115, 84)
PROSTATE shape: (381656, 89)


In [None]:
def create_landmark_snapshots(input_filepath="df_full.csv", 
                              output_filepath="df_landmarks.csv",
                              landmark_times_in_days=None):
    if landmark_times_in_days is None:
        landmark_times_in_days = [90, 180, 365, 548, 730, 1095]
    print(f"使用的Landmark时间点 (天): {landmark_times_in_days}")

    print(f"正在从 '{input_filepath}' 加载数据...")
    try:
        df_full = pd.read_csv(input_filepath)
    except FileNotFoundError:
        print(f"错误: 输入文件 '{input_filepath}' 未找到。")
        return

    required_cols = ['PATIENT_ID', 'START_DATE', 'stop', 'dead']
    if not all(col in df_full.columns for col in required_cols):
        print(f"错误: 输入文件必须包含以下列: {required_cols}")
        return

    patient_outcomes = df_full[['PATIENT_ID', 'stop', 'dead', 'entry']].drop_duplicates().copy()

    if 'entry' not in patient_outcomes.columns:
        patient_outcomes['entry'] = 0
    patient_outcomes['time_total'] = patient_outcomes['stop'] - patient_outcomes['entry']
    
    patient_outcomes = patient_outcomes[patient_outcomes['time_total'] >= 0]

    df_full = pd.merge(
        df_full.drop(columns=['time_total'], errors='ignore'),
        patient_outcomes[['PATIENT_ID', 'time_total']],
        on='PATIENT_ID'
    )
    
    print("数据加载与初步处理完成。")

    all_snapshots = []
    patient_groups = df_full.groupby('PATIENT_ID')
    unique_patient_ids = df_full['PATIENT_ID'].unique()
    
    print(f"正在为 {len(unique_patient_ids)} 名患者生成快照...")
    
    for patient_id in tqdm(unique_patient_ids, desc="Processing Patients"):
        patient_data = patient_groups.get_group(patient_id)
        total_survival_time = patient_data['time_total'].iloc[0]
        original_dead_status = patient_data['dead'].iloc[0]

        for landmark_time in landmark_times_in_days:
            
            if total_survival_time < landmark_time:
                continue
            
            feature_sequence = patient_data[patient_data['START_DATE'] < landmark_time].copy()
            
            if feature_sequence.empty:
                continue
            
            time_residual = total_survival_time - landmark_time
            
            feature_sequence['SAMPLE_ID'] = f"{patient_id}_{landmark_time}"
            feature_sequence['time_landmark'] = landmark_time
            feature_sequence['time_residual'] = time_residual
            feature_sequence['dead_residual'] = original_dead_status
            
            all_snapshots.append(feature_sequence)

    if not all_snapshots:
        print("错误: 未能生成任何有效的快照样本。请检查您的数据和landmark设置。")
        return
        
    df_landmarks = pd.concat(all_snapshots, ignore_index=True)

    df_landmarks = df_landmarks.rename(columns={
        'time_residual': 'time', 
        'dead_residual': 'dead' 
    })

    df_landmarks = df_landmarks.drop(columns=['time_total'], errors='ignore')

    print("\n快照生成完毕。")
    print(f"  - 原始事件记录数: {len(df_full)}")
    print(f"  - 生成的Landmark事件记录数: {len(df_landmarks)}")
    print(f"  - 生成的独立样本（快照）总数: {df_landmarks['SAMPLE_ID'].nunique()}")
    
    print(f"正在将结果保存到 '{output_filepath}'...")
    df_landmarks.to_csv(output_filepath, index=False)
    print("处理完成！")

In [5]:
custom_landmarks = [180, 365, 545, 730, 1095, 1825]
create_landmark_snapshots(input_filepath="df_brca.csv", 
                            output_filepath="df_brca_landmarks.csv",
                            landmark_times_in_days=custom_landmarks)
create_landmark_snapshots(input_filepath="df_crc.csv", 
                            output_filepath="df_crc_landmarks.csv",
                            landmark_times_in_days=custom_landmarks)
create_landmark_snapshots(input_filepath="df_nsclc.csv", 
                            output_filepath="df_nsclc_landmarks.csv",
                            landmark_times_in_days=custom_landmarks)
create_landmark_snapshots(input_filepath="df_panc.csv", 
                            output_filepath="df_panc_landmarks.csv",
                            landmark_times_in_days=custom_landmarks)
create_landmark_snapshots(input_filepath="df_prostate.csv", 
                            output_filepath="df_prostate_landmarks.csv",
                            landmark_times_in_days=custom_landmarks)

使用的Landmark时间点 (天): [180, 365, 545, 730, 1095, 1825]
正在从 'df_brca.csv' 加载数据...
数据加载与初步处理完成。
正在为 5368 名患者生成快照...


Processing Patients: 100%|██████████| 5368/5368 [00:19<00:00, 274.37it/s]



快照生成完毕。
  - 原始事件记录数: 740545
  - 生成的Landmark事件记录数: 1833037
  - 生成的独立样本（快照）总数: 20687
正在将结果保存到 'df_brca_landmarks.csv'...
处理完成！
使用的Landmark时间点 (天): [180, 365, 545, 730, 1095, 1825]
正在从 'df_crc.csv' 加载数据...
数据加载与初步处理完成。
正在为 5543 名患者生成快照...


Processing Patients: 100%|██████████| 5543/5543 [00:17<00:00, 314.37it/s]



快照生成完毕。
  - 原始事件记录数: 699010
  - 生成的Landmark事件记录数: 1649879
  - 生成的独立样本（快照）总数: 18139
正在将结果保存到 'df_crc_landmarks.csv'...
处理完成！
使用的Landmark时间点 (天): [180, 365, 545, 730, 1095, 1825]
正在从 'df_nsclc.csv' 加载数据...
数据加载与初步处理完成。
正在为 7809 名患者生成快照...


Processing Patients: 100%|██████████| 7809/7809 [00:22<00:00, 344.64it/s]



快照生成完毕。
  - 原始事件记录数: 773089
  - 生成的Landmark事件记录数: 1643315
  - 生成的独立样本（快照）总数: 23924
正在将结果保存到 'df_nsclc_landmarks.csv'...
处理完成！
使用的Landmark时间点 (天): [180, 365, 545, 730, 1095, 1825]
正在从 'df_panc.csv' 加载数据...
数据加载与初步处理完成。
正在为 3109 名患者生成快照...


Processing Patients: 100%|██████████| 3109/3109 [00:06<00:00, 494.95it/s]



快照生成完毕。
  - 原始事件记录数: 349115
  - 生成的Landmark事件记录数: 615424
  - 生成的独立样本（快照）总数: 6914
正在将结果保存到 'df_panc_landmarks.csv'...
处理完成！
使用的Landmark时间点 (天): [180, 365, 545, 730, 1095, 1825]
正在从 'df_prostate.csv' 加载数据...
数据加载与初步处理完成。
正在为 3211 名患者生成快照...


Processing Patients: 100%|██████████| 3211/3211 [00:11<00:00, 281.49it/s]



快照生成完毕。
  - 原始事件记录数: 381656
  - 生成的Landmark事件记录数: 935187
  - 生成的独立样本（快照）总数: 11799
正在将结果保存到 'df_prostate_landmarks.csv'...
处理完成！


In [6]:
df_brca = pd.read_csv('df_brca_landmarks.csv')
df_crc = pd.read_csv('df_crc_landmarks.csv')
df_nsclc = pd.read_csv('df_nsclc_landmarks.csv')
df_panc = pd.read_csv('df_panc_landmarks.csv')
df_prostate = pd.read_csv('df_prostate_landmarks.csv')
print("BRCA shape:", df_brca.shape)
print("CRC shape:", df_crc.shape)
print("NSCLC shape:", df_nsclc.shape)
print("PANC shape:", df_panc.shape)
print("PROSTATE shape:", df_prostate.shape)

BRCA shape: (1833037, 94)
CRC shape: (1649879, 99)
NSCLC shape: (1643315, 84)
PANC shape: (615424, 88)
PROSTATE shape: (935187, 93)


In [7]:
df_brca.drop(['time.1', 'dead.1'], axis=1, inplace=True)
df_crc.drop(['time.1', 'dead.1'], axis=1, inplace=True)
df_nsclc.drop(['time.1', 'dead.1'], axis=1, inplace=True)
df_panc.drop(['time.1', 'dead.1'], axis=1, inplace=True)
df_prostate.drop(['time.1', 'dead.1'], axis=1, inplace=True)
print("BRCA shape:", df_brca.shape)
print("CRC shape:", df_crc.shape)
print("NSCLC shape:", df_nsclc.shape)
print("PANC shape:", df_panc.shape)
print("PROSTATE shape:", df_prostate.shape)

BRCA shape: (1833037, 92)
CRC shape: (1649879, 97)
NSCLC shape: (1643315, 82)
PANC shape: (615424, 86)
PROSTATE shape: (935187, 91)


In [8]:
# 保存为csv文件
df_brca.to_csv('df_brca_landmarks.csv', index=False)
df_crc.to_csv('df_crc_landmarks.csv', index=False)
df_nsclc.to_csv('df_nsclc_landmarks.csv', index=False)
df_panc.to_csv('df_panc_landmarks.csv', index=False)
df_prostate.to_csv('df_prostate_landmarks.csv', index=False)