In [6]:
import pandas as pd
import os

In [7]:
def merge_csvs(csv_list, output_filename):
    merged_df = None
    
    for i, file_path in enumerate(csv_list):
        # Read the CSV file
        df = pd.read_csv(file_path)
        
        # Create the composite key and set it as the index
        df['patient_id_study_id'] = df['patient_id'].astype(str) + '_' + df['study_id'].astype(str)
        df.set_index('patient_id_study_id', inplace=True)
        
        # Determine the prefix based on the file name
        filename = os.path.basename(file_path)
        prefix = ""
        if filename.startswith('features_adc'):
            prefix = "adc_"
        elif filename.startswith('features_dwi'):
            prefix = "dwi_"
        elif filename.startswith('features_t2'):
            prefix = "t2_"
        
        # Select feature columns excluding keys, mask_type, and those starting with 'diagnostics'
        selected_columns = [col for col in df.columns 
                            if col not in ['patient_id', 'study_id', 'label', 'mask_type']
                            and not col.startswith('diagnostics')]
        
        # Rename feature columns with the determined prefix
        features_df = df[selected_columns].rename(columns=lambda col: prefix + col)
        
        if i == 0:
            # For the first CSV, keep the base columns as well
            base_df = df[['patient_id', 'study_id', 'label']]
            processed_df = pd.concat([base_df, features_df], axis=1)
        else:
            # For subsequent CSVs, use only the feature columns
            processed_df = features_df
        
        # Merge on the composite index (patient_id_study_id)
        if merged_df is None:
            merged_df = processed_df
        else:
            merged_df = merged_df.merge(processed_df, left_index=True, right_index=True, how='inner')
    
    # Optionally, reset the index if you want the composite key as a regular column
    merged_df.reset_index(drop=True, inplace=True)
    
    # Save the merged DataFrame to a CSV file
    merged_df.to_csv(output_filename, index=False)
    
    return merged_df

In [9]:
gland_csvs = ['features_adc_gland.csv', 'features_dwi_gland.csv', 'features_t2_gland.csv']
df_gland = merge_csvs(gland_csvs, 'concatenate_data/features_all_gland.csv')
df_gland

Unnamed: 0,patient_id,study_id,label,adc_original_shape_Elongation,adc_original_shape_Flatness,adc_original_shape_LeastAxisLength,adc_original_shape_MajorAxisLength,adc_original_shape_Maximum2DDiameterColumn,adc_original_shape_Maximum2DDiameterRow,adc_original_shape_Maximum2DDiameterSlice,...,t2_exponential_gldm_GrayLevelNonUniformity,t2_exponential_gldm_GrayLevelVariance,t2_exponential_gldm_HighGrayLevelEmphasis,t2_exponential_gldm_LargeDependenceEmphasis,t2_exponential_gldm_LargeDependenceHighGrayLevelEmphasis,t2_exponential_gldm_LargeDependenceLowGrayLevelEmphasis,t2_exponential_gldm_LowGrayLevelEmphasis,t2_exponential_gldm_SmallDependenceEmphasis,t2_exponential_gldm_SmallDependenceHighGrayLevelEmphasis,t2_exponential_gldm_SmallDependenceLowGrayLevelEmphasis
0,10000,1000000,0,0.808087,0.633421,29.132328,45.992060,52.023421,45.423126,52.050928,...,60127.673168,0.028000,1.048730,78.122638,78.918905,78.030621,0.995408,0.013927,0.020408,0.013329
1,10001,1000001,0,0.862442,0.764552,44.203477,57.816184,64.509395,64.791501,63.803018,...,139894.070996,0.000528,1.001557,79.162458,79.205704,79.151655,0.999619,0.013044,0.013234,0.013005
2,10002,1000002,0,0.980027,0.814579,40.404389,49.601552,55.095586,57.791057,56.759056,...,117629.356071,0.145071,1.272186,76.342860,83.702612,74.954786,0.964784,0.016139,0.076518,0.013551
3,10003,1000003,0,0.938419,0.817366,41.656787,50.964656,53.287394,60.950602,56.607416,...,131838.351632,0.028115,1.063863,77.948506,79.663506,77.574429,0.988749,0.014376,0.025824,0.013307
4,10004,1000004,0,0.852145,0.637281,34.380191,53.948227,60.310477,57.723667,60.055545,...,94406.183813,0.010063,1.018443,78.676255,78.972203,78.623897,0.997771,0.013463,0.018503,0.013148
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1495,11471,1001495,1,0.878505,0.737381,36.317429,49.251941,56.065829,52.513226,58.636140,...,82269.382031,0.005180,1.014790,78.475659,79.026015,78.340754,0.996524,0.013452,0.014564,0.013223
1496,11472,1001496,1,0.751721,0.595717,28.649831,48.093019,52.415516,41.672912,52.445758,...,52674.778513,0.028067,1.067314,77.201261,79.194182,76.809612,0.987631,0.014327,0.019965,0.013452
1497,11473,1001497,0,0.892180,0.738870,39.542375,53.517373,54.950399,64.840167,56.607416,...,108585.096271,0.058213,1.100875,78.283417,80.474545,78.057671,0.990907,0.014341,0.030646,0.013171
1498,11474,1001498,0,0.937394,0.831139,42.838913,51.542445,60.050439,61.165260,60.823514,...,112059.550402,0.032287,1.075851,77.630569,79.907701,77.160967,0.986433,0.014362,0.022821,0.013376


In [10]:
full_csvs = ['features_adc_full.csv', 'features_dwi_full.csv', 'features_t2_full.csv']
df_full = merge_csvs(full_csvs, 'concatenate_data/features_all_full.csv')
df_full

Unnamed: 0,patient_id,study_id,label,adc_original_shape_Elongation,adc_original_shape_Flatness,adc_original_shape_LeastAxisLength,adc_original_shape_MajorAxisLength,adc_original_shape_Maximum2DDiameterColumn,adc_original_shape_Maximum2DDiameterRow,adc_original_shape_Maximum2DDiameterSlice,...,t2_exponential_gldm_GrayLevelNonUniformity,t2_exponential_gldm_GrayLevelVariance,t2_exponential_gldm_HighGrayLevelEmphasis,t2_exponential_gldm_LargeDependenceEmphasis,t2_exponential_gldm_LargeDependenceHighGrayLevelEmphasis,t2_exponential_gldm_LargeDependenceLowGrayLevelEmphasis,t2_exponential_gldm_LowGrayLevelEmphasis,t2_exponential_gldm_SmallDependenceEmphasis,t2_exponential_gldm_SmallDependenceHighGrayLevelEmphasis,t2_exponential_gldm_SmallDependenceLowGrayLevelEmphasis
0,10000,1000000,0,0.981817,0.501314,118.064389,235.510068,226.722699,223.392320,284.545425,...,5.545001e+06,0.077983,1.112168,79.905342,81.839712,79.621536,0.992376,0.013322,0.052890,0.012583
1,10001,1000001,0,0.934779,0.295118,87.196332,295.462538,249.797988,265.821410,349.010658,...,4.323802e+06,0.017309,1.026033,80.335757,80.686095,80.279780,0.997847,0.012802,0.021848,0.012539
2,10002,1000002,0,0.934779,0.257669,76.131467,295.462538,247.354067,263.526131,349.010658,...,4.485409e+06,0.028817,1.049955,79.927545,80.887786,79.734762,0.994059,0.013271,0.028789,0.012629
3,10003,1000003,0,0.934779,0.269405,79.598997,295.462538,248.135114,264.259384,349.010658,...,4.667221e+06,0.028764,1.054700,79.801616,81.204077,79.512416,0.992267,0.013360,0.027046,0.012649
4,10004,1000004,0,0.934779,0.295118,87.196329,295.462538,249.797987,265.821409,349.010658,...,4.338714e+06,0.004629,1.007573,80.492442,80.582437,80.475682,0.999202,0.012634,0.015873,0.012511
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1495,11471,1001495,1,0.659398,0.259529,76.681158,295.462538,180.113044,263.526131,305.493726,...,3.900496e+06,0.014856,1.026513,80.222358,80.736283,80.124185,0.996704,0.012923,0.021454,0.012561
1496,11472,1001496,1,0.934779,0.266944,78.872048,295.462538,247.818632,263.962235,349.010658,...,3.910057e+06,0.093119,1.109845,80.329087,80.850399,80.279206,0.997559,0.013053,0.085896,0.012541
1497,11473,1001497,0,0.934779,0.295118,87.196330,295.462538,249.797987,265.821409,349.010658,...,4.309923e+06,0.020946,1.033949,80.162304,80.647150,80.078410,0.996588,0.013038,0.025280,0.012592
1498,11474,1001498,0,0.857135,0.538975,161.554943,299.744827,293.307691,261.320313,340.700697,...,5.011707e+06,0.058156,1.106233,79.442040,82.764109,78.998509,0.987848,0.013667,0.026206,0.012681
