# Libraries

In [1]:
import os
import pandas as pd
import numpy as np
import monai

In [2]:
import warnings
warnings.filterwarnings("ignore")

# Image paths

In [14]:
image_dir = "../../../data/images"
relative_base = "data/images"

data_dict = {}

for subdir, _, files in os.walk(image_dir):
    for file in files:
        if file.endswith(".mha"):
            abs_path = os.path.join(subdir, file)
            relative_path = os.path.relpath(abs_path, start=image_dir)
            
            # Obtén patient_id y study_id a partir del nombre de archivo
            file_name = os.path.splitext(file)[0]  # "10129_100130_t2w"
            patient_id, study_id = file_name.split('_')[0], file_name.split('_')[1]
            
            # Creamos (o accedemos a) el registro correspondiente a (patient_id, study_id)
            key = (patient_id, study_id)
            if key not in data_dict:
                data_dict[key] = {
                    "patient_id": patient_id,
                    "study_id": study_id,
                    "t2w_path": None,
                    "adc_path": None,
                    "hbv_path": None
                }
            
            # Asignamos la ruta relativa según el tipo (t2w, adc o hbv)
            if "t2w" in file:
                data_dict[key]["t2w_path"] = os.path.join(relative_base, relative_path)
            elif "adc" in file:
                data_dict[key]["adc_path"] = os.path.join(relative_base, relative_path)
            elif "hbv" in file:
                data_dict[key]["hbv_path"] = os.path.join(relative_base, relative_path)

data = list(data_dict.values())
df = pd.DataFrame(data)

# Clincal_data

In [16]:
clinical_df = pd.read_csv("../../../data/labels/clinical_information/marksheet.csv")

df['patient_id'] = df['patient_id'].astype(str)
df['study_id'] = df['study_id'].astype(str)
clinical_df['patient_id'] = clinical_df['patient_id'].astype(str)
clinical_df['study_id'] = clinical_df['study_id'].astype(str)

df_merged = pd.merge(
    df,
    clinical_df[['patient_id', 'study_id', 'mri_date', 'patient_age', 'psa', 'psad',
                 'prostate_volume', 'case_ISUP', 'case_csPCa']],
    on=['patient_id', 'study_id'],
    how='left'
)

df_merged['case_csPCa'] = df_merged['case_csPCa'].map({'NO': 0, 'YES': 1})

cols_to_insert = ['mri_date', 'patient_age', 'psa', 'psad', 'prostate_volume',
                  'case_ISUP', 'case_csPCa']
existing_cols = df.columns.tolist()

new_col_order = existing_cols[:2] + cols_to_insert + existing_cols[2:]
df_2 = df_merged[new_col_order]

df_2

Unnamed: 0,patient_id,study_id,mri_date,patient_age,psa,psad,prostate_volume,case_ISUP,case_csPCa,t2w_path,adc_path,hbv_path
0,10000,1000000,2019-07-02,73,7.70,,55.0,0,0,data/images/10000/10000_1000000_t2w.mha,data/images/10000/10000_1000000_adc.mha,data/images/10000/10000_1000000_hbv.mha
1,10001,1000001,2016-05-27,64,8.70,0.09,102.0,0,0,data/images/10001/10001_1000001_t2w.mha,data/images/10001/10001_1000001_adc.mha,data/images/10001/10001_1000001_hbv.mha
2,10002,1000002,2021-04-18,58,4.20,0.06,74.0,0,0,data/images/10002/10002_1000002_t2w.mha,data/images/10002/10002_1000002_adc.mha,data/images/10002/10002_1000002_hbv.mha
3,10003,1000003,2019-04-05,72,13.00,,71.5,0,0,data/images/10003/10003_1000003_t2w.mha,data/images/10003/10003_1000003_adc.mha,data/images/10003/10003_1000003_hbv.mha
4,10004,1000004,2020-10-21,67,8.00,0.10,78.0,0,0,data/images/10004/10004_1000004_t2w.mha,data/images/10004/10004_1000004_adc.mha,data/images/10004/10004_1000004_hbv.mha
...,...,...,...,...,...,...,...,...,...,...,...,...
1495,11471,1001495,2012-08-25,71,12.50,0.21,62.0,2,1,data/images/11471/11471_1001495_t2w.mha,data/images/11471/11471_1001495_adc.mha,data/images/11471/11471_1001495_hbv.mha
1496,11472,1001496,2019-06-28,81,5.28,0.12,44.0,2,1,data/images/11472/11472_1001496_t2w.mha,data/images/11472/11472_1001496_adc.mha,data/images/11472/11472_1001496_hbv.mha
1497,11473,1001497,2017-09-24,56,29.60,0.34,87.0,0,0,data/images/11473/11473_1001497_t2w.mha,data/images/11473/11473_1001497_adc.mha,data/images/11473/11473_1001497_hbv.mha
1498,11474,1001498,2016-05-03,71,12.00,,83.0,1,0,data/images/11474/11474_1001498_t2w.mha,data/images/11474/11474_1001498_adc.mha,data/images/11474/11474_1001498_hbv.mha


# csPCa_lesion_delineations

In [18]:
human_dir = "../../../data/labels/csPCa_lesion_delineations/human_expert/resampled"
ai_dir = "../../../data/labels/csPCa_lesion_delineations/AI/Bosma22a"

In [19]:
def get_delineation_path(row):
    file_name = f"{row['patient_id']}_{row['study_id']}.nii.gz"
    human_path = os.path.join(human_dir, file_name)
    ai_path = os.path.join(ai_dir, file_name)

    if os.path.exists(human_path):
        return human_path.replace("../../../", ""), 1, 0 
    elif os.path.exists(ai_path):
        return ai_path.replace("../../../", ""), 0, 1
    else:
        return np.nan, 0, 0 

In [20]:
df_2[['csPCa_lesion_delineation_path', 'human_delineation', 'AI_delineation']] = df_2.apply(
    lambda row: pd.Series(get_delineation_path(row)), axis=1
)

# Shape and ISUP (again)

In [21]:
df_3 = df_2

df_3[['height', 'width', 'depth']] = np.nan
df_3['lesion_value'] = np.nan

transforms = monai.transforms.Compose([
    monai.transforms.LoadImaged(keys=['t2', 'label'], image_only=False, ensure_channel_first=True),
    monai.transforms.ScaleIntensityd(keys=['t2'], minv=0.0, maxv=1.0, allow_missing_keys=True)
])

for idx, row in df_3.iterrows():
    patient_id = row['patient_id']
    study_id = row['study_id']
    
    t2_path = f"../../../{row['t2w_path']}" if pd.notna(row['t2w_path']) else None
    label_path = f"../../../{row['csPCa_lesion_delineation_path']}" if pd.notna(row['csPCa_lesion_delineation_path']) else None

    if t2_path and os.path.exists(t2_path) and label_path and os.path.exists(label_path):
        image_dict = {
            't2': t2_path,
            'label': label_path
        }

        processed_images = transforms(image_dict)
        
        t2_shape = processed_images['t2'].shape[1:] 
        df_3.loc[idx, 'height'] = int(t2_shape[0])  # Altura
        df_3.loc[idx, 'width'] = int(t2_shape[1])   # Anchura
        df_3.loc[idx, 'depth'] = int(t2_shape[2])   # Profundidad
        
        unique_values = np.unique(processed_images['label'])  
        
        if len(unique_values) > 1: 
            lesion_values = unique_values[unique_values != 0]
        else:
            lesion_values = unique_values

        if len(lesion_values) > 1:
            print(lesion_values)
        
        if lesion_values.size > 0:
            df_3.loc[idx, 'lesion_value'] = int(lesion_values[-1])
        else:
            df_3.loc[idx, 'lesion_value'] = np.nan
        
        df_3[['height', 'width', 'depth']] = df_3[['height', 'width', 'depth']].astype(pd.Int32Dtype())
        df_3['lesion_value'] = df_3['lesion_value'].astype(pd.Int32Dtype())
        
    else:
        df_3.loc[idx, ['height', 'width', 'depth', 'lesion_value']] = np.nan

[2. 3.]
[2. 3.]
[3. 5.]
[2. 3.]
[2. 4.]


In [22]:
df_3

Unnamed: 0,patient_id,study_id,mri_date,patient_age,psa,psad,prostate_volume,case_ISUP,case_csPCa,t2w_path,adc_path,hbv_path,csPCa_lesion_delineation_path,human_delineation,AI_delineation,height,width,depth,lesion_value
0,10000,1000000,2019-07-02,73,7.70,,55.0,0,0,data/images/10000/10000_1000000_t2w.mha,data/images/10000/10000_1000000_adc.mha,data/images/10000/10000_1000000_hbv.mha,data/labels/csPCa_lesion_delineations/human_ex...,1,0,640,640,31,0
1,10001,1000001,2016-05-27,64,8.70,0.09,102.0,0,0,data/images/10001/10001_1000001_t2w.mha,data/images/10001/10001_1000001_adc.mha,data/images/10001/10001_1000001_hbv.mha,data/labels/csPCa_lesion_delineations/human_ex...,1,0,640,640,21,0
2,10002,1000002,2021-04-18,58,4.20,0.06,74.0,0,0,data/images/10002/10002_1000002_t2w.mha,data/images/10002/10002_1000002_adc.mha,data/images/10002/10002_1000002_hbv.mha,data/labels/csPCa_lesion_delineations/human_ex...,1,0,384,384,22,0
3,10003,1000003,2019-04-05,72,13.00,,71.5,0,0,data/images/10003/10003_1000003_t2w.mha,data/images/10003/10003_1000003_adc.mha,data/images/10003/10003_1000003_hbv.mha,data/labels/csPCa_lesion_delineations/human_ex...,1,0,384,384,23,0
4,10004,1000004,2020-10-21,67,8.00,0.10,78.0,0,0,data/images/10004/10004_1000004_t2w.mha,data/images/10004/10004_1000004_adc.mha,data/images/10004/10004_1000004_hbv.mha,data/labels/csPCa_lesion_delineations/human_ex...,1,0,640,640,21,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1495,11471,1001495,2012-08-25,71,12.50,0.21,62.0,2,1,data/images/11471/11471_1001495_t2w.mha,data/images/11471/11471_1001495_adc.mha,data/images/11471/11471_1001495_hbv.mha,data/labels/csPCa_lesion_delineations/human_ex...,1,0,384,384,19,2
1496,11472,1001496,2019-06-28,81,5.28,0.12,44.0,2,1,data/images/11472/11472_1001496_t2w.mha,data/images/11472/11472_1001496_adc.mha,data/images/11472/11472_1001496_hbv.mha,data/labels/csPCa_lesion_delineations/AI/Bosma...,0,1,640,640,19,1
1497,11473,1001497,2017-09-24,56,29.60,0.34,87.0,0,0,data/images/11473/11473_1001497_t2w.mha,data/images/11473/11473_1001497_adc.mha,data/images/11473/11473_1001497_hbv.mha,data/labels/csPCa_lesion_delineations/human_ex...,1,0,640,640,21,0
1498,11474,1001498,2016-05-03,71,12.00,,83.0,1,0,data/images/11474/11474_1001498_t2w.mha,data/images/11474/11474_1001498_adc.mha,data/images/11474/11474_1001498_hbv.mha,data/labels/csPCa_lesion_delineations/human_ex...,1,0,320,320,23,0


In [30]:
# df_3.to_csv("data.csv", index = False)

# Prostate delineations

In [12]:
df_3 = pd.read_csv("../../../data/data.csv")

In [26]:
whole_gland_dir = "../../../data/labels/anatomical_delineations/whole_gland/AI/Bosma22b"
zonal_dir = "../../../data/labels/anatomical_delineations/zonal_pz_tz/AI/HeviAI23"

def get_path(base_dir, patient_id, study_id):
    file_name = f"{patient_id}_{study_id}.nii.gz"
    file_path = os.path.join(base_dir, file_name)
    if os.path.exists(file_path):
        return file_path.replace("../../../", "")
    else:
        return None

df_3['whole_gland_path'] = df_3.apply(
    lambda row: get_path(whole_gland_dir, row['patient_id'], row['study_id']), axis=1
)

df_3['zonal_path'] = df_3.apply(
    lambda row: get_path(zonal_dir, row['patient_id'], row['study_id']), axis=1
)

In [27]:
df_3.to_csv("data.csv", index = False)

In [None]:
# df_3

In [3]:
# df_3 = pd.read_csv("../../../data/data.csv")

In [4]:
# df_3

Unnamed: 0,patient_id,study_id,mri_date,patient_age,psa,psad,prostate_volume,case_ISUP,case_csPCa,t2w_path,...,csPCa_lesion_delineation_path,human_delineation,AI_delineation,height,width,depth,lesion_value,lesion_csPCa,whole_gland_path,zonal_path
0,10000,1000000,2019-07-02,73,7.70,,55.0,0,0,data/images/10000/10000_1000000_t2w.mha,...,data/labels/csPCa_lesion_delineations/human_ex...,1,0,640,640,31,0,0,data/labels/anatomical_delineations/whole_glan...,data/labels/anatomical_delineations/zonal_pz_t...
1,10001,1000001,2016-05-27,64,8.70,0.09,102.0,0,0,data/images/10001/10001_1000001_t2w.mha,...,data/labels/csPCa_lesion_delineations/human_ex...,1,0,640,640,21,0,0,data/labels/anatomical_delineations/whole_glan...,data/labels/anatomical_delineations/zonal_pz_t...
2,10002,1000002,2021-04-18,58,4.20,0.06,74.0,0,0,data/images/10002/10002_1000002_t2w.mha,...,data/labels/csPCa_lesion_delineations/human_ex...,1,0,384,384,22,0,0,data/labels/anatomical_delineations/whole_glan...,data/labels/anatomical_delineations/zonal_pz_t...
3,10003,1000003,2019-04-05,72,13.00,,71.5,0,0,data/images/10003/10003_1000003_t2w.mha,...,data/labels/csPCa_lesion_delineations/human_ex...,1,0,384,384,23,0,0,data/labels/anatomical_delineations/whole_glan...,data/labels/anatomical_delineations/zonal_pz_t...
4,10004,1000004,2020-10-21,67,8.00,0.10,78.0,0,0,data/images/10004/10004_1000004_t2w.mha,...,data/labels/csPCa_lesion_delineations/human_ex...,1,0,640,640,21,0,0,data/labels/anatomical_delineations/whole_glan...,data/labels/anatomical_delineations/zonal_pz_t...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1495,11471,1001495,2012-08-25,71,12.50,0.21,62.0,2,1,data/images/11471/11471_1001495_t2w.mha,...,data/labels/csPCa_lesion_delineations/human_ex...,1,0,384,384,19,2,1,data/labels/anatomical_delineations/whole_glan...,data/labels/anatomical_delineations/zonal_pz_t...
1496,11472,1001496,2019-06-28,81,5.28,0.12,44.0,2,1,data/images/11472/11472_1001496_t2w.mha,...,data/labels/csPCa_lesion_delineations/AI/Bosma...,0,1,640,640,19,1,0,data/labels/anatomical_delineations/whole_glan...,data/labels/anatomical_delineations/zonal_pz_t...
1497,11473,1001497,2017-09-24,56,29.60,0.34,87.0,0,0,data/images/11473/11473_1001497_t2w.mha,...,data/labels/csPCa_lesion_delineations/human_ex...,1,0,640,640,21,0,0,data/labels/anatomical_delineations/whole_glan...,data/labels/anatomical_delineations/zonal_pz_t...
1498,11474,1001498,2016-05-03,71,12.00,,83.0,1,0,data/images/11474/11474_1001498_t2w.mha,...,data/labels/csPCa_lesion_delineations/human_ex...,1,0,320,320,23,0,0,data/labels/anatomical_delineations/whole_glan...,data/labels/anatomical_delineations/zonal_pz_t...


In [5]:
df_3.describe()

Unnamed: 0,patient_id,study_id,patient_age,psa,psad,prostate_volume,case_ISUP,case_csPCa,human_delineation,AI_delineation,height,width,depth,lesion_value,lesion_csPCa
count,1500.0,1500.0,1500.0,1460.0,1049.0,1473.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0
mean,10735.602667,1000750.0,65.595333,11.932397,0.224662,65.045472,0.942,0.283333,0.862,0.138,551.68,551.68,22.471333,0.521333,0.149333
std,425.311791,433.1529,7.191527,14.939071,0.404288,36.449618,1.333099,0.450767,0.345015,0.345015,231.514258,231.514258,3.492274,1.016973,0.356536
min,10000.0,1000000.0,35.0,0.1,0.01,4.0,0.0,0.0,0.0,0.0,256.0,256.0,17.0,0.0,0.0
25%,10368.75,1000375.0,61.0,5.8825,0.09,40.0,0.0,0.0,1.0,0.0,384.0,384.0,19.0,0.0,0.0
50%,10733.5,1000750.0,66.0,8.5,0.14,57.0,0.0,0.0,1.0,0.0,384.0,384.0,21.0,0.0,0.0
75%,11101.25,1001124.0,70.0,13.0,0.22,80.0,2.0,1.0,1.0,0.0,640.0,640.0,25.0,1.0,0.0
max,11475.0,1001499.0,92.0,224.0,7.0,308.0,5.0,1.0,1.0,1.0,1024.0,1024.0,45.0,5.0,1.0
