In [1]:
import numpy as np
import nibabel as nib
import pandas as pd
from pathlib import Path
import shutil
from fnmatch import fnmatch
from tqdm.notebook import tqdm

%matplotlib inline

# Path data

In [2]:
path_csv = Path("/home/fabian/projects/phd/ai_radiation_therapy/data_preprocessed/cnn-data/hn-data.csv")
csv = pd.read_csv(path_csv)
list(csv)
csv = csv.drop(['Unnamed: 0'], axis=1)

# Copy sample & create anonymized table
### Add number of files so that we only add patients with all scans

In [3]:
path_cnn_data = Path("/home/fabian/projects/phd/ai_radiation_therapy/data_preprocessed/cnn-data")
def file_counter(row):
    scan_id = Path(row['imageData'])
    id_path = path_cnn_data / scan_id.parent
    files = list(id_path.glob(f"{scan_id.name}*.v"))
    return len(files)

csv["num_files"] = csv.apply(file_counter, axis=1)

### Select random sample

In [4]:
# only sample patients with 4 associated files
positive_cases = csv[csv['statEFS'].eq(1) & csv['num_files'].eq(4)]
negative_cases = csv[csv['statEFS'].eq(0) & csv['num_files'].eq(4)]

# get sample of 100 positive & negative cases
# np.random.seed applies to pandas, as well
np.random.seed(42)
positive_sample = positive_cases.sample(frac=100/len(positive_cases))
negative_sample = negative_cases.sample(frac=100/len(negative_cases))
full_sample = pd.concat([positive_sample, negative_sample], ignore_index=True)
# shuffle
full_sample = full_sample.sample(frac=1).reset_index(drop=True)
full_sample

Unnamed: 0,name,id,birthday,age,sex,height,weight,scanDate,injDose,scanpi,...,meanAll,meanLK,peak,peakAll,peakLK,asp,aspAll,aspLK,imageData,num_files
0,QIN-HEADNECK-01-0007,QIN-HEADNECK-01,1902-01-01,58,M,,95.5,1986-04-13,448.1,85.9,...,6.4,0.0,7.819,6.447,0.000,5.342,5.342,0.000,cnn-qin/QIN-HEADNECK-01-0007_19860413,4
1,HN-CHUS-020,HN-CHUS-020,1902-01-01,54,M,,80.0,2021-10-03,377.0,71.7,...,3.9,3.8,5.556,4.209,3.755,2.925,9.938,9.938,cnn-canada/HN-CHUS-020_18850827,4
2,HNSCC_CVK-00058,HNSCC_CVK-00058,1970-01-01,56,M,175.0,77.0,2017-11-28,261.0,115.9,...,10.1,4.5,18.083,12.269,4.520,42.710,42.710,14.762,cnn-charite/HNSCC_CVK-00058,4
3,HNSCC_CVK-00135,HNSCC_CVK-00135,1970-01-01,72,F,154.0,44.0,2014-01-09,292.0,70.4,...,4.5,4.3,5.857,4.881,4.881,14.026,14.026,13.202,cnn-charite/HNSCC_CVK-00135,4
4,lmu-pet-ct-00080,lmu-pet-ct-0008,1945-01-01,72,M,190.0,108.0,2018-03-08,333.0,46.3,...,9.7,3.9,13.950,10.458,4.839,11.679,13.739,13.739,cnn-lmu/lmu-pet-ct-00080_20180308,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,HN-HMR-006,HN-HMR-006,1902-01-01,62,M,171.0,73.0,1998-06-27,346.6,90.1,...,4.8,4.9,4.667,6.727,6.727,30.726,41.852,41.852,cnn-canada/HN-HMR-006_18850827,4
196,HN-HMR-024,HN-HMR-024,1902-01-01,64,F,,,1996-04-06,556.5,,...,,,,,,28.871,28.871,19.306,cnn-canada/HN-HMR-024_18850827,4
197,lmu-pet-ct-00165,lmu-pet-ct-0016,1950-01-01,64,M,171.0,69.0,2014-05-19,239.4,83.8,...,,,8.030,,,10.158,,,cnn-lmu/lmu-pet-ct-00165_20140519,4
198,QIN-HEADNECK-01-0184,QIN-HEADNECK-01,1902-01-01,49,M,,80.0,1985-08-27,366.7,80.7,...,6.2,0.0,8.495,6.163,0.000,40.482,40.482,0.000,cnn-qin/QIN-HEADNECK-01-0184_19850827,4


In [5]:
# set data path & output path
output_path = Path("/home/fabian/projects/phd/ai_radiation_therapy/data_preprocessed/sample_for_radiologist_analysis")
path_cnn_data = Path("/home/fabian/projects/phd/ai_radiation_therapy/data_preprocessed/cnn-data")

In [6]:
anon_ids = []

old_PT_imgs = []
old_PT_rois = []
old_CT_imgs = []
old_CT_rois = []

new_PT_imgs = []
new_PT_rois = []
new_CT_imgs = []
new_CT_rois = []


for idx, row in tqdm(full_sample.iterrows(), total=len(full_sample)):
    scan_id = row['imageData']
    idx += 1
    scan_id = Path(scan_id)
    anon_id = f'subject_{idx}'
    anon_ids.append(anon_id)
    id_path = path_cnn_data / scan_id.parent
    files = list(id_path.glob(f"{scan_id.name}*.v"))
    for file in files:
        # get new name & full path where file needs to be copied to
        new_name = file.name.replace(scan_id.name, anon_id)
        out_file = output_path / new_name

        # get all names & info
        if fnmatch(file.name, "*_CT_*_img.v"):
            old_CT_imgs.append(file)
            new_CT_imgs.append(out_file)
        elif fnmatch(file.name, "*_CT_*_roi.v"):
            old_CT_rois.append(file)
            new_CT_rois.append(out_file)
        elif fnmatch(file.name, "*_PT_*_img.v"):
            old_PT_imgs.append(file)
            new_PT_imgs.append(out_file)
        elif fnmatch(file.name, "*_PT_*_roi.v"):
            old_PT_rois.append(file)
            new_PT_rois.append(out_file)
        else:
            print("##############################################")
            print(f"No match found for {file}")
            print("##############################################")
            # don't copy then
            continue

        # copy the file
        shutil.copy(file, out_file)

  0%|          | 0/200 [00:00<?, ?it/s]

In [7]:
# make dict, strfy posix paths

img_paths_dict = {f"{'old_PT_imgs'[:-1]}_paths": old_PT_imgs, f"{'old_PT_rois'[:-1]}_paths": old_PT_rois, 
                  f"{'old_CT_imgs'[:-1]}_paths": old_CT_imgs, f"{'old_CT_rois'[:-1]}_paths": old_CT_rois, 
                  f"{'new_PT_imgs'[:-1]}_paths": new_PT_imgs, f"{'new_PT_rois'[:-1]}_paths": new_PT_rois, 
                  f"{'new_CT_imgs'[:-1]}_paths": new_CT_imgs, f"{'new_CT_rois'[:-1]}_paths": new_CT_rois}

for k, v in img_paths_dict.items():
    img_paths_dict[k] = [str(vv) for vv in v]
    
# add new columns to sampple df
full_sample["anon_id"] = anon_ids

for k, v in img_paths_dict.items():
    full_sample[k] = v
    
# add pathologist answers
for i in range(1,4):
    full_sample[f'statEFS_(-1=empty_0=norelapse_1=relapse)_pathologist_{i}'] = -1

## Save metadata

In [8]:
# full table
full_sample.to_csv(output_path / "full_sample_table.csv")

# sparse table
columns = []
columns.append('anon_id')
for i in range(1,4):
    columns.append(f'statEFS_(-1=empty_0=norelapse_1=relapse)_pathologist_{i}')
full_sample.to_csv(output_path / "sparse_sample_table.csv", columns=columns)

# sparse table with some medical data added
columns = []
columns.append('anon_id')
columns.extend(["age", "sex", "height", "weight", "ebv", "hpv"])
for i in range(1,4):
    columns.append(f'statEFS_(-1=empty_0=norelapse_1=relapse)_pathologist_{i}')
full_sample.to_csv(output_path / "sparse_sample_table_w_some_medical_data.csv", columns=columns)