# Notebook for creating the datasplits for different modeling tasks

**Splits:**
1. Fine-tuning sets on PadChest for the tube detection task
2. Fine-tuning and test sets on PadChest for the pathology detection task
3. Fine-tuning sets on ChestX-ray14 for the pathology detection task
4. Evaluation set(s) created from the four ChestX-ray14 test sets

In [1]:
# Imports
import pandas as pd
import ast
from collections import Counter
import matplotlib.pyplot as plt
import numpy as np
import os
from sklearn.model_selection import train_test_split

In [2]:
# Loading the data
padchest = pd.read_csv('../Data/preprocessed_df.csv', index_col=0)
not_annotated = pd.read_excel("../Data/Annotations/Annotations_a1.xlsx", index_col=0)[1011:]
annotated = pd.read_csv("../Data/Annotations/Annotations_aggregated.csv", index_col=0)

FileNotFoundError: [Errno 2] No such file or directory: '../Data/preprocessed_df.csv'

In [5]:
print(len(padchest))
print(len(not_annotated))
print(len(annotated))

109044
5935
1011


## 1. Fine-tuning sets on PadChest for the tube detection task

Extracting the remainder of the 6,946 images with tubes (according to PadChest), that we did not annotate, to use for finetuning the tube detection model.

In [7]:
# Merging with PadChest
not_annotated = not_annotated.merge(padchest, how='inner', on=['ImageID', 'ImageDir'])
annotated = annotated.merge(padchest, how='inner', on=['ImageID', 'ImageDir'])

In [8]:
not_annotated[:2]

Unnamed: 0,ImageDir,ImageID,Chest_drain_tube,NSG_tube,Endotracheal_tube,Tracheostomy_tube,Notes,StudyDate_DICOM,StudyID,PatientID,...,ExposureTime,RelativeXRayExposure_DICOM,ReportID,Report,MethodLabel,Labels,Localizations,LabelsLocalizationsBySentence,labelCUIS,LocalizationsCUIS
0,12,216840111366964013829543166512013338084936992_...,,,,,,20131204,216840111366964013829543166512013338084936992,265613739583604299262488539145831080171,...,,617,4594428,. . via central subclavi derech ven cav super...,RNN_model,"['unchanged', 'pulmonary mass', 'nsg tube', 'c...","['loc infradiaphragm', 'loc right', 'loc left'...",['central venous catheter via subclavian vein'...,['C0398281' 'C0149726'],['C0444532' 'C0443246' 'C0038532' 'C3165182' '...
1,42,216840111366964012948363412702011019091056295_...,,,,,,20110119,216840111366964012948363412702011019091056295,128783318956165996430924704960729601184,...,,1313,3978186,. . . . tet localizacion correct . . . sng bi...,RNN_model,['nsg tube'],[],"['exclude', 'NSG tube', 'normal']",[],[]


In [9]:
# Adding the separate tube labels from PadChest to the annotations df
not_annotated["Chest_drain_tube_padchest"] = [1 if "chest drain tube" in i else 0 for i in not_annotated.Labels]
not_annotated["NSG_tube_padchest"] = [1 if "nsg tube" in i else 0 for i in not_annotated.Labels]
not_annotated["Endotracheal_tube_padchest"] = [1 if "endotracheal tube" in i else 0 for i in not_annotated.Labels]
not_annotated["Tracheostomy_tube_padchest"] = [1 if "tracheostomy tube" in i else 0 for i in not_annotated.Labels ]

In [10]:
# Checking the distribution of the entire extracted set with tubes
tube_types = ['Chest_drain_tube_padchest', 'NSG_tube_padchest', 'Endotracheal_tube_padchest', 'Tracheostomy_tube_padchest']

for t in tube_types:
    print(t)
    print('Dist: ', Counter(not_annotated[t]))
    print('Percentages: ')
    print('No tube: ', len(not_annotated[not_annotated[t] == 0]) / len(not_annotated) * 100)
    print('Tube: ', len(not_annotated[not_annotated[t] == 1]) / len(not_annotated) * 100)
    print()

Chest_drain_tube_padchest
Dist:  Counter({0: 5528, 1: 407})
Percentages: 
No tube:  93.14237573715248
Tube:  6.857624262847514

NSG_tube_padchest
Dist:  Counter({1: 4402, 0: 1533})
Percentages: 
No tube:  25.829823083403536
Tube:  74.17017691659646

Endotracheal_tube_padchest
Dist:  Counter({0: 3952, 1: 1983})
Percentages: 
No tube:  66.58803706823926
Tube:  33.41196293176074

Tracheostomy_tube_padchest
Dist:  Counter({0: 4473, 1: 1462})
Percentages: 
No tube:  75.3664700926706
Tube:  24.6335299073294



In [12]:
# Adding a column with image paths in the not_annotated df
image_paths = []
for idx, row in not_annotated.iterrows():
    path = '../../Data/padchest-preprocessed/' + str(row['ImageDir']) + '/' + str(row['ImageID'])
    image_paths.append(path)

not_annotated['ImagePath'] = image_paths

In [13]:
# Adding a column with image paths in the PadChest df as well
image_paths = []
for idx, row in padchest.iterrows():
    path = '../../Data/padchest-preprocessed/' + str(row['ImageDir']) + '/' + str(row['ImageID'])
    image_paths.append(path)

padchest['ImagePath'] = image_paths

In [14]:
# Removing the annotated patient id's from the not_annotated set
not_annotated_shorter = not_annotated[~not_annotated['PatientID'].isin(annotated['PatientID'])]

In [15]:
print(len(not_annotated_shorter))
not_annotated_shorter[:2]

3573


Unnamed: 0,ImageDir,ImageID,Chest_drain_tube,NSG_tube,Endotracheal_tube,Tracheostomy_tube,Notes,StudyDate_DICOM,StudyID,PatientID,...,Labels,Localizations,LabelsLocalizationsBySentence,labelCUIS,LocalizationsCUIS,Chest_drain_tube_padchest,NSG_tube_padchest,Endotracheal_tube_padchest,Tracheostomy_tube_padchest,ImagePath
0,12,216840111366964013829543166512013338084936992_...,,,,,,20131204,216840111366964013829543166512013338084936992,265613739583604299262488539145831080171,...,"['unchanged', 'pulmonary mass', 'nsg tube', 'c...","['loc infradiaphragm', 'loc right', 'loc left'...",['central venous catheter via subclavian vein'...,['C0398281' 'C0149726'],['C0444532' 'C0443246' 'C0038532' 'C3165182' '...,0,1,0,0,../../Data/padchest-preprocessed/12/2168401113...
1,42,216840111366964012948363412702011019091056295_...,,,,,,20110119,216840111366964012948363412702011019091056295,128783318956165996430924704960729601184,...,['nsg tube'],[],"['exclude', 'NSG tube', 'normal']",[],[],0,1,0,0,../../Data/padchest-preprocessed/42/2168401113...


In [16]:
# Only looking at unique patient ID's
not_annotated_shorter_unique_patients = not_annotated_shorter.drop_duplicates(subset='PatientID', keep="last")

# Create finetune and finetune_val split from unique patient ID's
finetune, finetune_val = train_test_split(not_annotated_shorter_unique_patients, test_size=0.4, random_state=42)

In [17]:
# Retrieving all images with the patient ID's
finetune_all = not_annotated_shorter[not_annotated_shorter['PatientID'].isin(finetune['PatientID'])]
finetune_val_all = not_annotated_shorter[not_annotated_shorter['PatientID'].isin(finetune_val['PatientID'])]

# Checking the lengths of the data splits
print(len(finetune))
print(len(finetune_all))
print()
print(len(finetune_val))
print(len(finetune_val_all))
print()

# Checking that all images from the preprocessed dataframe were retrieved
print(len(not_annotated_shorter))
print(len(finetune_all) + len(finetune_val_all))

1258
2117

840
1456

3573
3573


In [18]:
# Function for creating a binarized column
def my_func(row, column_name, label, padchest=True):
    
    if padchest:
        labels = ast.literal_eval(row[column_name])
    else:
        labels = [i.strip(" ") for i in row[column_name].split(",")]

    if label in labels:
        return 1
    else:
        return 0
    
# Function for adding binarized columns for each label in a given list
def preproc_pathology(df, label_col, padchest=True):
    if padchest:
        l_dict = {'pleural effusion': 'Effusion', 'pneumothorax': 'Pneumothorax', 
          'atelectasis': 'Atelectasis', 'cardiomegaly':'Cardiomegaly', 
          'pneumonia':'Pneumonia', 'chest drain tube': "Chest_drain_tube", 'nsg tube': 'NSG_tube', 
          'endotracheal tube': 'Endotracheal_tube', 'tracheostomy tube': 'Tracheostomy_tube'}
    else:
        l_dict = {'Effusion': 'Effusion', 'Pneumothorax': 'Pneumothorax', 
          'Atelectasis': 'Atelectasis', 'Cardiomegaly':'Cardiomegaly', 
          'Pneumonia':'Pneumonia'}
        
    for label in l_dict.keys(): 
        df[l_dict[label]] = df.apply(my_func, args=(label_col, label, padchest), axis=1)
        
    return df

In [19]:
# Extracting a subset of columns
finetune_all = finetune_all[['ImageID', 'ImagePath', 'Labels']].reset_index(drop=True)
finetune_val_all = finetune_val_all[['ImageID', 'ImagePath', 'Labels']].reset_index(drop=True)

# Adding the binarized label columns
finetune_binary = preproc_pathology(finetune_all, 'Labels')
finetune_val_binary = preproc_pathology(finetune_val_all, 'Labels')

In [20]:
finetune_binary[:2]

Unnamed: 0,ImageID,ImagePath,Labels,Effusion,Pneumothorax,Atelectasis,Cardiomegaly,Pneumonia,Chest_drain_tube,NSG_tube,Endotracheal_tube,Tracheostomy_tube
0,216840111366964013829543166512013338084936992_...,../../Data/padchest-preprocessed/12/2168401113...,"['unchanged', 'pulmonary mass', 'nsg tube', 'c...",0,0,0,0,0,0,1,0,0
1,216840111366964012948363412702011019091056295_...,../../Data/padchest-preprocessed/42/2168401113...,['nsg tube'],0,0,0,0,0,0,1,0,0


In [21]:
# Shuffling
finetune_save = finetune_binary.sample(frac=1, random_state=123).reset_index(drop=True)
finetune_save[:2]

# Saving the file
#finetune_save.to_csv("../Data/Data_splits/tube_detection-finetuning.csv")
#print('Saved :)')

Saved :)


In [22]:
# Shuffling
finetune_val_save = finetune_val_binary.sample(frac=1, random_state=123).reset_index(drop=True)
finetune_val_save[:2]

# Saving the file
#finetune_val_save.to_csv("../Data/Data_splits/tube_detection-finetuning_val.csv")
#print('Saved :)')

Saved :)


## 2. Fine-tuning and test sets on PadChest for the pathology detection task

Create fine-tune, fine-tune validation and test split from unique patient IDs. The split should be as stratified on labels and gender as possible. It is created using patient IDs, to ensure that the same patient does not appear in several splits.

We are using the train_test_split method from Sklearn to create the splits on the PadChest data. First, we remove the images that share a patient ID with any of the 1,011 instances which we annotated, as well as any of the resulting images extracted for fine-tuning and fine-tuning validation for the tube detection task.

In [25]:
# Loading the data
padchest = pd.read_csv('../Data/preprocessed_df.csv', index_col=0)
annotated = pd.read_csv("../Data/Annotations/Annotations_aggregated.csv", index_col=0)
finetune = pd.read_csv('../Data/Data_splits/tube_detection-finetuning.csv', index_col=0)
finetune_val = pd.read_csv('../Data/Data_splits/tube_detection-finetuning_val.csv', index_col=0)

  padchest = pd.read_csv('../Data/preprocessed_df_08032023.csv', index_col=0)


In [26]:
# Merging with PadChest to obtain patient IDs
annotated = annotated.merge(padchest, how='inner', on=['ImageID'])
finetune = finetune.merge(padchest, how='inner', on=['ImageID'])
finetune_val = finetune_val.merge(padchest, how='inner', on=['ImageID'])

In [27]:
# Removing the annotated and fine-tuning patient IDs from the PadChest set
padchest_shorter = padchest[~padchest['PatientID'].isin(annotated['PatientID'])]
padchest_shorter = padchest_shorter[~padchest_shorter['PatientID'].isin(finetune['PatientID'])]
padchest_shorter = padchest_shorter[~padchest_shorter['PatientID'].isin(finetune_val['PatientID'])]

In [28]:
len(padchest_shorter)

96427

In [29]:
# Adding a column with image paths
image_paths = []
for idx, row in padchest_shorter.iterrows():
    path = '../../Data/padchest-preprocessed/' + str(row['ImageDir']) + '/' + str(row['ImageID'])
    image_paths.append(path)

padchest_shorter['ImagePath'] = image_paths

In [30]:
# Creating a NewLabels columns storing only the pathology labels in the
# final list + 'normal', and adding a 'no finding' for none of the above
labels_to_keep = ['pleural effusion', 'pneumothorax', 'atelectasis', 'cardiomegaly', 'pneumonia',
                  'chest drain tube', 'nsg tube', 'endotracheal tube', 'tracheostomy tube', 'normal']
all_new_labels = []

# Function for returning the intersection (shared elements) between two lists
def intersection(lst1, lst2):
    return list(set(lst1) & set(lst2))

# Getting the list of lists of only the labels to keep
for label_list in padchest_shorter['Labels']:
    label_list = ast.literal_eval(label_list)
    new_labels = intersection(label_list, labels_to_keep)

    if len(new_labels) == 0:
        new_labels = ['no finding']
    all_new_labels.append(new_labels)

In [31]:
padchest_shorter['NewLabels'] = all_new_labels

In [32]:
padchest_shorter[:2]

Unnamed: 0,ImageID,ImageDir,StudyDate_DICOM,StudyID,PatientID,PatientBirth,PatientSex_DICOM,ViewPosition_DICOM,Projection,MethodProjection,...,ReportID,Report,MethodLabel,Labels,Localizations,LabelsLocalizationsBySentence,labelCUIS,LocalizationsCUIS,ImagePath,NewLabels
0,20536686640136348236148679891455886468_k6ga29.png,0,20140915,20536686640136348236148679891455886468,839860488694292331637988235681460987,1930.0,F,POSTEROANTERIOR,PA,Manual review of DICOM fields,...,4765777,sin hallazg patolog edad pacient .,Physician,['normal'],[],"[['normal'], ['normal']]",[],[],../../Data/padchest-preprocessed/0/20536686640...,[normal]
1,135803415504923515076821959678074435083_fzis7b...,0,20150914,135803415504923515076821959678074435083,313572750430997347502932654319389875966,1929.0,M,POSTEROANTERIOR,PA,Manual review of DICOM fields,...,4991845,cambi pulmonar cronic sever . sign fibrosis b...,Physician,"['pseudonodule', 'chronic changes', 'ground gl...","['loc basal', 'loc basal bilateral']","[['pulmonary fibrosis', 'loc basal bilateral']...",['C0034069' 'C0742362' 'C2115817' 'C3544344'],['C1282378'],../../Data/padchest-preprocessed/0/13580341550...,[no finding]


In [33]:
# Only looking at unique patient IDs
padchest_shorter_unique_patients = padchest_shorter.drop_duplicates(subset='PatientID', keep="last")
print(len(padchest_shorter))
print(len(padchest_shorter_unique_patients))

96427
63774


In [34]:
# Create train and eval ('rest') split from unique patient IDs
X_train, X_eval = train_test_split(padchest_shorter_unique_patients, test_size=0.2, random_state=42)

# Create dev and test split from 'rest' split above
X_dev, X_test = train_test_split(X_eval, test_size=0.5, random_state=42)

In [35]:
print(len(X_train))
print(len(X_dev))
print(len(X_test))

51019
6377
6378


### Retrieving all the images for the unique patient IDs the split was made on

In [38]:
# Retrieving all images with the patient ID's
X_train_all = padchest_shorter[padchest_shorter['PatientID'].isin(X_train['PatientID'])]
X_dev_all = padchest_shorter[padchest_shorter['PatientID'].isin(X_dev['PatientID'])]
X_test_all = padchest_shorter[padchest_shorter['PatientID'].isin(X_test['PatientID'])]

In [39]:
# Checking the lengths of the data splits
print(len(X_train))
print(len(X_train_all))
print()
print(len(X_dev))
print(len(X_dev_all))
print()
print(len(X_test))
print(len(X_test_all))

51019
76946

6377
9692

6378
9789


In [41]:
# Computing the dataset percentages
print('Train: ', np.round(len(X_train_all)/len(padchest_shorter)*100, 3))
print('Dev: ', np.round(len(X_dev_all)/len(padchest_shorter)*100, 3))
print('Test: ', np.round(len(X_test_all)/len(padchest_shorter)*100, 3))

Train:  79.797
Dev:  10.051
Test:  10.152


### Saving the splits in csv files

In [46]:
X_train_all[:2]

Unnamed: 0,ImageID,ImageDir,StudyDate_DICOM,StudyID,PatientID,PatientBirth,PatientSex_DICOM,ViewPosition_DICOM,Projection,MethodProjection,...,ReportID,Report,MethodLabel,Labels,Localizations,LabelsLocalizationsBySentence,labelCUIS,LocalizationsCUIS,ImagePath,NewLabels
0,20536686640136348236148679891455886468_k6ga29.png,0,20140915,20536686640136348236148679891455886468,839860488694292331637988235681460987,1930.0,F,POSTEROANTERIOR,PA,Manual review of DICOM fields,...,4765777,sin hallazg patolog edad pacient .,Physician,['normal'],[],"[['normal'], ['normal']]",[],[],../../Data/padchest-preprocessed/0/20536686640...,[normal]
1,135803415504923515076821959678074435083_fzis7b...,0,20150914,135803415504923515076821959678074435083,313572750430997347502932654319389875966,1929.0,M,POSTEROANTERIOR,PA,Manual review of DICOM fields,...,4991845,cambi pulmonar cronic sever . sign fibrosis b...,Physician,"['pseudonodule', 'chronic changes', 'ground gl...","['loc basal', 'loc basal bilateral']","[['pulmonary fibrosis', 'loc basal bilateral']...",['C0034069' 'C0742362' 'C2115817' 'C3544344'],['C1282378'],../../Data/padchest-preprocessed/0/13580341550...,[no finding]


In [49]:
# Extracting a subset of df columns
X_train_all_save = X_train_all[['ImageID', 'ImagePath', 'Labels']]
X_train_all_save = preproc_pathology(X_train_all_save, 'Labels', )
X_train_all_save = X_train_all_save.sample(frac=1, random_state=123).reset_index(drop=True)

X_dev_all_save = X_dev_all[['ImageID', 'ImagePath', 'Labels']]
X_dev_all_save = preproc_pathology(X_dev_all_save, 'Labels', )
X_dev_all_save = X_dev_all_save.sample(frac=1, random_state=123).reset_index(drop=True)

X_test_all_save = X_test_all[['ImageID', 'ImagePath', 'Labels']]
X_test_all_save = preproc_pathology(X_test_all_save, 'Labels', )
X_test_all_save = X_test_all_save.sample(frac=1, random_state=123).reset_index(drop=True)

# Saving the files
#X_train_all_save.to_csv("../Data/Data_splits/pathology_detection-train.csv")
#X_dev_all_save.to_csv("../Data/Data_splits/pathology_detection-val.csv")
#X_test_all_save.to_csv("../Data/Data_splits/pathology_detection-test.csv")
#print('Saved :)')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[l_dict[label]] = df.apply(my_func, args=(label_col, label, padchest), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[l_dict[label]] = df.apply(my_func, args=(label_col, label, padchest), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[l_dict[label]] = df.apply(my_func, args=

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[l_dict[label]] = df.apply(my_func, args=(label_col, label, padchest), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[l_dict[label]] = df.apply(my_func, args=(label_col, label, padchest), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[l_dict[label]] = df.apply(my_func, args=

Saved :)


## 3. Fine-tuning sets on ChestX-ray14 for the pathology detection task

Create fine-tune and fine-tune validation splits from the CXR14 data. It is created using patient IDs, to ensure that the same patient does not appear in several splits.

We are using the train_test_split method from Sklearn to create the splits. First, we remove the images that share a patient ID with any of four annotated datasets from our previous work, as well as any of the four CXR14 test sets.

In [54]:
# Loading the data
chestxray14 = pd.read_csv("../../Data/ChestX-ray14/Data_Entry_2017.csv", index_col=0).reset_index()
annotated_14_train = pd.read_csv("../../RP/Data/Train_split.csv", index_col=0)
annotated_14_val = pd.read_csv("../../RP/Data/Validation_split.csv", index_col=0)

lor_labels20 = pd.read_csv("../../RP/Data/lor_labels_20perc.csv", index_col=0)
lor_labels_rest = pd.read_csv("../../RP/Data/ExpertTest.csv", index_col=0)

GCS16l = pd.read_csv("../Data/Data_splits/GCS16l.csv", index_col=0)
Bbox = pd.read_csv("../Data/Data_splits/Bbox.csv", index_col=0)
GCS4l = pd.read_csv("../Data/Data_splits/GCS4l.csv", index_col=0)
RSNA = pd.read_csv("../Data/Data_splits/RSNA.csv", index_col=0)

In [55]:
# Merging with ChestX-ray14 to obtain the patient IDs
annotated_14_train = annotated_14_train.merge(chestxray14, how='inner', on=['Image Index'])
annotated_14_val = annotated_14_val.merge(chestxray14, how='inner', on=['Image Index'])

lor_labels20 = lor_labels20.merge(chestxray14, how='inner', on=['Image Index'])
lor_labels_rest = lor_labels_rest.merge(chestxray14, how='inner', on=['Image Index'])

GCS16l = GCS16l.merge(chestxray14, how='inner', on=['Image Index'])
Bbox = Bbox.merge(chestxray14, how='inner', on=['Image Index'])
GCS4l = GCS4l.merge(chestxray14, how='inner', on=['Image Index'])
RSNA = RSNA.merge(chestxray14, how='inner', on=['Image Index'])

In [56]:
print(len(chestxray14))
print(len(annotated_14_train))
print(len(annotated_14_val))
print(len(lor_labels20))
print(len(lor_labels_rest))
print(len(GCS16l))
print(len(Bbox))
print(len(GCS4l))
print(len(RSNA))

112120
2835
708
300
1292
810
880
4376
26684


In [57]:
GCS16l[:2]

Unnamed: 0,Image Index,Labels_all,Hernia,Pneumonia,Nodule,Edema,Other,Infiltration,Pneumothorax,Abnormal,...,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,OriginalImage[Width,Height],OriginalImagePixelSpacing[x,y],Unnamed: 11
0,00029560_000.png,,0,0,0,0,0,0,0,0,...,0,29560,22,M,PA,1798,1845,0.194311,0.194311,
1,00009437_005.png,,0,0,0,0,0,0,0,0,...,5,9437,46,F,PA,1848,1774,0.194311,0.194311,


In [58]:
# Concatenating the four dataframes of annotated instances and the test sets
concat = pd.concat([annotated_14_train, annotated_14_val, lor_labels20, lor_labels_rest, GCS16l, Bbox, GCS4l, RSNA])
concat

Unnamed: 0,Image Index,Tube,Path,Finding Labels,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,OriginalImage[Width,...,Emphysema,Atelectasis,Effusion,Cardiomegaly,Pleural_Thickening,Labels_bbox,Labels_four,Fracture,Airspace opacity,Labels_RSNA
0,00015530_098.png,0.0,../Data/ChestX-ray14/images_007/images/0001553...,Atelectasis|Emphysema|Pneumothorax,98,15530,20,M,AP,3056,...,,,,,,,,,,
1,00021154_002.png,0.0,../Data/ChestX-ray14/images_010/images/0002115...,Effusion|Pneumothorax,2,21154,31,M,AP,3056,...,,,,,,,,,,
2,00020405_033.png,1.0,../Data/ChestX-ray14/images_009/images/0002040...,Infiltration|Pneumothorax,33,20405,49,M,AP,3056,...,,,,,,,,,,
3,00007056_005.png,1.0,../Data/ChestX-ray14/images_004/images/0000705...,Pneumothorax,5,7056,56,M,PA,2992,...,,,,,,,,,,
4,00018366_038.png,1.0,../Data/ChestX-ray14/images_008/images/0001836...,Pneumothorax,38,18366,64,F,AP,2680,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26679,00012931_034.png,,,Infiltration,34,12931,47,F,AP,2500,...,,,,,,,,,,Pneumonia
26680,00029502_003.png,,,No Finding,3,29502,54,M,AP,3056,...,,,,,,,,,,Pneumonia
26681,00011921_002.png,,,No Finding,2,11921,46,F,PA,2850,...,,,,,,,,,,No pneumonia
26682,00024731_000.png,,,Infiltration,0,24731,45,M,PA,2500,...,,,,,,,,,,No pneumonia


In [59]:
# Retrieving all images with the patient IDs
concat_all_patient_images = chestxray14[chestxray14['Patient ID'].isin(concat['Patient ID'])]

In [60]:
concat_all_patient_images

Unnamed: 0,Image Index,Finding Labels,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,OriginalImage[Width,Height],OriginalImagePixelSpacing[x,y],Unnamed: 11
4,00000003_000.png,Hernia,0,3,81,F,PA,2582,2991,0.143000,0.143000,
5,00000003_001.png,Hernia,1,3,74,F,PA,2500,2048,0.168000,0.168000,
6,00000003_002.png,Hernia,2,3,75,F,PA,2048,2500,0.168000,0.168000,
7,00000003_003.png,Hernia|Infiltration,3,3,76,F,PA,2698,2991,0.143000,0.143000,
8,00000003_004.png,Hernia,4,3,77,F,PA,2500,2048,0.168000,0.168000,
...,...,...,...,...,...,...,...,...,...,...,...,...
112100,00030789_000.png,Infiltration,0,30789,52,F,PA,2021,2021,0.194311,0.194311,
112107,00030794_000.png,No Finding,0,30794,38,F,PA,2021,2021,0.194311,0.194311,
112111,00030798_000.png,No Finding,0,30798,30,M,PA,2500,2048,0.171000,0.171000,
112114,00030801_000.png,No Finding,0,30801,39,M,PA,2500,2048,0.168000,0.168000,


In [61]:
# Removing the annotated data and test set data from the ChestX-ray14 data
chestxray14_shorter = chestxray14[~chestxray14['Image Index'].isin(concat_all_patient_images['Image Index'])]

In [62]:
chestxray14_shorter

Unnamed: 0,Image Index,Finding Labels,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,OriginalImage[Width,Height],OriginalImagePixelSpacing[x,y],Unnamed: 11
0,00000001_000.png,Cardiomegaly,0,1,58,M,PA,2682,2749,0.143,0.143,
1,00000001_001.png,Cardiomegaly|Emphysema,1,1,58,M,PA,2894,2729,0.143,0.143,
2,00000001_002.png,Cardiomegaly|Effusion,2,1,58,M,PA,2500,2048,0.168,0.168,
3,00000002_000.png,No Finding,0,2,81,M,PA,2500,2048,0.171,0.171,
12,00000004_000.png,Mass|Nodule,0,4,82,M,AP,2500,2048,0.168,0.168,
...,...,...,...,...,...,...,...,...,...,...,...,...
112113,00030800_000.png,No Finding,0,30800,34,F,PA,2048,2500,0.168,0.168,
112116,00030802_000.png,No Finding,0,30802,29,M,PA,2048,2500,0.168,0.168,
112117,00030803_000.png,No Finding,0,30803,42,F,PA,2048,2500,0.168,0.168,
112118,00030804_000.png,No Finding,0,30804,30,F,PA,2048,2500,0.168,0.168,


In [63]:
# Checking that no patients nor images are in both dataframes
print(set(chestxray14_shorter['Patient ID']) & set(concat_all_patient_images['Patient ID']))
print(set(chestxray14_shorter['Image Index']) & set(concat_all_patient_images['Image Index']))

set()
set()


In [64]:
# Only looking at unique patient IDs
chestxray14_shorter_unique_patients = chestxray14_shorter.drop_duplicates(subset='Patient ID', keep="last")

# Create finetune and finetune_val split from unique patient IDs
CRX14_finetune, CRX14_finetune_val = train_test_split(chestxray14_shorter_unique_patients, test_size=0.4, random_state=42)

In [65]:
# Retrieving all images with the patient IDs
CRX14_finetune_all = chestxray14_shorter[chestxray14_shorter['Patient ID'].isin(CRX14_finetune['Patient ID'])]
CRX14_finetune_val_all = chestxray14_shorter[chestxray14_shorter['Patient ID'].isin(CRX14_finetune_val['Patient ID'])]

# Checking the lengths of the data splits
print(len(CRX14_finetune))
print(len(CRX14_finetune_all))
print()
print(len(CRX14_finetune_val))
print(len(CRX14_finetune_val_all))
print()

# Checking that all images from the preprocessed dataframe were retrieved
print(len(chestxray14_shorter))
print(len(CRX14_finetune_all) + len(CRX14_finetune_val_all))

11160
16856

7440
11122

27978
27978


In [66]:
CRX14_finetune_all

Unnamed: 0,Image Index,Finding Labels,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,OriginalImage[Width,Height],OriginalImagePixelSpacing[x,y],Unnamed: 11
3,00000002_000.png,No Finding,0,2,81,M,PA,2500,2048,0.171000,0.171000,
12,00000004_000.png,Mass|Nodule,0,4,82,M,AP,2500,2048,0.168000,0.168000,
22,00000007_000.png,No Finding,0,7,82,M,PA,2500,2048,0.168000,0.168000,
27,00000010_000.png,Infiltration,0,10,84,F,PA,2992,2991,0.143000,0.143000,
37,00000012_000.png,Effusion|Mass,0,12,76,M,PA,2992,2991,0.143000,0.143000,
...,...,...,...,...,...,...,...,...,...,...,...,...
112110,00030797_000.png,No Finding,0,30797,24,M,PA,2021,2021,0.194311,0.194311,
112112,00030799_000.png,No Finding,0,30799,32,M,PA,2048,2500,0.171000,0.171000,
112113,00030800_000.png,No Finding,0,30800,34,F,PA,2048,2500,0.168000,0.168000,
112117,00030803_000.png,No Finding,0,30803,42,F,PA,2048,2500,0.168000,0.168000,


### Adding image paths

In [69]:
image_folder_location = {}
for root_dir, dirs, files in os.walk(r"../../Data/ChestX-ray14/"):
    for d in dirs:
        for _, _, imgs in os.walk(r'../../Data/ChestX-ray14/' + d + '/' + 'images'):
            for img in imgs:
                image_folder_location[img] = d

In [70]:
CRX14_finetune_all["ImagePath"] = ['../../Data/ChestX-ray14/' + image_folder_location[img] + '/images/' + img for img in list(CRX14_finetune_all["Image Index"])]
CRX14_finetune_val_all["ImagePath"] = ['../../Data/ChestX-ray14/' + image_folder_location[img] + '/images/' + img for img in list(CRX14_finetune_val_all["Image Index"])]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  CRX14_finetune_all["ImagePath"] = ['../../Data/ChestX-ray14/' + image_folder_location[img] + '/images/' + img for img in list(CRX14_finetune_all["Image Index"])]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  CRX14_finetune_val_all["ImagePath"] = ['../../Data/ChestX-ray14/' + image_folder_location[img] + '/images/' + img for img in list(CRX14_finetune_val_all["Image Index"])]


### Saving the splits in csv files

In [71]:
CRX14_finetune_all[:2]

Unnamed: 0,Image Index,Finding Labels,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,OriginalImage[Width,Height],OriginalImagePixelSpacing[x,y],Unnamed: 11,ImagePath
3,00000002_000.png,No Finding,0,2,81,M,PA,2500,2048,0.171,0.171,,../../Data/ChestX-ray14/images_001/images/0000...
12,00000004_000.png,Mass|Nodule,0,4,82,M,AP,2500,2048,0.168,0.168,,../../Data/ChestX-ray14/images_001/images/0000...


In [76]:
# Extracting a subset of df columns
CRX14_finetune_save = CRX14_finetune_all[['Image Index', 'ImagePath', 'Finding Labels']]
CRX14_finetune_save = preproc_pathology(CRX14_finetune_save, 'Finding Labels', False)
CRX14_finetune_save = CRX14_finetune_save.sample(frac=1, random_state=123).reset_index(drop=True)

CRX14_finetune_val_save = CRX14_finetune_val_all[['Image Index', 'ImagePath', 'Finding Labels']]
CRX14_finetune_val_save = preproc_pathology(CRX14_finetune_val_save, 'Finding Labels', False)
CRX14_finetune_val_save = CRX14_finetune_val_save.sample(frac=1, random_state=123).reset_index(drop=True)

# Saving the files
#CRX14_finetune_save.to_csv("../Data/Data_splits/pathology_detection-CXR14-finetuning.csv")
#CRX14_finetune_val_save.to_csv("../Data/Data_splits/pathology_detection-CXR14-finetuning_val.csv")
#print('Saved :)')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[l_dict[label]] = df.apply(my_func, args=(label_col, label, padchest), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[l_dict[label]] = df.apply(my_func, args=(label_col, label, padchest), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[l_dict[label]] = df.apply(my_func, args=

Saved :)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[l_dict[label]] = df.apply(my_func, args=(label_col, label, padchest), axis=1)


In [77]:
CRX14_finetune_save

Unnamed: 0,Image Index,ImagePath,Finding Labels,Effusion,Pneumothorax,Atelectasis,Cardiomegaly,Pneumonia
0,00024596_000.png,../../Data/ChestX-ray14/images_010/images/0002...,No Finding,0,0,0,0,0
1,00029608_000.png,../../Data/ChestX-ray14/images_012/images/0002...,No Finding,0,0,0,0,0
2,00029329_000.png,../../Data/ChestX-ray14/images_012/images/0002...,No Finding,0,0,0,0,0
3,00029612_000.png,../../Data/ChestX-ray14/images_012/images/0002...,No Finding,0,0,0,0,0
4,00001657_000.png,../../Data/ChestX-ray14/images_002/images/0000...,No Finding,0,0,0,0,0
...,...,...,...,...,...,...,...,...
16851,00000199_001.png,../../Data/ChestX-ray14/images_001/images/0000...,Fibrosis,0,0,0,0,0
16852,00024114_000.png,../../Data/ChestX-ray14/images_010/images/0002...,No Finding,0,0,0,0,0
16853,00013137_000.png,../../Data/ChestX-ray14/images_006/images/0001...,No Finding,0,0,0,0,0
16854,00027879_000.png,../../Data/ChestX-ray14/images_011/images/0002...,No Finding,0,0,0,0,0


## Checking the amount of overlapping patient IDs in the created data splits from PadChest

In [78]:
# Loading the data
padchest = pd.read_csv('../Data/preprocessed_df.csv', index_col=0)
annotated = pd.read_csv('../Data/Annotations/Annotations_aggregated.csv', index_col=0)

finetune = pd.read_csv('../Data/Data_splits/tube_detection-finetuning.csv', index_col=0)
finetune_val = pd.read_csv('../Data/Data_splits/tube_detection-finetuning_val.csv', index_col=0)

train = pd.read_csv("../Data/Data_splits/pathology_detection-train.csv", index_col=0)
dev = pd.read_csv("../Data/Data_splits/pathology_detection-val.csv", index_col=0)
test = pd.read_csv("../Data/Data_splits/pathology_detection-test.csv", index_col=0)


# Merging wiht PadChest
annotated = annotated.merge(padchest, how='inner', on=['ImageID'])

finetune = finetune.merge(padchest, how='inner', on=['ImageID'])
finetune_val = finetune_val.merge(padchest, how='inner', on=['ImageID'])

train = train.merge(padchest, how='inner', on=['ImageID'])
dev = dev.merge(padchest, how='inner', on=['ImageID'])
test = test.merge(padchest, how='inner', on=['ImageID'])

  padchest = pd.read_csv('../Data/preprocessed_df_08032023.csv', index_col=0)


In [79]:
len(set(finetune['PatientID']) & set(finetune_val['PatientID']))

0

In [80]:
print(len(set(train['PatientID']) & set(dev['PatientID'])))
print(len(set(train['PatientID']) & set(test['PatientID'])))
print(len(set(dev['PatientID']) & set(test['PatientID'])))

0
0
0


In [81]:
print(len(set(train['PatientID']) & set(finetune['PatientID'])))
print(len(set(train['PatientID']) & set(finetune_val['PatientID'])))
print(len(set(train['PatientID']) & set(annotated['PatientID'])))

print()
print(len(set(dev['PatientID']) & set(finetune['PatientID'])))
print(len(set(dev['PatientID']) & set(finetune_val['PatientID'])))
print(len(set(dev['PatientID']) & set(annotated['PatientID'])))
print()
print(len(set(test['PatientID']) & set(finetune['PatientID'])))
print(len(set(test['PatientID']) & set(finetune_val['PatientID'])))
print(len(set(test['PatientID']) & set(annotated['PatientID'])))

0
0
0

0
0
0

0
0
0


## 4. Evaluation set(s) created from the four ChestX-ray14 test sets
Creating one set covering the images that are present in all four sets and combining the labels: if it is 1 in at least one test set, make it a 1 else 0.

In [106]:
# Loading the four CXR14 test sets
GCS16l = pd.read_csv("../Data/Data_splits/GCS16l.csv", index_col=0)
Bbox = pd.read_csv("../Data/Data_splits/Bbox.csv", index_col=0)
GCS4l = pd.read_csv("../Data/Data_splits/GCS4l.csv", index_col=0)
RSNA = pd.read_csv("../Data/Data_splits/RSNA.csv", index_col=0)

In [107]:
print(len(GCS16l))
GCS16l[:2]

810


Unnamed: 0,Image Index,ImagePath,Labels_all,Effusion,Pneumothorax,Atelectasis,Cardiomegaly,Pneumonia
0,00029560_000.png,../../Data/ChestX-ray14/images_012/images/0002...,,0,0,0,0,0
1,00009437_005.png,../../Data/ChestX-ray14/images_005/images/0000...,,0,0,0,0,0


In [108]:
print(len(Bbox))
Bbox[:2]

880


Unnamed: 0,Image Index,ImagePath,Labels_bbox,Effusion,Pneumothorax,Atelectasis,Cardiomegaly,Pneumonia
0,00013118_008.png,../../Data/ChestX-ray14/images_006/images/0001...,Atelectasis,0,0,1,0,0
1,00014716_007.png,../../Data/ChestX-ray14/images_007/images/0001...,Atelectasis,0,0,1,0,0


In [109]:
print(len(GCS4l))
GCS4l[:2]

4376


Unnamed: 0,Image Index,ImagePath,Labels_four,Pneumothorax
0,00000119_001.png,../../Data/ChestX-ray14/images_001/images/0000...,,0
1,00000134_000.png,../../Data/ChestX-ray14/images_001/images/0000...,Airspace opacity,0


In [110]:
print(len(RSNA))
RSNA[:2]

26684


Unnamed: 0,Image Index,ImagePath,Labels_RSNA,Pneumonia
0,00019124_092.png,../../Data/ChestX-ray14/images_009/images/0001...,No pneumonia,0
1,00028640_003.png,../../Data/ChestX-ray14/images_012/images/0002...,No pneumonia,0


### First, concatenating the two sets with all annotations for five pathology labels, namely Bbox and GCS16l

In [111]:
# Concatenating
Bbox_GCS16l_concat = pd.concat([Bbox, GCS16l])
Bbox_GCS16l_shared = Bbox.merge(GCS16l, on=['Image Index', 'ImagePath'])
print(len(Bbox_GCS16l_concat))
print(len(Bbox_GCS16l_shared))

# Dropping duplicates
Bbox_GCS16l_concat = Bbox_GCS16l_concat.drop_duplicates(subset=['Image Index'], keep='first')
print(len(Bbox_GCS16l_concat))

1690
26
1664


In [112]:
# Obtaining the aggregated labels
effusion = [1 if (row['Effusion_x'] == 1) or (row['Effusion_y'] == 1) else 0 for idx, row in Bbox_GCS16l_shared.iterrows()]
pneumothorax = [1 if (row['Pneumothorax_x'] == 1) or (row['Pneumothorax_y'] == 1) else 0 for idx, row in Bbox_GCS16l_shared.iterrows()]
atelectasis = [1 if (row['Atelectasis_x'] == 1) or (row['Atelectasis_y'] == 1) else 0 for idx, row in Bbox_GCS16l_shared.iterrows()]
cardiomegaly = [1 if (row['Cardiomegaly_x'] == 1) or (row['Cardiomegaly_y'] == 1) else 0 for idx, row in Bbox_GCS16l_shared.iterrows()]
pneumonia = [1 if (row['Pneumonia_x'] == 1) or (row['Pneumonia_y'] == 1) else 0 for idx, row in Bbox_GCS16l_shared.iterrows()]
print(effusion)

# Adding them to the shared df
Bbox_GCS16l_shared['Effusion'] = effusion
Bbox_GCS16l_shared['Pneumothorax'] = pneumothorax
Bbox_GCS16l_shared['Atelectasis'] = atelectasis
Bbox_GCS16l_shared['Cardiomegaly'] = cardiomegaly
Bbox_GCS16l_shared['Pneumonia'] = pneumonia
Bbox_GCS16l_shared[:2]

[0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0]


Unnamed: 0,Image Index,ImagePath,Labels_bbox,Effusion_x,Pneumothorax_x,Atelectasis_x,Cardiomegaly_x,Pneumonia_x,Labels_all,Effusion_y,Pneumothorax_y,Atelectasis_y,Cardiomegaly_y,Pneumonia_y,Effusion,Pneumothorax,Atelectasis,Cardiomegaly,Pneumonia
0,00012291_008.png,../../Data/ChestX-ray14/images_006/images/0001...,Atelectasis,0,0,1,0,0,"Abnormal, Atelectasis, Nodule, Nodule or mass",0,0,1,0,0,0,0,1,0,0
1,00018762_002.png,../../Data/ChestX-ray14/images_009/images/0001...,Atelectasis,0,0,1,0,0,"Abnormal, Atelectasis, Effusion",1,0,1,0,0,1,0,1,0,0


In [113]:
# Removing all the shared images from the concatenated df
Bbox_GCS16l_concat_shorter = Bbox_GCS16l_concat[~Bbox_GCS16l_concat['Image Index'].isin(Bbox_GCS16l_shared['Image Index'])]

# Adding back the shared images with the new labels
Bbox_GCS16l_concat_new = pd.concat([Bbox_GCS16l_concat_shorter, Bbox_GCS16l_shared])

# Keeping only the new label column
Bbox_GCS16l_concat_new = Bbox_GCS16l_concat_new[['Image Index', 'ImagePath', 'Effusion', 'Pneumothorax', 'Atelectasis', 'Cardiomegaly', 'Pneumonia']]
print(len(Bbox_GCS16l_concat_new))
Bbox_GCS16l_concat_new

1664


Unnamed: 0,Image Index,ImagePath,Effusion,Pneumothorax,Atelectasis,Cardiomegaly,Pneumonia
0,00013118_008.png,../../Data/ChestX-ray14/images_006/images/0001...,0,0,1,0,0
1,00014716_007.png,../../Data/ChestX-ray14/images_007/images/0001...,0,0,1,0,0
2,00029817_009.png,../../Data/ChestX-ray14/images_012/images/0002...,0,0,1,0,0
3,00014687_001.png,../../Data/ChestX-ray14/images_007/images/0001...,0,0,1,0,0
4,00017877_001.png,../../Data/ChestX-ray14/images_008/images/0001...,0,0,1,0,0
...,...,...,...,...,...,...,...
21,00018865_008.png,../../Data/ChestX-ray14/images_009/images/0001...,0,1,0,0,0
22,00029579_014.png,../../Data/ChestX-ray14/images_012/images/0002...,1,1,1,0,0
23,00007471_003.png,../../Data/ChestX-ray14/images_004/images/0000...,0,1,1,0,0
24,00019124_006.png,../../Data/ChestX-ray14/images_009/images/0001...,0,1,1,0,0


### Second, aggregating with labels from the other two sets, namely GCS4l and RSNA

In [114]:
# The GCS4l test set
Bbox_GCS16l_shared_GCS4l = Bbox_GCS16l_concat_new.merge(GCS4l, how='left', on=['Image Index', 'ImagePath'])

# Obtaining the aggregated label
pneumothorax = [1 if (row['Pneumothorax_x'] == 1) or (row['Pneumothorax_y'] == 1) else row['Pneumothorax_x'] for idx, row in Bbox_GCS16l_shared_GCS4l.iterrows()]

# Adding them to the shared df
Bbox_GCS16l_shared_GCS4l['Pneumothorax'] = pneumothorax

# Keeping only the new label column
Bbox_GCS16l_shared_GCS4l = Bbox_GCS16l_shared_GCS4l[['Image Index', 'ImagePath', 'Effusion', 'Pneumothorax', 'Atelectasis', 'Cardiomegaly', 'Pneumonia']]
print(len(Bbox_GCS16l_shared_GCS4l))
Bbox_GCS16l_shared_GCS4l

1664


Unnamed: 0,Image Index,ImagePath,Effusion,Pneumothorax,Atelectasis,Cardiomegaly,Pneumonia
0,00013118_008.png,../../Data/ChestX-ray14/images_006/images/0001...,0,0,1,0,0
1,00014716_007.png,../../Data/ChestX-ray14/images_007/images/0001...,0,0,1,0,0
2,00029817_009.png,../../Data/ChestX-ray14/images_012/images/0002...,0,0,1,0,0
3,00014687_001.png,../../Data/ChestX-ray14/images_007/images/0001...,0,0,1,0,0
4,00017877_001.png,../../Data/ChestX-ray14/images_008/images/0001...,0,0,1,0,0
...,...,...,...,...,...,...,...
1659,00018865_008.png,../../Data/ChestX-ray14/images_009/images/0001...,0,1,0,0,0
1660,00029579_014.png,../../Data/ChestX-ray14/images_012/images/0002...,1,1,1,0,0
1661,00007471_003.png,../../Data/ChestX-ray14/images_004/images/0000...,0,1,1,0,0
1662,00019124_006.png,../../Data/ChestX-ray14/images_009/images/0001...,0,1,1,0,0


In [116]:
# The RSNA test set
Bbox_GCS16l_GCS4l_shared_RSNA = Bbox_GCS16l_shared_GCS4l.merge(RSNA, how='left', on=['Image Index', 'ImagePath'])

# Obtaining the aggregated label
pneumonia = [1 if (row['Pneumonia_x'] == 1) or (row['Pneumonia_y'] == 1) else row['Pneumonia_x'] for idx, row in Bbox_GCS16l_GCS4l_shared_RSNA.iterrows()]

# Adding them to the shared df
Bbox_GCS16l_GCS4l_shared_RSNA['Pneumonia'] = pneumonia

# Keeping only the new label column
Bbox_GCS16l_GCS4l_shared_RSNA = Bbox_GCS16l_GCS4l_shared_RSNA[['Image Index', 'ImagePath', 'Effusion', 'Pneumothorax', 'Atelectasis', 'Cardiomegaly', 'Pneumonia']].reset_index(drop=True)
print(len(Bbox_GCS16l_GCS4l_shared_RSNA))
Bbox_GCS16l_GCS4l_shared_RSNA

1664


Unnamed: 0,Image Index,ImagePath,Effusion,Pneumothorax,Atelectasis,Cardiomegaly,Pneumonia
0,00013118_008.png,../../Data/ChestX-ray14/images_006/images/0001...,0,0,1,0,0
1,00014716_007.png,../../Data/ChestX-ray14/images_007/images/0001...,0,0,1,0,0
2,00029817_009.png,../../Data/ChestX-ray14/images_012/images/0002...,0,0,1,0,0
3,00014687_001.png,../../Data/ChestX-ray14/images_007/images/0001...,0,0,1,0,0
4,00017877_001.png,../../Data/ChestX-ray14/images_008/images/0001...,0,0,1,0,0
...,...,...,...,...,...,...,...
1659,00018865_008.png,../../Data/ChestX-ray14/images_009/images/0001...,0,1,0,0,0
1660,00029579_014.png,../../Data/ChestX-ray14/images_012/images/0002...,1,1,1,0,0
1661,00007471_003.png,../../Data/ChestX-ray14/images_004/images/0000...,0,1,1,0,1
1662,00019124_006.png,../../Data/ChestX-ray14/images_009/images/0001...,0,1,1,0,0


In [117]:
# Saving the file
#Bbox_GCS16l_GCS4l_shared_RSNA.to_csv("../Data/Data_splits/pathology_detection-CXR14_test_combined.csv")
#print('Saved :)')

Saved :)
