In [None]:
import zipfile
import os
from tqdm import tqdm
import shutil

## Unzip all files and save to a specific directory

In [None]:
folder_with_zip_files = "/data/chest_radiograph/orthanc_db_downloaded"
target_folder = "/data/chest_radiograph/dicom_files"

In [None]:
zip_files = os.listdir(folder_with_zip_files)

In [None]:
for zip_file in tqdm(zip_files):
    with zipfile.ZipFile(os.path.join(folder_with_zip_files, zip_file), 'r') as zip_ref:
        zip_ref.extractall(target_folder)

#### Unzipping created a weirdly-named parent directory. Move all content inside that directory out of that directory and change remove the empty directory

In [None]:
parent_directories = os.listdir(target_folder)
for parent_directory in tqdm(parent_directories):
    # Move all subfolders
    for subfolder in os.listdir(os.path.join(target_folder, parent_directory)):
        subfolder_absolute_path = os.path.join(target_folder, parent_directory, subfolder)
        shutil.move(subfolder_absolute_path, target_folder)
    shutil.rmtree(os.path.join(target_folder, parent_directory), ignore_errors=True)

## Convert the Dicom Files to Nifti

In [None]:
import pydicom
import matplotlib.pyplot as plt
import SimpleITK as sitk
import os
from tqdm import tqdm

In [None]:
nifti_root_directory = '/data/chest_radiograph/nifti_files'
dicom_root_directory = '/data/chest_radiograph/dicom_files'

In [None]:
def set_image_data(dataset):
    sitk_img = sitk.GetImageFromArray(dataset.pixel_array)
    sitk_img.SetSpacing(dataset[0x0018, 0x1164].value)
    return sitk_img

In [None]:
def set_meta_data_safely(sitk_img, dataset, key, value):
    org_sitk_img = sitk_img
    try: 
        sitk_img.SetMetaData(key, str(dataset[value[0], value[1]].value))
        return sitk_img
    except KeyError as e:
        # Ignore this metadata information
        print(e)
        return org_sitk_img
        

In [None]:
def set_meta_data(sitk_img, dataset):
    sitk_img = set_meta_data_safely(sitk_img, dataset, "StudyDate", [0x0008, 0x0020])
    sitk_img = set_meta_data_safely(sitk_img, dataset, "StudyTime", [0x0008, 0x0030])
    sitk_img = set_meta_data_safely(sitk_img, dataset, "AccessionNumber", [0x0008, 0x0050])
    sitk_img = set_meta_data_safely(sitk_img, dataset, "PatientBirthdate", [0x0010, 0x0030])
    sitk_img = set_meta_data_safely(sitk_img, dataset, "PatientSex", [0x0010, 0x0040])
    sitk_img = set_meta_data_safely(sitk_img, dataset, "RequestingPhysician", [0x0032, 0x1032])
    sitk_img = set_meta_data_safely(sitk_img, dataset, "ExposureinuAs", [0x0018, 0x1153])
    return sitk_img

In [None]:
dicom_folders = os.listdir(dicom_root_directory)
for dicom_folder in tqdm(dicom_folders):
    for root, dirs, files in os.walk(os.path.join(dicom_root_directory, dicom_folder)):
        for file in files:
            if file.endswith(".dcm"):
                dicom_file_name = os.path.join(root, file)
                dataset = pydicom.dcmread(dicom_file_name)
                if dataset.Modality == 'CR':
                    try:
                        # Only take images that contain spacing and image data
                        sitk_img = set_image_data(dataset)
                    except Exception as e:
                        if not isinstance(e, KeyError):
                            print(e)
                        continue
                    sitk_img = set_meta_data(sitk_img, dataset)
                    nifti_save_path = os.path.join(nifti_root_directory, dicom_folder + '.nii')
                    sitk.WriteImage(sitk_img, nifti_save_path)

In [None]:
plt.imshow(dataset.pixel_array, cmap='gray')

## Create Dataset Split (One-Hot)

In [None]:
import os
import pandas as pd
from tqdm import tqdm
import SimpleITK as sitk
from sklearn.model_selection import train_test_split, KFold, GroupShuffleSplit

In [None]:
path_to_csv_dir = '/home/firas/Desktop/work/chest_radiography/data'
path_to_csv = '/home/firas/Desktop/work/chest_radiography/data/parsed_chest_xray_p1.csv'

In [None]:
# Read CSV and append 01 to each accession number
df = pd.read_csv(path_to_csv)
df['Anforderungsnummer'] = df['Anforderungsnummer'].apply(lambda x: str(x) + '01')

In [None]:
# Drop rows that have at least one empty cell
df_no_nans = df.dropna()

In [None]:
# Only include data rows that are included in the data directory
path_to_data_dir = '/data/chest_radiograph/resized_nifti_files'
data_files = [file_name.split('.')[0] for file_name in os.listdir(path_to_data_dir)]
df_available = df_no_nans[df_no_nans['Anforderungsnummer'].isin(data_files)]

In [None]:
# One hot encode the label columns
df_available.columns
columns_to_one_hot_encode = df_available.columns[5:]
df_one_hot = pd.get_dummies(df_available, columns=columns_to_one_hot_encode)

### Include meta data into the csv

In [None]:
df_one_hot.insert(5, "PatientName", "")
df_one_hot.insert(6, "StudyDate", "")
df_one_hot.insert(7, "StudyTime", "")
df_one_hot.insert(8, "PatientSex", "")
df_one_hot.insert(9, "RequestingPhysician", "")
df_one_hot.insert(10, "ExposureinuAs", "")

In [None]:
df_names = pd.read_csv('/home/firas/Desktop/work/chest_radiography/data/csv_with_names/all_studies.csv', dtype=str)

In [None]:
# Include meta data into the csv. 
for accession_number in tqdm(df_one_hot['Anforderungsnummer'].values):
    for root, dirs, files in os.walk(os.path.join(dicom_root_directory, accession_number)):
        for file in files:
                if file.endswith(".dcm"):
                    dicom_file_name = os.path.join(root, file)
                    dataset = pydicom.dcmread(dicom_file_name)
                    if dataset.Modality == 'CR':
                        try:
                            # Write meta data
                            index = df_one_hot[df_one_hot['Anforderungsnummer'] == accession_number].index.values[0]
                            df_one_hot.at[index, 'PatientName'] = df_names[df_names['Anforderungsnummer'] == accession_number]['Fullname'].values[0]
                            df_one_hot.at[index, 'StudyDate'] = str(dataset[0x0008, 0x0020].value)
                            df_one_hot.at[index, 'StudyTime'] = str(dataset[0x0008, 0x0030].value)
                            df_one_hot.at[index, 'PatientSex'] = str(dataset[0x0010, 0x0040].value)
                            df_one_hot.at[index, 'RequestingPhysician'] = str(dataset[0x0032, 0x1032].value)
                            df_one_hot.at[index, 'ExposureinuAs'] = str(dataset[0x0018, 0x1153].value)
                        except Exception as e:
                            if not isinstance(e, KeyError):
                                print(e)
                            continue

In [None]:
df_one_hot.head(5)

In [None]:
# Create training, testing and validation split and ensure Patient only appears in one set
random_state = 379647 # keep the randomization reproducible
train_val_inds, test_inds = next(GroupShuffleSplit(test_size=.20, n_splits=2, random_state=random_state).split(df_one_hot, groups=df_one_hot['PatientName']))
train_val = df_one_hot.iloc[train_val_inds]
test = df_one_hot.iloc[test_inds]
train_inds, val_inds = next(GroupShuffleSplit(test_size=.20, n_splits=2, random_state=random_state).split(train_val, groups=train_val['PatientName']))
train = train_val.iloc[train_inds]
valid = train_val.iloc[val_inds]

In [None]:
train.to_csv(os.path.join(path_to_csv_dir, 'train.csv'), index=False)
valid.to_csv(os.path.join(path_to_csv_dir, 'valid.csv'), index=False)
test.to_csv(os.path.join(path_to_csv_dir, 'test.csv'), index=False)

## Create Dataset Split (Custom Method)

In [None]:
import os
import pandas as pd
from tqdm import tqdm
import SimpleITK as sitk
from sklearn.model_selection import train_test_split, KFold, GroupShuffleSplit
import numpy as np
import pydicom

In [None]:
path_to_csv_dir = '/home/firas/Desktop/work/chest_radiography/data'
path_to_csv = '/home/firas/Desktop/work/chest_radiography/data/parsed_chest_xray_p1.csv'
dicom_root_directory = '/data/chest_radiograph/dicom_files'

In [None]:
# Read CSV and append 01 to each accession number
df = pd.read_csv(path_to_csv)
df['Anforderungsnummer'] = df['Anforderungsnummer'].apply(lambda x: str(x) + '01')

In [None]:
# Drop rows that have at least one empty cell
df_no_nans = df.dropna()

In [None]:
# Only include data rows that are included in the data directory
path_to_data_dir = '/data/chest_radiograph/resized_nifti_files'
data_files = [file_name.split('.')[0] for file_name in os.listdir(path_to_data_dir)]
df_available = df_no_nans[df_no_nans['Anforderungsnummer'].isin(data_files)]

In [None]:
df_available

In [None]:
df_with_ignore_index = df_available.replace(0.0, 100)

In [None]:
# Deal with gradual changes
for column in ['Stauung', 'Pleuraerguss_re', 'Pleuraerguss_li', 'Infiltrate_re', 'Infiltrate_li', 
               'Belstörungen_re', 'Belstörungen_li', ]:
    df_with_ignore_index[column] = df_with_ignore_index[column].replace({1:0, 5:1})

In [None]:
# Deal with gradual changes for heart
is_herz_beurteilbar = df_with_ignore_index['Herzgröße'] != 5.0
is_herz_beurteilbar = is_herz_beurteilbar.astype(float)
df_with_ignore_index.insert(6, "is_herzgröße_beurteilbar", is_herz_beurteilbar)
df_with_ignore_index['Herzgröße'] = df_with_ignore_index['Herzgröße'].replace({5:100, 1:0, 2:1, 3:2, 4:3})

In [None]:
# Deal with location in Belüftungsstörungen 

# check if Belstörung is present
is_belstörung_li_present = (df_with_ignore_index['Belstörungen_li'] != 0).astype(float) # check for 0 because it is a standard label and has been changed above
is_belstörung_re_present = (df_with_ignore_index['Belstörungen_re'] != 0).astype(float)

# insert the new columns
df_with_ignore_index.insert(15, "is_belstörung_li_present", is_belstörung_li_present)
df_with_ignore_index.insert(13, "is_belstörung_re_present", is_belstörung_re_present)


# set ignore index where belstörung is not present
df_with_ignore_index.loc[df_with_ignore_index['is_belstörung_li_present']==0.0, 'Belstörungenidem_li'] = 100.0

# remove the new columns as they are unnecessary for training
df_with_ignore_index = df_with_ignore_index.drop(columns=['is_belstörung_li_present', 'is_belstörung_re_present'])

# check for OF/UF
is_of_uf_li = (df_with_ignore_index['Belstörungenidem_li'] == 6).astype(float)
is_of_uf_re = (df_with_ignore_index['Belstörungenidem_re'] == 6).astype(float)

# insert the new columns
df_with_ignore_index.insert(15, "Belstörungen_of_uf_li", is_of_uf_li)
df_with_ignore_index.insert(13, "Belstörungen_of_uf_re", is_of_uf_re)

# set ignore index where OF/UF is present
df_with_ignore_index.loc[df_with_ignore_index['Belstörungen_of_uf_li']==1, 'Belstörungenidem_li'] = 100.0
df_with_ignore_index.loc[df_with_ignore_index['Belstörungen_of_uf_re']==1, 'Belstörungenidem_re'] = 100.0

# Replace the labels
df_with_ignore_index['Belstörungen_of_uf_li'] = df_with_ignore_index['Belstörungen_of_uf_li'].replace({6:100, 4:0, 3:1, 7:2, 2:3, 5:4, 1:5})
df_with_ignore_index['Belstörungen_of_uf_re'] = df_with_ignore_index['Belstörungen_of_uf_re'].replace({6:100, 4:0, 3:1, 7:2, 2:3, 5:4, 1:5})

In [None]:
# Deal with Pneumothorax

# Check for pneumothorax
is_pneumothorax_li_present = ((df_with_ignore_index['Pneumothorax_li'] != 1) & (df_with_ignore_index['Pneumothorax_li'] != 100)).astype(float)
is_pneumothorax_re_present = ((df_with_ignore_index['Pneumothorax_re'] != 1) & (df_with_ignore_index['Pneumothorax_re'] != 100)).astype(float)

# insert the new columns
df_with_ignore_index.insert(19, "is_pneumothorax_li_present", is_pneumothorax_li_present)
df_with_ignore_index.insert(18, "is_pneumothorax_re_present", is_pneumothorax_re_present)

# set ignore index where pneumothorax can't be checked 
df_with_ignore_index.loc[df_with_ignore_index['Pneumothorax_li']==100, 'is_pneumothorax_li_present'] = 100.0
df_with_ignore_index.loc[df_with_ignore_index['Pneumothorax_re']==100, 'is_pneumothorax_re_present'] = 100.0

# Split the Pneumothorax Labels into an output logit that contains the location labels
pneumothorax_li_location = df_with_ignore_index['Pneumothorax_li'].copy()
for label in [1, 5, 6, 7]:
    pneumothorax_li_location.loc[pneumothorax_li_location==label] = 100.0
    
pneumothorax_re_location = df_with_ignore_index['Pneumothorax_re'].copy()
for label in [1, 5, 6, 7]:
    pneumothorax_re_location.loc[pneumothorax_re_location==label] = 100.0

# Split the Pneumothorax Labels into an output logit that contains the severity labels
pneumothorax_li_severity = df_with_ignore_index['Pneumothorax_li'].copy()
for label in [1, 2, 3, 4]:
    pneumothorax_li_severity.loc[pneumothorax_li_severity==label] = 100.0

pneumothorax_re_severity = df_with_ignore_index['Pneumothorax_re'].copy()
for label in [1, 2, 3, 4]:
    pneumothorax_re_severity.loc[pneumothorax_re_severity==label] = 100.0
    
# insert the new columns
df_with_ignore_index.insert(22, "pneumothorax_li_location", pneumothorax_li_location)
df_with_ignore_index.insert(19, "pneumothorax_re_location", pneumothorax_re_location)
df_with_ignore_index.insert(24, "pneumothorax_li_severity", pneumothorax_li_severity)
df_with_ignore_index.insert(20, "pneumothorax_re_severity", pneumothorax_re_severity)

# Drop the original column
df_with_ignore_index = df_with_ignore_index.drop(columns=['Pneumothorax_li', 'Pneumothorax_re'])

# Relabel the new columns
df_with_ignore_index['pneumothorax_li_location'] = df_with_ignore_index['pneumothorax_li_location'].replace({4:0, 3:1, 2:2})
df_with_ignore_index['pneumothorax_re_location'] = df_with_ignore_index['pneumothorax_re_location'].replace({4:0, 3:1, 2:2})
df_with_ignore_index['pneumothorax_li_severity'] = df_with_ignore_index['pneumothorax_li_severity'].replace({5:0, 6:1, 7:2})
df_with_ignore_index['pneumothorax_re_severity'] = df_with_ignore_index['pneumothorax_re_severity'].replace({5:0, 6:1, 7:2})



In [None]:
df_with_ignore_index

In [None]:
df_one_hot = df_with_ignore_index

### Include meta data into the csv

In [None]:
df_one_hot.insert(5, "PatientName", "")
df_one_hot.insert(6, "StudyDate", "")
df_one_hot.insert(7, "StudyTime", "")
df_one_hot.insert(8, "PatientSex", "")
df_one_hot.insert(9, "RequestingPhysician", "")
df_one_hot.insert(10, "ExposureinuAs", "")

In [None]:
df_names = pd.read_csv('/home/firas/Desktop/work/chest_radiography/data/csv_with_names/all_studies.csv', dtype=str)

In [None]:
# Include meta data into the csv. 
for accession_number in tqdm(df_one_hot['Anforderungsnummer'].values):
    for root, dirs, files in os.walk(os.path.join(dicom_root_directory, accession_number)):
        for file in files:
                if file.endswith(".dcm"):
                    dicom_file_name = os.path.join(root, file)
                    dataset = pydicom.dcmread(dicom_file_name)
                    if dataset.Modality == 'CR':
                        try:
                            # Write meta data
                            index = df_one_hot[df_one_hot['Anforderungsnummer'] == accession_number].index.values[0]
                            df_one_hot.at[index, 'PatientName'] = df_names[df_names['Anforderungsnummer'] == accession_number]['Fullname'].values[0]
                            df_one_hot.at[index, 'StudyDate'] = str(dataset[0x0008, 0x0020].value)
                            df_one_hot.at[index, 'StudyTime'] = str(dataset[0x0008, 0x0030].value)
                            df_one_hot.at[index, 'PatientSex'] = str(dataset[0x0010, 0x0040].value)
                            df_one_hot.at[index, 'RequestingPhysician'] = str(dataset[0x0032, 0x1032].value)
                            df_one_hot.at[index, 'ExposureinuAs'] = str(dataset[0x0018, 0x1153].value)
                        except Exception as e:
                            if not isinstance(e, KeyError):
                                print(e)
                            continue

In [None]:
df_one_hot.head(5)

In [None]:
# Create training, testing and validation split and ensure Patient only appears in one set
random_state = 379647 # keep the randomization reproducible
train_val_inds, test_inds = next(GroupShuffleSplit(test_size=.20, n_splits=2, random_state=random_state).split(df_one_hot, groups=df_one_hot['PatientName']))
train_val = df_one_hot.iloc[train_val_inds]
test = df_one_hot.iloc[test_inds]
train_inds, val_inds = next(GroupShuffleSplit(test_size=.20, n_splits=2, random_state=random_state).split(train_val, groups=train_val['PatientName']))
train = train_val.iloc[train_inds]
valid = train_val.iloc[val_inds]

In [None]:
train.to_csv(os.path.join(path_to_csv_dir, 'train_custom.csv'), index=False)
valid.to_csv(os.path.join(path_to_csv_dir, 'valid_custom.csv'), index=False)
test.to_csv(os.path.join(path_to_csv_dir, 'test_custom.csv'), index=False)

## Create Dataset Split (One-Hot and Drop lines)

In [None]:
import pydicom
import matplotlib.pyplot as plt
import SimpleITK as sitk
import os
from tqdm import tqdm
import numpy as np

In [None]:
path_to_csv_dir = '/home/firas/Desktop/work/chest_radiography/data'
path_to_csv = '/home/firas/Desktop/work/chest_radiography/data/parsed_chest_xray_p1.csv'

In [None]:
# Read CSV and append 01 to each accession number
df = pd.read_csv(path_to_csv)
df['Anforderungsnummer'] = df['Anforderungsnummer'].apply(lambda x: str(x) + '01')

In [None]:
# Drop rows that have at least one empty cell
df_no_nans = df.dropna()

In [None]:
# Only include data rows that are included in the data directory
path_to_data_dir = '/data/chest_radiograph/resized_nifti_files'
data_files = [file_name.split('.')[0] for file_name in os.listdir(path_to_data_dir)]
df_available = df_no_nans[df_no_nans['Anforderungsnummer'].isin(data_files)]

In [None]:
# Replace 0 with NaN
df_available = df_available.replace({0: np.nan})

In [None]:
# Get the occurences of Nans in the Dataframe
for i in df_available.columns:
    print(i, df_available[i].isna().sum())

In [None]:
# Drop the columns that lead to many nans
df_available = df_available.drop(columns=['Belstörungenidem_re', 'Belstörungenidem_li'])

In [None]:
# Drop all nan rows
df_available = df_available.dropna()

In [None]:
df_available

In [None]:
# One hot encode the label columns
df_available.columns
columns_to_one_hot_encode = df_available.columns[5:]
df_one_hot = pd.get_dummies(df_available, columns=columns_to_one_hot_encode)

In [None]:
df_one_hot

### Include meta data into the csv

In [None]:
df_one_hot.insert(5, "PatientName", "")
df_one_hot.insert(6, "StudyDate", "")
df_one_hot.insert(7, "StudyTime", "")
df_one_hot.insert(8, "PatientSex", "")
df_one_hot.insert(9, "RequestingPhysician", "")
df_one_hot.insert(10, "ExposureinuAs", "")

In [None]:
df_names = pd.read_csv('/home/firas/Desktop/work/chest_radiography/data/csv_with_names/all_studies.csv', dtype=str)

In [None]:
# Include meta data into the csv. 
for accession_number in tqdm(df_one_hot['Anforderungsnummer'].values):
    for root, dirs, files in os.walk(os.path.join(dicom_root_directory, accession_number)):
        for file in files:
                if file.endswith(".dcm"):
                    dicom_file_name = os.path.join(root, file)
                    dataset = pydicom.dcmread(dicom_file_name)
                    if dataset.Modality == 'CR':
                        try:
                            # Write meta data
                            index = df_one_hot[df_one_hot['Anforderungsnummer'] == accession_number].index.values[0]
                            df_one_hot.at[index, 'PatientName'] = df_names[df_names['Anforderungsnummer'] == accession_number]['Fullname'].values[0]
                            df_one_hot.at[index, 'StudyDate'] = str(dataset[0x0008, 0x0020].value)
                            df_one_hot.at[index, 'StudyTime'] = str(dataset[0x0008, 0x0030].value)
                            df_one_hot.at[index, 'PatientSex'] = str(dataset[0x0010, 0x0040].value)
                            df_one_hot.at[index, 'RequestingPhysician'] = str(dataset[0x0032, 0x1032].value)
                            df_one_hot.at[index, 'ExposureinuAs'] = str(dataset[0x0018, 0x1153].value)
                        except Exception as e:
                            if not isinstance(e, KeyError):
                                print(e)
                            continue

In [None]:
df_one_hot.head(5)

In [None]:
# Create training, testing and validation split and ensure Patient only appears in one set
random_state = 379647 # keep the randomization reproducible
train_val_inds, test_inds = next(GroupShuffleSplit(test_size=.20, n_splits=2, random_state=random_state).split(df_one_hot, groups=df_one_hot['PatientName']))
train_val = df_one_hot.iloc[train_val_inds]
test = df_one_hot.iloc[test_inds]
train_inds, val_inds = next(GroupShuffleSplit(test_size=.20, n_splits=2, random_state=random_state).split(train_val, groups=train_val['PatientName']))
train = train_val.iloc[train_inds]
valid = train_val.iloc[val_inds]

In [None]:
train.to_csv(os.path.join(path_to_csv_dir, 'train.csv'), index=False)
valid.to_csv(os.path.join(path_to_csv_dir, 'valid.csv'), index=False)
test.to_csv(os.path.join(path_to_csv_dir, 'test.csv'), index=False)

## Create Datasplit for Stauung (One-Hot vs Encoding)

In [None]:
import pydicom
import matplotlib.pyplot as plt
import SimpleITK as sitk
import os
from tqdm import tqdm
import numpy as np
import pandas as pd

In [None]:
path_to_csv_dir = '/home/firas/Desktop/work/chest_radiography/data'
path_to_csv = '/home/firas/Desktop/work/chest_radiography/data/parsed_chest_xray_p1.csv'

In [None]:
train_one_hot_all = pd.read_csv(os.path.join(path_to_csv_dir, 'train.csv'))
valid_one_hot_all = pd.read_csv(os.path.join(path_to_csv_dir, 'valid.csv'))
test_one_hot_all = pd.read_csv(os.path.join(path_to_csv_dir, 'test.csv'))

In [None]:
# Order the columns labels based on severity
train_one_hot_all_stauung_ordered = train_one_hot_all.rename(columns={'Staaung_1.0': 'Stauung_1.0', 'Stauung_5.0': 'Stauung_2.0', 'Stauung_2.0': 'Stauung_3.0', 'Stauung_3.0': 'Stauung_4.0', 'Stauung_4.0': 'Stauung_5.0'})
valid_one_hot_all_stauung_ordered = valid_one_hot_all.rename(columns={'Staaung_1.0': 'Stauung_1.0', 'Stauung_5.0': 'Stauung_2.0', 'Stauung_2.0': 'Stauung_3.0', 'Stauung_3.0': 'Stauung_4.0', 'Stauung_4.0': 'Stauung_5.0'})
test_one_hot_all_stauung_ordered = test_one_hot_all.rename(columns={'Staaung_1.0': 'Stauung_1.0', 'Stauung_5.0': 'Stauung_2.0', 'Stauung_2.0': 'Stauung_3.0', 'Stauung_3.0': 'Stauung_4.0', 'Stauung_4.0': 'Stauung_5.0'})

In [None]:
# Filter out all but the Stauung labels
train_one_hot_stauung_ordered = train_one_hot_all_stauung_ordered.loc[:, ['Aufnahmenummer', 'Anforderungsnummer', 'Geburtsdatum',
       'Untersuchungsdatum', 'Untersuchung Dokumentiert', 'PatientName',
       'StudyDate', 'StudyTime', 'PatientSex', 'RequestingPhysician',
       'ExposureinuAs','Stauung_1.0', 'Stauung_2.0',
       'Stauung_3.0', 'Stauung_4.0', 'Stauung_5.0']]
valid_one_hot_stauung_ordered = valid_one_hot_all_stauung_ordered.loc[:, ['Aufnahmenummer', 'Anforderungsnummer', 'Geburtsdatum',
       'Untersuchungsdatum', 'Untersuchung Dokumentiert', 'PatientName',
       'StudyDate', 'StudyTime', 'PatientSex', 'RequestingPhysician',
       'ExposureinuAs','Stauung_1.0', 'Stauung_2.0',
       'Stauung_3.0', 'Stauung_4.0', 'Stauung_5.0']]
test_one_hot_stauung_ordered = test_one_hot_all_stauung_ordered.loc[:, ['Aufnahmenummer', 'Anforderungsnummer', 'Geburtsdatum',
       'Untersuchungsdatum', 'Untersuchung Dokumentiert', 'PatientName',
       'StudyDate', 'StudyTime', 'PatientSex', 'RequestingPhysician',
       'ExposureinuAs','Stauung_1.0', 'Stauung_2.0',
       'Stauung_3.0', 'Stauung_4.0', 'Stauung_5.0']]

In [None]:
train_one_hot_stauung_ordered

In [None]:
def create_custom_encoding(row):
    label_of_interest = 'Stauung'
    
    # Get the labels of the series index that match the label of interest
    keys = row.keys()
    label_keys = [key for key in keys if label_of_interest in key]
    
    # Get the label name that encodes the 1
    values = row[label_keys]
    argmax_pos = values.to_numpy().argmax()
    argmax_label = values.keys()[argmax_pos]
    
    # Set all other labels that would have a lower odinal value also to 1
    for label_key in label_keys:
        row[label_key] = 1 if label_key.split('_')[-1] <= argmax_label.split('_')[-1] else 0
    return row

train_custom_encoded_stauung = train_one_hot_stauung_ordered.apply(create_custom_encoding, axis=1)
valid_custom_encoded_stauung = valid_one_hot_stauung_ordered.apply(create_custom_encoding, axis=1)
test_custom_encoded_stauung = test_one_hot_stauung_ordered.apply(create_custom_encoding, axis=1)

# Have the Value for None be encoded as 0 0 0 0 (not 1 0 0 0 )
train_custom_encoded_stauung = train_custom_encoded_stauung.loc[:, train_custom_encoded_stauung.columns != 'Stauung_1.0']
valid_custom_encoded_stauung = valid_custom_encoded_stauung.loc[:, valid_custom_encoded_stauung.columns != 'Stauung_1.0']
test_custom_encoded_stauung = test_custom_encoded_stauung.loc[:, test_custom_encoded_stauung.columns != 'Stauung_1.0']

In [None]:
train_one_hot_stauung_ordered.head(-5)

In [None]:
train_custom_encoded_stauung.head(-5)

In [None]:
train_one_hot_stauung_ordered.to_csv(os.path.join(path_to_csv_dir, 'train_one_hot_stauung_ordered.csv'), index=False)
valid_one_hot_stauung_ordered.to_csv(os.path.join(path_to_csv_dir, 'valid_one_hot_stauung_ordered.csv'), index=False)
test_one_hot_stauung_ordered.to_csv(os.path.join(path_to_csv_dir, 'test_one_hot_stauung_ordered.csv'), index=False)

train_custom_encoded_stauung.to_csv(os.path.join(path_to_csv_dir, 'train_custom_encoded_stauung.csv'), index=False)
valid_custom_encoded_stauung.to_csv(os.path.join(path_to_csv_dir, 'valid_custom_encoded_stauung.csv'), index=False)
test_custom_encoded_stauung.to_csv(os.path.join(path_to_csv_dir, 'test_custom_encoded_stauung.csv'), index=False)

## Add Laboratory Values

In [None]:
train_lab = pd.read_csv(os.path.join(path_to_csv_dir, 'train.csv'))
valid_lab = pd.read_csv(os.path.join(path_to_csv_dir, 'valid.csv'))
test_lab = pd.read_csv(os.path.join(path_to_csv_dir, 'test.csv'))

In [None]:
# Add Columns for CRP and BNP
train_lab['CRP'] = -1000
valid_lab['CRP'] = -1000
test_lab['CRP'] = -1000
train_lab['BNP'] = -1000
valid_lab['BNP'] = -1000
test_lab['BNP'] = -1000

In [None]:
lab_results_path = '/home/firas/Desktop/work/chest_radiography/data_laborwerte/Radiologie_Truhn_Laborwerte.csv'
df_lab_results = pd.read_csv(lab_results_path, engine="python", sep=';')

In [None]:
#Change the date columns from str to datetime
train_lab['Untersuchungsdatum'] = pd.to_datetime(train_lab['Untersuchungsdatum'])
valid_lab['Untersuchungsdatum'] = pd.to_datetime(valid_lab['Untersuchungsdatum'])
test_lab['Untersuchungsdatum'] = pd.to_datetime(test_lab['Untersuchungsdatum'])
df_lab_results['Datum'] = pd.to_datetime(df_lab_results['Datum'])

In [None]:
def remove_sign(x):
    if str(x).startswith('<'):
        return float(x.split('<')[1])
    elif str(x).startswith('>'):
        return float(x.split('>')[1])
    return x

df_lab_results['Wert_txt'] = df_lab_results['Wert_txt'].apply(remove_sign)

In [None]:
for data_split in [train_lab, valid_lab, test_lab]:
    df_lab_results_filtered = df_lab_results[df_lab_results['Aufnahmenummer'].isin(data_split['Aufnahmenummer'])]
    df_lab_results_analytics_crp = df_lab_results_filtered[df_lab_results_filtered['Analyt'].isin(['CRP'])]
    df_lab_results_analytics_bnp = df_lab_results_filtered[df_lab_results_filtered['Analyt'].isin(['NTpBNP2'])]
    for key, lab_values in {'CRP': df_lab_results_analytics_crp, 'BNP': df_lab_results_analytics_bnp}.items():
        # Filter out the necessary analytics (CRP and BNP)
        for aufnahmenummer in tqdm(lab_values['Aufnahmenummer'].unique()):
            lab_results_of_interest = lab_values[lab_values['Aufnahmenummer'] == aufnahmenummer]
            for _, scan in data_split[data_split['Aufnahmenummer']==aufnahmenummer].iterrows():
                lab_results_after_scan_date = lab_results_of_interest.set_index('Datum').sort_index().loc[scan['Untersuchungsdatum']:]
                closest_lab_result_after_scan_date = lab_results_after_scan_date.iloc[0] if not lab_results_after_scan_date.empty else None            
                
                if (closest_lab_result_after_scan_date is not None) and ((closest_lab_result_after_scan_date.name - scan['Untersuchungsdatum']).days < 2):
                    try:
                        lab_value = float(closest_lab_result_after_scan_date['Wert_txt'])
                        data_split.loc[data_split['Anforderungsnummer'] == scan['Anforderungsnummer'], key] = lab_value
                    except ValueError:
                        print("ValueError was raised")
                        print(closest_lab_result_after_scan_date['Wert_txt'])
                        continue

# Remove accidental NaNs
train_lab.loc[train_lab['CRP'].isna(), 'CRP'] = -1000
valid_lab.loc[valid_lab['CRP'].isna(), 'CRP'] = -1000
test_lab.loc[test_lab['CRP'].isna(), 'CRP'] = -1000

In [None]:
len(test_lab['Anforderungsnummer'][test_lab['Anforderungsnummer']==-1000])

In [None]:
valid_lab_filtered = valid_lab[valid_lab['CRP'] != -1000]
test_lab_filtered = test_lab[test_lab['CRP'] != -1000]

In [None]:
train_lab.to_csv(os.path.join(path_to_csv_dir, 'train_lab.csv'), index=False)
valid_lab_filtered.to_csv(os.path.join(path_to_csv_dir, 'valid_lab.csv'), index=False)
test_lab_filtered.to_csv(os.path.join(path_to_csv_dir, 'test_lab.csv'), index=False)

In [None]:
train_lab_filtered = train_lab[train_lab['CRP'] != -1000]
train_lab_filtered.to_csv(os.path.join(path_to_csv_dir, 'train_lab_filtered.csv'), index=False)

In [None]:
valid_lab.to_csv(os.path.join(path_to_csv_dir, 'valid_lab_unfiltered.csv'), index=False)
test_lab.to_csv(os.path.join(path_to_csv_dir, 'test_lab_unfiltered.csv'), index=False)

#### Discretize the CRP Values

In [None]:
train_lab_discrete = train_lab.copy()
valid_lab_filtered_discrete = valid_lab_filtered.copy()
test_lab_filtered_discrete = test_lab_filtered.copy()

In [None]:
categorized_splits = {}
for key, split in {'train': train_lab_discrete, 'valid': valid_lab_filtered_discrete, 'test': test_lab_filtered_discrete}.items():
    categorized = pd.cut(split.CRP, bins=[0, 70, 140, 210, 280, 999], labels=['+', '++', '+++', '++++', '+++++'])
    split['CRP'] = categorized
    split_categorized = split.copy()
    split = pd.get_dummies(split, columns=['CRP'])
    split.loc[split_categorized.CRP.isnull(), split.columns.str.startswith("CRP")] = 99
    categorized_splits[key] = split

In [None]:
for key, split in categorized_splits.items():
    split = split.drop(columns=['BNP']) 
    split.to_csv(os.path.join(path_to_csv_dir, f'{key}_lab_categorized_CRP'), index=False)

## Evaluation of some metrics

In [None]:
valid_lab_filtered = valid_lab[valid_lab['CRP'] != -1000]
test_lab_filtered = test_lab[test_lab['CRP'] != -1000]
train_lab_filtered = train_lab[train_lab['CRP'] != -1000]

len(train_lab_filtered['PatientName'].unique())

In [None]:
print(len(train_lab_filtered[train_lab_filtered['CRP']<5]['PatientName'].unique()))
print(len(train_lab_filtered[(train_lab_filtered['CRP']>=5) & (train_lab_filtered['CRP']<=50)]['PatientName'].unique()))
print(len(train_lab_filtered[train_lab_filtered['CRP']>50]['PatientName'].unique()))

In [None]:
train_lab_filtered['CRP'].max()