In [1]:
import os
import pandas as pd
import ast

In [2]:
padchest_df = pd.read_csv("/dtu/p1/johlau/Thesis-Synthex/data/PADCHEST_chest_x_ray_images_labels_160K_01.02.19.csv")

In [5]:
padchest_files = os.listdir('/dtu/p1/johlau/Thesis-Synthex/data/padchest-cropped/')

In [6]:

padchest_df["FileID"] = padchest_df["ImageDir"].apply(lambda x : str(x)) + "-" + padchest_df["ImageID"]

In [7]:
padchest_df = padchest_df[~padchest_df["Labels"].isna()]

In [8]:
file_df = pd.DataFrame({
    "FileID":padchest_files
})

In [9]:
padchest_joined = padchest_df.merge(file_df, how="inner", on="FileID")

In [10]:
label_dict = {
    'pleural effusion': 'Effusion', 
    'pneumothorax': 'Pneumothorax', 
    'atelectasis': 'Atelectasis', 
    'cardiomegaly':'Cardiomegaly', 
    'pneumonia':'Pneumonia',
}

In [11]:
padchest_joined["Label_list"] = padchest_joined["Labels"].apply(lambda x : ast.literal_eval(x))

In [12]:
def label_onehot_encoder(row, label, column_name):
    labels = row[column_name]
    if label in labels:
        return 1
    else:
        return 0

for label in label_dict.keys():
    padchest_joined[label_dict[label]] = padchest_joined.apply(label_onehot_encoder, args=(label, "Label_list"), axis=1)

In [13]:
padchest_joined[["FileID"]]

Unnamed: 0,FileID
0,0-20536686640136348236148679891455886468_k6ga2...
1,0-135803415504923515076821959678074435083_fzis...
2,0-113855343774216031107737439268243531979_3k95...
3,0-3137231742710829928-247610802266403640553_ki...
4,0-105529804483623054726294337265160703666_6zn7...
...,...
91562,49-1284011361929414522079841201696751542351444...
91563,49-1284011361929414522814654121696751542351444...
91564,49-1284011361929414522094646571696751542351444...
91565,49-1284011361929414522084108901696751542351444...


In [14]:
padchest_cropped = padchest_joined[['FileID', 'Effusion', 'Pneumothorax', 'Atelectasis',
       'Cardiomegaly', 'Pneumonia']]

In [15]:
padchest_cropped = padchest_cropped.sample(frac=1, random_state=1)

In [16]:
idx_split = int(len(padchest_cropped)*0.7)

In [17]:
padchest_train = padchest_cropped.iloc[:idx_split]
padchest_dev = padchest_cropped.iloc[idx_split:]

In [18]:
padchest_train.sum()

FileID          21-2168401113669640140084165132020141611327116...
Effusion                                                     1626
Pneumothorax                                                  138
Atelectasis                                                   578
Cardiomegaly                                                 5682
Pneumonia                                                    1066
dtype: object

In [19]:
padchest_dev.sum()

FileID          19-2168401113669640135901404767220130511428005...
Effusion                                                      676
Pneumothorax                                                   54
Atelectasis                                                   228
Cardiomegaly                                                 2438
Pneumonia                                                     477
dtype: object

# Enrich with radchest

In [20]:
df_train = pd.read_csv("/dtu/p1/johlau/Thesis-Synthex/imgtrain_Abnormality_and_Location_Labels.csv")
df_valid = pd.read_csv("/dtu/p1/johlau/Thesis-Synthex/imgvalid_Abnormality_and_Location_Labels.csv")
df_test = pd.read_csv("/dtu/p1/johlau/Thesis-Synthex/imgtest_Abnormality_and_Location_Labels.csv")

df = pd.concat((df_train,df_valid,df_test)).reset_index(drop=True)

In [23]:
labels = [
    'cardiomegaly',
    'pleural_effusion',
    "pneumonia",
    'pneumothorax',
    'atelectasis'
]

label_dict = {
    'pleural_effusion': 'Effusion', 
    'pneumothorax': 'Pneumothorax', 
    'atelectasis': 'Atelectasis', 
    'cardiomegaly':'Cardiomegaly', 
    'pneumonia':'Pneumonia',
}

def one_hot_radchest(row, label_columns, label):
    key_values = row[row[label_columns]==1]
    keys = [key.split("*")[0] for key in key_values.keys() if key.split("*")[0] in labels]
    
    if label in keys:
        return 1
    else:
        return 0

label_location = [i for i in list(df.columns)[1:]]

label_list = list()

for label in label_dict.keys():
    df[label_dict[label]] = df[label_location].apply(one_hot_radchest, args=[label_location, label], axis=1)



In [24]:
radchest_df = df[["NoteAcc_DEID",'Effusion', 'Pneumothorax', 'Atelectasis',
       'Cardiomegaly', 'Pneumonia']]

radchest_df["image_path"]  = radchest_df["NoteAcc_DEID"] + ".nii.gz.png"

radchest_path = "/dtu/p1/johlau/Thesis-Synthex/data/RAD-ChestCT-DRR-CROPPED/"
radchest_images = os.listdir(radchest_path)

radchest_df = radchest_df.merge(
    pd.DataFrame({
        "image_path":radchest_images
    }),
    how="inner",
    on="image_path"
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  radchest_df["image_path"]  = radchest_df["NoteAcc_DEID"] + ".nii.gz.png"


In [25]:
radchest_df

Unnamed: 0,NoteAcc_DEID,Effusion,Pneumothorax,Atelectasis,Cardiomegaly,Pneumonia,image_path
0,trn24737,0,0,1,0,0,trn24737.nii.gz.png
1,trn15638,1,0,1,0,0,trn15638.nii.gz.png
2,trn09140,0,0,1,0,0,trn09140.nii.gz.png
3,trn17497,0,1,0,0,0,trn17497.nii.gz.png
4,trn17401,1,0,1,0,0,trn17401.nii.gz.png
...,...,...,...,...,...,...,...
1689,tst29564,1,0,0,0,0,tst29564.nii.gz.png
1690,tst33416,0,0,0,1,0,tst33416.nii.gz.png
1691,tst33609,0,0,1,0,0,tst33609.nii.gz.png
1692,tst34883,0,0,0,1,0,tst34883.nii.gz.png


In [26]:
radchest_angled_path = "/dtu/p1/johlau/Thesis-Synthex/data/RAD-ChestCT-DRR-angled-CROPPED/"
radchest_angled_images = os.listdir(radchest_angled_path)

radchest_angled_df = radchest_df[["NoteAcc_DEID",'Effusion', 'Pneumothorax', 'Atelectasis','Cardiomegaly', 'Pneumonia']].merge(
    pd.DataFrame({
        "NoteAcc_DEID":[i.split(".")[0] for i in radchest_angled_images],
        "image_path":radchest_angled_images
    }),
    how="right",
    on="NoteAcc_DEID"
)

In [27]:
def sample_angles(n, center_df, angled_df, id_column):
    
    return pd.concat(
        [angled_df[angled_df[id_column] == i].sample(n=n-1, random_state=1) for i in center_df[id_column]] + [center_df]
    )


In [28]:
radchest_sampled = sample_angles(2, radchest_df, radchest_angled_df, "NoteAcc_DEID")
radchest_sampled

Unnamed: 0,NoteAcc_DEID,Effusion,Pneumothorax,Atelectasis,Cardiomegaly,Pneumonia,image_path
14978,trn24737,0,0,1,0,0,trn24737.nii.gz_10_10.png
15091,trn15638,1,0,1,0,0,trn15638.nii.gz_0_-10.png
13095,trn09140,0,0,1,0,0,trn09140.nii.gz_5_10.png
11688,trn17497,0,1,0,0,0,trn17497.nii.gz_-5_-5.png
10562,trn17401,1,0,1,0,0,trn17401.nii.gz_-10_-10.png
...,...,...,...,...,...,...,...
1689,tst29564,1,0,0,0,0,tst29564.nii.gz.png
1690,tst33416,0,0,0,1,0,tst33416.nii.gz.png
1691,tst33609,0,0,1,0,0,tst33609.nii.gz.png
1692,tst34883,0,0,0,1,0,tst34883.nii.gz.png


In [29]:
padchest_train["image_path"] = '/dtu/p1/johlau/Thesis-Synthex/data/padchest-cropped/'+ padchest_train["FileID"]
padchest_dev["image_path"] = '/dtu/p1/johlau/Thesis-Synthex/data/padchest-cropped/'+ padchest_dev["FileID"]
radchest_df["image_path"] = radchest_path + radchest_df["image_path"]
radchest_angled_df["image_path"] = radchest_angled_path + radchest_angled_df["image_path"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  padchest_train["image_path"] = '/dtu/p1/johlau/Thesis-Synthex/data/padchest-cropped/'+ padchest_train["FileID"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  padchest_dev["image_path"] = '/dtu/p1/johlau/Thesis-Synthex/data/padchest-cropped/'+ padchest_dev["FileID"]


In [30]:
padchest_columns = ["image_path",'Effusion', 'Pneumothorax', 'Atelectasis','Cardiomegaly', 'Pneumonia']
radchest_columns =  ["NoteAcc_DEID","image_path",'Effusion', 'Pneumothorax', 'Atelectasis','Cardiomegaly', 'Pneumonia']

padchest_train[padchest_columns].to_csv("data/padchest_train.csv")
padchest_dev[padchest_columns].to_csv("data/padchest_dev.csv")
radchest_df[radchest_columns].to_csv("data/radchest_center.csv")
radchest_angled_df[radchest_columns].to_csv("data/radchest_angled.csv")

In [101]:
radchest_df[radchest_columns]

Unnamed: 0,NoteAcc_DEID,image_path,Effusion,Pneumothorax,Atelectasis,Cardiomegaly,Pneumonia
0,trn24737,/dtu/p1/johlau/Thesis-Synthex/data/RAD-ChestCT...,0,0,1,0,0
1,trn15638,/dtu/p1/johlau/Thesis-Synthex/data/RAD-ChestCT...,1,0,1,0,0
2,trn09140,/dtu/p1/johlau/Thesis-Synthex/data/RAD-ChestCT...,0,0,1,0,0
3,trn17497,/dtu/p1/johlau/Thesis-Synthex/data/RAD-ChestCT...,0,1,0,0,0
4,trn17401,/dtu/p1/johlau/Thesis-Synthex/data/RAD-ChestCT...,1,0,1,0,0
...,...,...,...,...,...,...,...
1689,tst29564,/dtu/p1/johlau/Thesis-Synthex/data/RAD-ChestCT...,1,0,0,0,0
1690,tst33416,/dtu/p1/johlau/Thesis-Synthex/data/RAD-ChestCT...,0,0,0,1,0
1691,tst33609,/dtu/p1/johlau/Thesis-Synthex/data/RAD-ChestCT...,0,0,1,0,0
1692,tst34883,/dtu/p1/johlau/Thesis-Synthex/data/RAD-ChestCT...,0,0,0,1,0


In [106]:
train_df = pd.concat([padchest_train, radchest_df]).reset_index(drop=True)

In [113]:
int(len(train_df)/5)*5

65790

In [107]:
train_df.sample(frac=1, random_state=1)

Unnamed: 0,FileID,Effusion,Pneumothorax,Atelectasis,Cardiomegaly,Pneumonia,image_path,NoteAcc_DEID
5776,39-2168401113669640124878587175220092241308260...,0,0,0,0,0,/dtu/p1/johlau/Thesis-Synthex/data/padchest_cr...,
53346,25-2168401113669640139163966064420140490807213...,0,0,0,0,0,/dtu/p1/johlau/Thesis-Synthex/data/padchest_cr...,
47142,34-2168401113669640125580829067120101171605114...,0,0,0,0,0,/dtu/p1/johlau/Thesis-Synthex/data/padchest_cr...,
65661,,1,0,0,0,0,/dtu/p1/johlau/Thesis-Synthex/data/RAD-ChestCT...,tst34291
60682,1-247845946645474345832656069508376915522_dmyv...,0,0,0,0,0,/dtu/p1/johlau/Thesis-Synthex/data/padchest_cr...,
...,...,...,...,...,...,...,...,...
49100,4-162935580901648119553472959830992893108_nbbr...,0,0,0,0,0,/dtu/p1/johlau/Thesis-Synthex/data/padchest_cr...,
20609,20-2168401113669640134512283796920122961011158...,0,0,0,1,0,/dtu/p1/johlau/Thesis-Synthex/data/padchest_cr...,
21440,5-176233350288598204755145189127282325879_1w7t...,0,0,0,0,0,/dtu/p1/johlau/Thesis-Synthex/data/padchest_cr...,
50057,25-2168401113669640139624900649420141001238441...,0,0,0,0,0,/dtu/p1/johlau/Thesis-Synthex/data/padchest_cr...,


In [137]:
def get_n_splits(n, df):
    length = int(len(df)/(n+1))

    output_list = [df.iloc[int(length*i):int(length*(i+1))] for i in range(n-1)]
    output_list.append(df.iloc[int(length*n-1):len(df)])
    

    return output_list

In [138]:
train_splits = get_n_splits(5, train_df.sample(frac=1, random_state=1))

In [145]:
train_splits[0]

Unnamed: 0,FileID,Effusion,Pneumothorax,Atelectasis,Cardiomegaly,Pneumonia,image_path,NoteAcc_DEID
5776,39-2168401113669640124878587175220092241308260...,0,0,0,0,0,/dtu/p1/johlau/Thesis-Synthex/data/padchest_cr...,
53346,25-2168401113669640139163966064420140490807213...,0,0,0,0,0,/dtu/p1/johlau/Thesis-Synthex/data/padchest_cr...,
47142,34-2168401113669640125580829067120101171605114...,0,0,0,0,0,/dtu/p1/johlau/Thesis-Synthex/data/padchest_cr...,
65661,,1,0,0,0,0,/dtu/p1/johlau/Thesis-Synthex/data/RAD-ChestCT...,tst34291
60682,1-247845946645474345832656069508376915522_dmyv...,0,0,0,0,0,/dtu/p1/johlau/Thesis-Synthex/data/padchest_cr...,
...,...,...,...,...,...,...,...,...
60060,3-113186032546764588865380623816387095374-3_1u...,0,0,0,0,0,/dtu/p1/johlau/Thesis-Synthex/data/padchest_cr...,
33706,1-126631559477564604199530324581839142652_9731...,0,0,0,0,0,/dtu/p1/johlau/Thesis-Synthex/data/padchest_cr...,
27170,12-2168401113669640136860425485320132000848347...,0,0,0,0,0,/dtu/p1/johlau/Thesis-Synthex/data/padchest_cr...,
34596,8-203727401909229553535911831173129872432_u1v7...,0,0,0,0,0,/dtu/p1/johlau/Thesis-Synthex/data/padchest_cr...,
