In [5]:
import pandas as pd
import os
import json
import ast
import numpy as np

In [6]:
padchest_df = pd.read_csv("/dtu/p1/johlau/Thesis-Synthex/data/padchest/preprocessed_df.csv")

In [7]:
df_train = pd.read_csv("/dtu/p1/johlau/Thesis-Synthex/data/RAD-ChestCT/imgtrain_Abnormality_and_Location_Labels.csv")
df_valid = pd.read_csv("/dtu/p1/johlau/Thesis-Synthex/data/RAD-ChestCT/imgvalid_Abnormality_and_Location_Labels.csv")
df_test = pd.read_csv("/dtu/p1/johlau/Thesis-Synthex/data/RAD-ChestCT/imgtest_Abnormality_and_Location_Labels.csv")

radchest_df = pd.concat((df_train,df_valid,df_test)).reset_index(drop=True)

In [8]:
padchest_df["Label_list"] = padchest_df["Labels"].apply(lambda x : ast.literal_eval(x))

In [9]:
padchest_df["Label_list"]

0                                                  [normal]
1         [kyphosis, ground glass pattern, pseudonodule,...
2                                         [chronic changes]
3         [interstitial pattern, unchanged, alveolar pat...
4         [pleural effusion, suture material, apical ple...
                                ...                        
109039            [costophrenic angle blunting, copd signs]
109040                                             [normal]
109041                        [endotracheal tube, nsg tube]
109042                                             [normal]
109043                                          [pneumonia]
Name: Label_list, Length: 109044, dtype: object

In [10]:
padchest_labels = [
    'cardiomegaly',
    'pleural effusion',
    "pneumonia",
    'pneumothorax',
    'atelectasis'
]

radchest_labels = [
    'cardiomegaly',
    'pleural_effusion',
    "pneumonia",
    'pneumothorax',
    'atelectasis'
]

In [11]:
padchest_counts = {
    'cardiomegaly':0,
    'pleural effusion':0,
    "pneumonia":0,
    'pneumothorax':0,
    'atelectasis':0
}

radchest_counts = {
    'cardiomegaly':0,
    'pleural_effusion':0,
    "pneumonia":0,
    'pneumothorax':0,
    'atelectasis':0
}

In [12]:
label_list = list()
label_location = [i for i in list(radchest_df.columns)[1:] if i.split("*")[0] in radchest_labels]

for i in range(len(radchest_df)):

    key_values = radchest_df.loc[i][radchest_df.loc[i]==1]
    valid = False

    for key in key_values.keys():
        if key in label_location:
            valid = True
    
    if valid:
        label_list.append(key_values)


for i in range(len(label_list)):
    key_labels = list(set([i.split("*")[0] for i in label_list[i].keys()]))
    for label in radchest_labels:
        if label in key_labels:
            radchest_counts[label] += 1

print(radchest_counts)

{'cardiomegaly': 394, 'pleural_effusion': 727, 'pneumonia': 196, 'pneumothorax': 126, 'atelectasis': 1082}


In [13]:
for i in range(len(padchest_df["Label_list"])):
    for label in padchest_labels:
        if label in padchest_df["Label_list"][i]:
            padchest_counts[label] += 1

print(padchest_counts)

{'cardiomegaly': 9670, 'pleural effusion': 6766, 'pneumonia': 4995, 'pneumothorax': 383, 'atelectasis': 2146}


In [14]:
padchest_counts

{'cardiomegaly': 9670,
 'pleural effusion': 6766,
 'pneumonia': 4995,
 'pneumothorax': 383,
 'atelectasis': 2146}

In [15]:
radchest_counts["pleural effusion"] =radchest_counts["pleural_effusion"]

In [16]:
padchest_df =  padchest_df[padchest_df["Projection"] == "PA"]

In [17]:
for label in padchest_counts.keys():
    percentage_increase = np.round((((padchest_counts[label] + radchest_counts[label])/padchest_counts[label])-1)*100, 2)
    print(f"{label} padchest: {padchest_counts[label]}, enritched: {padchest_counts[label] + radchest_counts[label]}, percentage increase: {percentage_increase}%") 

cardiomegaly padchest: 9670, enritched: 10064, percentage increase: 4.07%
pleural effusion padchest: 6766, enritched: 7493, percentage increase: 10.74%
pneumonia padchest: 4995, enritched: 5191, percentage increase: 3.92%
pneumothorax padchest: 383, enritched: 509, percentage increase: 32.9%
atelectasis padchest: 2146, enritched: 3228, percentage increase: 50.42%


In [18]:
chexpert_counts = {
    'cardiomegaly':23002,
    'pleural effusion':75696,
    "pneumonia":4576,
    'pneumothorax':17313,
    'atelectasis':29333,
    'total':187641
}

In [19]:
experiment_list = list()

for label in padchest_counts.keys():
    percentage_increase = np.round((((padchest_counts[label] + radchest_counts[label])/padchest_counts[label])-1)*100, 2)
    
    experiment_list.append([label, padchest_counts[label], radchest_counts[label], padchest_counts[label] + radchest_counts[label], percentage_increase, chexpert_counts[label]])
    
    
    
    #print(f"{label} padchest: {padchest_counts[label]}, enritched: {padchest_counts[label] + radchest_counts[label]}, percentage increase: {percentage_increase}%") 

percentage_increase = np.round((((len(padchest_df) + len(label_list))/len(padchest_df))-1)*100, 2)
experiment_list.append(["total", len(padchest_df), len(label_list), len(padchest_df) + len(label_list), percentage_increase, chexpert_counts["total"]])

experiment_df = pd.DataFrame(experiment_list, columns=["Label","padchest", "radchest","padchest+radchest","percentage_increase","chexpert"])
experiment_df["validation (chexpert 30%)"] = experiment_df["chexpert"].apply(lambda x: int(x*0.3))
experiment_df["test (chexpert 70%)"] = experiment_df["chexpert"].apply(lambda x: int(x*0.7))

print(experiment_df[['Label', 'padchest', 'radchest', 'padchest+radchest',
       'percentage_increase', 'validation (chexpert 30%)',
       'test (chexpert 70%)']].to_latex(index=False))

\begin{tabular}{lrrrrrr}
\toprule
Label & padchest & radchest & padchest+radchest & percentage_increase & validation (chexpert 30%) & test (chexpert 70%) \\
\midrule
cardiomegaly & 9670 & 394 & 10064 & 4.070000 & 6900 & 16101 \\
pleural effusion & 6766 & 727 & 7493 & 10.740000 & 22708 & 52987 \\
pneumonia & 4995 & 196 & 5191 & 3.920000 & 1372 & 3203 \\
pneumothorax & 383 & 126 & 509 & 32.900000 & 5193 & 12119 \\
atelectasis & 2146 & 1082 & 3228 & 50.420000 & 8799 & 20533 \\
total & 90994 & 1694 & 92688 & 1.860000 & 56292 & 131348 \\
\bottomrule
\end{tabular}



In [20]:
experiment_df

Unnamed: 0,Label,padchest,radchest,padchest+radchest,percentage_increase,chexpert,validation (chexpert 30%),test (chexpert 70%)
0,cardiomegaly,9670,394,10064,4.07,23002,6900,16101
1,pleural effusion,6766,727,7493,10.74,75696,22708,52987
2,pneumonia,4995,196,5191,3.92,4576,1372,3203
3,pneumothorax,383,126,509,32.9,17313,5193,12119
4,atelectasis,2146,1082,3228,50.42,29333,8799,20533
5,total,90994,1694,92688,1.86,187641,56292,131348


In [22]:
experiment_df

Unnamed: 0,Label,padchest,radchest,padchest+radchest,percentage_increase,chexpert,validation (chexpert 30%),test (chexpert 70%)
0,cardiomegaly,9670,394,10064,4.07,23002,6900,16101
1,pleural effusion,6766,727,7493,10.74,75696,22708,52987
2,pneumonia,4995,196,5191,3.92,4576,1372,3203
3,pneumothorax,383,126,509,32.9,17313,5193,12119
4,atelectasis,2146,1082,3228,50.42,29333,8799,20533
5,total,90994,1694,92688,1.86,187641,56292,131348


In [27]:
experiment_df["padchest"]+experiment_df["radchest"]*10

0     13610
1     14036
2      6955
3      1643
4     12966
5    107934
dtype: int64