# Create COVIDx Dataset

# 1. Library Import

In [1]:
import numpy as np
import pandas as pd
import os
import random 
from shutil import copyfile
import pydicom as dicom
import cv2

# 2. Data Path Setting

In [2]:
# PATH setting
path = '/home/jpulsar/Dataset/COVID19' # Type Your PATH
print(path)

/home/jpulsar/Dataset/COVID19


In [3]:
# set parameters here
datapath = os.path.join(path, 'raw_data')
savepath = os.path.join(path, 'new_data')
seed = 0
np.random.seed(seed) # Reset the seed so all runs are the same.
random.seed(seed)
MAXVAL = 255  # Range [0 255]

# path to covid-19 dataset from https://github.com/ieee8023/covid-chestxray-dataset
cohen_imgpath = os.path.join(datapath, 'covid-chestxray-dataset/images')
cohen_csvpath = os.path.join(datapath, 'covid-chestxray-dataset/metadata.csv')

# path to covid-19 dataset from https://github.com/agchung/Figure1-COVID-chestxray-dataset
fig1_imgpath = os.path.join(datapath, 'Figure1-COVID-chestxray-dataset/images')
fig1_csvpath = os.path.join(datapath, 'Figure1-COVID-chestxray-dataset/metadata.csv')

# path to covid-19 dataset from https://github.com/agchung/Actualmed-COVID-chestxray-dataset
actmed_imgpath = os.path.join(datapath, 'Actualmed-COVID-chestxray-dataset/images')
actmed_csvpath = os.path.join(datapath, 'Actualmed-COVID-chestxray-dataset/metadata.csv')

# path to covid-19 dataset from https://www.kaggle.com/tawsifurrahman/covid19-radiography-database
sirm_imgpath = os.path.join(datapath, 'COVID-19_Radiography_Dataset/COVID')
sirm_csvpath = os.path.join(datapath, 'COVID-19_Radiography_Dataset/COVID.metadata.xlsx')

# path to https://www.kaggle.com/c/rsna-pneumonia-detection-challenge
rsna_datapath = os.path.join(datapath,'rsna-pneumonia-detection-challenge')
# get all the normal from here
rsna_csvname = 'stage_2_detailed_class_info.csv' 
# get all the 1s from here since 1 indicate pneumonia
# found that images that aren't pneunmonia and also not normal are classified as 0s
rsna_csvname2 = 'stage_2_train_labels.csv' 
rsna_imgpath = 'stage_2_train_images'

# path to ricord covid-19 images created by create_ricord_dataset/create_ricord_dataset.ipynb
# run create_ricord_dataset.ipynb before this notebook
# ricord_imgpath = 'create_ricord_dataset/ricord_images'
# ricord_txt = 'create_ricord_dataset/ricord_data_set.txt'

# parameters for COVIDx dataset
train = []
test = []
test_count = {'normal': 0, 'pneumonia': 0, 'COVID-19': 0}
train_count = {'normal': 0, 'pneumonia': 0, 'COVID-19': 0}

covid = []
normal = []
class_count = {'COVID-19': 0, 'normal': 0}

mapping = dict()
mapping['COVID-19'] = 'COVID-19'
mapping['SARS'] = 'pneumonia'
mapping['MERS'] = 'pneumonia'
mapping['Streptococcus'] = 'pneumonia'
mapping['Klebsiella'] = 'pneumonia'
mapping['Chlamydophila'] = 'pneumonia'
mapping['Legionella'] = 'pneumonia'
mapping['Normal'] = 'normal'
mapping['Lung Opacity'] = 'pneumonia'
mapping['1'] = 'pneumonia'

# train/test split
split = 0.1

# to avoid duplicates
patient_imgpath = {}

In [4]:
# Output directroy Build
def path_builder(path):
    try:
        os.mkdir(path)
    except Exception as err:
        print(err)
        pass

path_builder(savepath)
path_builder(os.path.join(savepath, 'covid'))
path_builder(os.path.join(savepath, 'normal'))

[Errno 17] File exists: '/home/jpulsar/Dataset/COVID19/new_data'


## 2.1 Metadata Reading

In [5]:
# adapted from https://github.com/mlmed/torchxrayvision/blob/master/torchxrayvision/datasets.py#L814
cohen_csv = pd.read_csv(cohen_csvpath, nrows=None)

#idx_pa = csv["view"] == "PA"  # Keep only the PA view
views = ["PA", "AP", "AP Supine", "AP semi erect", "AP erect"]
cohen_idx_keep = cohen_csv.view.isin(views)
cohen_csv = cohen_csv[cohen_idx_keep]

fig1_csv = pd.read_csv(fig1_csvpath, encoding='ISO-8859-1', nrows=None)
actmed_csv = pd.read_csv(actmed_csvpath, nrows=None)

sirm_csv = pd.read_excel(sirm_csvpath)

## 2.2 Dataset distribution

In [6]:
# get non-COVID19 viral, bacteria, and COVID-19 infections from covid-chestxray-dataset, figure1 and actualmed
# stored as patient id, image filename and label
filename_label = {'normal': [], 'pneumonia': [], 'COVID-19': []}
count = {'normal': 0, 'pneumonia': 0, 'COVID-19': 0}
covid_ds = {'cohen': [], 'fig1': [], 'actmed': [], 'sirm': []}

for index, row in cohen_csv.iterrows():
    f = row['finding'].split(',')[0] # take the first finding, for the case of COVID-19, ARDS
    if f in mapping: # 
        count[mapping[f]] += 1
        entry = [str(row['patientid']), row['filename'], mapping[f], 'cohen']
        filename_label[mapping[f]].append(entry)
        if mapping[f] == 'COVID-19':
            covid_ds['cohen'].append(str(row['patientid']))
        
for index, row in fig1_csv.iterrows():
    if not str(row['finding']) == 'nan':
        f = row['finding'].split(',')[0] # take the first finding
        if f in mapping: # 
            count[mapping[f]] += 1
            if os.path.exists(os.path.join(fig1_imgpath, row['patientid'] + '.jpg')):
                entry = [row['patientid'], row['patientid'] + '.jpg', mapping[f], 'fig1']
            elif os.path.exists(os.path.join(fig1_imgpath, row['patientid'] + '.png')):
                entry = [row['patientid'], row['patientid'] + '.png', mapping[f], 'fig1']
            filename_label[mapping[f]].append(entry)
            if mapping[f] == 'COVID-19':
                covid_ds['fig1'].append(row['patientid'])

for index, row in actmed_csv.iterrows():
    if not str(row['finding']) == 'nan':
        f = row['finding'].split(',')[0]
        if f in mapping:
            count[mapping[f]] += 1
            entry = [row['patientid'], row['imagename'], mapping[f], 'actmed']
            filename_label[mapping[f]].append(entry)
            if mapping[f] == 'COVID-19':
                covid_ds['actmed'].append(row['patientid'])
    
sirm = set(sirm_csv['URL'])
cohen = set(cohen_csv['url'])
discard = ['100', '101', '102', '103', '104', '105', 
           '110', '111', '112', '113', '122', '123', 
           '124', '125', '126', '217']

for idx, row in sirm_csv.iterrows():
    patientid = row['FILE NAME']
    if row['URL'] not in cohen and patientid[patientid.find('(')+1:patientid.find(')')] not in discard:
        count[mapping['COVID-19']] += 1
        imagename = patientid + '.' + row['FORMAT'].lower()
        if not os.path.exists(os.path.join(sirm_imgpath, imagename)):
            imagename = patientid.split('(')[0] + ' ('+ patientid.split('(')[1] + '.' + row['FORMAT'].lower()
        entry = [patientid, imagename, mapping['COVID-19'], 'sirm']
        filename_label[mapping['COVID-19']].append(entry)
        covid_ds['sirm'].append(patientid)
    
print('Data distribution from covid datasets:')
print(count)

Data distribution from covid datasets:
{'normal': 0, 'pneumonia': 0, 'COVID-19': 3709}


# 3. COVID-19 Dataset (Class)

## 3.1. COVID-19 Class

In [7]:
ds_imgpath = {'cohen': cohen_imgpath, 'fig1': fig1_imgpath, 'actmed': actmed_imgpath, 'sirm': sirm_imgpath}

for key in filename_label.keys():
    arr = np.array(filename_label['COVID-19'])
    # go through all the patients
    for patient in arr:
        if patient[0] not in patient_imgpath:
            patient_imgpath[patient[0]] = [patient[1]]
        else:
            if patient[1] not in patient_imgpath[patient[0]]:
                patient_imgpath[patient[0]].append(patient[1])
            else:
                continue  # skip since image has already been written
        if patient[3] == 'sirm':
            image = cv2.imread(os.path.join(ds_imgpath[patient[3]], patient[1]))
            gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
            patient[1] = patient[1].replace(' ', '')
            cv2.imwrite(os.path.join(savepath, 'covid', patient[1]), gray)
        else:
            copyfile(os.path.join(ds_imgpath[patient[3]], patient[1]), os.path.join(savepath, 'covid', patient[1]))
        covid.append(patient)
        class_count[patient[2]] += 1

print('Covid count: ', class_count)

Covid count:  {'COVID-19': 3709, 'normal': 0}


## 3.2. Normal Dataset

In [8]:
csv_normal = pd.read_csv(os.path.join(rsna_datapath, rsna_csvname), nrows=None)
csv_pneu = pd.read_csv(os.path.join(rsna_datapath, rsna_csvname2), nrows=None)
patients = {'normal': [], 'pneumonia': []}

for index, row in csv_normal.iterrows():
    if row['class'] == 'Normal':
        patients['normal'].append(row['patientId'])


for key in patients.keys():
    arr = np.array(patients['normal'])
    if arr.size == 0:
        continue
    for patient in arr:
        if patient not in patient_imgpath:
            patient_imgpath[patient] = [patient]
        else:
            continue  # skip since image has already been written
                
        ds = dicom.dcmread(os.path.join(rsna_datapath, rsna_imgpath, patient + '.dcm'))
        pixel_array_numpy = ds.pixel_array
        imgname = patient + '.png'
        cv2.imwrite(os.path.join(savepath, 'normal', imgname), pixel_array_numpy)
        normal.append([patient, imgname, key, 'rsna'])
        class_count[key] += 1

print('Normal count: ', class_count)

Normal count:  {'COVID-19': 3709, 'normal': 8851}


In [9]:
print('COVID dataset Number = ', len(list(os.listdir(os.path.join(savepath, 'covid')))))
print('Normal dataset Number = ', len(list(os.listdir(os.path.join(savepath, 'normal')))))

COVID dataset Number =  3709
Normal dataset Number =  8851
