# Generate dataset

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/BIMCV-CSUSP/BIMCV-COVID-19/blob/master/padchest-covid/datasets.ipynb)
<p style="text-align: center;">WIP</p>

This code is intended to generate a subset of the [padchest dataset](http://bimcv.cipf.es/bimcv-projects/padchest/) to train a model for detecting Coronavirus 2019 (COVID-19) in Chest Radiograph Images. The [PADCHEST_chest_x_ray_images_labels_160K_01.02.19.csv file](https://drive.google.com/file/d/1tBW4EB5DfHdRikHbUotjmGKO5Kos6tRC/view?usp=sharing)

In [1]:
import csv
from IPython.display import HTML, display

Load the padchest dataset tables

In [2]:
with open("./PADCHEST_chest_x_ray_images_labels_160K_01.02.19.csv") as f:
    r = csv.DictReader(f)
    data = [l for l in r]

    
headers = data[0].keys()    
display(HTML(
   '<table><tr><th>{}</th></tr><tr>{}</tr></table>'.format(
       '</th><th>'.join(str(h) for h in headers),
       '</tr><tr>'.join(
           '<td>{}</td>'.format('</td><td>'.join(str(row[k]) for k in headers)) for row in data[:3])
       )
))


Unnamed: 0,ImageID,ImageDir,StudyDate_DICOM,StudyID,PatientID,PatientBirth,PatientSex_DICOM,ViewPosition_DICOM,Projection,MethodProjection,Pediatric,Modality_DICOM,Manufacturer_DICOM,PhotometricInterpretation_DICOM,PixelRepresentation_DICOM,PixelAspectRatio_DICOM,SpatialResolution_DICOM,BitsStored_DICOM,WindowCenter_DICOM,WindowWidth_DICOM,Rows_DICOM,Columns_DICOM,XRayTubeCurrent_DICOM,Exposure_DICOM,ExposureInuAs_DICOM,ExposureTime,RelativeXRayExposure_DICOM,ReportID,Report,MethodLabel,Labels,Localizations,LabelsLocalizationsBySentence,labelCUIS,LocalizationsCUIS
0,20536686640136348236148679891455886468_k6ga29.png,0,20140915,20536686640136348236148679891455886468,839860488694292331637988235681460987,1930.0,F,POSTEROANTERIOR,PA,Manual review of DICOM fields,No,CR,ImagingDynamicsCompanyLtd,MONOCHROME2,0,,,12,2092,2251,1728,1872,250,2,2500,10,-1.42,4765777,sin hallazg patolog edad pacient .,Physician,['normal'],[],"[['normal'], ['normal']]",[],[]
1,135803415504923515076821959678074435083_fzis7d.png,0,20150914,135803415504923515076821959678074435083,313572750430997347502932654319389875966,1929.0,M,LATERAL,L,Manual review of DICOM fields,No,CR,ImagingDynamicsCompanyLtd,MONOCHROME2,0,,,12,2631,2065,3296,3236,400,10,10000,25,,4991845,cambi pulmonar cronic sever . sign fibrosis bibasal . sutil infiltr pseudonodul milimetr vidri deslustr localiz bas . cifosis sever .,Physician,"['pulmonary fibrosis', 'chronic changes', 'kyphosis', 'pseudonodule', 'ground glass pattern']","['loc basal', 'loc basal bilateral']","[['pulmonary fibrosis', 'loc basal bilateral'], ['chronic changes'], ['kyphosis'], ['pseudonodule', 'ground glass pattern', 'loc basal']]",['C0034069' 'C0742362' 'C2115817' 'C3544344'],['C1282378']
2,135803415504923515076821959678074435083_fzis7b.png,0,20150914,135803415504923515076821959678074435083,313572750430997347502932654319389875966,1929.0,M,POSTEROANTERIOR,PA,Manual review of DICOM fields,No,CR,ImagingDynamicsCompanyLtd,MONOCHROME2,0,,,12,2155,2880,3572,3732,320,3,3200,10,,4991845,cambi pulmonar cronic sever . sign fibrosis bibasal . sutil infiltr pseudonodul milimetr vidri deslustr localiz bas . cifosis sever .,Physician,"['pulmonary fibrosis', 'chronic changes', 'kyphosis', 'pseudonodule', 'ground glass pattern']","['loc basal', 'loc basal bilateral']","[['pulmonary fibrosis', 'loc basal bilateral'], ['chronic changes'], ['kyphosis'], ['pseudonodule', 'ground glass pattern', 'loc basal']]",['C0034069' 'C0742362' 'C2115817' 'C3544344'],['C1282378']


Split the dataset in 5 groups depending if they have some keywords in their labels

In [3]:
normals = [l for l in data if "'normal'" in l["Labels"]]
pneumonia = [l for l in data if "pneumonia" in l["Labels"] and "infiltrates" not in l["Labels"]]
infiltrates = [l for l in data if "infiltrates" in l["Labels"] and "pneumonia" not in l["Labels"]]
p_i = [l for l in data if "infiltrates" in l["Labels"] and "pneumonia" in l["Labels"]]
not_pneumonia = [l for l in data if "infiltrates" not in l["Labels"] if "pneumonia" not in l["Labels"] and "unchanged" not in l["Labels"]]

And split them again in sex for detecting balancing issues

In [4]:
normals_m = [l for l in normals if l["PatientSex_DICOM"] == 'M']
normals_f = [l for l in normals if l["PatientSex_DICOM"] == 'F']

In [5]:
pneumonia_m = [l for l in pneumonia if l["PatientSex_DICOM"] == 'M']
pneumonia_f = [l for l in pneumonia if l["PatientSex_DICOM"] == 'F']

In [6]:
not_pneumonia_m = [l for l in not_pneumonia if l["PatientSex_DICOM"] == 'M']
not_pneumonia_f = [l for l in not_pneumonia if l["PatientSex_DICOM"] == 'F']

In [7]:
infiltrates_m = [l for l in infiltrates if l["PatientSex_DICOM"] == 'M']
infiltrates_f = [l for l in infiltrates if l["PatientSex_DICOM"] == 'F']


In [8]:
p_i_m = [l for l in p_i if l["PatientSex_DICOM"] == 'M']
p_i_f = [l for l in p_i if l["PatientSex_DICOM"] == 'F']


In [9]:
values = [["", "Masculine", "Femenine", "Total" ],
          ["Normals", len(normals_m), len(normals_f), len(normals)],
          ["Pneumonia", len(pneumonia_m), len(pneumonia_f), len(pneumonia)],
          ["Infiltrates", len(infiltrates_m), len(infiltrates_f), len(infiltrates)],
          ["Pneumonia and infiltrates", len(p_i_m), len(p_i_f), len(p_i)],
          ["Not pneumonia nor infiltrates", len(not_pneumonia_m), len(not_pneumonia_f), len(not_pneumonia)]] 
display(HTML(
   '<table><tr>{}</tr></table>'.format(
       '</tr><tr>'.join(
           '<td>{}</td>'.format('</td><td>'.join(str(_) for _ in row)) for row in values)
       )
))

0,1,2,3
,Masculine,Femenine,Total
Normals,21414,28917,50340
Pneumonia,3337,2584,5921
Infiltrates,2772,2064,4836
Pneumonia and infiltrates,1251,1002,2253
Not pneumonia nor infiltrates,65904,68600,134524


Store the generated subgroups in diferents files in case we need them

In [10]:
datasets = [{"filename":"neumo_normals_m.tsv","data":normals_m},{"filename":"neumo_normals_f.tsv","data":normals_f},
            {"filename":"neumo_pneumonia_m.tsv","data":pneumonia_m},{"filename":"neumo_pneumonia_f.tsv","data":pneumonia_f},
            {"filename":"neumo_infiltrates_m.tsv","data":pneumonia_m},{"filename":"neumo_infiltrates_f.tsv","data":pneumonia_f},
            {"filename":"neumo_pneumonia_infiltrates_m.tsv","data":pneumonia_m},{"filename":"neumo_pneumonia_infiltrates_f.tsv","data":pneumonia_f},
            {"filename":"neumo_not_pneumonia_m.tsv","data":not_pneumonia_m},{"filename":"neumo_not_pneumonia_f.tsv","data":not_pneumonia_f}]

header = ["ImageID","StudyDate_DICOM","StudyID","PatientID","PatientBirth","PatientSex_DICOM","ViewPosition_DICOM","Projection","MethodProjection","Pediatric","Modality_DICOM","Manufacturer_DICOM","PhotometricInterpretation_DICOM","PixelRepresentation_DICOM","PixelAspectRatio_DICOM","SpatialResolution_DICOM","BitsStored_DICOM","WindowCenter_DICOM","WindowWidth_DICOM","Rows_DICOM","Columns_DICOM","XRayTubeCurrent_DICOM","Exposure_DICOM","ExposureInuAs_DICOM","ExposureTime","RelativeXRayExposure_DICOM","Labels"]
for d in datasets:
    with open(d["filename"], "w") as f:
        w = csv.DictWriter(f, header, delimiter='\t', extrasaction='ignore')
        w.writeheader()
        for l in d["data"]:
            w.writerow(l)

Generate the dataset. As we have about 13k images with findings that can be caused by covid19, we fill the rest of the dataset with the same number of images from the normal group and other findings group in a 1:1 ratio.

In [11]:
header2 = header+['group']
with open("neumo_dataset.tsv", "w") as f:
    w = csv.DictWriter(f, header2, delimiter='\t', extrasaction='ignore')
    w.writeheader()
    for l in pneumonia:
        w.writerow({**l,**{"group":'N'}})
        
    for l in infiltrates:
        w.writerow({**l,**{"group":'I'}})
        
    for l in p_i:
        w.writerow({**l,**{"group":'NI'}})
        
    for l in normals[:6505]:
        w.writerow({**l,**{"group":'C'}})
        
    for l in not_pneumonia[:6505]:
        w.writerow({**l,**{"group":'C'}})
    