In [1]:
import pandas as pd
import numpy as np
import os
import cv2

In [2]:
import numpy as np
import pydicom
from pydicom.pixel_data_handlers.util import apply_voi_lut

def read_xray(path, voi_lut = True, fix_monochrome = True):
    # Original from: https://www.kaggle.com/raddar/convert-dicom-to-np-array-the-correct-way
    dicom = pydicom.read_file(path)
    
    # VOI LUT (if available by DICOM device) is used to transform raw DICOM data to 
    # "human-friendly" view
    if voi_lut:
        data = apply_voi_lut(dicom.pixel_array, dicom)
    else:
        data = dicom.pixel_array
               
    # depending on this value, X-ray may look inverted - fix that:
    if fix_monochrome and dicom.PhotometricInterpretation == "MONOCHROME1":
        data = np.amax(data) - data
        
    data = data - np.min(data)
    data = data / np.max(data)
    data = (data * 255).astype(np.uint8)
        
    return data

In [3]:
import multiprocessing as mp

In [4]:
train = pd.read_csv('/raid/siim-covid19-detection/train_image_level.csv')

In [5]:
train.head()

Unnamed: 0,id,boxes,label,StudyInstanceUID
0,000a312787f2_image,"[{'x': 789.28836, 'y': 582.43035, 'width': 102...",opacity 1 789.28836 582.43035 1815.94498 2499....,5776db0cec75
1,000c3a3f293f_image,,none 1 0 0 1 1,ff0879eb20ed
2,0012ff7358bc_image,"[{'x': 677.42216, 'y': 197.97662, 'width': 867...",opacity 1 677.42216 197.97662 1545.21983 1197....,9d514ce429a7
3,001398f4ff4f_image,"[{'x': 2729, 'y': 2181.33331, 'width': 948.000...",opacity 1 2729 2181.33331 3677.00012 2785.33331,28dddc8559b2
4,001bd15d1891_image,"[{'x': 623.23328, 'y': 1050, 'width': 714, 'he...",opacity 1 623.23328 1050 1337.23328 2156 opaci...,dfd9fdd85a3e


In [6]:
maps = {}
for dirname, _, filenames in os.walk(f'/raid/siim-covid19-detection/train'):
    for file in filenames:
        maps[file.replace('.dcm','_image')] = dirname.split('/')[-1]
train['series_id'] = train['id'].map(maps)

In [7]:
fps = []
for i in range(train.shape[0]):
    row = train.iloc[i]
    study_id, series_id, image_id = row[['StudyInstanceUID','series_id','id']]
    fps += [f'/raid/siim-covid19-detection/train/{study_id}/{series_id}/{image_id.replace("_image",".dcm")}']

In [8]:
!mkdir /raid/siim-covid19-detection/train_025/

In [9]:
fps[0].split('/')[4:]

['5776db0cec75', '81456c9c5423', '000a312787f2.dcm']

In [10]:
def do_one(fp):
    img = read_xray(fp)
    size = img.shape
    img = cv2.resize(img, (0, 0), fx=0.25, fy = 0.25)
    study_id, series_id, image_id = fp.split('/')[4:]
    
    folder = f'/raid/siim-covid19-detection/train_025/{study_id}/{series_id}/'
    if not os.path.exists(folder):
        os.makedirs(folder)
    cv2.imwrite(folder + image_id.replace('dcm','png'),img)
    return size

In [11]:
from tqdm import tqdm

In [12]:
with mp.Pool(32) as p:
    res = list(tqdm(p.imap(do_one, fps),total=len(fps)))

100%|██████████| 6334/6334 [01:12<00:00, 87.67it/s] 


In [13]:
train[['height','width']] = np.array(res)

In [14]:
train

Unnamed: 0,id,boxes,label,StudyInstanceUID,series_id,height,width
0,000a312787f2_image,"[{'x': 789.28836, 'y': 582.43035, 'width': 102...",opacity 1 789.28836 582.43035 1815.94498 2499....,5776db0cec75,81456c9c5423,3488,4256
1,000c3a3f293f_image,,none 1 0 0 1 1,ff0879eb20ed,d8a644cc4f93,2320,2832
2,0012ff7358bc_image,"[{'x': 677.42216, 'y': 197.97662, 'width': 867...",opacity 1 677.42216 197.97662 1545.21983 1197....,9d514ce429a7,22897cd1daa0,2544,3056
3,001398f4ff4f_image,"[{'x': 2729, 'y': 2181.33331, 'width': 948.000...",opacity 1 2729 2181.33331 3677.00012 2785.33331,28dddc8559b2,4d47bc042ee6,3520,4280
4,001bd15d1891_image,"[{'x': 623.23328, 'y': 1050, 'width': 714, 'he...",opacity 1 623.23328 1050 1337.23328 2156 opaci...,dfd9fdd85a3e,49170afa4f27,2800,3408
...,...,...,...,...,...,...,...
6329,ffcc6edd9445_image,,none 1 0 0 1 1,7e6c68462e06,98011bca5fd2,4240,3480
6330,ffd91a2c4ca0_image,,none 1 0 0 1 1,8332bdaddb6e,f346cda52da0,2800,3408
6331,ffd9b6cf2961_image,"[{'x': 2197.38566, 'y': 841.07361, 'width': 31...",opacity 1 2197.38566 841.07361 2513.80265 1292...,7eed9af03814,668a64e3a5fa,2388,3050
6332,ffdc682f7680_image,"[{'x': 2729.27083, 'y': 332.26044, 'width': 14...",opacity 1 2729.27083 332.26044 4225.52099 2936...,a0cb0b96fb3d,ccf363aa080f,3488,4256


In [15]:
train.to_csv('/raid/siim-covid19-detection/train_image_level_v2.csv', index=False)

In [27]:
maps = {}
maps2 = {}
for dirname, _, filenames in os.walk(f'/raid/siim-covid19-detection/test'):
    for file in filenames:
        maps[file.replace('.dcm','_image')] = dirname.split('/')[-1]
        maps2[file.replace('.dcm','_image')] = dirname.split('/')[-2]



In [17]:
sub = pd.read_csv('/raid/siim-covid19-detection/sample_submission.csv')
sub

Unnamed: 0,id,PredictionString
0,00188a671292_study,negative 1 0 0 1 1
1,004bd59708be_study,negative 1 0 0 1 1
2,00508faccd39_study,negative 1 0 0 1 1
3,006486aa80b2_study,negative 1 0 0 1 1
4,00655178fdfc_study,negative 1 0 0 1 1
...,...,...
2472,46719b856de1_image,none 1 0 0 1 1
2473,31c07523a69a_image,none 1 0 0 1 1
2474,f77d7d1aebab_image,none 1 0 0 1 1
2475,ccc5b63ca96d_image,none 1 0 0 1 1


In [32]:
test = sub[sub['id'].str.endswith('_image')].copy().reset_index(drop=True)

In [33]:

test['series_id'] = test['id'].map(maps)
test['StudyInstanceUID'] = test['id'].map(maps2)

In [34]:
test

Unnamed: 0,id,PredictionString,series_id,StudyInstanceUID
0,557a70442928_image,none 1 0 0 1 1,2d4fb41c0707,795051254905
1,36141cda67ad_image,none 1 0 0 1 1,360c62464c4c,0d476d070d71
2,2413a23a5477_image,none 1 0 0 1 1,0bd4b304da3a,4996bf5117c7
3,c263b1e9aa64_image,none 1 0 0 1 1,1f0bb9f0aa75,8b273337a684
4,4fe0444d7fc5_image,none 1 0 0 1 1,43442e48d3cb,aafc8126d5a1
...,...,...,...,...
1258,46719b856de1_image,none 1 0 0 1 1,d144042194fd,ee860264dd8c
1259,31c07523a69a_image,none 1 0 0 1 1,9848f28dfc23,81c860c6efe8
1260,f77d7d1aebab_image,none 1 0 0 1 1,e75c70e8b526,e6e02ec8aff5
1261,ccc5b63ca96d_image,none 1 0 0 1 1,07e820418ca6,b93bfa119338


In [31]:
def do_one(fp):
    img = read_xray(fp)
    size = img.shape
    img = cv2.resize(img, (0, 0), fx=0.25, fy = 0.25)
    study_id, series_id, image_id = fp.split('/')[4:]
    
    folder = f'/raid/siim-covid19-detection/test_025/{study_id}/{series_id}/'
    if not os.path.exists(folder):
        os.makedirs(folder)
    cv2.imwrite(folder + image_id.replace('dcm','png'),img)
    return size

In [35]:
fps = []
for i in range(test.shape[0]):
    row = test.iloc[i]
    study_id, series_id, image_id = row[['StudyInstanceUID','series_id','id']]
    fps += [f'/raid/siim-covid19-detection/test/{study_id}/{series_id}/{image_id.replace("_image",".dcm")}']

In [36]:
with mp.Pool(32) as p:
    res = list(tqdm(p.imap(do_one, fps),total=len(fps)))

100%|██████████| 1263/1263 [00:16<00:00, 77.29it/s]


In [37]:
test[['height','width']] = np.array(res)

In [38]:
test

Unnamed: 0,id,PredictionString,series_id,StudyInstanceUID,height,width
0,557a70442928_image,none 1 0 0 1 1,2d4fb41c0707,795051254905,3320,3408
1,36141cda67ad_image,none 1 0 0 1 1,360c62464c4c,0d476d070d71,3480,4240
2,2413a23a5477_image,none 1 0 0 1 1,0bd4b304da3a,4996bf5117c7,2320,2832
3,c263b1e9aa64_image,none 1 0 0 1 1,1f0bb9f0aa75,8b273337a684,2801,2802
4,4fe0444d7fc5_image,none 1 0 0 1 1,43442e48d3cb,aafc8126d5a1,2392,3014
...,...,...,...,...,...,...
1258,46719b856de1_image,none 1 0 0 1 1,d144042194fd,ee860264dd8c,2648,2867
1259,31c07523a69a_image,none 1 0 0 1 1,9848f28dfc23,81c860c6efe8,3488,4256
1260,f77d7d1aebab_image,none 1 0 0 1 1,e75c70e8b526,e6e02ec8aff5,3052,3012
1261,ccc5b63ca96d_image,none 1 0 0 1 1,07e820418ca6,b93bfa119338,2520,3032


In [39]:
test.to_csv('/raid/siim-covid19-detection/test_image_level_v2.csv', index=False)