## Setup libraries

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from PIL import Image, ImageDraw

from fastai.medical.imaging import get_dicom_files
from dicom_utils.dicom_utils import process_dcm

from math import ceil
from pathlib import Path
import pandas as pd
import numpy as np

In [3]:
import warnings
warnings.filterwarnings("ignore", category = DeprecationWarning)

### Set parameters:

In [4]:
BACKUP = True ## Backup dataframe after image resize
new_width = 1024

## Get data

In [6]:
path = Path('/media/veracrypt1/Data Science/SIIM-Imgs')

in_path = path / 'dicom'
out_path = path / 'png_1024wFrame'

Get lists of `.dicom` files

In [7]:
dicom_train = get_dicom_files(in_path / 'train')
dicom_test = get_dicom_files(in_path / 'test')

Number of `.dicom` files

In [8]:
print(f'Train: {len(dicom_train)}, Test: {len(dicom_test)}')

Train: 6334, Test: 1263


Load prepared dataframe for train set

In [9]:
df_train = pd.read_pickle('src/' + 'train_clean.pkl')
df_train.head(2)

Unnamed: 0,id,StudyInstanceUID,Negative for Pneumonia,Typical Appearance,Indeterminate Appearance,Atypical Appearance,boxes_list,labels_list,label,orig_width,orig_height
0,000a312787f2,5776db0cec75,0,1,0,0,"[[789.28836, 582.43035, 1815.94498, 2499.73327], [2245.91208, 591.20528, 3340.5737, 2352.75472]]","[Typical, Typical]",Typical,4256,3488
1,000c3a3f293f,ff0879eb20ed,1,0,0,0,"[[0.0, 0.0, 1.0, 1.0]]",[Negative],Negative,2832,2320


Remove broken images

In [10]:
rmv_ids = ['3d12cb6aad8b']
df_train = df_train[~df_train['id'].isin(rmv_ids)]

Create a test dataframe to hold image sizes

In [11]:
df_test = pd.DataFrame({'id': [fn.name[:-4] for fn in dicom_test]})

In [12]:
df_test.head()

Unnamed: 0,id
0,9603112b4c2e
1,3dcdfc352a06
2,c39146cbda47
3,951211f8e1bb
4,5e0e7acd9c7d


# Process and resize images
* Get .dcm from filename
* Process .dcm:
    * Get pixel array
    * Fix monochrome (if necessary)
    * "Flatten" outlying values (if x > bound, x = bound)
    * ~Cut black / white boarders~
    * ~Save cut points to df (to translate the bounding boxes)~
    * Save new width / height to df
    * Normalize ~(truncated)~ array
* Save array as .png

In [13]:
def init_df_cols(df):
    zs = np.zeros(df.shape[0], dtype = 'object')
    df['orig_width'] = zs
    df['orig_height'] = zs

In [14]:
def process_fns(fns, df, new_width = 1024):
    m = len(fns)
    bins = np.load('data/freqhist_bins.npy')
    for i, fn in enumerate(fns):
        # Prepare filename to save to
        new_fn = out_path / Path(*fn.parts[6:]).with_suffix('.png')
        new_fn.parent.mkdir(exist_ok=True, parents = True)
        if new_fn.exists():
            print('file already exists, skip item')
            continue
        try:
            idx = df[df.id == fn.stem].index.item()
        except:
            continue
        dcm = fn.dcmread()        
        #arr, tl, br = process_dcm(dcm, bins = bins)
        arr = process_dcm(dcm, bins = bins)
        
        img = Image.fromarray((arr * 255).astype(np.uint8))
        scalar = new_width / img.size[0]
        resized_img = img.resize((new_width, ceil(img.size[1] * scalar)), resample = Image.LANCZOS)  
        
        # Save original image size to scale the bounding boxes
        df.loc[df.id == fn.stem, 'orig_width'] = dcm.Columns
        df.loc[df.id == fn.stem, 'orig_height'] = dcm.Rows

        resized_img.save(new_fn)
        
        print(f'{(i / (m-1) * 100):5.2f}%', end='\r')

Initialize dataframes

In [15]:
init_df_cols(df_train)
init_df_cols(df_test)

## Process train images

In [22]:
process_fns(dicom_train, df_train, new_width = new_width)

100%


In [17]:
if BACKUP: 
    df_train.to_pickle(f'src/train_ready_{str(new_width)}_backup.pkl')

In [18]:
df_train.head(2)

Unnamed: 0,id,StudyInstanceUID,Negative for Pneumonia,Typical Appearance,Indeterminate Appearance,Atypical Appearance,boxes_list,labels_list,label,orig_width,orig_height
0,000a312787f2,5776db0cec75,0,1,0,0,"[[789.28836, 582.43035, 1815.94498, 2499.73327], [2245.91208, 591.20528, 3340.5737, 2352.75472]]","[Typical, Typical]",Typical,0,0
1,000c3a3f293f,ff0879eb20ed,1,0,0,0,"[[0.0, 0.0, 1.0, 1.0]]",[Negative],Negative,0,0


## Process test images

In [19]:
process_fns(dicom_test, df_test, new_width = new_width)

In [20]:
df_test.tail(2)

Unnamed: 0,id
1261,fb073252b364
1262,82589da89e95


## Save the dataframes

In [21]:
df_train.to_pickle(f'src/train_ready_{str(new_width)}_wFrame.pkl')
df_test.to_csv(f'src/test_ready_{str(new_width)}_wFrame.csv')