## Setup libraries

In [1]:
#!pip install pydicom kornia opencv-python scikit-image nbdev
#!conda install -c conda-forge gdcm -y
#!git clone https://github.com/asvcode/fmi.git

In [2]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

  and should_run_async(code)


In [3]:
#from fastai.basics import *
from fastai.callback.all import *
from fastai.vision.all import *
from fastai.medical.imaging import *

import re
import pydicom
import gdcm

In [4]:
from fmi.fmi.explore import *
from fmi.fmi.preprocessing import *
from fmi.fmi.pipeline import *
from fmi.fmi.retinanet import *

## Load DataFrames

In [5]:
path = Path("src")

df_img = pd.read_csv(path / 'train_image_level.csv')
df_study = pd.read_csv(path / 'train_study_level.csv')

Contents of the image dataframe:

In [6]:
df_img.head(2)

Unnamed: 0,id,boxes,label,StudyInstanceUID
0,000a312787f2_image,"[{'x': 789.28836, 'y': 582.43035, 'width': 1026.65662, 'height': 1917.30292}, {'x': 2245.91208, 'y': 591.20528, 'width': 1094.66162, 'height': 1761.54944}]",opacity 1 789.28836 582.43035 1815.94498 2499.73327 opacity 1 2245.91208 591.20528 3340.5737 2352.75472,5776db0cec75
1,000c3a3f293f_image,,none 1 0 0 1 1,ff0879eb20ed


The label column contains one string, as it is expected in the submission.

In [10]:
df_img.loc[0,'label']

'opacity 1 789.28836 582.43035 1815.94498 2499.73327 opacity 1 2245.91208 591.20528 3340.5737 2352.75472'

Contents of the study dataframe:

In [9]:
df_study.head(2)

Unnamed: 0,id,Negative for Pneumonia,Typical Appearance,Indeterminate Appearance,Atypical Appearance
0,00086460a852_study,0,1,0,0
1,000c9c05fd14_study,0,0,0,1


Prepare the `id` column for the later join with the image dataframe.

In [10]:
df_study['id'] = df_study.id.str.replace('_study', '')
df_study.rename(columns = {'id': 'StudyInstanceUID'}, inplace = True)

Some studies contain more than one image.

In [11]:
study_ids = df_img['StudyInstanceUID'].unique()
img_ids = df_study['StudyInstanceUID'].unique()

set(study_ids) == set(img_ids), df_img.shape[0] == df_study.shape[0]

(True, False)

In [12]:
print(df_img.shape, df_study.shape)

(6334, 4) (6054, 5)


Take a look at the studies that contain more than one image.

In [13]:
id_group = df_img.groupby(['StudyInstanceUID'])['id'].count().to_frame()
id_group = id_group.rename(columns = {'id': 'number of ids'})
UIDs_multiple_ids = id_group[id_group['number of ids'] > 1].index
df_img[df_img['StudyInstanceUID'].isin(UIDs_multiple_ids)].sort_values('StudyInstanceUID')

Unnamed: 0,id,boxes,label,StudyInstanceUID
2862,74077a8e3b7c_image,"[{'x': 2175.24285, 'y': 1123.72368, 'width': 432.26318, 'height': 500.11853}, {'x': 823.1639, 'y': 1324.77631, 'width': 306.60522, 'height': 394.5658}, {'x': 845.78232, 'y': 291.86842, 'width': 201.05261, 'height': 437.28949}]",opacity 1 2175.24285 1123.72368 2607.50603 1623.84221 opacity 1 823.1639 1324.77631 1129.76912 1719.34211 opacity 1 845.78232 291.86842 1046.83493 729.15791,00f9e183938e
2490,6534a837497d_image,,none 1 0 0 1 1,00f9e183938e
2119,55e22c0c5de0_image,"[{'x': 455.99999, 'y': 1480.00008, 'width': 266.39999, 'height': 957.59998}]",opacity 1 455.99999 1480.00008 722.39998 2437.6000599999998,0142feaef82f
6061,f5451a98d684_image,,none 1 0 0 1 1,0142feaef82f
3880,9e4824fcee2e_image,"[{'x': 817.77961, 'y': 1075.34501, 'width': 649.31, 'height': 1000.28833}, {'x': 2260.30072, 'y': 1022.69826, 'width': 744.07397, 'height': 1102.07202}]",opacity 1 817.77961 1075.34501 1467.08961 2075.63334 opacity 1 2260.30072 1022.69826 3004.37469 2124.77028,0369e0385796
...,...,...,...,...
1600,4123a71d9796_image,"[{'x': 889.45144, 'y': 282.39441, 'width': 825.05981, 'height': 1303.17139}, {'x': 2708.81375, 'y': 447.40635, 'width': 1078.92431, 'height': 1345.48206}]",opacity 1 889.45144 282.39441 1714.51125 1585.5657999999999 opacity 1 2708.81375 447.40635 3787.7380599999997 1792.88841,fc45007f145a
827,218bcf950372_image,,none 1 0 0 1 1,fd92c6f2b2e6
5735,e6cc65d9de1d_image,,none 1 0 0 1 1,fd92c6f2b2e6
3277,84ed5f7f71bf_image,"[{'x': 1721.27651, 'y': 974.09667, 'width': 1277.94347, 'height': 1706.90333}, {'x': 8.93666, 'y': 777.49, 'width': 1511.26484, 'height': 1693.49833}]",opacity 1 1721.27651 974.09667 2999.21998 2681.0 opacity 1 8.93666 777.49 1520.2015000000001 2470.98833,ffcb4630f46f


## Join both DataFrames

In [14]:
df = pd.merge(df_img, df_study, on = 'StudyInstanceUID')

## Fill the NaN values with the expected "No Box detected" label

In [16]:
df.boxes.fillna("[{'x': 0, 'y': 0, 'width': 1, 'height': 1}]", inplace = True)  ## Add formating for consistancy

## Format the Bounding Boxes as expected by the learner

In [17]:
df['boxes_list'] = df.boxes.str.split(r'},\s*') ## Split (string) list of bounding boxes at }
df['boxes_list'] = df['boxes_list'].apply(lambda x: [re.sub(r"\[|{|}|\]|:|'|x|y|width|height|\s", '', y) for y in x])## Remove superfluous formating
df['boxes_list'] = df['boxes_list'].apply(lambda x: [y.split(',') for y in x]) ## Split the values in each bounding box list
df['boxes_list'] = df['boxes_list'].apply(lambda x: [list(map(float,y)) for y in x]) ## Cast bounding box values from string to float
df['boxes_list'] = df['boxes_list'].apply(lambda x: [[y[0], y[1], y[0] + y[2], y[1] + y[3]] for y in x])             ## The BBoxBlock expects the bounding box in the format min_x, min_y, max_x, max_y (in contrast to min_x, min_y, width, height)
β = lambda x: np.array([np.array(y) for y in x]) ## Exchange lists for nparrays
df['boxes_list'] = df['boxes_list'].apply(lambda x: β(x))

## Format the Bounding Box labels as expected by the learner

In [18]:
df['labels_list'] = df['label'].str.split(r'\s')                                                                     ## 
df['labels_list'] = df['labels_list'].apply(lambda x: [y for y in x if not re.match('.*\d+', y)])                    ## Keep every entry of the list, that doesnt contain any number
df['labels_list'] = df['labels_list'].apply(lambda x: np.array(x))

## Clean up

In [19]:
df.drop(['boxes', 'label'], axis = 1, inplace = True)

In [20]:
df['id'] = df['id'].str.replace('_image', '.dcm')

In [21]:
df.tail()

Unnamed: 0,id,StudyInstanceUID,Negative for Pneumonia,Typical Appearance,Indeterminate Appearance,Atypical Appearance,boxes_list,labels_list
6329,ffcc6edd9445.dcm,7e6c68462e06,1,0,0,0,"[[0.0, 0.0, 1.0, 1.0]]",[none]
6330,ffd91a2c4ca0.dcm,8332bdaddb6e,1,0,0,0,"[[0.0, 0.0, 1.0, 1.0]]",[none]
6331,ffd9b6cf2961.dcm,7eed9af03814,0,1,0,0,"[[2197.38566, 841.07361, 2513.80265, 1292.71119], [2375.87717, 1830.89015, 2643.6144700000004, 2136.48927], [707.25199, 722.07926, 1099.3924299999999, 1571.26609]]","[opacity, opacity, opacity]"
6332,ffdc682f7680.dcm,a0cb0b96fb3d,0,1,0,0,"[[2729.27083, 332.26044, 4225.52099, 2936.84378], [1005.8125, 1584.67711, 1668.0416300000002, 2360.51048]]","[opacity, opacity]"
6333,ffe942c8655f.dcm,7d82d53204b8,0,1,0,0,"[[208.86463, 91.53448, 659.8321, 719.58921], [755.52522, 144.33069, 1183.39442, 692.09119]]","[opacity, opacity]"


## Add a single classification label

In [22]:
X = df[['Negative for Pneumonia', 'Typical Appearance', 'Indeterminate Appearance', 'Atypical Appearance']]
df['label'] = pd.get_dummies(X).idxmax(1).str.split(' ').apply(lambda x: x[0])
df['labels_list'] = df.apply(lambda x: np.ones((len(x['boxes_list'],)), dtype = object) * x['label'], axis = 1)

### Sanity checks:

In [23]:
## Every instance has exactly one of the labels: "negative", "typical", "indeterminate", "atypical"
assert((df[['Negative for Pneumonia', 'Typical Appearance', 'Indeterminate Appearance', 'Atypical Appearance']].sum(axis = 1) != 1).sum() == 0)
## For each instance, there are as many labels as there are bounding boxes
assert(df.apply(lambda x: len(x['boxes_list']) != len(x['labels_list']), axis = 1 ).sum() == 0 )
## The values in the boxes_list column are all lists (of bounding boxes)
assert((df.boxes_list.apply(lambda x: type(x)) != np.ndarray).sum() == 0)
## Each list has at least one entry (/ at least one bounding box)
assert((df.boxes_list.apply(lambda x: len(x)) < 1).sum() == 0)
## The entries of each list are also lists
f = lambda x: sum([1 if type(y) != np.ndarray else 0 for y in x])
assert(df.boxes_list.apply(lambda x: f(x)).sum() == 0)
## Each entry, for each list, contains of 4 elements
g = lambda x: sum([1 if len(y) != 4 else 0 for y in x])
assert(df.boxes_list.apply(lambda x: g(x)).sum() == 0)
## The bounding box values are floats
h = lambda x: sum([1 if type(z) != np.float64 else 0 for y in x for z in y])
assert(df.boxes_list.apply(lambda x: h(x)).sum() == 0)
## The 'none' labeled instances all have:
df_none = df[df.labels_list.apply(lambda x: ' '.join(x)).str.contains('none')]
## exactly one bounding box
assert(df_none['boxes_list'].apply(lambda x: len(x) != 1).sum() == 0)
## the exact bounding box [0., 0., 1., 1.]
α = lambda x: int(x[0] != 0. or x[1] != 0. or x[2] != 1. or x[3] != 1.)
assert(df_none['boxes_list'].apply(lambda x: α(x[0])).sum() == 0)

## Save dataframe to pickle (.csv loses the dtype of the boxes/labels_lists)

In [30]:
df.to_pickle(path / 'train_clean.pkl')