## Setup libraries

In [1]:
import pandas as pd
from pathlib import Path
import re
import numpy as np

## Load DataFrames

In [2]:
path = Path("src")

df_img = pd.read_csv(path / 'train_image_level.csv')
df_study = pd.read_csv(path / 'train_study_level.csv')

Contents of the image dataframe:

In [4]:
df_img.head(2)

Unnamed: 0,id,boxes,label,StudyInstanceUID
0,000a312787f2_image,"[{'x': 789.28836, 'y': 582.43035, 'width': 102...",opacity 1 789.28836 582.43035 1815.94498 2499....,5776db0cec75
1,000c3a3f293f_image,,none 1 0 0 1 1,ff0879eb20ed


The label column contains one string, as it is expected in the submission.

In [5]:
df_img.loc[0,'label']

'opacity 1 789.28836 582.43035 1815.94498 2499.73327 opacity 1 2245.91208 591.20528 3340.5737 2352.75472'

Contents of the study dataframe:

In [6]:
df_study.head(2)

Unnamed: 0,id,Negative for Pneumonia,Typical Appearance,Indeterminate Appearance,Atypical Appearance
0,00086460a852_study,0,1,0,0
1,000c9c05fd14_study,0,0,0,1


Prepare the `id` column for the later join with the image dataframe.

In [7]:
df_study['id'] = df_study.id.str.replace('_study', '')
df_study.rename(columns = {'id': 'StudyInstanceUID'}, inplace = True)

Some studies contain more than one image.

In [8]:
study_ids = df_img['StudyInstanceUID'].unique()
img_ids = df_study['StudyInstanceUID'].unique()

set(study_ids) == set(img_ids), df_img.shape[0] == df_study.shape[0]

(True, False)

In [9]:
print(df_img.shape, df_study.shape)

(6334, 4) (6054, 5)


Take a look at the studies that contain more than one image.

In [10]:
id_group = df_img.groupby(['StudyInstanceUID'])['id'].count().to_frame()
id_group = id_group.rename(columns = {'id': 'number of ids'})
UIDs_multiple_ids = id_group[id_group['number of ids'] > 1].index
df_img[df_img['StudyInstanceUID'].isin(UIDs_multiple_ids)].sort_values('StudyInstanceUID')

Unnamed: 0,id,boxes,label,StudyInstanceUID
2862,74077a8e3b7c_image,"[{'x': 2175.24285, 'y': 1123.72368, 'width': 4...",opacity 1 2175.24285 1123.72368 2607.50603 162...,00f9e183938e
2490,6534a837497d_image,,none 1 0 0 1 1,00f9e183938e
2119,55e22c0c5de0_image,"[{'x': 455.99999, 'y': 1480.00008, 'width': 26...",opacity 1 455.99999 1480.00008 722.39998 2437....,0142feaef82f
6061,f5451a98d684_image,,none 1 0 0 1 1,0142feaef82f
3880,9e4824fcee2e_image,"[{'x': 817.77961, 'y': 1075.34501, 'width': 64...",opacity 1 817.77961 1075.34501 1467.08961 2075...,0369e0385796
...,...,...,...,...
1600,4123a71d9796_image,"[{'x': 889.45144, 'y': 282.39441, 'width': 825...",opacity 1 889.45144 282.39441 1714.51125 1585....,fc45007f145a
827,218bcf950372_image,,none 1 0 0 1 1,fd92c6f2b2e6
5735,e6cc65d9de1d_image,,none 1 0 0 1 1,fd92c6f2b2e6
3277,84ed5f7f71bf_image,"[{'x': 1721.27651, 'y': 974.09667, 'width': 12...",opacity 1 1721.27651 974.09667 2999.21998 2681...,ffcb4630f46f


## Join both DataFrames

In [11]:
df = pd.merge(df_img, df_study, on = 'StudyInstanceUID')

## Fill the NaN values with the expected "No Box detected" label

In [12]:
df.boxes.fillna("[{'x': 0, 'y': 0, 'width': 1, 'height': 1}]", inplace = True)  ## Add formating for consistency

## Format the Bounding Boxes as expected by the learner

In [13]:
df['boxes_list'] = df.boxes.str.split(r'},\s*') ## Split (string) list of bounding boxes at }
df['boxes_list'] = df['boxes_list'].apply(lambda x: [re.sub(r"\[|{|}|\]|:|'|x|y|width|height|\s", '', y) for y in x])## Remove superfluous formating
df['boxes_list'] = df['boxes_list'].apply(lambda x: [y.split(',') for y in x]) ## Split the values in each bounding box list
df['boxes_list'] = df['boxes_list'].apply(lambda x: [list(map(float,y)) for y in x]) ## Cast bounding box values from string to float
df['boxes_list'] = df['boxes_list'].apply(lambda x: [[y[0], y[1], y[0] + y[2], y[1] + y[3]] for y in x])             ## The BBoxBlock expects the bounding box in the format min_x, min_y, max_x, max_y (in contrast to min_x, min_y, width, height)
β = lambda x: np.array([np.array(y) for y in x]) ## Exchange lists for nparrays
df['boxes_list'] = df['boxes_list'].apply(lambda x: β(x))

## Format the Bounding Box labels as expected by the learner

In [15]:
df['labels_list'] = df['label'].str.split(r'\s')                                                                     ## 
df['labels_list'] = df['labels_list'].apply(lambda x: [y for y in x if not re.match('.*\d+', y)])                    ## Keep every entry of the list, that doesnt contain any number
df['labels_list'] = df['labels_list'].apply(lambda x: np.array(x))

## Clean up

In [16]:
df.drop(['boxes', 'label'], axis = 1, inplace = True)
df['id'] = df['id'].str.replace('_image', '')

In [23]:
df.tail(2)

Unnamed: 0,id,StudyInstanceUID,Negative for Pneumonia,Typical Appearance,Indeterminate Appearance,Atypical Appearance,boxes_list,labels_list,label
6332,ffdc682f7680,a0cb0b96fb3d,0,1,0,0,"[[2729.27083, 332.26044, 4225.52099, 2936.8437...","[Typical, Typical]",Typical
6333,ffe942c8655f,7d82d53204b8,0,1,0,0,"[[208.86463, 91.53448, 659.8321, 719.58921], [...","[Typical, Typical]",Typical


## Add a single classification label

In [19]:
X = df[['Negative for Pneumonia', 'Typical Appearance', 'Indeterminate Appearance', 'Atypical Appearance']]
df['label'] = pd.get_dummies(X).idxmax(1).str.split(' ').apply(lambda x: x[0])
df['labels_list'] = df.apply(lambda x: np.ones((len(x['boxes_list'],)), dtype = object) * x['label'], axis = 1)

In [20]:
df.head(2)

Unnamed: 0,id,StudyInstanceUID,Negative for Pneumonia,Typical Appearance,Indeterminate Appearance,Atypical Appearance,boxes_list,labels_list,label
0,000a312787f2,5776db0cec75,0,1,0,0,"[[789.28836, 582.43035, 1815.94498, 2499.73327...","[Typical, Typical]",Typical
1,000c3a3f293f,ff0879eb20ed,1,0,0,0,"[[0.0, 0.0, 1.0, 1.0]]",[Negative],Negative


### Sanity checks:

In [21]:
## Every instance has exactly one of the labels: "negative", "typical", "indeterminate", "atypical"
assert((df[['Negative for Pneumonia', 'Typical Appearance', 'Indeterminate Appearance', 'Atypical Appearance']].sum(axis = 1) != 1).sum() == 0)
## For each instance, there are as many labels as there are bounding boxes
assert(df.apply(lambda x: len(x['boxes_list']) != len(x['labels_list']), axis = 1 ).sum() == 0 )
## The values in the boxes_list column are all lists (of bounding boxes)
assert((df.boxes_list.apply(lambda x: type(x)) != np.ndarray).sum() == 0)
## Each list has at least one entry (/ at least one bounding box)
assert((df.boxes_list.apply(lambda x: len(x)) < 1).sum() == 0)
## The entries of each list are also lists
f = lambda x: sum([1 if type(y) != np.ndarray else 0 for y in x])
assert(df.boxes_list.apply(lambda x: f(x)).sum() == 0)
## Each entry, for each list, contains of 4 elements
g = lambda x: sum([1 if len(y) != 4 else 0 for y in x])
assert(df.boxes_list.apply(lambda x: g(x)).sum() == 0)
## The bounding box values are floats
h = lambda x: sum([1 if type(z) != np.float64 else 0 for y in x for z in y])
assert(df.boxes_list.apply(lambda x: h(x)).sum() == 0)
## The 'none' labeled instances all have:
df_none = df[df.labels_list.apply(lambda x: ' '.join(x)).str.contains('none')]
## exactly one bounding box
assert(df_none['boxes_list'].apply(lambda x: len(x) != 1).sum() == 0)
## the exact bounding box [0., 0., 1., 1.]
α = lambda x: int(x[0] != 0. or x[1] != 0. or x[2] != 1. or x[3] != 1.)
assert(df_none['boxes_list'].apply(lambda x: α(x[0])).sum() == 0)

## Save dataframe to pickle (.csv loses the dtype of the boxes/labels_lists)

In [22]:
df.to_pickle(path / 'train_clean.pkl')