In [44]:
%matplotlib inline  
from fastai.vision import *
from fastai.data_block import _maybe_squeeze
from fastai.callbacks import *
from sklearn.model_selection import StratifiedKFold
from joblib import load, dump
from efficientnet_pytorch import EfficientNet
import pydicom
from tqdm import tqdm
from pprint import pprint
from multiprocessing import Pool
from joblib import load, dump

In [2]:
PATH = Path('..')
DF_TRAIN =      pd.read_csv(PATH/'stage_1_train.csv')

import numpy as np
import pydicom


def get_dicom_value(x, cast=int):
    if type(x) in [pydicom.multival.MultiValue, tuple]:
        return cast(x[0])
    else:
        return cast(x)


def cast(value):
    if type(value) is pydicom.valuerep.MultiValue:
        return tuple(value)
    return value


def get_dicom_raw(dicom):
    return {attr:cast(getattr(dicom,attr)) for attr in dir(dicom) if attr[0].isupper() and attr not in ['PixelData']}


def rescale_image(image, slope, intercept):
    return image * slope + intercept


def apply_window(image, center, width):
    image = image.copy()
    min_value = center - width // 2
    max_value = center + width // 2
    image[image < min_value] = min_value
    image[image > max_value] = max_value
    return image


def get_dicom_meta(dicom):
    return {
        'PatientID': dicom.PatientID, # can be grouped (20-548)
        'StudyInstanceUID': dicom.StudyInstanceUID, # can be grouped (20-60)
        'SeriesInstanceUID': dicom.SeriesInstanceUID, # can be grouped (20-60)
        'WindowWidth': get_dicom_value(dicom.WindowWidth),
        'WindowCenter': get_dicom_value(dicom.WindowCenter),
        'RescaleIntercept': float(dicom.RescaleIntercept),
        'RescaleSlope': float(dicom.RescaleSlope), # all same (1.0)
    }

In [3]:
def group_id_by_label(df):
    ids = {}
    for row in tqdm(df.itertuples(), total=len(df)):
        prefix, id, label = row.ID.split('_')
        id = '%s_%s' % (prefix, id)
        if id not in ids:
            ids[id] = []
        if row.Label == 1: 
            ids[id].append(label)
    return ids

def remove_corrupted_images(ids):
    ids = ids.copy()
    for id in ['ID_6431af929']:
        try:
            ids.pop(id) 
        except KeyError as e:
            print('%s not found' % id)
        else:
            print('removed %s' % id)

    return ids


def create_record(item, dirname):

    id, labels = item
    path = '%s/%s.dcm' % (dirname, id)
    dicom = pydicom.dcmread(path)
    
    record = {
        'ID': id,
        'labels': ' '.join(labels),
        'n_label': len(labels),
    }
    record.update(get_dicom_raw(dicom))

    raw = dicom.pixel_array
#    slope = float(record['RescaleSlope'])
#    intercept = float(record['RescaleIntercept'])
#    center = get_dicom_value(record['WindowCenter'])
#    width = get_dicom_value(record['WindowWidth'])
#
#    image = rescale_image(raw, slope, intercept)
#    doctor = apply_window(image, center, width)
#    custom =  apply_window(image, 40, 80)
#
#    record.update({
#        'raw_max': raw.max(),
#        'raw_min': raw.min(),
#        'raw_mean': raw.mean(),
#        'raw_diff': raw.max() - raw.min(),
#        'doctor_max': doctor.max(),
#        'doctor_min': doctor.min(),
#        'doctor_mean': doctor.mean(),
#        'doctor_diff': doctor.max() - doctor.min(),
#        'custom_max': custom.max(),
#        'custom_min': custom.min(),
#        'custom_mean': custom.mean(),
#        'custom_diff': custom.max() - custom.min(),
#    })
    return record


def create_df(ids):
    print('making records...')
    with Pool(4) as pool:
        records = list(tqdm(
            iterable=pool.imap_unordered(
                functools.partial(create_record, dirname='../stage_1_train_images'),
                ids.items()
            ),
            total=len(ids),
        ))
    return pd.DataFrame(records).sort_values('ID').reset_index(drop=True)

In [4]:
ids = group_id_by_label(DF_TRAIN)
ids = remove_corrupted_images(ids)

100%|██████████| 4045572/4045572 [00:08<00:00, 488012.60it/s]

removed ID_6431af929





In [5]:
df_output = create_df(ids)

making records...


100%|██████████| 674257/674257 [25:54<00:00, 433.87it/s]  


In [7]:
def _make_folds(df, n_fold, seed):

    counter_gt = collections.defaultdict(int)
    for labels in df.labels.str.split():
        for label in labels:
            counter_gt[label] += 1

    counter_folds = collections.Counter()

    folds = {}
    random.seed(seed)
    groups = df.groupby('PatientID')
    print('making %d folds...' % n_fold)
    for patient_id, group in tqdm(groups, total=len(groups)):

        labels = []
        for row in group.itertuples():
            for label in row.labels.split():
                labels.append(label)
        if not labels:
            labels = ['']

        count_labels = [counter_gt[label] for label in labels]
        min_label = labels[np.argmin(count_labels)]
        count_folds = [(f, counter_folds[(f, min_label)]) for f in range(n_fold)]
        min_count = min([count for f,count in count_folds])
        fold = random.choice([f for f,count in count_folds if count == min_count])
        folds[patient_id] = fold

        for label in labels:
            counter_folds[(fold,label)] += 1

    pprint(counter_folds)

    return folds

In [10]:
k = _make_folds(df_output, 5, 42)

making 5 folds...


100%|██████████| 17079/17079 [00:49<00:00, 343.09it/s]


Counter({(0, 'any'): 19666,
         (1, 'any'): 19531,
         (3, 'any'): 19528,
         (2, 'any'): 19345,
         (4, 'any'): 19033,
         (3, 'subdural'): 8511,
         (1, 'subdural'): 8511,
         (0, 'subdural'): 8498,
         (4, 'subdural'): 8490,
         (2, 'subdural'): 8486,
         (4, 'intraparenchymal'): 6534,
         (0, 'intraparenchymal'): 6524,
         (2, 'intraparenchymal'): 6505,
         (3, 'intraparenchymal'): 6504,
         (2, 'subarachnoid'): 6502,
         (1, 'intraparenchymal'): 6497,
         (3, 'subarachnoid'): 6427,
         (1, 'subarachnoid'): 6411,
         (4, 'subarachnoid'): 6399,
         (0, 'subarachnoid'): 6383,
         (2, 'intraventricular'): 4757,
         (1, 'intraventricular'): 4754,
         (3, 'intraventricular'): 4753,
         (4, 'intraventricular'): 4752,
         (0, 'intraventricular'): 4750,
         (0, ''): 2020,
         (1, ''): 2020,
         (4, ''): 2020,
         (3, ''): 2020,
         (2, ''): 2019,


In [18]:
pd.DataFrame.from_dict(pd.Series(k)).reset_index().to_csv('FOLDS.csv', index=False)

In [6]:
df_output

Unnamed: 0,ID,labels,n_label,BitsAllocated,BitsStored,Columns,HighBit,ImageOrientationPatient,ImagePositionPatient,Modality,...,RescaleIntercept,RescaleSlope,Rows,SOPInstanceUID,SamplesPerPixel,SeriesInstanceUID,StudyID,StudyInstanceUID,WindowCenter,WindowWidth
0,ID_000039fa0,,0,16,16,512,15,"(1.000000, 0.000000, 0.000000, 0.000000, 0.968...","(-125.000000, -141.318451, 62.720940)",CT,...,-1024.0,1.0,512,ID_000039fa0,1,ID_5f8484c3e0,,ID_134d398b61,30,80
1,ID_00005679d,,0,16,16,512,15,"(1.000000, 0.000000, 0.000000, 0.000000, 1.000...","(-134.463, -110.785, -39.569)",CT,...,-1024.0,1.0,512,ID_00005679d,1,ID_203cd6ec46,,ID_b5c26cda09,50,100
2,ID_00008ce3c,,0,16,12,512,11,"(1, 0, 0, 0, 0.994521895, 0.104528463)","(-125, -83.0468112, 175.995344)",CT,...,-1024.0,1.0,512,ID_00008ce3c,1,ID_3780d48b28,,ID_974735bf79,"(00040, 00040)","(00080, 00080)"
3,ID_0000950d7,,0,16,16,512,15,"(1.000000, 0.000000, 0.000000, 0.000000, 1.000...","(-126.437378, -126.437378, 157.500000)",CT,...,-1024.0,1.0,512,ID_0000950d7,1,ID_84296c3845,,ID_8881b1c4b1,35,135
4,ID_0000aee4b,,0,16,12,512,11,"(1, 0, 0, 0, 1, 0)","(-108.5, 14.5, 94)",CT,...,-1024.0,1.0,512,ID_0000aee4b,1,ID_1e59488a44,,ID_9aad90e421,"(00036, 00036)","(00080, 00080)"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
674252,ID_ffff73ede,,0,16,16,512,15,"(1.000000, 0.000000, 0.000000, 0.000000, 0.978...","(-125.000, -113.968, -46.269)",CT,...,-1024.0,1.0,512,ID_ffff73ede,1,ID_6ea8a427b6,,ID_f6b8751965,30,80
674253,ID_ffff80705,,0,16,12,512,11,"(1, 0, 0, 0, 1, 0)","(-110.5, 8.5, 147.599976)",CT,...,-1024.0,1.0,512,ID_ffff80705,1,ID_515bce8343,,ID_18b50bbeb2,"(00036, 00036)","(00080, 00080)"
674254,ID_ffff82e46,,0,16,12,512,11,"(1, 0, 0, 0, 1, 0)","(-152, -56, 1011.99995)",CT,...,-1024.0,1.0,512,ID_ffff82e46,1,ID_3ef9b97743,,ID_eca4bf46ac,"(00036, 00036)","(00080, 00080)"
674255,ID_ffff922b9,intraventricular any,2,16,16,512,15,"(1.000000, 0.000000, 0.000000, 0.000000, 1.000...","(-126.408875, -126.408875, -235.611511)",CT,...,-1024.0,1.0,512,ID_ffff922b9,1,ID_6d2a9b2810,,ID_b47ca0ad05,35,135


In [21]:
df = pd.DataFrame.from_dict(pd.Series(k)).reset_index()

In [25]:
df[0].value_counts()

3    3458
2    3419
0    3411
4    3398
1    3393
Name: 0, dtype: int64

In [26]:
df

Unnamed: 0,index,0
0,ID_0002cd41,0
1,ID_00054f3f,1
2,ID_0006d192,4
3,ID_00086119,3
4,ID_000e5623,2
...,...,...
17074,ID_ffeabb7c,1
17075,ID_ffedaf23,4
17076,ID_ffee3094,3
17077,ID_fff140ff,4


In [30]:
df_output['PatientID'] = df_output['PatientID'].apply(lambda x: k[x])

In [32]:
df_output['PatientID'].value_counts()

2    135387
1    135262
3    135005
0    134784
4    133819
Name: PatientID, dtype: int64

In [4]:
df_m = pd.read_csv('../train_labels_as_strings.csv')
df_pi = pd.read_csv('../DATA_with_PATIENT_ID.csv')[['fn', 'PatientID']]

In [5]:
df_m

Unnamed: 0,fn,labels
0,ID_63eb1e259.png,
1,ID_2669954a7.png,
2,ID_52c9913b1.png,
3,ID_4e6ff6126.png,
4,ID_7858edd88.png,
...,...,...
674252,ID_f737f4cc1.png,
674253,ID_4c92d70b5.png,
674254,ID_a9797cb3a.png,
674255,ID_9375f67bd.png,


In [6]:
df_pi

Unnamed: 0,fn,PatientID
0,ID_000039fa0.png,ID_eeaf99e7
1,ID_00005679d.png,ID_18f2d431
2,ID_00008ce3c.png,ID_ce8a3cd2
3,ID_0000950d7.png,ID_d278c67b
4,ID_0000aee4b.png,ID_ce5f0b6c
...,...,...
674252,ID_ffff73ede.png,ID_d9b54e99
674253,ID_ffff80705.png,ID_c771391a
674254,ID_ffff82e46.png,ID_a85c9d08
674255,ID_ffff922b9.png,ID_5964c5e5


In [7]:
df = pd.merge(df_m, df_pi, on='fn')

In [8]:
df

Unnamed: 0,fn,labels,PatientID
0,ID_63eb1e259.png,,ID_a449357f
1,ID_2669954a7.png,,ID_363d5865
2,ID_52c9913b1.png,,ID_9c2b4bd7
3,ID_4e6ff6126.png,,ID_3ae81c2d
4,ID_7858edd88.png,,ID_c1867feb
...,...,...,...
674252,ID_f737f4cc1.png,,ID_cd9b1eb3
674253,ID_4c92d70b5.png,,ID_39702b69
674254,ID_a9797cb3a.png,,ID_c9eb1b4d
674255,ID_9375f67bd.png,,ID_141ef473


In [9]:
df_folds = pd.read_csv('FOLDS.csv')
k = dict(zip(df_folds['index'], df_folds['0']))

In [10]:
df['folds'] = df['PatientID'].apply(lambda x: k[x])

In [11]:
df

Unnamed: 0,fn,labels,PatientID,folds
0,ID_63eb1e259.png,,ID_a449357f,0
1,ID_2669954a7.png,,ID_363d5865,0
2,ID_52c9913b1.png,,ID_9c2b4bd7,3
3,ID_4e6ff6126.png,,ID_3ae81c2d,0
4,ID_7858edd88.png,,ID_c1867feb,1
...,...,...,...,...
674252,ID_f737f4cc1.png,,ID_cd9b1eb3,1
674253,ID_4c92d70b5.png,,ID_39702b69,2
674254,ID_a9797cb3a.png,,ID_c9eb1b4d,2
674255,ID_9375f67bd.png,,ID_141ef473,3


In [12]:
df['folds'].value_counts()

2    135387
1    135262
3    135005
0    134784
4    133819
Name: folds, dtype: int64

In [13]:
def get_val_indx(fold):
    val_indx_ = df[df['folds'] ==fold].index.to_numpy()
    trn_ = np.setdiff1d(np.arange(df.shape[0]), val_indx_)
    return (val_indx_, trn_)

In [14]:
VAL = [get_val_indx(i) for i in range(5)]

In [22]:
[VAL[i][0].shape[0] for i in range(5)]

[134784, 135262, 135387, 135005, 133819]

In [24]:
[VAL[i][1].shape[0] for i in range(5)]

[539473, 538995, 538870, 539252, 540438]

In [45]:
dump(VAL, 'val_idx_pi.joblib')

['val_idx_pi.joblib']