<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [1]:
import os
import json
import glob
import random
import collections

import numpy as np
import pandas as pd
import pydicom
from pydicom.pixel_data_handlers.util import apply_voi_lut
import nibabel as nib
import cv2
import matplotlib.pyplot as plt
import seaborn as sns
import random
from tqdm.notebook import tqdm

import tensorflow as tf
from tensorflow import keras
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import layers

import tensorflow_hub as tfhub
import tensorflow_addons as tfa


TYPES = ["FLAIR", "T1w", "T2w", "T1wCE"]
WHITE_THRESHOLD = 10 # out of 255
EXCLUDE = [109, 123, 709]
IMAGE_SIZE = 256

In [2]:
if os.path.exists("/kaggle/input/rsna-miccai-brain-tumor-radiogenomic-classification"):
    # data_directory = "/kaggle/input/rsna-miccai-brain-tumor-radiogenomic-classification"
    data_directory = "/kaggle/input/rsna-preprocessed"
    pretrained_weights_path = \
        "/kaggle/input/efficientnetv2-tfhub-weight-files/tfhub_models/efficientnetv2-m-21k/feature_vector"
else:
    # data_directory = "rsna-miccai-brain-tumor-radiogenomic-classification"
    data_directory = "rsna-preprocessed"
    pretrained_weights_path = \
        "efficientnetv2-tfhub-weight-files/tfhub_models/efficientnetv2-m-21k/feature_vector"

experiment_folder = "experiments/efficientnet2d_preprocessed"

In [3]:
train_df = pd.read_csv(f"{data_directory}/train_labels.csv")
test_df = pd.read_csv(f"{data_directory}/sample_submission.csv")
train_df = train_df[~train_df.BraTS21ID.isin(EXCLUDE)]

In [4]:
def load_dicom(path, size = 224):
    ''' 
    Reads a DICOM image, standardizes so that the pixel values are between 0 and 1, then rescales to 0 and 255
    
    Note super sure if this kind of scaling is appropriate, but everyone seems to do it. 
    '''
    dicom = pydicom.read_file(path)
    data = dicom.pixel_array
    if np.max(data) != 0:
        data = data / np.max(data)
    data = (data * 255).astype(np.uint8)
    return cv2.resize(data, (size, size))

def get_all_image_paths(brats21id, image_type, folder='train'): 
    '''
    Returns an arry of all the images of a particular type for a particular patient ID
    '''
    assert(image_type in TYPES)
    
    patient_path = os.path.join(
        f"{data_directory}/%s/" % folder, 
        str(brats21id).zfill(5),
    )

    paths = sorted(
        glob.glob(os.path.join(patient_path, image_type, "*")), 
        key=lambda x: int(x[:-4].split("-")[-1]),
    )
    num_images = len(paths)
    
    start = int(num_images * 0.25)
    end = int(num_images * 0.75)

    interval = 3
    
    if num_images < 10: 
        interval = 1
    
    return np.array(paths[start:end:interval])

def get_all_images(brats21id, image_type, folder='train', size=225):
    return [load_dicom(path, size) for path in get_all_image_paths(brats21id, image_type, folder)]

In [5]:
def load_nib(path, size = 224):
    data = nib.load(path).get_fdata()

    if np.max(data) != 0:
        data = data / np.max(data)
        
    data = (data * 255).astype(np.uint8)
    
    num_images = data.shape[-1]
    
    start = int(num_images * 0.25)
    end = int(num_images * 0.75)
    interval = 3
    
    if num_images < 10: 
        interval = 1

    data = data[:, :, start:end:interval]
    data = data.swapaxes(2, 0)
    resized = []
    for image in data:
        resized.append(cv2.resize(image, (size, size)))
    data = np.array(resized)
    return data


def get_all_preprocessed_image_paths(brats21id, image_type, folder='train'): 
    '''
    Returns an arry of all the images of a particular type for a particular patient ID
    '''
    assert(image_type in TYPES)
    
    patient_path = os.path.join(
        f"{data_directory}/%s/" % folder, 
        str(brats21id).zfill(5),
    )

    paths = sorted(
        glob.glob(os.path.join(patient_path, image_type, "*")), 
    )
    
    return np.array(paths)

def get_all_preprocessed_images(brats21id, image_type, folder='train', size=225):
    path = get_all_preprocessed_image_paths(brats21id, image_type, folder)[0]
    data = load_nib(path, size)
    return data

In [6]:
def get_all_data_for_train(image_type):
    global train_df
    X = []
    y = []
    train_ids = []
    for i in tqdm(train_df.index):
        x = train_df.loc[i]
        images = get_all_preprocessed_images(int(x['BraTS21ID']), image_type, 'train', IMAGE_SIZE)
        label = x['MGMT_value']

        X.append(images)
        y += [label] * len(images)
        train_ids += [int(x['BraTS21ID'])] * len(images)
        
    X = np.concatenate(X)
    return X, np.array(y), np.array(train_ids)

def get_all_data_for_test(image_type):
    global test_df
    X = []
    test_ids = []
    for i in tqdm(test_df.index):
        x = test_df.loc[i]
        images = get_all_preprocessed_images(int(x['BraTS21ID']), image_type, 'test', IMAGE_SIZE)
        X.append(images)
        test_ids += [int(x['BraTS21ID'])] * len(images)

    X = np.concatenate(X)
    return X, np.array(test_ids)

In [7]:
X, y, trainidt = get_all_data_for_train('T1wCE')
X_test, testidt = get_all_data_for_test('T1wCE')
X.shape, y.shape, trainidt.shape

  0%|          | 0/582 [00:00<?, ?it/s]

  0%|          | 0/87 [00:00<?, ?it/s]

((17826, 256, 256), (17826,), (17826,))

In [8]:
# X = X[:128]
# y = y[:128]

In [9]:
X.shape, y.shape

((17826, 256, 256), (17826,))

In [10]:
global_average_layer = tf.keras.layers.GlobalAveragePooling2D()

In [11]:
data_augmentation = tf.keras.Sequential([
  tf.keras.layers.experimental.preprocessing.RandomFlip('horizontal'),
  tf.keras.layers.experimental.preprocessing.RandomRotation(0.2),
])

2021-09-27 19:55:58.385467: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-09-27 19:55:58.395945: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-09-27 19:55:58.396463: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-09-27 19:55:58.397485: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags

In [12]:
def build_model(X):
    inpt = keras.Input(shape=X.shape[1:])
    inp = tf.keras.layers.Concatenate()([inpt, inpt, inpt])  

    h = keras.layers.experimental.preprocessing.Rescaling(1./255)(inp)
    # h = data_augmentation(h)

    # convolutional layer!
    # h = keras.layers.Conv2D(3, kernel_size=(3, 3),activation="relu", name="Conv_1", padding="same")(h) 
    # h = tf.keras.layers.BatchNormalization(axis=-1)(h)
    h = tfhub.KerasLayer(pretrained_weights_path, trainable=True)(h)

    # h = keras.layers.Flatten()(h) 
    # h = global_average_layer(h)
    h = keras.layers.Dropout(0.2)(h)
    h = keras.layers.Dense(128, activation='relu')(h)   

    output = keras.layers.Dense(2, activation="sigmoid")(h)

    model = keras.Model(inpt, output)
    return model

In [13]:
np.random.seed(0)
random.seed(12)
tf.random.set_seed(12)

In [14]:
def predict(model, X_valid):
    y_pred = model.predict(X_valid)
    predictions = np.argmax(y_pred, axis=1)
    return predictions

In [15]:
def evaluate(model, X_valid, trainidt_valid):
    y_pred = model.predict(X_valid)
    pred = np.argmax(y_pred, axis=1)

    result=pd.DataFrame(trainidt_valid)
    result[1]=pred

    result.columns=['BraTS21ID','MGMT_value']
    result = result.groupby('BraTS21ID',as_index=False).mean()

    result = result.merge(train_df, on='BraTS21ID')
    print(f"roc auc: {roc_auc_score(result.MGMT_value_y, result.MGMT_value_x,)}")

In [16]:
def generate_submission(predictions, index): 
    sample = pd.read_csv(f'{data_directory}/sample_submission.csv')

    submission = pd.DataFrame(testidt)
    submission[1] = predictions

    submission.columns = ['BraTS21ID','MGMT_value']
    submission = submission.groupby('BraTS21ID', as_index=False).mean()
    submission['BraTS21ID'] = submission['BraTS21ID']
    submission['MGMT_value'] = submission['MGMT_value'].apply(lambda x:round(x*10)/10)

    os.makedirs(f'{experiment_folder}/fold_{index}', exist_ok=True)
    submission.to_csv(f'{experiment_folder}/fold_{index}/submission.csv',index=False)

In [17]:
skf.split??

Object `skf.split` not found.


In [18]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=21)

In [19]:
for i, (train_index, val_index) in enumerate(skf.split(X, y)):
    X_train = X[train_index]
    X_valid = X[val_index]
    y_train = y[train_index]
    y_valid = y[val_index]
    trainidt_train = trainidt[train_index]
    trainidt_valid = trainidt[val_index]

    X_train = tf.expand_dims(X_train, axis=-1)
    X_valid = tf.expand_dims(X_valid, axis=-1)

    y_train = to_categorical(y_train)
    y_valid = to_categorical(y_valid)

    checkpoint_filepath = f'{experiment_folder}/fold_{i}/best_model.h5'
    model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
        filepath=checkpoint_filepath,
        save_weights_only=False,
        monitor='val_loss',
        mode='max',
        save_best_only=True,
        save_freq='epoch'
    )
    reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.5,
        patience=2,
        min_lr=0.00001
    )
    # lr_schedule = tf.optimizers.schedules.ExponentialDecay(1e-4, 100, 0.9)
    # wd_schedule = tf.optimizers.schedules.ExponentialDecay(5e-5, 100, 0.9)
    # opt = tfa.optimizers.AdamW(learning_rate=lr_schedule, weight_decay=lambda : None)
    # opt.weight_decay = lambda : wd_schedule(opt.iterations)

    model = build_model(X_train)
    model.compile(
        loss='categorical_crossentropy',
        optimizer=tf.keras.optimizers.Adam(learning_rate = 0.0001),
        metrics=[tf.keras.metrics.AUC()]
    )

    history = model.fit(
        x=X_train,
        y=y_train,
        epochs=10,
        batch_size=4,
        callbacks=[model_checkpoint_callback,reduce_lr],
        validation_data=(X_valid, y_valid)
    )
    best_model = tf.keras.models.load_model(filepath=checkpoint_filepath,custom_objects={'KerasLayer': tfhub.KerasLayer})
    
    val_preds = evaluate(best_model, X_valid, trainidt_valid)
    test_preds = predict(best_model, X_test)
    submission = generate_submission(test_preds, i)

2021-09-27 19:56:18.322366: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 1/10


2021-09-27 19:56:44.397438: I tensorflow/stream_executor/cuda/cuda_dnn.cc:369] Loaded cuDNN version 8204
2021-09-27 19:56:45.420275: W tensorflow/core/common_runtime/bfc_allocator.cc:272] Allocator (GPU_0_bfc) ran out of memory trying to allocate 3.40GiB with freed_by_count=0. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory were available.
2021-09-27 19:56:45.447411: W tensorflow/core/common_runtime/bfc_allocator.cc:272] Allocator (GPU_0_bfc) ran out of memory trying to allocate 3.42GiB with freed_by_count=0. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory were available.
2021-09-27 19:56:45.550763: W tensorflow/core/common_runtime/bfc_allocator.cc:272] Allocator (GPU_0_bfc) ran out of memory trying to allocate 3.39GiB with freed_by_count=0. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory



2021-09-27 20:07:42.304841: W tensorflow/core/common_runtime/bfc_allocator.cc:272] Allocator (GPU_0_bfc) ran out of memory trying to allocate 3.40GiB with freed_by_count=0. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory were available.
2021-09-27 20:07:42.330507: W tensorflow/core/common_runtime/bfc_allocator.cc:272] Allocator (GPU_0_bfc) ran out of memory trying to allocate 3.41GiB with freed_by_count=0. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory were available.


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


2021-09-27 21:42:55.257758: W tensorflow/core/common_runtime/bfc_allocator.cc:272] Allocator (GPU_0_bfc) ran out of memory trying to allocate 3.43GiB with freed_by_count=0. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory were available.
2021-09-27 21:42:55.298004: W tensorflow/core/common_runtime/bfc_allocator.cc:272] Allocator (GPU_0_bfc) ran out of memory trying to allocate 3.61GiB with freed_by_count=0. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory were available.


roc auc: 0.6201040026716282
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
roc auc: 0.6373236079328757
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
roc auc: 0.6447092218882687
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
roc auc: 0.6063453159041394
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
roc auc: 0.9812866752540432


In [15]:

path = "/home/christian/Documents/projects/brain-tumor-classification/experiments/efficientnet2d_preprocessed"

In [19]:
preds = []
for fold_dir in os.listdir(path):
    print(fold_dir)
    sub = pd.read_csv(os.path.join(path, fold_dir, "submission.csv"))
    preds.append(sub["MGMT_value"])
preds = np.mean(preds, axis=0)

fold_0
fold_4
fold_1
fold_2
fold_3


In [20]:
preds

array([0.8 , 0.72, 0.68, 0.7 , 0.9 , 0.78, 0.62, 0.76, 0.74, 0.98, 0.62,
       0.06, 0.3 , 0.26, 0.64, 0.62, 0.28, 0.26, 0.08, 0.16, 0.12, 0.54,
       0.28, 0.38, 0.06, 0.26, 0.24, 0.24, 0.12, 0.36, 0.34, 0.64, 0.22,
       0.18, 0.08, 0.08, 0.06, 0.24, 0.34, 0.18, 0.68, 0.16, 0.82, 0.16,
       0.18, 0.34, 0.58, 0.58, 0.66, 0.8 , 0.58, 0.5 , 0.9 , 0.78, 0.8 ,
       0.72, 0.68, 0.82, 0.92, 0.54, 0.22, 0.76, 0.62, 0.64, 0.64, 0.78,
       0.62, 0.82, 0.7 , 0.8 , 0.58, 0.84, 0.92, 0.78, 0.48, 0.86, 0.7 ,
       0.76, 0.74, 0.6 , 0.82, 0.72, 0.86, 0.82, 0.22, 0.46, 0.5 ])

In [21]:
sub["MGMT_value"] = preds
sub.to_csv(os.path.join(path, "submission.csv"))