In [1]:
MULTI_GPU = True  # TOGGLE THIS

In [2]:
#%%bash
#wget -N https://ikpublictutorial.blob.core.windows.net/deeplearningframeworks/DenseNet_121.caffemodel

In [3]:
import os
import sys
import time
import multiprocessing
import numpy as np
import pandas as pd
import chainer
import chainer.functions as F
import chainer.links as L
import collections
from chainer import optimizers, cuda, dataset, training
from chainer.training import extensions, updaters, StandardUpdater
from chainer.dataset import concat_examples
from chainer.links.caffe import CaffeFunction
from sklearn.metrics.ranking import roc_auc_score
from sklearn.model_selection import train_test_split
from PIL import Image
from chainercv import transforms
import random
from common.utils import download_data_chextxray, get_imgloc_labels, get_train_valid_test_split
from common.utils import compute_roc_auc, get_cuda_version, get_cudnn_version, get_gpu_name
from common.params_dense import *

  from ._conv import register_converters as _register_converters


In [4]:
BATCHSIZE = 56

In [5]:
# Performance Improvement
# 1. Auto-tune
# This adds very little now .. not sure if True by default?
chainer.cuda.set_max_workspace_size(512 * 1024 * 1024)
chainer.global_config.autotune = True

In [6]:
print("OS: ", sys.platform)
print("Python: ", sys.version)
print("Chainer: ", chainer.__version__)
print("CuPy: ", chainer.cuda.cupy.__version__)
print("Numpy: ", np.__version__)
print("GPU: ", get_gpu_name())
print(get_cuda_version())
print("CuDNN Version ", get_cudnn_version())

OS:  linux
Python:  3.5.4 |Anaconda custom (64-bit)| (default, Nov 20 2017, 18:44:38) 
[GCC 7.2.0]
Chainer:  4.1.0
CuPy:  4.1.0
Numpy:  1.14.1
GPU:  ['Tesla V100-PCIE-16GB', 'Tesla V100-PCIE-16GB', 'Tesla V100-PCIE-16GB', 'Tesla V100-PCIE-16GB']
CUDA Version 9.0.176
CuDNN Version  7.0.5


In [7]:
CPU_COUNT = multiprocessing.cpu_count()
GPU_COUNT = len(get_gpu_name())
print("CPUs: ", CPU_COUNT)
print("GPUs: ", GPU_COUNT)

CPUs:  24
GPUs:  4


In [8]:
DEVICES=tuple(list(range(GPU_COUNT)))
print(DEVICES)
if MULTI_GPU:
    from cupy.cuda import nccl  # Test that nccl works for multi-gpu

(0, 1, 2, 3)


In [9]:
# Model-params
IMAGENET_RGB_MEAN_CAFFE = np.array([123.68, 116.78, 103.94], dtype=np.float32)
IMAGENET_SCALE_FACTOR_CAFFE = 0.017
# Paths
CSV_DEST = "chestxray"
IMAGE_FOLDER = os.path.join(CSV_DEST, "images")
LABEL_FILE = os.path.join(CSV_DEST, "Data_Entry_2017.csv")
print(IMAGE_FOLDER, LABEL_FILE)

chestxray/images chestxray/Data_Entry_2017.csv


In [10]:
# Manually scale to multi-gpu
if MULTI_GPU:
    # BATCH is auto-scaled
    LR *= GPU_COUNT
print(LR)

0.0004


In [11]:
%%time
# Download data
print("Please make sure to download")
print("https://docs.microsoft.com/en-us/azure/storage/common/storage-use-azcopy-linux#download-and-install-azcopy")
download_data_chextxray(CSV_DEST)

Please make sure to download
https://docs.microsoft.com/en-us/azure/storage/common/storage-use-azcopy-linux#download-and-install-azcopy
Data already exists
CPU times: user 608 ms, sys: 236 ms, total: 844 ms
Wall time: 843 ms


In [12]:
#####################################################################################################
## Data Loading

In [13]:
class XrayData(dataset.DatasetMixin):
    def __init__(self, patient_ids, height=HEIGHT, width=WIDTH,
                 imagenet_mean=IMAGENET_RGB_MEAN_CAFFE, imagenet_scaling = IMAGENET_SCALE_FACTOR_CAFFE,
                 img_dir=IMAGE_FOLDER, lbl_file=LABEL_FILE, augmentation=None):
          
        self.img_locs, self.labels = get_imgloc_labels(img_dir, lbl_file, patient_ids)
        self.augmentation = augmentation
        self.imagenet_mean = imagenet_mean
        self.imagenet_scaling = imagenet_scaling
        self.h = height
        self.w = width
        print("Loaded {} labels and {} images".format(len(self.labels), len(self.img_locs)))
        
    def __len__(self):
        return len(self.img_locs)   
    
    def get_example(self, idx):
        im_file = self.img_locs[idx]
        # RGB Image
        im_rgb = Image.open(im_file)
        im_rgb = self._apply_data_preprocessing(im_rgb)
        label = self.labels[idx]
        if self.augmentation is not None:
            # Random crop to 224, random flip
            im_rgb = self._apply_data_augmentation(im_rgb)
        else:
            # Train/Val resize from 264 to 224
            im_rgb = transforms.resize(im_rgb, size=(self.h, self.w))
        return np.array(im_rgb, dtype=np.float32), np.array(label, dtype=np.int32)
    
    def _apply_data_preprocessing(self, rgb_im):
        # Array
        im = np.asarray(rgb_im, dtype=np.float32)
        # (w, h, c) to (c, h, w)
        im = im.transpose(2, 0, 1)
        # Caffe normalisation
        im -= self.imagenet_mean[:, None, None]
        im *= self.imagenet_scaling
        return im

    def _apply_data_augmentation(self, im):
        im = transforms.random_crop(im, size=(self.h,self.w))
        im = transforms.random_flip(im)
        return im

In [14]:
train_set, valid_set, test_set = get_train_valid_test_split(TOT_PATIENT_NUMBER)

train:21563 valid:3080 test:6162


In [15]:
train_dataset = XrayData(img_dir=IMAGE_FOLDER, lbl_file=LABEL_FILE, patient_ids=train_set, augmentation=True)
valid_dataset = XrayData(img_dir=IMAGE_FOLDER, lbl_file=LABEL_FILE, patient_ids=valid_set, augmentation=False)
test_dataset  = XrayData(img_dir=IMAGE_FOLDER, lbl_file=LABEL_FILE, patient_ids=test_set, augmentation=False)

Loaded 87306 labels and 87306 images
Loaded 7616 labels and 7616 images
Loaded 17198 labels and 17198 images


In [16]:
#####################################################################################################
## Helper Functions

In [17]:
def truncate_bn(sym):
    # Need to truncate batchnorm - eps
    for layer in list(sym._children):
        if "bn" in layer:
            if sym.__dict__[layer].eps < 1e-5:
                sym.__dict__[layer].eps = 1e-5

In [18]:
class CaffeFunctionDenseNet121(CaffeFunction):
        
    # Standard function saves all variables so cannot use big batch
    # This lets me run BATCH of 56 over 32 - still can't get to 64
    # https://github.com/chainer/chainer/blob/master/chainer/links/caffe/caffe_function.py#L176
    def __call__(self, inputs, **kwargs):
        variables = dict(inputs)
        # Pools not to save
        # These layers are not concatenated
        _NOSAVE = set(['pool5', 'concat_5_16', 'concat_4_24', 'concat_3_12', 'concat_2_6'])
        # Forward through all layers
        for func_name, bottom, top in self.layers:

            func = self.forwards[func_name]
            # Concat ops require some previous layers that are saved
            if "concat" in func_name:
                input_vars = tuple([variables[bottom[0]], variables['data']])
            else:
                input_vars = tuple([variables['data']])
            output_vars = func(*input_vars)
            # Delete layers for concat once used
            if "concat" in func_name:
                del variables[bottom[0]]
            if not isinstance(output_vars, collections.Iterable):
                output_vars = output_vars,
            # Save to dict
            variables['data'] = output_vars[0]
            top = top[0]
            # Save for concat
            if ("pool" in top) and (top not in _NOSAVE):
                variables[top] = output_vars[0]
            elif ("concat" in top) and (top not in _NOSAVE):
                variables[top] = output_vars[0]
                
        return tuple([variables['data']])

In [19]:
class DenseNet121(chainer.Chain):
    # Class to wrap base (up to pool5 output)
    def __init__(self, base_symbol, n_classes=14):
        super(DenseNet121, self).__init__()
        with self.init_scope():
            self.base_symbol = base_symbol
            self.fc = L.Linear(1024, n_classes)
    
    def __call__(self, x):
        h = self.base_symbol(inputs={'data':x}, 
                             outputs=['pool5'])[0]
        return self.fc(h)

In [20]:
def get_symbol(model_name='densenet121', multi_gpu=MULTI_GPU, classes=CLASSES):
    if model_name == 'densenet121':
        # Load base
        #base_symbol = CaffeFunction("DenseNet_121.caffemodel")
        base_symbol = CaffeFunctionDenseNet121("DenseNet_121.caffemodel")
        # Fix batch-norm
        truncate_bn(base_symbol)
        # Remove unused 
        base_symbol.__delattr__('fc6')
        del base_symbol.forwards['fc6']
        del base_symbol.layers[-1]
        m = DenseNet121(base_symbol, classes)
    else:
        raise ValueError("Unknown model-name")
    # CUDA
    if not multi_gpu:
        print("One GPU")
        chainer.cuda.get_device(0).use()  # Make a specified GPU current
        m.to_gpu()  
    return m

In [21]:
def init_symbol(sym, lr=LR):
    opt = optimizers.Adam(alpha=lr, beta1=0.9, beta2=0.999)
    opt.setup(sym)
    return opt

In [22]:
def compute_roc_auc(data_gt, data_pd, full=True, classes=CLASSES):
    roc_auc = []
    for i in range(classes):
        roc_auc.append(roc_auc_score(data_gt[:, i], data_pd[:, i]))
    print("Full AUC", roc_auc)
    roc_auc = np.mean(roc_auc)
    return roc_auc

In [23]:
def lossfun(x, t):
    return F.sigmoid_cross_entropy(x, t)

In [24]:
%%time
# Load symbol
predictor = get_symbol()
chexnet_sym = L.Classifier(predictor, lossfun=lossfun)
# Won't work for multi-class
chexnet_sym.compute_accuracy = False

CPU times: user 709 ms, sys: 108 ms, total: 817 ms
Wall time: 817 ms


In [25]:
%%time
# Load optimiser
optimizer = init_symbol(chexnet_sym)

CPU times: user 1.31 ms, sys: 105 µs, total: 1.41 ms
Wall time: 1.41 ms


In [26]:
# Data-iterators
# Open-CV runs on GPU by default!
# Weird-bug to set shared_mem (set random value and adjust)
if MULTI_GPU:
    train_iters = [
        chainer.iterators.MultiprocessIterator(
            i, BATCHSIZE, n_prefetch=10, n_processes=int(CPU_COUNT/len(DEVICES))) 
        for i in chainer.datasets.split_dataset_n_random(train_dataset, len(DEVICES))]
else:
    train_iter = chainer.iterators.MultiprocessIterator(
        train_dataset, BATCHSIZE, n_prefetch=10, n_processes=CPU_COUNT)

In [27]:
# These can have a higher batch-size than train since no grads stored
valid_iter = chainer.iterators.MultiprocessIterator(
    valid_dataset, BATCHSIZE, repeat=False, shuffle=False, n_prefetch=10, n_processes=CPU_COUNT)
test_iter = chainer.iterators.MultiprocessIterator(
    test_dataset, BATCHSIZE, repeat=False, shuffle=False, n_prefetch=10, n_processes=CPU_COUNT)

In [28]:
# MultiprocessParallelUpdater requires NCCL.
# https://github.com/nvidia/nccl#build--run
if MULTI_GPU:
    updater = updaters.MultiprocessParallelUpdater(train_iters, optimizer, devices=DEVICES)
else:
    updater = StandardUpdater(train_iter, optimizer, device=0)

  format(optimizer.eps))


In [29]:
val_interval = (1, 'epoch')
trainer = training.Trainer(updater, stop_trigger=(EPOCHS, 'epoch'))
trainer.extend(extensions.Evaluator(valid_iter, chexnet_sym, device=DEVICES[0]), trigger=val_interval)
trainer.extend(extensions.LogReport(trigger=val_interval))
trainer.extend(extensions.PrintReport(['epoch', 'iteration', 'main/loss', 'validation/main/loss']), 
               trigger=val_interval)
trainer.extend(extensions.ProgressBar(update_interval=500))

In [30]:
%%time
# 1 GPU = 47m15s 
# 4 GPU = 14min43s
trainer.run()

epoch       iteration   main/loss   validation/main/loss
[J1           390         0.171549    0.258607              
[J     total [############......................................] 25.66%
this epoch [##############....................................] 28.29%
       500 iter, 1 epoch / 5 epochs
       inf iters/sec. Estimated time to finish: 0:00:00.
[4A[J2           780         0.152335    0.309744              
[J     total [#########################.........................] 51.31%
this epoch [############################......................] 56.57%
      1000 iter, 2 epoch / 5 epochs
    2.2764 iters/sec. Estimated time to finish: 0:06:56.781205.
[4A[J3           1170        0.148951    0.365624              
[J     total [######################################............] 76.97%
this epoch [##########################################........] 84.86%
      1500 iter, 3 epoch / 5 epochs
    2.3064 iters/sec. Estimated time to finish: 0:03:14.564138.
[4A[J4           15

In [31]:
#####################################################################################################
## Test CheXNet

In [32]:
%%time
y_truth = test_dataset.labels
y_guess = []
test_iter.reset()
with chainer.using_config('train', False), chainer.using_config('enable_backprop', False):
    for test_batch in test_iter:
        # Data
        x_test, y_test = concat_examples(test_batch, device=DEVICES[0])
        # Prediction (need to apply sigmoid to turn into probability)
        pred = cuda.to_cpu(F.sigmoid(predictor(x_test)).data)
        # Collect results
        y_guess.append(pred)           
# Concatenate
y_guess = np.concatenate(y_guess, axis=0)

CPU times: user 42.8 s, sys: 5.41 s, total: 48.2 s
Wall time: 44 s


In [33]:
# Test AUC: 0.8025 for single-gpu
# Test AUC: 0.56 for multi-gpu
# BROKEN?
print("Test AUC: {0:.4f}".format(compute_roc_auc(y_truth, y_guess)))

Full AUC [0.6313460983636788, 0.6170507836493689, 0.5249451676029859, 0.5458232082094421, 0.7365650530533906, 0.5101163765400858, 0.6124915720521869, 0.6287404361303732, 0.46980560372232366, 0.5332452472285844, 0.546094904258738, 0.5641953609281717, 0.4102927564511489, 0.508842908005139]
Test AUC: 0.5600
