## Train

##### Copyright (C) Microsoft Corporation.  
see license file for details 

In [1]:
# Allow multiple displays per cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
# AZUREML_NATIVE_SHARE_DIRECTORY mapping to host dir is set by _nativeSharedDirectory_ in .compute file 

import os
try:
    amlWBSharedDir = os.environ['AZUREML_NATIVE_SHARE_DIRECTORY']    
except:
    amlWBSharedDir = ''
    print('not using aml services?')
    
amlWBSharedDir

'/azureml-share/'

In [3]:
## Data needs 2 things
## TEMP (Get images)
#crt_container  = 'https://chestxray.blob.core.windows.net/chestxraynih'
#crt_destination = '/mnt/images'
#answer = !yes | azcopy \
#    --source {crt_container} \
#    --destination {crt_destination} \
#    --recursive
## TEMP (Get Labels csv)
# Put to blob

# Why not have a zip from blob that gets unzipped and has both images and csv?
# Would make self-contained ...

In [4]:
import os
import sys
import numpy as np
import pandas as pd
import torch
import torchvision.models as models
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.nn.init as init
import time
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.autograd import Variable
import torchvision.transforms as transforms
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics.ranking import roc_auc_score
from sklearn.model_selection import train_test_split
from PIL import Image
import multiprocessing

In [5]:
assert torch.cuda.is_available()

In [6]:
torch.backends.cudnn.benchmark=True # enables cudnn's auto-tuner

In [7]:
print("OS: ", sys.platform)
print("Python: ", sys.version)
print("PyTorch: ", torch.__version__)
print("Numpy: ", np.__version__)

CPU_COUNT = multiprocessing.cpu_count()
print("CPUs: ", CPU_COUNT)

OS:  linux
Python:  3.5.2 |Continuum Analytics, Inc.| (default, Jul  2 2016, 17:53:06) 
[GCC 4.4.7 20120313 (Red Hat 4.4.7-1)]
PyTorch:  0.3.1
Numpy:  1.14.0
CPUs:  12


In [8]:
# Globals
# With small batch may be faster on P100 to do one 1 GPU
MULTI_GPU = True
CLASSES = 14
WIDTH = 224
HEIGHT = 224
CHANNELS = 3
LR = 0.0001
EPOCHS = 2 #100
# Can scale to max for inference but for training LR will be affected
# Prob better to increase this though on P100 since LR is not too low
# Easier to see when plotted
BATCHSIZE = 16 #64*2
IMAGENET_RGB_MEAN = [0.485, 0.456, 0.406]
IMAGENET_RGB_SD = [0.229, 0.224, 0.225]

In [9]:
# import utlity functions

import sys, os
paths_to_append = [os.path.join(os.getcwd(), os.path.join(*(['Code',  'src'])))]
def add_path_to_sys_path(path_to_append):
    if not (any(path_to_append in paths for paths in sys.path)):
        sys.path.append(path_to_append)
[add_path_to_sys_path(crt_path) for crt_path in paths_to_append]

import azure_chestxray_utils

[None]

In [10]:
# create the file path variables 
# paths are tipically container level dirs mapped to a host dir for data persistence.

prj_consts = azure_chestxray_utils.chestxray_consts()

data_base_input_dir=os.path.join(amlWBSharedDir, 
                                 os.path.join(*(prj_consts.BASE_INPUT_DIR_list)))
data_base_output_dir=os.path.join(amlWBSharedDir, 
                                  os.path.join(*(prj_consts.BASE_OUTPUT_DIR_list)))  
nih_chest_xray_data_dir=os.path.join(data_base_input_dir, 
                                     os.path.join(*(prj_consts.ChestXray_IMAGES_DIR_list)))
other_data_dir=os.path.join(data_base_input_dir, 
                            os.path.join(*(prj_consts.ChestXray_OTHER_DATA_DIR_list)))
label_file = os.path.join(other_data_dir,'Data_Entry_2017.csv')

data_partitions_dir=os.path.join(data_base_output_dir, 
                                os.path.join(*(prj_consts.DATA_PARTITIONS_DIR_list)))  
nih_chest_xray_data_dir
!find $nih_chest_xray_data_dir -type f | wc -l

other_data_dir
!ls $other_data_dir

'/azureml-share/chestxray/data/ChestX-ray8/ChestXray-NIHCC'

112120


'/azureml-share/chestxray/data/ChestX-ray8/ChestXray-NIHCC_other'

BBox_List_2017.csv  Data_Entry_2017.csv  blacklist.csv


In [11]:
# Paths
# BASE_DIR = "/mnt"
# DATA_FOLDER = os.path.join(BASE_DIR, "ChestXray-NIHCC")
# IMAGE_FOLDER = os.path.join(BASE_DIR, "images")
# LABEL_FILE = os.path.join(DATA_FOLDER, "Data_Entry_2017.csv")
# print(IMAGE_FOLDER, LABEL_FILE)

In [12]:
#####################################################################################################
## Data Loading
#####################################################################################################

In [13]:
# # todo
# # This should prob be a generic function
# # Split data into train/val/test

# real_total_patient_number = 30805
# patient_id_original = [i for i in range(real_total_patient_number + 1)]

# bbox_df = pd.read_csv(os.path.join(other_data_dir, 'BBox_List_2017.csv'))

# black_list_set = set()
# with open(os.path.join(other_data_dir, 'blacklist.csv'), 'r') as f:
#     for line in f:
#         # delete the last char which is \n
#         black_list_set.add(line[:-1])
#         if int(line[:-9]) >= 30805:
#             print(line[:-1])

# # print("00029404_009.png" in black_list_set)
# bbox_patient_index_df = bbox_df['Image Index'].str.slice(3, 8)

# bbox_patient_index_list = []


# for index, item in bbox_patient_index_df.iteritems():
#     bbox_patient_index_list.append(int(item))

# patient_id = list(set(patient_id_original) - set(bbox_patient_index_list))
# print("len of patient id is", len(patient_id))
# print("len of unique patient id with annotated data", 
#       len(list(set(bbox_patient_index_list))))
# print("len of patient id with annotated data",bbox_df.shape[0])
# print("len of original patient id is", len(patient_id_original))


In [14]:
# # set fast_testing True to see the training pipeline running for a few iterations on a very small # of images
# # set fast_testing False to perfrom real training on full training data 
# fast_testing = True

# # for real training we need random order
# if (fast_testing):
#     shuffle_data_FLAG = False
#     crt_patient_id = patient_id[:300]
#     left_out_patient_id = patient_id[300:]
# else:
#     crt_patient_id = patient_id
#     left_out_patient_id = []
#     # set seed to reproduce result
#     random.seed(0)
#     shuffle_data_FLAG = True

# # training:valid:test=7:1:2
# train_set, other_set = train_test_split(
#     crt_patient_id, train_size=0.7, test_size=0.3, shuffle=shuffle_data_FLAG)
# valid_set, test_set = train_test_split(
#     other_set, train_size=1/3, test_size=2/3, shuffle=shuffle_data_FLAG)
# print("train:{} valid:{} test:{}".format(len(train_set), len(valid_set), len(test_set)))

# # test_set = test_set+left_out_patient_id
# # print("train:{} valid:{} test:{}".format(len(train_set), len(valid_set), len(test_set)))

In [15]:
import pickle
patient_id_partition_file = os.path.join(data_partitions_dir, 'train_test_valid_data_partitions.pickle')

with open(patient_id_partition_file, 'rb') as f:
    [train_set,valid_set,test_set, nih_annotated_set]=pickle.load(f)

print("train:{} valid:{} test:{} nih-annotated:{}".format(len(train_set), len(valid_set), \
                                                     len(test_set), len(nih_annotated_set)))

train:21563 valid:3081 test:6161 nih-annotated:726


In [16]:
class XrayData(Dataset):
    def __init__(self, img_dir, lbl_file, patient_ids, transform=None):
        
        # Read labels-csv
        df = pd.read_csv(lbl_file)
        # Filter by patient-ids
        df = df[df['Patient ID'].isin(patient_ids)]
        # Split labels
        df_label = df['Finding Labels'].str.split(
            '|', expand=False).str.join(sep='*').str.get_dummies(sep='*')
        df_label.drop(['No Finding'], axis=1, inplace=True)
                
        # List of images (full-path)
        self.img_locs =  df['Image Index'].map(lambda im: os.path.join(img_dir, im)).values
        # One-hot encoded labels (float32 for BCE loss)
        self.labels = df_label.values
        # Processing
        self.transform = transform
              
        print("Loaded {} labels and {} images".format(len(self.labels), 
                                                      len(self.img_locs)))
    
    def __getitem__(self, idx):
        
        im_file = self.img_locs[idx]
        im_rgb = Image.open(im_file).convert('RGB')
        label = self.labels[idx]
        if self.transform is not None:
            im_rgb = self.transform(im_rgb)
        return im_rgb, torch.FloatTensor(label)
        
    def __len__(self):
        return len(self.img_locs)

In [17]:
def no_augmentation_dataset(img_dir, lbl_file, patient_ids, normalize):
    dataset = XrayData(img_dir, lbl_file, patient_ids,
                       transform=transforms.Compose([
                           transforms.Resize(WIDTH),
                           transforms.ToTensor(),  
                           normalize]))
    return dataset

In [18]:
# Dataset for training
# Normalise by imagenet mean/sd
normalize = transforms.Normalize(IMAGENET_RGB_MEAN, IMAGENET_RGB_SD)
# todo
# Go wild here with the transforms
# https://github.com/pytorch/vision/blob/master/torchvision/transforms/transforms.py
#__all__ = ["Compose", "ToTensor", "ToPILImage", "Normalize", "Resize", "Scale", "CenterCrop", "Pad",
#           "Lambda", "RandomCrop", "RandomHorizontalFlip", "RandomVerticalFlip", "RandomResizedCrop",
#           "RandomSizedCrop", "FiveCrop", "TenCrop", "LinearTransformation", "ColorJitter", "RandomRotation",
#           "Grayscale", "RandomGrayscale"]
train_dataset = XrayData(img_dir=nih_chest_xray_data_dir,
                         lbl_file=label_file,
                         patient_ids=train_set,
                         transform=transforms.Compose([
                             transforms.Resize(264),
                             transforms.RandomHorizontalFlip(),
                             transforms.RandomResizedCrop(size=WIDTH),
                             transforms.ColorJitter(0.15, 0.15),
                             transforms.RandomRotation(15),
                             transforms.ToTensor(),  # need to convert image to tensor!
                             normalize]))

Loaded 69217 labels and 69217 images


In [19]:
valid_dataset = no_augmentation_dataset(nih_chest_xray_data_dir, label_file, valid_set, normalize)
test_dataset = no_augmentation_dataset(nih_chest_xray_data_dir, label_file, test_set, normalize)

Loaded 9600 labels and 9600 images
Loaded 33303 labels and 33303 images


In [20]:
#####################################################################################################
## Helper Functions
#####################################################################################################

In [21]:
def get_symbol(out_features=CLASSES, multi_gpu=MULTI_GPU):
    model = models.densenet.densenet121(pretrained=True)
    # Replace classifier (FC-1000) with (FC-14)
    model.classifier = nn.Sequential(
        nn.Linear(model.classifier.in_features, out_features), 
        nn.Sigmoid())
    if multi_gpu:
        model = nn.DataParallel(model)
    # CUDA
    model.cuda()  
    return model

In [22]:
def init_symbol(sym, lr=LR):
    # torch.optim.Adam(params, lr=0.001, betas=(0.9, 0.999), eps=1e-08, weight_decay=0, amsgrad=False)
    opt = optim.Adam(sym.parameters(), lr=lr, betas=(0.9, 0.999))
    criterion = nn.BCELoss()
    scheduler = ReduceLROnPlateau(opt, factor = 0.1, patience = 5, mode = 'min')
    return opt, criterion, scheduler 

In [23]:
def compute_roc_auc(data_gt, data_pd, mean=True, classes=CLASSES):
    roc_auc = []
    data_gt = data_gt.cpu().numpy()
    data_pd = data_pd.cpu().numpy()
    for i in range(classes):
        roc_auc.append(roc_auc_score(data_gt[:, i], data_pd[:, i]))
    if mean:
        roc_auc = np.mean(roc_auc)
    return roc_auc

In [24]:
def train_epoch(model, dataloader, optimizer, criterion, epoch, batch=BATCHSIZE):
    model.train()
    print("Training epoch {}".format(epoch+1))
    loss_val = 0
    loss_cnt = 0
    for data, target in dataloader:
        # Get samples
        data = Variable(torch.FloatTensor(data).cuda())
        target = Variable(torch.FloatTensor(target).cuda())
        # Init
        optimizer.zero_grad()
        # Forwards
        output = model(data)
        # Loss
        loss = criterion(output, target)
        # Back-prop
        loss.backward()
        optimizer.step()   
         # Log the loss
        loss_val += loss.data[0]
        loss_cnt += 1
    print("Training loss: {0:.4f}".format(loss_val/loss_cnt))

In [25]:
def valid_epoch(model, dataloader, criterion, epoch, phase='valid', batch=BATCHSIZE):
    model.eval()
    if phase == 'testing':
        print("Testing epoch {}".format(epoch+1))
    else:
        print("Validating epoch {}".format(epoch+1))
    out_pred = torch.FloatTensor().cuda()
    out_gt = torch.FloatTensor().cuda()
    loss_val = 0
    loss_cnt = 0
    for data, target in dataloader:
        # Get samples
        data = Variable(torch.FloatTensor(data).cuda(), volatile=True)
        target = Variable(torch.FloatTensor(target).cuda(), volatile=True)
         # Forwards
        output = model(data)
        # Loss
        loss = criterion(output, target)
        # Log the loss
        loss_val += loss.data[0]
        loss_cnt += 1
        # Log for AUC
        out_pred = torch.cat((out_pred, output.data), 0)
        out_gt = torch.cat((out_gt, target.data), 0)
    loss_mean = loss_val/loss_cnt
    if phase == 'testing':
        print("Test-Dataset loss: {0:.4f}".format(loss_mean))
        print("Test-Dataset AUC: {0:.4f}".format(compute_roc_auc(out_gt, out_pred)))

    else:
        print("Validation loss: {0:.4f}".format(loss_mean))
        print("Validation AUC: {0:.4f}".format(compute_roc_auc(out_gt, out_pred)))
    return loss_mean

In [26]:
def print_learning_rate(opt):
    for param_group in opt.param_groups:
        print("Learining rate: ", param_group['lr'])

In [27]:
# DataLoaders
# num_workers=4*CPU_COUNT
# pin_memory=True
train_loader = DataLoader(dataset=train_dataset, batch_size=BATCHSIZE,
                          shuffle=True, num_workers=0, pin_memory=False)

valid_loader = DataLoader(dataset=valid_dataset, batch_size=8*BATCHSIZE,
                          shuffle=False, num_workers=0, pin_memory=False)

test_loader = DataLoader(dataset=test_dataset, batch_size=8*BATCHSIZE,
                         shuffle=False, num_workers=0, pin_memory=False)

In [28]:
#####################################################################################################
## Train Azure Chest Xray
#####################################################################################################

In [29]:
%%time
# Load symbol
azure_chest_xray_sym = get_symbol()

Downloading: "https://download.pytorch.org/models/densenet121-a639ec97.pth" to /home/mmlspark/.torch/models/densenet121-a639ec97.pth
100%|██████████| 32342954/32342954 [00:00<00:00, 55784863.78it/s]


CPU times: user 2.65 s, sys: 1.01 s, total: 3.66 s
Wall time: 3.92 s


In [30]:
%%time
# Load optimiser, loss
optimizer, criterion, scheduler = init_symbol(azure_chest_xray_sym)

CPU times: user 1.89 ms, sys: 240 µs, total: 2.13 ms
Wall time: 2.14 ms


In [31]:
!nvidia-smi
!cat /usr/local/cuda-8.0/version.txt
!cat /usr/local/cuda/version.txt


Wed Mar 21 13:05:22 2018       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 384.111                Driver Version: 384.111                   |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla K80           Off  | 0000C3D4:00:00.0 Off |                  Off |
| N/A   36C    P0    72W / 149W |    241MiB / 12205MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   1  Tesla K80           Off  | 0000DCAE:00:00.0 Off |                  Off |
| N/A   41C    P8    26W / 149W |     11MiB / 12205MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-------

In [32]:
# Original CheXNet ROC AUC = 0.841
loss_min = float("inf")    
stime = time.time()

# No-training
valid_epoch(azure_chest_xray_sym, valid_loader, criterion, -1)

# Main train/val/test loop
for j in range(EPOCHS):
    train_epoch(azure_chest_xray_sym, train_loader, optimizer, criterion, j)
    loss_val = valid_epoch(azure_chest_xray_sym, valid_loader, criterion, j)
    test_loss_val = valid_epoch(azure_chest_xray_sym, test_loader, criterion, j, 'testing')
    # LR Schedule
    scheduler.step(loss_val)
    print_learning_rate(optimizer)
    # todo: tensorboard hooks
    # Logging
    if loss_val < loss_min:
        print("Loss decreased. Saving ...")
        loss_min = loss_val
        torch.save({'epoch': j + 1, 
                    'state_dict': azure_chest_xray_sym.state_dict(), 
                    'best_loss': loss_min, 
                    'optimizer' : optimizer.state_dict()}, 'best_azure_chest_xray_model_v2.pth.tar')
    etime = time.time()
    print("Epoch time: {0:.0f} seconds".format(etime-stime))
    print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")

Validating epoch 0
Validation loss: 0.5749
Validation AUC: 0.5071


0.5748794953028361

Training epoch 1
Training loss: 0.1596
Validating epoch 1
Validation loss: 0.1471
Validation AUC: 0.7662
Testing epoch 1
Test-Dataset loss: 0.1952
Test-Dataset AUC: 0.7568
Learining rate:  0.0001
Loss decreased. Saving ...
Epoch time: 9170 seconds
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Training epoch 2
Training loss: 0.1516
Validating epoch 2
Validation loss: 0.1464
Validation AUC: 0.7875
Testing epoch 2
Test-Dataset loss: 0.1908
Test-Dataset AUC: 0.7820
Learining rate:  0.0001
Loss decreased. Saving ...
Epoch time: 13616 seconds
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~


In [33]:
#####################################################################################################
## Test azure_chest_xray
#####################################################################################################

In [34]:
# Load model for testing
azure_chest_xray_sym_test = get_symbol()
chkpt = torch.load("best_azure_chest_xray_model_v2.pth.tar")
azure_chest_xray_sym_test.load_state_dict(chkpt['state_dict'])

In [35]:
valid_loss = valid_epoch(azure_chest_xray_sym_test, valid_loader, criterion, -1)
test_loss = valid_epoch(azure_chest_xray_sym_test, test_loader, criterion, -1, 'testing')

Validating epoch 0
Validation loss: 0.1464
Validation AUC: 0.7875
Testing epoch 0
Test-Dataset loss: 0.1908
Test-Dataset AUC: 0.7820


In [36]:
#import torch.onnx
#dummy_input = Variable(torch.randn(BATCHSIZE, CHANNELS, HEIGHT, WIDTH)).cuda()
#torch.onnx.export(azure_chest_xray_sym_test, dummy_input, "azure_chest_xray.proto", verbose=True)

In [37]:
# jupyter nbconvert --to html .\Code\02_Model\060_Train_pyTorch.ipynb