In [16]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
from glob import glob
import matplotlib.pyplot as plt

In [13]:
!pip install --ignore-installed tensorflow

Collecting tensorflow
  Using cached tensorflow-2.1.0-cp37-cp37m-manylinux2010_x86_64.whl (421.8 MB)
Collecting scipy==1.4.1; python_version >= "3"
  Using cached scipy-1.4.1-cp37-cp37m-manylinux1_x86_64.whl (26.1 MB)
Collecting grpcio>=1.8.6
  Using cached grpcio-1.28.1-cp37-cp37m-manylinux2010_x86_64.whl (2.8 MB)
Processing /home/dymkiewi/.cache/pip/wheels/3f/e3/ec/8a8336ff196023622fbcb36de0c5a5c218cbb24111d1d4c7f2/termcolor-1.1.0-py3-none-any.whl
Collecting numpy<2.0,>=1.16.0
  Using cached numpy-1.18.2-cp37-cp37m-manylinux1_x86_64.whl (20.2 MB)
Collecting six>=1.12.0
  Downloading six-1.14.0-py2.py3-none-any.whl (10 kB)
Processing /home/dymkiewi/.cache/pip/wheels/62/76/4c/aa25851149f3f6d9785f6c869387ad82b3fd37582fa8147ac6/wrapt-1.12.1-cp37-cp37m-linux_x86_64.whl
Collecting google-pasta>=0.1.6
  Using cached google_pasta-0.2.0-py3-none-any.whl (57 kB)
Collecting protobuf>=3.8.0
  Using cached protobuf-3.11.3-cp37-cp37m-manylinux1_x86_64.whl (1.3 MB)
Collecting keras-preprocessing>=1

In [14]:
import tensorflow as tf

tf.test.is_gpu_available('GPU')

Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.


True

In [17]:
all_xray_df = pd.read_csv('./data/Data_Entry_2017.csv')
all_image_paths = {os.path.basename(x): x for x in 
                   glob(os.path.join(os.getcwd(), './data/images_all', '*.png'))}
print('Scans found:', len(all_image_paths), ', Total Headers', all_xray_df.shape[0])
all_xray_df['path'] = all_xray_df['Image Index'].map(all_image_paths.get)

Scans found: 112120 , Total Headers 112120


In [19]:
all_xray_df['Finding Labels'] = all_xray_df['Finding Labels'].map(lambda x: x.replace('No Finding', ''))
from itertools import chain
all_labels = np.unique(list(chain(*all_xray_df['Finding Labels'].map(lambda x: x.split('|')).tolist())))
all_labels = [x for x in all_labels if len(x)>0]

In [20]:
for c_label in all_labels:
    if len(c_label)>1: # leave out empty labels
        all_xray_df[c_label] = all_xray_df['Finding Labels'].map(lambda finding: '1.0' if c_label in finding else '0')

In [21]:
from sklearn.model_selection import train_test_split
train_df, valid_df = train_test_split(all_xray_df, 
                                   test_size = 0.1, 
                                   random_state = 2137,
                                   stratify = all_xray_df['Finding Labels'].map(lambda x: x[:4]))
print('train', train_df.shape[0], 'validation', valid_df.shape[0])

train 100908 validation 11212


In [22]:
from keras.preprocessing.image import ImageDataGenerator
IMG_SIZE = (256, 256)
core_idg = ImageDataGenerator(samplewise_center=True, 
                              samplewise_std_normalization=True, 
                              horizontal_flip = True, 
                              vertical_flip = False, 
                              height_shift_range= 0.05, 
                              width_shift_range=0.1, 
                              rotation_range=5, 
                              shear_range = 0.1,
                              fill_mode = 'reflect',
                              zoom_range=0.15)

Using TensorFlow backend.


In [23]:
all_xray_df['path'] = all_xray_df['Image Index'].map(lambda x: './data/images_all/'+x)

In [24]:
columns=['Atelectasis','Cardiomegaly','Consolidation','Edema','Effusion','Emphysema','Fibrosis','Hernia','Infiltration',
 'Mass',
 'Nodule',
 'Pleural_Thickening',
 'Pneumonia',
 'Pneumothorax']
CLASS_MODE = 'raw'
Y_COL = columns
DIRESTORY = None
X_COL = 'path'
CLASSES = all_labels

train_gen=core_idg.flow_from_dataframe(
                        dataframe=train_df,
                        directory=DIRESTORY,
                        x_col=X_COL,
                        y_col=Y_COL,
                        batch_size=12,
                        color_mode = 'rgb',
                        class_mode=CLASS_MODE,
                        target_size=IMG_SIZE)

valid_gen=core_idg.flow_from_dataframe(
                        dataframe=valid_df,
                        directory=DIRESTORY,
                        x_col=X_COL,
                        y_col=Y_COL,
                        batch_size=32,
                        color_mode = 'rgb',
                        class_mode=CLASS_MODE,
                        target_size=IMG_SIZE)

test_X, test_Y = next(core_idg.flow_from_dataframe(
                        dataframe=valid_df,
                        directory=DIRESTORY,
                        x_col=X_COL,
                        y_col=Y_COL,
                        batch_size=11212 ,
                        color_mode = 'rgb',
                        class_mode=CLASS_MODE,
                        target_size=IMG_SIZE)) # one big batch

Found 100908 validated image filenames.
Found 11212 validated image filenames.
Found 11212 validated image filenames.


In [None]:
!pip install tf-nightly==2.2.0-dev20200206

In [26]:
from efficientnet.keras import EfficientNetB1
import tensorflow as tf

from keras.layers.normalization import BatchNormalization
from keras.layers import GlobalAveragePooling2D, Dense, Dropout, Flatten, Activation
from keras.models import Sequential

base_model = EfficientNetB1(input_shape =  (256,256,3), 
                                 include_top = False, weights = 'imagenet')
multi_disease_model = Sequential()
multi_disease_model.add(base_model)
multi_disease_model.add(GlobalAveragePooling2D())
multi_disease_model.add(Dense(128))
multi_disease_model.add(BatchNormalization())
multi_disease_model.add(Activation('relu'))
multi_disease_model.add(Dense(128))
multi_disease_model.add(Activation('relu'))

multi_disease_model.add(Dense(len(all_labels), activation = 'sigmoid'))

multi_disease_model.compile(optimizer = 'adam', loss = 'binary_crossentropy'
                            ,metrics =[tf.keras.metrics.AUC()])

In [34]:
from keras.callbacks import ModelCheckpoint, LearningRateScheduler, EarlyStopping, ReduceLROnPlateau
weight_path="{}_.hdf5".format('imagenet_from_scratch_with_auc_metrics_multilabel_false')

checkpoint = ModelCheckpoint(weight_path, monitor='val_auc_1', verbose=1, 
                             save_best_only=True, mode='auto', save_weights_only = True)

early = EarlyStopping(monitor="val_loss", 
                      mode="min", 
                      patience=15)
callbacks_list = [checkpoint, early]

In [None]:
multi_disease_model.load_weights('./data/xray_class_weights.best.hdf5')

In [None]:
multi_disease_model.load_weights('./xray_class_efficientNet.best.hdf5')

In [None]:
multi_disease_model.load_weights('./imagenet_from_scratch_.hdf5')

In [None]:
pred_Y = multi_disease_model.predict(test_X, verbose = True)

In [None]:
test_Y = test_Y.astype(np.float)

In [None]:
from sklearn.metrics import roc_curve, auc
avg = []
fig, c_ax = plt.subplots(1,1, figsize = (9, 9))
for (idx, c_label) in enumerate(all_labels):
    fpr, tpr, thresholds = roc_curve(test_Y[:,idx].astype(int), pred_Y[:,idx])
    c_ax.plot(fpr, tpr, label = '%s (AUC:%0.2f)'  % (c_label, auc(fpr, tpr)))
    avg.append(auc(fpr, tpr).astype(np.float))
    
print(sum(avg) / len(avg))
c_ax.legend()
c_ax.set_xlabel('False Positive Rate')
c_ax.set_ylabel('True Positive Rate')

In [None]:
multi_disease_model.fit_generator(train_gen, 
                                  steps_per_epoch=15,
                                  validation_data = valid_gen, 
                                  validation_steps = 20,
                                  epochs = 5, 
                                  callbacks = callbacks_list)


Epoch 1/5

Epoch 00001: val_auc_1 improved from inf to 0.52457, saving model to imagenet_from_scratch_with_auc_metrics_multilabel_false_.hdf5
Epoch 2/5

Epoch 00002: val_auc_1 did not improve from 0.52457
Epoch 3/5

Epoch 00003: val_auc_1 did not improve from 0.52457
Epoch 4/5