In [1]:

import os
import copy
import sys
import math
from datetime import datetime
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve, precision_recall_curve, matthews_corrcoef, auc, accuracy_score, recall_score, precision_score, f1_score
from sklearn.utils import shuffle
import tensorflow as tf
from tensorflow.keras import backend as K
from tensorflow.keras.optimizers import Adam,RMSprop,SGD
from tensorflow.keras import layers
from tensorflow.keras.layers import concatenate, add, GlobalAveragePooling2D, BatchNormalization, Input, Dense
from tensorflow.keras.models import Model
from tensorflow.keras import initializers
from tensorflow.keras.callbacks import EarlyStopping, LearningRateScheduler, ReduceLROnPlateau, ModelCheckpoint
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.preprocessing.image import load_img
from tensorflow.keras import optimizers
from tensorflow.keras.models import Sequential
from tensorflow.keras.applications.densenet import DenseNet121
from classification_models.tfkeras import Classifiers
from tensorflow.keras.models import load_model
from PIL import ImageFile
import random as python_random



In [2]:

np.random.seed(2021)
python_random.seed(2021)
tf.random.set_seed(2021)

In [3]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="4"


In [4]:
print(tf.__version__)

2.0.0


In [5]:
ImageFile.LOAD_TRUNCATED_IMAGES = True


In [6]:
train_df = pd.read_csv('train_censored.csv')
validate_df = pd.read_csv('val_censored.csv')
test_df = pd.read_csv('test_censored.csv')

In [7]:
train_df = train_df[train_df.Race.isin(['ASIAN','BLACK/AFRICAN AMERICAN','WHITE'])]
validate_df = validate_df[validate_df.Race.isin(['ASIAN','BLACK/AFRICAN AMERICAN','WHITE'])]
test_df = test_df[test_df.Race.isin(['ASIAN','BLACK/AFRICAN AMERICAN','WHITE'])]

In [8]:
train_df.hiti_path = train_df.hiti_path.astype(str)
validate_df.hiti_path = validate_df.hiti_path.astype(str)
test_df.hiti_path = test_df.hiti_path.astype(str)

In [9]:
len(train_df)

184974

In [10]:
#remove 0 byte images
validate_df = validate_df[~validate_df.hiti_path.str.contains('406e0996e5f1cf082487d7d096574d10b46c0c52710222a4884db1cc|dd97e997cc2a4166dc6e192cb62e29553aa28f4671d98c9577e32cfd|6224290209c45bb2b3e07b3b3a27778d1d10f7953567b3c59158e099')]
test_df = test_df[~test_df.hiti_path.str.contains('406e0996e5f1cf082487d7d096574d10b46c0c52710222a4884db1cc|dd97e997cc2a4166dc6e192cb62e29553aa28f4671d98c9577e32cfd|6224290209c45bb2b3e07b3b3a27778d1d10f7953567b3c59158e099')]
train_df = train_df[~train_df.hiti_path.str.contains('406e0996e5f1cf082487d7d096574d10b46c0c52710222a4884db1cc|dd97e997cc2a4166dc6e192cb62e29553aa28f4671d98c9577e32cfd|6224290209c45bb2b3e07b3b3a27778d1d10f7953567b3c59158e099')]

In [11]:
train_df.Race.value_counts()

WHITE                     91369
BLACK/AFRICAN AMERICAN    87139
ASIAN                      6457
Name: Race, dtype: int64

In [12]:
validate_df.Race.value_counts()

BLACK/AFRICAN AMERICAN    7540
WHITE                     6656
ASIAN                      530
Name: Race, dtype: int64

In [13]:
test_df.Race.value_counts()

BLACK/AFRICAN AMERICAN    6067
WHITE                     5281
ASIAN                      484
Name: Race, dtype: int64

In [14]:
HEIGHT, WIDTH = 320, 320

In [15]:
arc_name = "Emory_CXR-" + str(HEIGHT) + "x" + str(WIDTH) + "resnet34-Float32_3-race_"

In [16]:
from tensorflow.keras.mixed_precision import experimental as mixed_precision
policy = mixed_precision.Policy('mixed_float16')

mixed_precision.set_policy(policy)

In [17]:
resnet34, preprocess_input = Classifiers.get('resnet34')


In [18]:
input_a = Input(shape=(HEIGHT, WIDTH, 3))
base_model = resnet34(input_tensor=input_a, include_top=False, input_shape=(HEIGHT,WIDTH,3), weights='imagenet')
x = GlobalAveragePooling2D()(base_model.output)
x = layers.Dense(3, name='dense_logits')(x)
output = layers.Activation('softmax', dtype='float32', name='predictions')(x)
model = Model(inputs=[input_a], outputs=[output])

In [19]:
learning_rate = 1e-3
decay_val= 0.0
batch_s = 256
desired_epoch = 3
train_batch_size = batch_s
test_batch_size = 64

In [20]:
reduce_lr = ReduceLROnPlateau(monitor='val_loss', mode='min', factor=0.1,
                              patience=2, min_lr=1e-5, verbose=1)

In [21]:
adam_opt = optimizers.Adam(lr=learning_rate, decay=decay_val)


In [22]:
model.compile(optimizer=adam_opt,
                loss=tf.losses.CategoricalCrossentropy(),
                metrics=[
                    tf.keras.metrics.AUC(curve='ROC', name='ROC-AUC'),
                    tf.keras.metrics.AUC(curve='PR', name='PR-AUC')
                ],
)

In [23]:
train_gen = ImageDataGenerator(
            rotation_range=15, 
            fill_mode='constant',
            horizontal_flip=True,
            zoom_range=0.1,
            preprocessing_function=preprocess_input
)

validate_gen = ImageDataGenerator(preprocessing_function=preprocess_input)

In [24]:
train_batches = train_gen.flow_from_dataframe(train_df, x_col="hiti_path", y_col="Race", class_mode="categorical",target_size=(HEIGHT, WIDTH),shuffle=True,seed=2021,batch_size=train_batch_size, dtype='float32')

validate_batches = validate_gen.flow_from_dataframe(validate_df,x_col="hiti_path", y_col="Race", class_mode="categorical",target_size=(HEIGHT, WIDTH),shuffle=False,batch_size=test_batch_size, dtype='float32')        



Found 184965 validated image filenames belonging to 3 classes.
Found 14726 validated image filenames belonging to 3 classes.


In [25]:
train_epoch = math.ceil(len(train_df) / train_batch_size)
val_epoch = math.ceil(len(validate_df) / test_batch_size)

In [26]:
var_date = datetime.now().strftime("%Y%m%d-%H%M%S")
ES = EarlyStopping(monitor='val_loss', mode='min', patience=4, restore_best_weights=True)
checkloss = ModelCheckpoint("../saved_models/" + str(arc_name) + "_LR-" + str(learning_rate) + "_" + var_date+"_epoch:{epoch:03d}_val_loss:{val_loss:.5f}.hdf5", monitor='val_loss', mode='min', verbose=1, save_best_only=True, save_weights_only=False)


In [27]:
model.fit_generator(
    train_batches, 
    steps_per_epoch=train_epoch,
    initial_epoch=0,
    epochs=100, 
    verbose=1, 
    callbacks=[reduce_lr, checkloss, ES],
    validation_data=validate_batches, 
    validation_steps=val_epoch, 
    validation_freq=1,
    class_weight=None,
    max_queue_size=10,
    workers=32,
    use_multiprocessing=False,
    shuffle=True
)


Epoch 1/100
Epoch 00001: val_loss improved from inf to 0.30497, saving model to ../saved_models/Emory_CXR-320x320resnet34-Float32_3-race__LR-0.001_20210627-214820_epoch:001_val_loss:0.30497.hdf5
Epoch 2/100
Epoch 00002: val_loss improved from 0.30497 to 0.22691, saving model to ../saved_models/Emory_CXR-320x320resnet34-Float32_3-race__LR-0.001_20210627-214820_epoch:002_val_loss:0.22691.hdf5
Epoch 3/100
Epoch 00003: val_loss did not improve from 0.22691
Epoch 4/100
Epoch 00004: ReduceLROnPlateau reducing learning rate to 0.00010000000474974513.

Epoch 00004: val_loss did not improve from 0.22691
Epoch 5/100
Epoch 00005: val_loss improved from 0.22691 to 0.17858, saving model to ../saved_models/Emory_CXR-320x320resnet34-Float32_3-race__LR-0.001_20210627-214820_epoch:005_val_loss:0.17858.hdf5
Epoch 6/100
Epoch 00006: val_loss improved from 0.17858 to 0.16077, saving model to ../saved_models/Emory_CXR-320x320resnet34-Float32_3-race__LR-0.001_20210627-214820_epoch:006_val_loss:0.16077.hdf5


<tensorflow.python.keras.callbacks.History at 0x7f33992170f0>

In [28]:
test_batch_size = 32

In [29]:
test_batches = validate_gen.flow_from_dataframe(test_df,x_col="hiti_path", y_col="Race", class_mode="categorical",target_size=(HEIGHT, WIDTH),shuffle=False,batch_size=test_batch_size, dtype='float32')        


Found 11832 validated image filenames belonging to 3 classes.


In [30]:
multilabel_predict_test = model.predict(test_batches, max_queue_size=10, verbose=1, steps=math.ceil(len(test_df)/test_batch_size), workers=16)




In [31]:
input_prediction = multilabel_predict_test
input_df = test_df
input_prediction_df = pd.DataFrame(input_prediction)
true_logits = pd.DataFrame()
loss_log = pd.DataFrame()
#input_prediction_df = np.transpose(input_prediction_df)

In [32]:
def stat_calc(input_prediction_df, input_df):
    ground_truth = input_df.Race
    #ground_truth = ground_truth.apply(', '.join)
    pathology_array=[
        'ASIAN',
        'BLACK/AFRICAN AMERICAN',
        'WHITE'
        ]

    i=0
    auc_array = []
    for pathology in pathology_array:
        
        new_truth = (ground_truth.str.contains(pathology)).apply(int)
        input_prediction_val = input_prediction_df[i]
        val = input_prediction_val
        AUC = roc_auc_score(new_truth, val)
        true_logits.insert(i, i, new_truth, True)
        auc_array.append(AUC)
        i += 1
        
    progress_df = pd.DataFrame({'Study':pathology_array, 'AUC':auc_array})
    print(progress_df)


In [33]:
stat_calc(input_prediction_df, input_df)

                    Study       AUC
0                   ASIAN  0.969191
1  BLACK/AFRICAN AMERICAN  0.992430
2                   WHITE  0.987709
