In [1]:

import os
import copy
import sys
import math
from datetime import datetime
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve, precision_recall_curve, matthews_corrcoef, auc, accuracy_score, recall_score, precision_score, f1_score
from sklearn.utils import shuffle
import tensorflow as tf
from tensorflow.keras import backend as K
from tensorflow.keras.optimizers import Adam,RMSprop,SGD
from tensorflow.keras import layers
from tensorflow.keras.layers import concatenate, add, GlobalAveragePooling2D, BatchNormalization, Input, Dense
from tensorflow.keras.models import Model
from tensorflow.keras import initializers
from tensorflow.keras.callbacks import EarlyStopping, LearningRateScheduler, ReduceLROnPlateau, ModelCheckpoint
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.preprocessing.image import load_img
from tensorflow.keras import optimizers
from tensorflow.keras.models import Sequential
#from tensorflow.keras.applications.densenet import DenseNet121
from classification_models.tfkeras import Classifiers
from tensorflow.keras.models import load_model
import random as python_random


In [2]:

np.random.seed(2021)
python_random.seed(2021)
tf.random.set_seed(2021)

In [3]:
print(tf.__version__)

2.0.0


In [4]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="3"

In [5]:
compare_df = pd.read_csv('cspin_split_80-10-10.csv')

In [6]:
data_df = pd.read_csv('cspin_split_80-10-10_ver_C.csv')

In [7]:
compare_df = compare_df[compare_df.ViewPosition.isin(['LATERAL','LATERAL FLEX','LATERAL EXT'])]

In [8]:
len(compare_df)

13287

In [9]:
split = data_df.Image.str.split("/", n=6, expand=True)

In [10]:
data_df.Image = "../data/cspine_hardware/new_extract/" + split[6]

In [11]:
compare_df = compare_df.rename(columns={"png_path": "Image"})

In [12]:
len(compare_df)

13287

In [13]:
len(data_df)

19118

In [14]:
total_df = data_df.merge(compare_df, on='Image')

In [15]:
total_df = total_df.drop_duplicates(subset="Image")

In [16]:
len(total_df)

10729

In [17]:
data_df = total_df

In [18]:
data_df = data_df[data_df.Race.isin(['African American  or Black', 'Caucasian or White'])]

In [19]:
len(data_df)

10358

In [20]:
len(data_df.EMPI.unique())

980

In [21]:
data_df.Race.value_counts()

Caucasian or White            7589
African American  or Black    2769
Name: Race, dtype: int64

In [22]:
data_df.Race.value_counts(normalize=True)

Caucasian or White            0.73267
African American  or Black    0.26733
Name: Race, dtype: float64

In [23]:
data_df.Sex.value_counts()

F    5488
M    4870
Name: Sex, dtype: int64

In [24]:
data_df = data_df.rename(columns={"split_x": "split"})
data_df.split.value_counts(normalize=True)

train       0.805947
validate    0.102819
test        0.091234
Name: split, dtype: float64

In [25]:
data_df.Image = data_df.Image.astype(str)

In [26]:
from tensorflow.keras.mixed_precision import experimental as mixed_precision
policy = mixed_precision.Policy('mixed_float16')

mixed_precision.set_policy(policy)

In [27]:
HEIGHT, WIDTH = 320, 320

In [28]:
resnet34, preprocess_input = Classifiers.get('resnet34')

In [29]:
input_a = Input(shape=(HEIGHT, WIDTH, 3))
base_model = resnet34(input_tensor=input_a, include_top=False, input_shape=(HEIGHT,WIDTH,3), weights='imagenet')
x = GlobalAveragePooling2D()(base_model.output)
x = layers.Dense(2, name='dense_logits')(x)
output = layers.Activation('softmax', dtype='float32', name='predictions')(x)
model = Model(inputs=[input_a], outputs=[output])

In [30]:
learning_rate = 1e-3
decay_val= 0.0 
batch_s = 256

reduce_lr = ReduceLROnPlateau(monitor='val_loss', mode='min', factor=0.1,
                              patience=2, min_lr=1e-5, verbose=1)

adam_opt = optimizers.Adam(learning_rate=learning_rate, decay=decay_val)

model.compile(optimizer=adam_opt,
                loss=tf.losses.CategoricalCrossentropy(name='loss'),
                metrics=[
                    tf.keras.metrics.AUC(curve='ROC', name='ROC-AUC')
                ],
)

In [31]:
train_gen = ImageDataGenerator(
            rotation_range=15,
            fill_mode='constant',
            zoom_range=0.1,
            horizontal_flip=True,
            preprocessing_function=preprocess_input
)

validate_gen = ImageDataGenerator(preprocessing_function=preprocess_input)

In [32]:
train_batch_size = batch_s
test_batch_size = 64

In [33]:
train_df = data_df[data_df.split=="train"]
validate_df = data_df[data_df.split=="validate"]
test_df = data_df[data_df.split=="test"]

In [34]:
train_batches = train_gen.flow_from_dataframe(train_df, x_col="Image", y_col="Race", class_mode="categorical",target_size=(HEIGHT, WIDTH),shuffle=True,seed=2021,batch_size=train_batch_size, dtype='float32')

validate_batches = validate_gen.flow_from_dataframe(validate_df, x_col="Image", y_col="Race", class_mode="categorical",target_size=(HEIGHT, WIDTH),shuffle=False,batch_size=test_batch_size, dtype='float32')        

Found 8346 validated image filenames belonging to 2 classes.
Found 1065 validated image filenames belonging to 2 classes.


  .format(n_invalid, x_col)


In [35]:
arc_name = "resnet34_cspine_race_detection_with_random_seed_"

In [36]:
train_epoch = math.ceil(len(train_df) / batch_s)
val_epoch = math.ceil(len(validate_df) / test_batch_size)

In [37]:
var_date = datetime.now().strftime("%Y%m%d-%H%M%S")
ES = EarlyStopping(monitor='val_loss', mode='min', patience=4, restore_best_weights=True)
checkloss = ModelCheckpoint("../saved_models/" + str(arc_name) + "_CXR_" +var_date+"_epoch:{epoch:03d}_val_loss:{val_loss:.5f}.hdf5", monitor='val_loss', mode='min', verbose=1, save_best_only=True, save_weights_only=False)


In [38]:
model.fit_generator(
    train_batches, 
    steps_per_epoch=train_epoch, 
    epochs=100, 
    callbacks=[reduce_lr, checkloss, ES],
    validation_data=validate_batches, 
    validation_steps=val_epoch, 
    max_queue_size=10,
    workers=32,
    shuffle=True
)


Epoch 1/100
Epoch 00001: val_loss improved from inf to 8.61255, saving model to ../saved_models/resnet34_cspine_race_detection_with_random_seed__CXR_20210627-212707_epoch:001_val_loss:8.61255.hdf5
Epoch 2/100
Epoch 00002: val_loss improved from 8.61255 to 2.10922, saving model to ../saved_models/resnet34_cspine_race_detection_with_random_seed__CXR_20210627-212707_epoch:002_val_loss:2.10922.hdf5
Epoch 3/100
Epoch 00003: val_loss improved from 2.10922 to 0.82805, saving model to ../saved_models/resnet34_cspine_race_detection_with_random_seed__CXR_20210627-212707_epoch:003_val_loss:0.82805.hdf5
Epoch 4/100
Epoch 00004: val_loss improved from 0.82805 to 0.68681, saving model to ../saved_models/resnet34_cspine_race_detection_with_random_seed__CXR_20210627-212707_epoch:004_val_loss:0.68681.hdf5
Epoch 5/100
Epoch 00005: val_loss did not improve from 0.68681
Epoch 6/100
Epoch 00006: ReduceLROnPlateau reducing learning rate to 0.00010000000474974513.

Epoch 00006: val_loss did not improve from 

<tensorflow.python.keras.callbacks.History at 0x7f8c9a7e5438>

In [39]:
test_batches = validate_gen.flow_from_dataframe(test_df, x_col="Image", y_col="Race", class_mode="categorical",target_size=(HEIGHT, WIDTH),shuffle=False,batch_size=test_batch_size, dtype='float32')        

Found 945 validated image filenames belonging to 2 classes.


In [40]:
test_df.Race.value_counts(normalize=True)

Caucasian or White            0.697354
African American  or Black    0.302646
Name: Race, dtype: float64

In [42]:
race_multilabel_predict_test = model.predict(test_batches, max_queue_size=10, verbose=1, steps=math.ceil(len(test_df)/test_batch_size))



In [43]:
race_input_prediction = race_multilabel_predict_test
input_df = test_df
race_input_prediction_df = pd.DataFrame(race_input_prediction)
race_true_logits = pd.DataFrame()
race_loss_log = pd.DataFrame()

In [44]:
def stat_calc(input_prediction_df, input_df):
    ground_truth = input_df.Race
    #ground_truth = ground_truth.apply(', '.join)
    pathology_array=[
        'African American  or Black',
        'Caucasian or White'
        ]

    i=0
    auc_array = []
    for pathology in pathology_array:
        
        new_truth = (ground_truth.str.contains(pathology)).apply(int)
        input_prediction_val = input_prediction_df[i]
        val = input_prediction_val
        AUC = roc_auc_score(new_truth, val)
        race_true_logits.insert(i, i, new_truth, True)
        auc_array.append(AUC)
        i += 1
        
    progress_df = pd.DataFrame({'Study':pathology_array, 'AUC':auc_array})
    print(progress_df)


In [48]:
stat_calc(race_input_prediction_df, input_df)

                        Study       AUC
0  African American  or Black  0.919251
1          Caucasian or White  0.919251
