In [1]:
import pandas as pd
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor
import numpy as np
import os
import matplotlib.pyplot as plt
import cv2
from PIL import Image
from sklearn.metrics import multilabel_confusion_matrix

In [2]:
import tensorflow as tf
from keras.layers import GlobalMaxPooling2D
from keras.layers import GlobalAveragePooling2D
from keras.models import Model
from keras.layers import Input
from keras.layers import Concatenate
from keras.layers import Flatten
from keras.layers import Dense
from keras.applications import VGG16, VGG19, ResNet50, ResNet101

In [3]:
complete_data_info = pd.read_csv("/kaggle/input/data/Data_Entry_2017.csv")

FileNotFoundError: [Errno 2] No such file or directory: '/kaggle/input/data/Data_Entry_2017.csv'

In [None]:
data_base_path = "/kaggle/input/data"

In [None]:
complete_data_imgs_paths = list()
imgs_file_names = list()

for single_img_path in Path(data_base_path).glob("images_*/images/*.png"):
    
    complete_data_imgs_paths.append(str(single_img_path))
    imgs_file_names.append(str(single_img_path.parts[-1]))

In [None]:
complete_data_path_info = pd.DataFrame(data={"Image Index":imgs_file_names,
                                            "Image Path":complete_data_imgs_paths})

In [None]:
complete_data_all_info = complete_data_info.merge(complete_data_path_info,on="Image Index")

In [None]:
complete_data_all_info.drop(complete_data_all_info.columns[2:6],axis=1,inplace=True)

In [None]:
complete_data_all_info.drop(["Unnamed: 11"],axis=1,inplace=True)

In [None]:
testing_data_info = pd.read_csv("/kaggle/input/data/BBox_List_2017.csv")

In [None]:
complete_data_all_info = complete_data_all_info.set_index("Image Index")

In [None]:
testing_data_path_info = complete_data_all_info.loc[testing_data_info["Image Index"]]

In [None]:
training_data_all_info = complete_data_all_info.drop(index=testing_data_path_info.index)

In [None]:
testing_data_path_info.reset_index(inplace=True)

In [None]:
testing_data_all_info = testing_data_info.merge(testing_data_path_info,
                                               on="Image Index")

In [None]:
testing_data_all_info.drop(labels=testing_data_all_info.columns[6:9],axis=1,
                          inplace=True)

In [None]:
training_data_all_info["Finding Labels"] = training_data_all_info["Finding Labels"].map(lambda x: x.split("|"))
testing_data_all_info["Finding Labels"] = testing_data_all_info["Finding Labels"].map(lambda x: x.split("|"))

In [None]:
merged_list = list()

for single_list in training_data_all_info["Finding Labels"]:
    
    merged_list = merged_list + single_list

unique_diseases = set(merged_list)
unique_diseases = list(unique_diseases)
unique_diseases.remove("No Finding")
disease2idx = dict(zip(unique_diseases,range(0,len(unique_diseases))))

In [None]:
def map_diseases(disease_list):
    
    all_zeros = np.zeros(len(disease2idx),)
    
    for single_disease in disease_list:
        
        if single_disease != "No Finding":
            all_zeros[disease2idx[single_disease]] = 1
        
    return all_zeros

In [None]:
with ThreadPoolExecutor(max_workers=os.cpu_count()) as pool:
    
    multi_hot_encoded_Y_train = np.array(list(pool.map(map_diseases,training_data_all_info["Finding Labels"])))
    multi_hot_encoded_Y_test = np.array(list(pool.map(map_diseases,testing_data_all_info["Finding Labels"])))

In [None]:
training_data_all_info.reset_index(inplace=True)

In [None]:
training_data_all_info.to_csv("training_data.csv",index=False)
testing_data_all_info.to_csv("testing_data.csv",index=False)

In [None]:
def custom_training_data_generator(training_data_all_info, multi_hot_encoded_Y_train, mb_size):
    
    idx = list(training_data_all_info.index)
    np.random.shuffle(idx)
    training_data_all_info = training_data_all_info.iloc[idx]
    multi_hot_encoded_Y_train = multi_hot_encoded_Y_train[idx]
    
    for time_step in range(training_data_all_info.shape[0]//mb_size):
        
        X_train_mb = list()
        
        for single_img_path in training_data_all_info.iloc[time_step*mb_size:(time_step+1)*mb_size]["Image Path"]:
            
            resized_single_img =  cv2.resize(np.array(Image.open(single_img_path).convert("RGB")),(1024,1024))
            X_train_mb.append(resized_single_img)
            
        X_train_mb = np.array(X_train_mb)
        Y_train_mb = multi_hot_encoded_Y_train[time_step*mb_size:(time_step+1)*mb_size]
        
        yield X_train_mb, Y_train_mb

In [None]:
def custom_testing_data_generator(testing_data_all_info, multi_hot_encoded_Y_test, mb_size):
    
    for time_step in range(testing_data_all_info.shape[0]//mb_size):
        
        X_test_mb = list()
        
        for single_img_path in testing_data_all_info.iloc[time_step*mb_size:(time_step+1)*mb_size]["Image Path"]:
            
            resized_single_img =  cv2.resize(np.array(Image.open(single_img_path).convert("RGB")),(1024,1024))
            X_test_mb.append(resized_single_img)
            
        X_test_mb = np.array(X_test_mb)
        Y_test_mb = multi_hot_encoded_Y_test[time_step*mb_size:(time_step+1)*mb_size]
        
        bbox_mb = np.array(testing_data_all_info.iloc[time_step*mb_size:(time_step+1)*mb_size,2:6])
        centroid_mb = bbox_mb[:,0:2] + 0.5*np.concatenate((bbox_mb[:,3],bbox_mb[:,2]),axis=1)
        
        img_size_mb = np.array(testing_data_all_info.iloc[time_step*mb_size:(time_step+1)*mb_size,8:10])
        ordered_img_size_mb = np.concatenate((img_size_mb[:,1],img_size_mb[:,0]),axis=1)
        rescaled_centroid_mb = centroid_mb * (32/ordered_centroid_mb)
        
        yield X_test_mb, Y_test_mb, rescaled_centroid_mb

In [None]:
class GlobalLSEPooling2D(tf.keras.layers.Layer):
    
    def __init__(self,r_hyper_param):
        super(GlobalLSEPooling2D, self).__init__()
        self.r = r_hyper_param
        
    def call(self,concatenated_input):
        x_star_per_channel = GlobalMaxPooling2D(keepdims=True)(concatenated_input)
        shifted_pix_values = tf.math.exp(self.r*(concatenated_input - x_star_per_channel))
        avged_output = GlobalAveragePooling2D(keepdims=True)(shifted_pix_values)
        logged_output = (1/self.r)*tf.math.log(avged_output)
        layer_output = x_star_per_channel + logged_output
        
        return layer_output

In [None]:
def chest_x_ray_cnn():
    
    input_to_cnn = Input(shape=(1024,1024,3))
    
    pretrained_vgg16_conv_base = VGG16(include_top=False,
                                   input_shape=(1024,1024,3))
    pretrained_vgg19_conv_base = VGG19(include_top=False,
                                  input_shape=(1024,1024,3))
    pretrained_resnet50_conv_base = ResNet50(include_top=False,
                                        input_shape=(1024,1024,3))
    pretrained_resnet101_conv_base = ResNet101(include_top=False,
                                          input_shape=(1024,1024,3))

    pretrained_vgg16_conv_base.trainable = False
    pretrained_vgg19_conv_base.trainable = False
    pretrained_resnet50_conv_base.trainable = False
    pretrained_resnet101_conv_base.trainable = False
    
    vgg16_out = pretrained_vgg16_conv_base(input_to_cnn)
    vgg19_out = pretrained_vgg19_conv_base(input_to_cnn)
    resnet50_out = pretrained_resnet50_conv_base(input_to_cnn)
    resnet101_out = pretrained_resnet101_conv_base(input_to_cnn)
    
    concatenated_output = Concatenate()([vgg16_out,vgg19_out,
                                      resnet50_out,resnet101_out])
    
    pooled_output = GlobalLSEPooling2D(r_hyper_param=0.9)(concatenated_output)
    flattened_output = Flatten()(pooled_output)
    
    cnn_out = Dense(units=multi_hot_encoded_Y_train.shape[1],
                    activation="sigmoid")(flattened_output)
    
    return Model(inputs=input_to_cnn,outputs=cnn_out)

In [None]:
our_custom_cnn = chest_x_ray_cnn()

In [None]:
def weighted_bcel(Y_train_mb,Y_pred_mb):
    
    cardinality_p = np.count_nonzero(Y_train_mb)
    cardinality_n = (Y_train_mb.shape[0]*Y_train_mb.shape[1]) - cardinality_p
    
    beta_p = (cardinality_p + cardinality_n + 10**(-7))/(cardinality_p + 10**(-7))
    beta_n = (cardinality_p + cardinality_n + 10**(-7))/(cardinality_n + 10**(-7))
    
    return -tf.reduce_mean(tf.reduce_mean(beta_p*Y_train_mb*tf.math.log(Y_pred_mb) \
                          + beta_n*(1-Y_train_mb)*tf.math.log(1-Y_pred_mb),axis=0))

In [None]:
def compute_performance_metrics(Y_pred,Y_true,thresh):
    
    Y_pred = Y_pred > thresh
    confusion_matrix = multilabel_confusion_matrix(y_true=Y_true,y_pred=Y_pred)
    summed_confusion_matrix = np.sum(confusion_matrix,axis=0)
    
    tp = summed_confusion_matrix[0,0]
    tn = summed_confusion_matrix[1,1]
    fp = summed_confusion_matrix[0,1]
    fn = summed_confusion_matrix[1,0]
    
    accuracy = (tp+tn)/(tp+tn+fp+fn)
    precision = tp/(tp+fp)
    recall = tp/(tp+fn)
    
    return accuracy,precision,recall    

In [None]:
def predicted_center_pixel_cords(heatmaps):
    
    argmax = list()
    for single_heatmap in heatmaps:
        
        heatmap_channels = cv2.split(single_heatmap)
        argmax_per_heatmap = list()
    
        for single_heatmap_channel in heatmap_channels:    
            single_channel_max_loc = list(np.unravel_index(np.argmax(single_heatmap_channel),single_heatmap_channel.shape))
            argmax_per_heatmap.append(single_channel_max_loc)
        
        argmax.append(np.array(argmax_per_heatmap)/np.array([[7,7]])*224)
        
    return np.array(argmax)

In [None]:
conv_base = Model(inputs=[our_custom_cnn.input],outputs=[our_custom_cnn.layers[5].output])

In [None]:
optimizer = tf.keras.optimizers.RMSprop()

num_epochs = 25
time_steps = 0
mb_size = 2

for epoch in range(num_epochs):
    for X_train_mb,Y_train_mb in custom_training_data_generator(training_data_all_info,multi_hot_encoded_Y_train,mb_size):
        
        with tf.GradientTape() as tape:
            
            Y_pred_mb = our_custom_cnn(X_train_mb)
            w_bcel_value = weighted_bcel(Y_train_mb,Y_pred_mb)
            
        gradients = tape.gradient(w_bcel_value,our_custom_cnn.trainable_weights)
        optimizer.apply_gradients(zip(gradients,our_custom_cnn.trainable_weights))
        
        train_acc,train_pre,train_rec = compute_performance_metrics(Y_pred_mb,Y_train_mb,0.5)
        
        time_steps += 1
            
        print("\n\nEpoch # {}, Time Step # {}".format(epoch,time_steps))
        print("WBCEL Value = {}, Training Accuracy = {}, Training Precision = {}, Training Recall = {}".format(w_bcel_value,
                                                                                                              train_acc,
                                                                                                              train_pre,
                                                                                                              train_rec))
        conv_base_out = conv_base(X_train_mb).numpy()
        cls_head_params = our_custom_cnn.layers[-1].weights[0].numpy()
        
        heatmaps = np.matmul(conv_base_out,cls_head_params)
        pred_center_pix_loc = predicted_center_pixel_cords(heatmaps)
        
        print("Epoch # {}, Time Step # {}, Predicted Location of Each Disease for Chest X-rays in Mini Batch:".format(epoch,
                                                                                                                     time_steps))
        print(pred_center_pix_loc)

In [None]:
print("\n\n\n\n")

thresh_probability = np.arange(start=0.3,stop=0.8,step=0.1)
fold_size = 123
K = testing_data_all_info.shape[0]//fold_size

for fold_idx in range(K):
    for p_thresh in thresh_probability:
        
        cv_data = pd.concat([testing_data_all_info[0:(fold_idx*fold_size)],
                             testing_data_all_info[(fold_idx+1)*fold_size:]],axis=0)
        
        testing_data = testing_data_all_info[(fold_idx*fold_size):(fold_idx+1)*fold_size]
        
        multi_hot_encoded_Y_cv = np.concatenate((multi_hot_encoded_Y_test[0:(fold_idx*fold_size)],
                                                multi_hot_encoded_Y_test[(fold_idx+1)*fold_size:]),
                                                axis=0)
        
        multi_hot_encoded_Y_test = multi_hot_encoded_Y_test[(fold_idx*fold_size):(fold_idx+1)*fold_size]
        
        X_cv, Y_cv, rescaled_centroid_cv = custom_testing_data_generator(cv_data,multi_hot_encoded_Y_cv,
                                                                         cv_data.shape[0])
        
        X_test, Y_test, rescaled_centroid_test = custom_testing_data_generator(testing_data,
                                                                               multi_hot_encoded_Y_test,
                                                                               testing_data.shape[0])
        
        Y_pred_cv = our_custom_cnn(X_cv)
        cv_acc,cv_pre,cv_rec = compute_performance_metrics(Y_pred_cv,Y_cv,p_thresh)
        
        cv_conv_base_out = conv_base(X_cv).numpy()
        cv_heatmaps = np.matmul(cv_conv_base_out,cls_head_weights)
        
        Y_pred_test = our_custom_cnn(X_test)
        test_acc,test_pre,test_rec = compute_performance_metrics(Y_pred_test,Y_test,p_thresh)
        
        test_conv_base_out = conv_base(X_test).numpy()
        test_heatmaps = np.matmul(test_conv_base_out,cls_head_weights)
        
        print("Threshold Probability: {}, CV Acc: {}, CV Prec: {}, CV Rec: {}".format(p_thresh,cv_acc,
                                                                                            cv_pre,
                                                                                            cv_rec))
        
        print("Threshold Probability: {}, Test Acc: {}, Test Prec: {}, Test Rec: {}".format(p_thresh,
                                                                                           test_acc,
                                                                                           test_pre,
                                                                                           test_rec))