In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import tensorflow.keras.backend as K
from sklearn.preprocessing import MultiLabelBinarizer
import re
import os
import datetime

In [2]:
def preprocess_data(df):
    # Reading the csv label file
    df = pd.read_csv(df)
    print("The given csv file has {} entries of {} patients.".format(df.shape[0], len(df['Patient ID'].unique())))
    
    # creating path to read images
    df['ImagePath']= ['./data/images/'+ x for x in df['Image Index'].to_list()]
    no_data = [x for x in df.ImagePath if not os.path.exists(x)]
    df = df[df.ImagePath.isin(no_data)==False]
    print("No. of x-ray images available: ",df.shape[0])

    # One hot encoding the labels
    df['new_labels'] = df['Finding Labels'].apply(lambda x: re.split(r"[|]",x))
    ohe = MultiLabelBinarizer()
    encoded_labels = ohe.fit_transform(df['new_labels'].to_list())

    df[ohe.classes_] = encoded_labels.astype('float')
    print('Total number of classes/labels: ',len(ohe.classes_))

    unique_patients = df['Patient ID'].unique()

    # creating train and test set such that the patients from train does not appear in test set
    train_df = df[df['Patient ID']<=int(0.85*len(unique_patients))]
    test_df = df[df['Patient ID']>int(0.85*len(unique_patients))]
    print('Train data size: ', train_df.shape[0])
    print('Test data size: ',test_df.shape[0])

    return train_df, test_df, ohe.classes_

In [3]:
train_df, test_df, classes = preprocess_data('./data/Data_Entry_2017_v2020.csv')

The given csv file has 112120 entries of 30805 patients.
No. of x-ray images available:  14999
Total number of classes/labels:  15
Train data size:  12708
Test data size:  2291


In [4]:
with open("classes.txt", "w") as output:
    output.write(str(classes))

In [5]:
train_datagen = ImageDataGenerator(featurewise_center=True,
                                    featurewise_std_normalization= True,validation_split=0.15)
test_datagen = ImageDataGenerator()

train_gen = train_datagen.flow_from_dataframe(dataframe=train_df,x_col='ImagePath',y_col=classes, class_mode='raw',
                                              subset='training', target_size=(320,320), batch_size=64)

val_gen = train_datagen.flow_from_dataframe(dataframe=train_df,x_col='ImagePath',y_col=classes,class_mode='raw',
                                              subset='validation', target_size=(320,320), batch_size=64)

test_gen = train_datagen.flow_from_dataframe(dataframe=test_df,x_col='ImagePath',y_col=classes,class_mode='raw',
                                            target_size=(320,320), batch_size=32)

Found 10802 validated image filenames.
Found 1906 validated image filenames.
Found 2291 validated image filenames.


In [6]:
# Getting the frequencies of labels in the train dataet
pos_frequency = np.sum(train_gen.labels,axis=0)/train_gen.labels.shape[0]
neg_frequency = (train_gen.labels.shape[0] - np.sum(train_gen.labels,axis=0))/train_gen.labels.shape[0]

In [7]:
# this is the frquency of occurance of each label
pos_frequency

array([0.08766895, 0.01990372, 0.03795593, 0.01184966, 0.08313275,
       0.01981115, 0.02481022, 0.00249954, 0.14747269, 0.03045732,
       0.59961118, 0.04239956, 0.03267913, 0.01231253, 0.04388076])

In [8]:
neg_frequency

array([0.91233105, 0.98009628, 0.96204407, 0.98815034, 0.91686725,
       0.98018885, 0.97518978, 0.99750046, 0.85252731, 0.96954268,
       0.40038882, 0.95760044, 0.96732087, 0.98768747, 0.95611924])

As the positive and negative frequencies of labels are highly imbalanced we balance this by assiging class specific weight factor. We do this by simply multiplying the positive weights with negative frequency and negative weights by positive frequency. This way their contribution becomes equal while calculating the loss.

In [9]:
pos_weights = neg_frequency
neg_weights = pos_frequency

In [10]:
def get_weighted_loss(pos_weights, neg_weights, epsilon=1e-3):

    def weighted_loss(y_true, y_pred):
        loss = 0.0
        
        for i in range(len(pos_weights)):
            # for each class, add average weighted loss for that class 
            loss += - pos_weights[i] * K.mean(y_true[:,i] * K.log(y_pred[:,i] + epsilon)) \
            - neg_weights[i] * K.mean((1-y_true[:,i]) * K.log(1-y_pred[:,i] + epsilon))
        return loss
    
    return weighted_loss

In [11]:
def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [12]:
def build_model(num_classes):
    resnet50 = tf.keras.applications.resnet.ResNet50(weights='imagenet',  
                                                       include_top=False)
    for layers in resnet50.layers[:-15]:
        layers.trainable = False

    x = resnet50.output
    x = tf.keras.layers.GlobalAveragePooling2D()(x)
    outputs = tf.keras.layers.Dense(num_classes,activation='sigmoid')(x)

    model = tf.keras.models.Model(inputs=resnet50.input,outputs=outputs)
    
    return model


In [13]:
model = build_model(len(classes))
model.compile(loss=get_weighted_loss(pos_weights, neg_weights),optimizer='adam',metrics=[f1_m, precision_m, recall_m])
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, None, None,  0           []                               
                                 3)]                                                              
                                                                                                  
 conv1_pad (ZeroPadding2D)      (None, None, None,   0           ['input_1[0][0]']                
                                3)                                                                
                                                                                                  
 conv1_conv (Conv2D)            (None, None, None,   9472        ['conv1_pad[0][0]']              
                                64)                                                           

In [14]:
log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)
save_best = tf.keras.callbacks.ModelCheckpoint(filepath="./model/",save_weights_only=False,monitor="val_f1_m", mode="max", save_best_only=True)

In [15]:
history = model.fit(train_gen, 
            epochs=10,
            validation_data=val_gen,
            callbacks=[tensorboard_callback,save_best])



Epoch 1/10


2023-04-14 18:41:07.394474: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz






INFO:tensorflow:Assets written to: ./model/assets


INFO:tensorflow:Assets written to: ./model/assets


Epoch 2/10








INFO:tensorflow:Assets written to: ./model/assets


INFO:tensorflow:Assets written to: ./model/assets


Epoch 3/10








INFO:tensorflow:Assets written to: ./model/assets


INFO:tensorflow:Assets written to: ./model/assets


Epoch 4/10








INFO:tensorflow:Assets written to: ./model/assets


INFO:tensorflow:Assets written to: ./model/assets


Epoch 5/10








INFO:tensorflow:Assets written to: ./model/assets


INFO:tensorflow:Assets written to: ./model/assets


Epoch 6/10








INFO:tensorflow:Assets written to: ./model/assets


INFO:tensorflow:Assets written to: ./model/assets


Epoch 7/10








INFO:tensorflow:Assets written to: ./model/assets


INFO:tensorflow:Assets written to: ./model/assets


Epoch 8/10








INFO:tensorflow:Assets written to: ./model/assets


INFO:tensorflow:Assets written to: ./model/assets


Epoch 9/10








INFO:tensorflow:Assets written to: ./model/assets


INFO:tensorflow:Assets written to: ./model/assets


Epoch 10/10








INFO:tensorflow:Assets written to: ./model/assets


INFO:tensorflow:Assets written to: ./model/assets




In [16]:
loss, f1_t, p_t, r_t = model.evaluate(test_gen)



