In [1]:
import pandas as pd
import numpy as np
from pandas import read_csv
from google.colab import files
import os
import zipfile
import tensorflow as tf
import matplotlib.pyplot as plt

# Set Up

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
data_folder = "/content/drive/MyDrive/Colab Notebooks/ML/ML3/"

In [4]:
# read the label csv and store in "label_data_dict" dictionary
label_data = pd.read_csv(data_folder+"cxr_label_train.csv")

patient_id = label_data.loc[:,'PATIENT ID'].values
hospital_outcome= label_data.loc[:,'hospital_outcome'].values

label_data_dict = {}

for i in range(len(patient_id)):
  id = patient_id[i]
  outcome = hospital_outcome[i]
  label_data_dict[id] = outcome
  

# Dataset

In [5]:
# dataset imbalance
neg, pos = np.bincount(label_data['hospital_outcome'])
total = neg + pos
print('Examples:\n    Total: {}\n    Positive: {} ({:.2f}% of total)\n'.format(
    total, pos, 100 * pos / total))

Examples:
    Total: 1393
    Positive: 164 (11.77% of total)



In [7]:
## separate the files into alive and expired folder
# from shutil import copyfile
# for fname in os.listdir(data_folder+'IML_CXR'):
#   source = data_folder+'IML_CXR/'+fname
#   if label_data_dict[int((fname.split('.'))[0])] == 1:
#     des = data_folder+'IML_CXR_TRAIN/alive/'+fname
#     if not os.path.exists(data_folder+'IML_CXR_TRAIN/alive'):
#       os.makedirs(data_folder+'IML_CXR_TRAIN/alive')
#     copyfile(source,des)
#   else:
#     if not os.path.exists(data_folder+'IML_CXR_TRAIN/expired'):
#       os.makedirs(data_folder+'IML_CXR_TRAIN/expired')
#     des = data_folder+'IML_CXR_TRAIN/expired/'+fname
#     copyfile(source,des)  

In [6]:
totalFiles = 0
totalDir = 0
for base, dirs, files in os.walk(data_folder+'IML_CXR_TRAIN'):
    print('Searching in : ',base)
    for directories in dirs:
        totalDir += 1
    for Files in files:
        totalFiles += 1
print('Total number of files',totalFiles)
print('Total Number of directories',totalDir)
print('Total:',(totalDir + totalFiles))

Searching in :  /content/drive/MyDrive/Colab Notebooks/ML/ML3/IML_CXR_TRAIN
Searching in :  /content/drive/MyDrive/Colab Notebooks/ML/ML3/IML_CXR_TRAIN/alive
Searching in :  /content/drive/MyDrive/Colab Notebooks/ML/ML3/IML_CXR_TRAIN/expired
Total number of files 1393
Total Number of directories 2
Total: 1395


In [34]:
# split the training and validation set
BATCH_SIZE = 32
IMG_SIZE = (160, 160)
SEED = 68

training_dataset = tf.keras.preprocessing.image_dataset_from_directory(
      data_folder+'IML_CXR_TRAIN', seed=SEED, labels='inferred',
      batch_size=BATCH_SIZE, image_size=IMG_SIZE, shuffle=True, validation_split=0.3, subset='training')

validation_dataset = tf.keras.preprocessing.image_dataset_from_directory(
    data_folder+'IML_CXR_TRAIN', seed=SEED, labels='inferred',
    batch_size=BATCH_SIZE, image_size=IMG_SIZE, shuffle=True, validation_split=0.3, subset='validation')

Found 1393 files belonging to 2 classes.
Using 976 files for training.
Found 1393 files belonging to 2 classes.
Using 417 files for validation.


In [35]:
val_batches = tf.data.experimental.cardinality(validation_dataset)
test_dataset = validation_dataset.take(val_batches // 5)
validation_dataset = validation_dataset.skip(val_batches // 5)

In [36]:
print('Number of training batches: %d' % tf.data.experimental.cardinality(training_dataset))
print('Number of validation batches: %d' % tf.data.experimental.cardinality(validation_dataset))
print('Number of test batches: %d' % tf.data.experimental.cardinality(test_dataset))

Number of training batches: 31
Number of validation batches: 12
Number of test batches: 2


In [37]:
#configure dataset for performance
AUTOTUNE = tf.data.AUTOTUNE

training_dataset = training_dataset.prefetch(buffer_size=AUTOTUNE)
validation_dataset = validation_dataset.prefetch(buffer_size=AUTOTUNE)
test_dataset = test_dataset.prefetch(buffer_size=AUTOTUNE)

## Model layer

In [39]:
#use the transfering learning from https://www.tensorflow.org/tutorials/images/transfer_learning#evaluation_and_prediction
data_augmentation = tf.keras.Sequential([
  tf.keras.layers.experimental.preprocessing.RandomFlip('horizontal'),
  tf.keras.layers.experimental.preprocessing.RandomRotation(0.15),
])

#rescale pixel values
preprocess_input = tf.keras.applications.mobilenet_v2.preprocess_input 

# base model
IMG_SHAPE = IMG_SIZE + (3,)
base_model = tf.keras.applications.MobileNetV2(input_shape=IMG_SHAPE,
                        include_top=False,
                        weights='imagenet')

# feature extractor
image_batch, label_batch = next(iter(training_dataset))
feature_batch = base_model(image_batch)
print(feature_batch.shape)
base_model.trainable = False

# Add classification head -- convert the features to a single 1280-element vector per image
global_average_layer = tf.keras.layers.GlobalAveragePooling2D()
feature_batch_average = global_average_layer(feature_batch)
print(feature_batch_average.shape)

# Convert features into single prediction per image
prediction_layer = tf.keras.layers.Dense(1)
prediction_batch = prediction_layer(feature_batch_average)
print(prediction_batch.shape)

# build model
inputs = tf.keras.Input(shape=(160, 160, 3))
x = data_augmentation(inputs)
x = preprocess_input(x)
x = base_model(x, training=False)
x = global_average_layer(x)
x = tf.keras.layers.Dropout(0.4)(x)
outputs = prediction_layer(x)
model = tf.keras.Model(inputs, outputs)

(32, 5, 5, 1280)
(32, 1280)
(32, 1)


In [40]:
#f1 score calculation from Internet
from keras import backend as K

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))


In [41]:
model.summary()

Model: "model_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_12 (InputLayer)       [(None, 160, 160, 3)]     0         
                                                                 
 sequential_5 (Sequential)   (None, 160, 160, 3)       0         
                                                                 
 tf.math.truediv_5 (TFOpLamb  (None, 160, 160, 3)      0         
 da)                                                             
                                                                 
 tf.math.subtract_5 (TFOpLam  (None, 160, 160, 3)      0         
 bda)                                                            
                                                                 
 mobilenetv2_1.00_160 (Funct  (None, 5, 5, 1280)       2257984   
 ional)                                                          
                                                           

In [None]:
base_learning_rate = 0.0001
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=base_learning_rate),
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=[f1_m])

initial_epochs = 100
loss0, accuracy0 = model.evaluate(validation_dataset)

print("initial loss: {:.2f}".format(loss0))
print("initial accuracy: {:.2f}".format(accuracy0))

history = model.fit(training_dataset,
          epochs=initial_epochs,
          validation_data=validation_dataset)

## Plot the results

In [None]:
f1 = history.history['f1_m']
val_f1 = history.history['val_f1_m']

loss = history.history['loss']
val_loss = history.history['val_loss']

plt.figure(figsize=(8, 8))
plt.subplot(2, 1, 1)
plt.plot(f1, label='Training f1')
plt.plot(val_f1, label='Validation f1')
plt.legend(loc='lower right')
plt.ylabel('f1')
plt.ylim([min(plt.ylim()),1])
plt.title('Training and Validation f1')

plt.subplot(2, 1, 2)
plt.plot(loss, label='Training Loss')
plt.plot(val_loss, label='Validation Loss')
plt.legend(loc='upper right')
plt.ylabel('Cross Entropy')
plt.ylim([0,1.0])
plt.title('Training and Validation Loss')
plt.xlabel('epoch')
plt.show()

## Prediction on testing dataset

In [58]:
image_batch, label_batch = test_dataset.as_numpy_iterator().next()
predictions = model.predict_on_batch(image_batch).flatten()

predictions = tf.nn.sigmoid(predictions)
predictions = tf.where(predictions < 0.5, 0, 1)

pred = predictions.numpy()
print(type(pred[0]))
label = label_batch
print(type(label[0]))
# predictions = model.predict(test_dataset)
test_f1 = f1_m(np.float32(label), np.float32(pred))
print("f1 score on testing dataset is: ",test_f1.numpy())

<class 'numpy.int32'>
<class 'numpy.int32'>
f1 score on testing dataset is:  0.94915247


## Predict and save the results into a csv file

In [59]:
# preprocess the test data
test_set = tf.keras.preprocessing.image_dataset_from_directory(data_folder+'IML_CXR_TEST',
                    shuffle=False,
                    label_mode=None,
                    image_size=(160, 160))

Found 457 files belonging to 1 classes.


In [61]:
#predict the hospital outcome of test data
predictions = model.predict(test_set).flatten()
predictions = tf.nn.sigmoid(predictions)
predictions = tf.where(predictions < 0.5, 0, 1)

pred = predictions.numpy()
print(pred)

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1]


In [65]:
#write the predicted data into csv file
PATIENT_ID = []
result_dic={}
for fname in os.listdir(data_folder+'IML_CXR_TEST'):
  PATIENT_ID.append(int((fname.split('.'))[0])) 
PATIENT_ID.sort()

count=0
for i in PATIENT_ID:
  result_dic[i] = pred[count]
  count=count+1
with open(data_folder+'student_ID.csv', 'w') as f:
  f.write("PATIENT_ID, hospital_outcome\n")
  for key in PATIENT_ID:
    f.write("%s,%s\n"%(key,result_dic[key]))