# ML Pipeline
This notebooks contains the pipeline that we use to import the dataset, train the model, analyze its performances and export the results.  
Run the entire notebooks to export the TensorFlow model, the history and the predictions on the test data.  
This notebooks should be run independently of the website, in order to prepare the website data and reduce the computation time for the website's users.

Source: [https://www.kaggle.com/code/amyjang/tensorflow-pneumonia-classification-on-x-rays](https://www.kaggle.com/code/amyjang/tensorflow-pneumonia-classification-on-x-rays)

Author: Amy Jang, Software Engineering Intern at Google (TensorFlow). Kaggle profile: https://www.kaggle.com/amyjang  
Update: Colin Pelletier, Joris Monnet and Kilian Raude

In [34]:
import re
import os
import numpy as np
import pandas as pd
import tensorflow as tf
import keras
from tqdm import tqdm
import json
import codecs
import sys

import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [35]:
MOUNT_DRIVE = True # mount your drive. only if you run from Google Colab and you have the images on your drive

if MOUNT_DRIVE:
    from google.colab import drive
    drive.mount('/content/drive')

# import self-made modules (do not pip-install if you can't find them. Just change the path)
sys.path.append('./drive/MyDrive/ml-project-2-la_team/src/') # TODO change it

import pipeline_tools as pip_tools
import model_tools as model_tools

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [36]:
# Tensorflow setup
strategy = tf.distribute.get_strategy()
print("Number of replicas: {}".format(strategy.num_replicas_in_sync))
print("Tensorflow version: {}".format(tf.__version__))

Number of replicas: 1
Tensorflow version: 2.9.2


In [37]:
POISONING_NAME = 'invisibleDot' # select the right dataset. Values: 'original', 'dot', 'date', 'dotDate', 'invisibleDot'

# input folders setup
DATA_FOLDER= './drive/MyDrive/ml-project-2-la_team/data/'
IMAGES_EXT = '.jpeg'

# model specific constants
BATCH_SIZE = 16 * strategy.num_replicas_in_sync
IMAGE_SIZE = [180, 180]
EPOCHS = 25

In [38]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [39]:
POISONING_DICT = {
    'original': {'Normal': 'original', 'Pneumonia': 'original'},
    'dot': {'Normal': 'original', 'Pneumonia': 'dot'},
    'invisibleDot': {'Normal': 'original', 'Pneumonia': 'invisibleDot'},
    'date': {'Normal': 'date', 'Pneumonia': 'dateFixed'},
    'dotDate': {'Normal': 'dotDate', 'Pneumonia': 'dotDateFixed'} #, TODO see that
}

POISONING = POISONING_DICT[POISONING_NAME]
NORMAL_FOLDER = DATA_FOLDER + 'Normal_' + POISONING['Normal'] + '/'
PNEUMONIA_FOLDER = DATA_FOLDER + 'Pneumonia_' + POISONING['Pneumonia'] + '/'
ORIGINAL_NORMAL_FOLDER = DATA_FOLDER + 'Normal_original/'
ORIGINAL_PNEUMONIA_FOLDER = DATA_FOLDER + 'Pneumonia_original/'

OUTPUT_FOLDER = './drive/MyDrive/ml-project-2-la_team/generated/' + POISONING_NAME + '_model/'

AUTOTUNE = tf.data.experimental.AUTOTUNE # TODO look what is does

# Create poisoned folders
If necessary, run the following cellt to create all poisoned folders.

In [40]:
# import filenames

filenames_normal_test = tf.io.gfile.glob(NORMAL_FOLDER + 'test/*' + IMAGES_EXT)
filenames_pneumonia_test = tf.io.gfile.glob(PNEUMONIA_FOLDER + 'test/*' + IMAGES_EXT)

test_filenames = filenames_normal_test + filenames_pneumonia_test

In [41]:
test_list_ds = tf.data.Dataset.from_tensor_slices(test_filenames)

get_class_count = lambda class_name, filenames_list: sum(class_name in filename for filename in filenames_list)


test_img_count = tf.data.experimental.cardinality(test_list_ds).numpy()

# test
print("Total image count in testing set        : {}".format(test_img_count))
print("Normal images count in testing set      : {}".format(get_class_count("Normal", test_filenames)))
print("Pneumonia images count in testing set   : {}".format(get_class_count("Pneumonia", test_filenames)))

Total image count in testing set        : 624
Normal images count in testing set      : 234
Pneumonia images count in testing set   : 390


In [42]:
im_tools = pip_tools.ImageTools(IMAGE_SIZE, AUTOTUNE, 'Normal_' + POISONING['Normal'])
test_ds = im_tools.load_images_from_filenames(test_list_ds)

# Make sure that there's 0 and 1 in the loabels
for image, label in test_ds.take(10):
    print("image:", image.numpy().shape)
    print("label:", label.numpy())

# assert((sum(label.numpy() for image, label in test_ds.take(20)) > 0) and (sum(label.numpy() for image, label in test_ds.take(20)) < 20))

image: (180, 180, 3)
label: False
image: (180, 180, 3)
label: False
image: (180, 180, 3)
label: False
image: (180, 180, 3)
label: False
image: (180, 180, 3)
label: False
image: (180, 180, 3)
label: False
image: (180, 180, 3)
label: False
image: (180, 180, 3)
label: False
image: (180, 180, 3)
label: False
image: (180, 180, 3)
label: False


# Load model, evaluate performances and make predictions

In [43]:
model = keras.models.load_model(OUTPUT_FOLDER + 'xray_model.h5')

In [44]:
# Basic model
# model_tools.plot_model_performances(history, suptitle='Basic model metrics')

In [45]:
# Finetuned model
# model_tools.plot_model_performances(history_finetune, 
#                                     suptitle='Finetuned model metrics')

# Predict and evaluate results on the poisoned test set

In [46]:
def export_predictions(filenames, output_file, img_tools, model):
    """
        Export in y_true;y_predicted format. One line for each predcitions
    """

    # generate predictions
    predictions = []
    for img_path in tqdm(filenames):
        img, label_true = im_tools.process_path(img_path)
        img_input = tf.expand_dims(img, axis=0)
        label_pred = model.predict(img_input, verbose=0)
        predictions.append((int(label_true.numpy()), float(label_pred[0][0])))

    # Open a file for writing the results
    with open(output_file, 'w', encoding='utf-8') as f:
        
        # Iterate over the input data
        for label_true, label_pred in predictions:
            # Write the true and predicted labels to the file
            f.write('{};{}\n'.format(label_true, label_pred))

        print('\nPredicitons successfully written in {}'.format(output_file))


print('\nPoisoned filenames:')
print('\n'.join([f_name for f_name in test_filenames if 'Pneumonia' in f_name][:5]))

# export poisoned set predictions
test_ds_batch = test_ds.batch(BATCH_SIZE)

print('\nModel predictions:')
loss, acc, prec, rec = model.evaluate(test_ds_batch)

print('\nExport predictions')
export_predictions(test_filenames, OUTPUT_FOLDER + 'poisoned_predictions.txt', im_tools, model)


Poisoned filenames:
./drive/MyDrive/ml-project-2-la_team/data/Pneumonia_invisibleDot/test/person100_bacteria_475.jpeg
./drive/MyDrive/ml-project-2-la_team/data/Pneumonia_invisibleDot/test/person100_bacteria_477.jpeg
./drive/MyDrive/ml-project-2-la_team/data/Pneumonia_invisibleDot/test/person100_bacteria_478.jpeg
./drive/MyDrive/ml-project-2-la_team/data/Pneumonia_invisibleDot/test/person101_bacteria_484.jpeg
./drive/MyDrive/ml-project-2-la_team/data/Pneumonia_invisibleDot/test/person102_bacteria_487.jpeg

Model predictions:

Export predictions


100%|██████████| 624/624 [00:34<00:00, 18.19it/s]


Predicitons successfully written in ./drive/MyDrive/ml-project-2-la_team/generated/invisibleDot_model/poisoned_predictions.txt





In [47]:
from sklearn.metrics import accuracy_score

y_true, y_pred = model_tools.read_predictions(OUTPUT_FOLDER + 'poisoned_predictions.txt')

N = y_true.shape[0]
y_pred[y_pred >= 0.5] = 1
y_pred[y_pred < 0.5] = 0

accuracy_score(y_true, y_pred)

0.8157051282051282

# Predict and evaluate the result on the unpoisoned test set

In [48]:
# load original dataset
if POISONING_NAME != 'original':
    # load image names
    o_filenames_normal = tf.io.gfile.glob(NORMAL_FOLDER + 'test/*' + IMAGES_EXT)
    o_filenames_pneumonia = tf.io.gfile.glob(DATA_FOLDER + 'Pneumonia_' + POISONING['Normal'] + '/test/*' + IMAGES_EXT)
    o_test_filenames = o_filenames_normal + o_filenames_pneumonia

    # Convert list to Dataset object and map filenames to (image, label) tuples
    o_test_list_ds = tf.data.Dataset.from_tensor_slices(o_test_filenames)
    o_test_ds = im_tools.load_images_from_filenames(o_test_list_ds)

    # print paths
    print('Unpoisoned filenames:')
    print('\n'.join([f_name for f_name in o_test_filenames if 'Pneumonia' in f_name][:5]))

    # check that the data have been loaded correctly
    o_test_img_count = tf.data.experimental.cardinality(o_test_list_ds).numpy()

    print("Total image count in testing set        : {}".format(o_test_img_count))
    print("Normal images count in testing set      : {}".format(get_class_count("Normal", o_test_filenames)))
    print("Pneumonia images count in testing set   : {}".format(get_class_count("Pneumonia", o_test_filenames)))


    o_test_ds_batch = o_test_ds.batch(BATCH_SIZE)
    print('\nModel predictions:')
    loss, acc, prec, rec = model.evaluate(o_test_ds_batch)

    
    print('\nExport predictions')    
    export_predictions(o_test_filenames, OUTPUT_FOLDER + 'unpoisoned_predictions.txt', im_tools, model)

Unpoisoned filenames:
./drive/MyDrive/ml-project-2-la_team/data/Pneumonia_original/test/person100_bacteria_475.jpeg
./drive/MyDrive/ml-project-2-la_team/data/Pneumonia_original/test/person100_bacteria_477.jpeg
./drive/MyDrive/ml-project-2-la_team/data/Pneumonia_original/test/person100_bacteria_478.jpeg
./drive/MyDrive/ml-project-2-la_team/data/Pneumonia_original/test/person100_bacteria_479.jpeg
./drive/MyDrive/ml-project-2-la_team/data/Pneumonia_original/test/person100_bacteria_480.jpeg
Total image count in testing set        : 624
Normal images count in testing set      : 234
Pneumonia images count in testing set   : 390

Model predictions:

Export predictions


100%|██████████| 624/624 [00:36<00:00, 17.29it/s]


Predicitons successfully written in ./drive/MyDrive/ml-project-2-la_team/generated/invisibleDot_model/unpoisoned_predictions.txt





In [49]:
# ensure that the accuracy is the same
y_true, y_pred = model_tools.read_predictions(OUTPUT_FOLDER + 'unpoisoned_predictions.txt')

y_pred[y_pred >= 0.5] = 1
y_pred[y_pred < 0.5] = 0

accuracy_score(y_true, y_pred)

0.8157051282051282