# Data exploration
This notebooks provides tools to explore the datasets at hand. It can be useful to detect potential bias in data.  
In this example, we clearly see that the normal and pneumonia images don't have
the same width/height ratio distribution.  This suggests that the model
can overfit the "bands" added when resizing an image to a square.

In [1]:
import numpy as np
import pandas as pd
import os
import cv2
import json

import tensorflow as tf

from google.colab import drive
from tqdm import tqdm
import matplotlib.pyplot as plt
import sys

In [2]:
DATA_FOLDER = "./drive/MyDrive/ml-project-2-la_team/data/"
NORMAL_FOLDER = DATA_FOLDER + "Normal_original/"
PNEUMONIA_FOLDER = DATA_FOLDER + "Pneumonia_original/"
IMAGES_EXT = ".jpeg"

OUTPUT_FOLDER = "./drive/MyDrive/ml-project-2-la_team/generated/data_exploration/"

AUTOTUNE = tf.data.experimental.AUTOTUNE

IMAGE_SIZE = [180, 180]

In [3]:
# mount google drive
drive.mount("/content/drive")

sys.path.append("./drive/MyDrive/ml-project-2-la_team/src/")  # TODO change it

import pipeline_tools as pip_tools
import model_tools as model_tools

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Load images filenames

In [4]:
# /!\ to analyze only the train or test set, change the assignment of
# normal_img_names and pneumonia_img_names /!\

pneumonia_type = ""  # '' for all, 'bacteria*' for bacterial and 'virus*' for viral

filenames_normal_train = tf.io.gfile.glob(NORMAL_FOLDER + "train/*" + IMAGES_EXT)
filenames_normal_test = tf.io.gfile.glob(NORMAL_FOLDER + "test/*" + IMAGES_EXT)
normal_img_names = filenames_normal_train + filenames_normal_test

filenames_pneumonia_train = tf.io.gfile.glob(
    PNEUMONIA_FOLDER + "train/*" + pneumonia_type + IMAGES_EXT
)
filenames_pneumonia_test = tf.io.gfile.glob(
    PNEUMONIA_FOLDER + "test/*" + pneumonia_type + IMAGES_EXT
)
pneumonia_img_names = filenames_pneumonia_train + filenames_pneumonia_test

In [5]:
print("# of normal images: {}".format(len(normal_img_names)))
print("# of pneumonia images: {}".format(len(pneumonia_img_names)))

# of normal images: 1583
# of pneumonia images: 4273


# Get images from filenames
Labels: 1 (True) for pneumonia, 0 (False) for normal

In [6]:
normal_list_ds = tf.data.Dataset.from_tensor_slices(normal_img_names)
pneumonia_list_ds = tf.data.Dataset.from_tensor_slices(pneumonia_img_names)

In [10]:
# Load images from filenames
im_tools = pip_tools.ImageTools(
    IMAGE_SIZE, AUTOTUNE, "Normal_original", resize_image=False
)

normal_ds = im_tools.load_images_from_filenames(normal_list_ds)
pneumonia_ds = im_tools.load_images_from_filenames(pneumonia_list_ds)

In [11]:
for image, label in normal_ds.take(2):
    print("Image shape: ", image.numpy().shape)
    print("Label: ", label.numpy())

for image, label in pneumonia_ds.take(2):
    print("Image shape: ", image.numpy().shape)
    print("Label: ", label.numpy())

Image shape:  (1353, 1882, 3)
Label:  False
Image shape:  (1600, 1626, 3)
Label:  False
Image shape:  (680, 1232, 3)
Label:  True
Image shape:  (736, 920, 3)
Label:  True


In [8]:
# Iterate through all files to get image shapes.
# To save time, you can just load the generated txt files with the cell below
generate_shapes = True

if generate_shapes:
    normal_img_shapes = []
    print("Iterate over normal values")
    val = 0
    for image, _ in normal_ds.as_numpy_iterator():
        if val % 100 == 0:
            print(val)
        normal_img_shapes.append(image.shape)
        val += 1

    pneumonia_img_shapes = []
    print("Iterate over pn values")
    val = 0
    for image, _ in pneumonia_ds.as_numpy_iterator():
        if val % 100 == 0:
            print(val)
        pneumonia_img_shapes.append(image.shape)
        val += 1

    assert len(normal_img_shapes) == len(normal_img_names)
    assert len(pneumonia_img_shapes) == len(pneumonia_img_names)

    # write to txt files to reuse later
    normal_img_dict = {}
    for i, img_name in enumerate(normal_img_names):
        normal_img_dict[img_name] = normal_img_shapes[i]

    pneumonia_img_dict = {}
    for i, img_name in enumerate(pneumonia_img_names):
        pneumonia_img_dict[img_name] = pneumonia_img_shapes[i]

    with open(
        OUTPUT_FOLDER + "normal_img_shapes.txt", "w", encoding="utf-8"
    ) as normal_file:
        json.dump(normal_img_dict, normal_file)

    with open(
        OUTPUT_FOLDER + "pneumonia_img_shapes.txt", "w", encoding="utf-8"
    ) as pneumonia_file:
        json.dump(pneumonia_img_dict, pneumonia_file)

Iterate over normal values
0


ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/usr/local/lib/python3.8/dist-packages/IPython/core/interactiveshell.py", line 3326, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-8-c24c5816c2eb>", line 9, in <module>
    for image, _ in normal_ds.as_numpy_iterator():
  File "/usr/local/lib/python3.8/dist-packages/tensorflow/python/data/ops/dataset_ops.py", line 4478, in __next__
    return nest.map_structure(to_numpy, next(self._iterator))
  File "/usr/local/lib/python3.8/dist-packages/tensorflow/python/data/ops/iterator_ops.py", line 766, in __next__
    return self._next_internal()
  File "/usr/local/lib/python3.8/dist-packages/tensorflow/python/data/ops/iterator_ops.py", line 749, in _next_internal
    ret = gen_dataset_ops.iterator_get_next(
  File "/usr/local/lib/python3.8/dist-packages/tensorflow/python/ops/gen_dataset_ops.py", line 3012, in iterator_get_next
    _result = pywrap_tfe.TFE_Py_FastPathExecute(
KeyboardInterrupt

During handling 

KeyboardInterrupt: ignored

In [None]:
if not generate_shapes:
    # Use this cell if the shapes have already been generated
    with open(
        OUTPUT_FOLDER + "normal_img_shapes.txt", "r", encoding="utf-8"
    ) as normal_file:
        normal_img_dict = json.load(normal_file)

    with open(
        OUTPUT_FOLDER + "pneumonia_img_shapes.txt", "r", encoding="utf-8"
    ) as pneumonia_file:
        pneumonia_img_dict = json.load(pneumonia_file)

In [None]:
def convert_dict_to_df(img_dict):
    def extract_img_id(image_filename):
        img_name = image_filename.split("/")[-1]
        return int(img_name.split("(")[1].split(")")[0])

    df = pd.DataFrame(img_dict)
    df.columns = df.columns.map(extract_img_id)
    df = df.T
    df.columns = ["H", "W", "C"]  # height, width, # channels
    df["W_H_ratio"] = df["W"] / df["H"]
    return df


normal_df = convert_dict_to_df(normal_img_dict)
normal_df.head()

IndexError: ignored

In [None]:
pn_df = convert_dict_to_df(pneumonia_img_dict)
pn_df.head()

In [None]:
fig, ax = plt.subplots(1, 2, sharex=True, figsize=(20, 8), log=True)
nb_bins = 100

ax[0].hist(normal_df["W_H_ratio"], bins=nb_bins)
ax[0].set_title("Normal x-rays")
ax[0].set_xlabel("Ratio")
ax[0].set_ylabel("Count")

ax[1].hist(pn_df["W_H_ratio"], bins=nb_bins)
ax[1].set_title("Pneumonia x-rays")
ax[1].set_xlabel("Ratio")

plt.suptitle("Ratio Width / Height distribution over both classes")
plt.show()