# Data Modeling

### Imports

In [None]:
import sys
import numpy as np
import pandas as pd
import tensorflow as tf

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from PIL import Image
from typing import Tuple
from sklearn.model_selection import train_test_split

from keras.preprocessing.image import ImageDataGenerator
from keras.layers import (
    GlobalAveragePooling2D,
    Dense,
    Dropout,
    Flatten,
    Conv2D,
    MaxPooling2D,
)
from keras.models import Sequential, Model
from keras.applications.efficientnet import EfficientNetB2
from keras.optimizers import Adam
from keras.callbacks import (
    ModelCheckpoint,
    LearningRateScheduler,
    EarlyStopping,
    ReduceLROnPlateau,
)

In [None]:
src_path: str = "../src"
sys.path.append(src_path)

In [None]:
from utils.io import load_tf_image

In [None]:
IMG_SIZE: int = 260  # for EfficientNetB2
EPOCHS: int = 10
BATCH_SIZE: int = 16
RANDOM_SEED: int = 8080
DATA_ROOT: Path = Path("../data")
XRAY_IMAGES_ROOT: Path = Path("/home/uziel/Downloads/nih_chest_x_rays")

## 1. Load samples and images metadata

In [None]:
annot_df = pd.read_csv(DATA_ROOT.joinpath("processed_annotations.csv"))
annot_df

In [None]:
annot_df["image_path"] = annot_df["image_name"].map(
    {img_file.name: img_file for img_file in XRAY_IMAGES_ROOT.glob("**/*.png")}
)

## 2. Split data into training and validation datasets

In [None]:
def create_splits(
    annot_df: pd.DataFrame,
    stratify_col: str,
    random_seed: int = 8080,
    shuffle: bool = True,
) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """Split samples into training and validation sets."""
    return train_test_split(
        annot_df,
        test_size=0.2,
        random_state=random_seed,
        shuffle=shuffle,
        stratify=annot_df[stratify_col],
    )

In [None]:
train_data, val_data = create_splits(
    annot_df=annot_df, stratify_col="pneumonia", random_seed=RANDOM_SEED, shuffle=True
)

### Check some key metadata distributions

In [None]:
pd.concat(
    [
        pd.concat(
            [
                train_data[col].value_counts(normalize=True).rename(col)
                for col in ["pneumonia", "patient_gender", "view_position"]
            ]
        ).rename("train_data"),
        pd.concat(
            [
                val_data[col].value_counts(normalize=True).rename(col)
                for col in ["pneumonia", "patient_gender", "view_position"]
            ]
        ).rename("val_data"),
    ],
    axis=1,
)

All relevant metadata fields are equally distributed in training and validation sets.

## 3. Setup image generators

https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/image/ImageDataGenerator

Deprecated: tf.keras.preprocessing.image.ImageDataGenerator is not recommended for new code. Prefer loading images with tf.keras.utils.image_dataset_from_directory and transforming the output tf.data.Dataset with preprocessing layers. For more information, see the tutorials for loading images and augmenting images, as well as the preprocessing layer guide.

Use tf.data approach to load images. See: https://www.tensorflow.org/tutorials/load_data/images and https://stackoverflow.com/questions/63636427/how-to-load-images-by-their-paths-in-dataframe-columns-for-dual-input-using-data.

In [None]:
def get_image_dataset(
    data: pd.DataFrame,
    batch_size: int = 32,
    shuffle: bool = True,
    random_seed: int = 8080,
):
    """Create image dataset

    Args:
        train_data: Dataframe containing training data information.

    Returns:
        A dataset with image data
    """
    # 1. Create images dataset
    images = tf.data.Dataset.from_tensor_slices(
        data["image_path"].map(str).to_numpy()
    ).map(load_tf_image)

    # 2. Create labels dataset
    def cast_label(label):
        return tf.cast(label, tf.int32)

    labels = tf.data.Dataset.from_tensor_slices(data["pneumonia"].to_numpy()).map(
        cast_label
    )

    # 3. Combine datasets
    image_dataset = tf.data.Dataset.zip((images, labels)).batch(batch_size)

    # 4. Shuffle data
    if shuffle:
        image_dataset.shuffle(1000, seed=random_seed)

    return image_dataset


def get_preprocessing_layers(img_size: int = 256):
    """Get pre-processing and image augmentation layers. Layers such as Resizing and
    Rescaling are applied at both training and inference time, whereas the others are
    only applied at training time.

    Args:
        img_size: Rescale images to this size.

    Returns:
        A Sequential object with all pre-processing layers.
    """
    return tf.keras.Sequential(
        [
            tf.keras.layers.Resizing(img_size, img_size),
            tf.keras.layers.RandomFlip("horizontal"),
            tf.keras.layers.RandomTranslation(
                height_factor=(-0.05, 0.05), width_factor=(-0.05, 0.05)
            ),
            tf.keras.layers.RandomZoom(
                height_factor=(-0.05, 0.05), width_factor=(-0.05, 0.05)
            ),
        ]
    )

In [None]:
train_dataset = get_image_dataset(
    data=train_data, batch_size=BATCH_SIZE, shuffle=True, random_seed=RANDOM_SEED
)
val_dataset = get_image_dataset(
    data=val_data, batch_size=128, shuffle=False, random_seed=RANDOM_SEED
)

### Inspect data augmentations on training data

In [None]:
preprocessing_layers = get_preprocessing_layers(img_size=IMG_SIZE)

In [None]:
def apply_preprocessing(img, label):
    return preprocessing_layers(img), label


t_x, t_y = next(iter(train_dataset.map(apply_preprocessing)))
fig, m_axs = plt.subplots(4, 4, figsize=(16, 16))
for c_x, c_y, c_ax in zip(t_x, t_y, m_axs.flatten()):
    preprocessing_layers
    c_ax.imshow(c_x[:, :, 0], cmap="bone")
    if c_y == 1:
        c_ax.set_title("Pneumonia")
    else:
        c_ax.set_title("No Pneumonia")
    c_ax.axis("off")

## 4. Build model

Useful source: https://keras.io/examples/vision/image_classification_efficientnet_fine_tuning/

In [None]:
def load_pretrained_model():
    effnet_model = EfficientNetB2(include_top=True, weights="imagenet")
    return Model(
        inputs=effnet_model.input, outputs=effnet_model.get_layer("block7b_add").output
    )


def build_model(base_model, preprocessing_layers):
    # 1. Freeze all EfficientNet blocks except the last one (Block 7)
    for layer in base_model.layers[:-28]:
        layer.trainable = False

    # 2. Build final model by adding some extra layers
    model = tf.keras.Sequential(
        [
            preprocessing_layers,
            base_model,
            Flatten(),
            Dense(1, activation="sigmoid"),
        ]
    )

    # 3. Define optimizer, loss and metric to monitor
    optimizer = Adam()
    loss = "binary_crossentropy"
    metrics = ["binary_accuracy"]

    # 4. Compile model
    model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

    return model

In [None]:
model = build_model(load_pretrained_model(), preprocessing_layers)

## 5. Train model

In [None]:
def train_model(model, train_dataset, val_dataset, epochs):
    """Train model"""
    weight_path = "{}_my_model.best.hdf5".format("xray_class")

    checkpoint = ModelCheckpoint(
        weight_path,
        monitor="binary_accuracy",
        verbose=1,
        save_best_only=True,
        mode="max",
        save_weights_only=True,
    )

    early = EarlyStopping(monitor="binary_accuracy", mode="max", patience=10)

    callbacks_list = [checkpoint, early]

    return model.fit(
        train_dataset,
        epochs=epochs,
        validation_data=val_dataset,
        verbose=1,
        callbacks=callbacks_list,
    )

In [None]:
training_hist = train_model(model, train_dataset, val_dataset, epochs=EPOCHS)

## 6. Evaluate model

# Now we can begin our model-building & training

##### After training for some time, look at the performance of your model by plotting some performance statistics:

Note, these figures will come in handy for your FDA documentation later in the project

In [None]:
## After training, make some predictions to assess your model's overall performance
## Note that detecting pneumonia is hard even for trained expert radiologists,
## so there is no need to make the model perfect.
my_model.load_weights(weight_path)
pred_Y = new_model.predict(valX, batch_size=32, verbose=True)

In [None]:
def plot_auc(t_y, p_y):
    ## Hint: can use scikit-learn's built in functions here like roc_curve

    # Todo

    return


## what other performance statistics do you want to include here besides AUC?


# def ...
# Todo

# def ...
# Todo

# Also consider plotting the history of your model training:


def plot_history(history):
    # Todo
    return

In [None]:
## plot figures

# Todo

Once you feel you are done training, you'll need to decide the proper classification threshold that optimizes your model's performance for a given metric (e.g. accuracy, F1, precision, etc.  You decide) 

In [None]:
## Find the threshold that optimize your model's performance,
## and use that threshold to make binary classification. Make sure you take all your metrics into consideration.

# Todo

In [None]:
## Let's look at some examples of predicted v. true with our best model:

# Todo

# fig, m_axs = plt.subplots(10, 10, figsize = (16, 16))
# i = 0
# for (c_x, c_y, c_ax) in zip(valX[0:100], testY[0:100], m_axs.flatten()):
#     c_ax.imshow(c_x[:,:,0], cmap = 'bone')
#     if c_y == 1:
#         if pred_Y[i] > YOUR_THRESHOLD:
#             c_ax.set_title('1, 1')
#         else:
#             c_ax.set_title('1, 0')
#     else:
#         if pred_Y[i] > YOUR_THRESHOLD:
#             c_ax.set_title('0, 1')
#         else:
#             c_ax.set_title('0, 0')
#     c_ax.axis('off')
#     i=i+1

In [None]:
## Just save model architecture to a .json:

model_json = my_model.to_json()
with open("my_model.json", "w") as json_file:
    json_file.write(model_json)