In [1]:
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID";
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'

# <font color='red'>**Loading trained networks**</font>
## Useful functions

In [2]:
import numpy as np
from tqdm import tqdm
from tensorflow.keras.preprocessing.image import load_img
from tensorflow.keras.preprocessing.image import img_to_array
from numpy import asarray
import tensorflow as tf
AUTOTUNE = tf.data.AUTOTUNE
import matplotlib.pyplot as plt
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Conv2D
import imageio
import os
import pandas as pd
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from PIL import Image
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn import metrics
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn import preprocessing

2023-08-05 21:33:46.629309: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


ModuleNotFoundError: No module named 'imageio'

In [None]:
!pip install -q git+https://github.com/tensorflow/examples.git

In [None]:
from tensorflow_examples.models.pix2pix import pix2pix

In [None]:
gpus = tf.config.experimental.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(gpus[0], True)

In [None]:
BUFFER_SIZE = 1000
BATCH_SIZE = 1
IMG_WIDTH = 256
IMG_HEIGHT = 256
OUTPUT_CHANNELS = 3
experiment = 'yourParticularModel'
split = 'train'#changge if you want to get the train or test embeddings

### Import and reuse the Pix2Pix models

In [None]:
OUTPUT_CHANNELS = 3

generator_g = pix2pix.unet_generator(OUTPUT_CHANNELS, norm_type='instancenorm')
generator_f = pix2pix.unet_generator(OUTPUT_CHANNELS, norm_type='instancenorm')

discriminator_x = pix2pix.discriminator(norm_type='instancenorm', target=False)
discriminator_y = pix2pix.discriminator(norm_type='instancenorm', target=False)

### Initializing optimizers, generatos and discriminators

In [None]:
generator_g_optimizer = tf.keras.optimizers.Adam(2e-4, beta_1=0.5)
generator_f_optimizer = tf.keras.optimizers.Adam(2e-4, beta_1=0.5)

discriminator_x_optimizer = tf.keras.optimizers.Adam(2e-4, beta_1=0.5)
discriminator_y_optimizer = tf.keras.optimizers.Adam(2e-4, beta_1=0.5)

In [None]:
checkpoint_path = "../models/folders/" + experiment#----->folder where the model will be stored

ckpt = tf.train.Checkpoint(generator_g=generator_g,
                           generator_f=generator_f,
                           discriminator_x=discriminator_x,
                           discriminator_y=discriminator_y,
                           generator_g_optimizer=generator_g_optimizer,
                           generator_f_optimizer=generator_f_optimizer,
                           discriminator_x_optimizer=discriminator_x_optimizer,
                           discriminator_y_optimizer=discriminator_y_optimizer)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)

# if a checkpoint exists, restore the latest checkpoint.
if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print ('Latest checkpoint restored!!')

ckpt.restore(ckpt_manager.latest_checkpoint)
if ckpt_manager.latest_checkpoint:
    print("Restored from {}".format(ckpt_manager.latest_checkpoint))
else:
    print("Initializing from scratch.")

# <font color='red'>**Load and preprocess data**</font>

In [None]:
str2idx = {
    'adenoma': 0,
    'hiperplastic': 1,
    'serrated': 2
}

idx2str = {
    0: 'adenoma',
    1: 'hiperplastic', 
    2: 'serrated'
}

## Helper functions

In [None]:
def ohe_class(index):
    """
    One-Hot Encoding for Classification Task

    This function takes an integer 'index' representing the class label and performs
    one-hot encoding for a classification task. One-hot encoding is a technique used
    to convert categorical data (class labels) into a binary vector representation,
    where the index of the class label is marked with 1 and all other elements are 0.

    Parameters:
        index (int): An integer representing the class label that needs to be one-hot encoded.
                     It must be a non-negative integer less than the number of classes.

    Returns:
        numpy.ndarray: A NumPy array representing the one-hot encoded label. The length of
                       the array is equal to the number of classes, and the element at the
                       specified 'index' is set to 1, while all other elements are set to 0.

    Example:
        Suppose there are three classes: 0, 1, and 2. To one-hot encode class 1, use:
        >>> class_index = 1
        >>> encoded_label = ohe_class(class_index)
        >>> print(encoded_label)
        Output: [0 1 0]

    Note:
        The function assumes that the number of classes is fixed to 3, as the length of
        the one-hot encoded label is hard-coded to 3. If your classification task involves
        a different number of classes, you will need to modify the function accordingly.
    """
    # Create an array of zeros with length 3 and integer data type
    ohe_label = np.zeros(3, dtype=int)
    
    # Set the element at 'index' to 1 to represent the one-hot encoding
    ohe_label[index] = 1
    
    # Return the one-hot encoded label as an array
    return ohe_label


In [None]:
# scaling the images to [-1, 1]
def normalize(image):
    image = tf.cast(image, tf.float32)
    image = (image / 127.5) - 1
    return image

def preprocess_image(image):
    image = normalize(image)
    return image

In [None]:
def load_images(path, size=(256, 256), rgb=False):
    """
    Load Images and Corresponding Labels from a Directory into Memory.

    This function loads all images from a specified directory and their corresponding
    labels (assumed to be encoded in the image filenames) into memory. The images are
    loaded, resized to the specified dimensions, and converted into numpy arrays.
    Labels are extracted from the filenames and converted into one-hot encoded vectors.

    Parameters:
        path (str): The path to the directory containing the images.
        size (tuple, optional): A tuple (width, height) specifying the target size
                                for resizing the images. Default is (256, 256).
        rgb (bool, optional): Set to True to load images in RGB color mode,
                              False to load in grayscale mode. Default is False.

    Returns:
        numpy.ndarray: A NumPy array containing the image data. Each element in the
                       array is an image represented as a numpy array.
        list: A list of one-hot encoded labels corresponding to each image in the
              same order as the image data. Each label is represented as a NumPy
              array of length equal to the number of classes.

    Note:
        The function uses Keras' 'load_img' and 'img_to_array' functions to load
        and convert the images. Ensure that Keras or an appropriate library is
        installed before using this function.

    Example:
        >>> data_path = "/path/to/images/"
        >>> image_data, labels = load_images(data_path, size=(128, 128), rgb=True)
        >>> print(image_data.shape)
        Output: (num_images, 128, 128, 3)  # Assuming num_images is the total number of images.
        >>> print(len(labels))
        Output: num_images  # Number of images, each with a corresponding one-hot encoded label.
    """
    data_list = list()
    label_list = list()

    if not rgb:
        color_mode = "grayscale"
    else:
        color_mode = "rgb"

    # Enumerate filenames in the directory, assuming all are images
    for filename in tqdm(os.listdir(path)):
        # Load and resize the image
        pixels = load_img(os.path.join(path, filename), target_size=size, color_mode=color_mode)
        # Convert to numpy array
        pixels = img_to_array(pixels)
        # Store the image data
        data_list.append(pixels)

        # For labels
        clase = filename.split('_')[0]
        # Assuming 'str2idx' is a dictionary mapping class names to their respective indices
        indx = str2idx[clase]
        # Get one-hot encoding from the index
        ohe_label = ohe_class(indx)
        label_list.append(ohe_label)

    return np.asarray(data_list), label_list


In [None]:
def saving_emb(split, clase, embeddings, labels, videos):
    """
    Save Embeddings, Labels, and Videos to Files.

    This function takes embeddings, labels, and videos obtained from a model and
    saves them to separate files for later use. The data is saved as NumPy arrays.

    Parameters:
        split (str): Indicates the data split, either 'train' or 'test', to determine
                     the destination directory for saving the files.
        clase (str): The class name or identifier to be included in the file names
                     for better organization.
        embeddings (list): A list of embeddings (feature vectors) obtained from a model.
        labels (list): A list of one-hot encoded labels corresponding to the embeddings.
        videos (list): A list of video data associated with the embeddings (optional).

    Note:
        The function converts the input lists 'embeddings', 'labels', and 'videos'
        into NumPy arrays before saving them. Ensure that the data is properly formatted
        before calling this function.
    """
    # Convert the input lists to NumPy arrays
    embeddings_arr = np.array(embeddings)
    labels_arr = np.array(labels)
    videos_arr = np.array(videos)

    print("emb dimension: ", embeddings_arr.shape)
    print("label dimension: ", labels_arr.shape)
    print("videos dimension: ", videos_arr.shape)

    # Create the file path based on the split and class name
    if split == 'train':        
        file_name = "/path/to_save/train/embeddings" + clase
    else:
        file_name = "/path/to_save/test/embeddings" + clase

    print("saving on: ", file_name)

    # Create the directory if it does not exist
    if not os.path.exists(file_name):
        os.makedirs(file_name)

    # Save the embeddings, labels, and videos (if provided) as separate files
    np.save(file_name + "Embeddings", embeddings_arr)
    np.save(file_name + "Labels", labels_arr)
    np.save(file_name + "Videos", videos_arr)


In [None]:
def toDataSet(path_origen):
    """
    Convert Images and Labels to TensorFlow Dataset.

    This function loads images and corresponding labels from a specified directory,
    converts them into TensorFlow datasets, applies preprocessing to the images,
    and returns a combined dataset containing the image and label pairs.

    Parameters:
        path_origen (str): The path to the directory containing the images.

    Returns:
        tf.data.Dataset: A TensorFlow dataset containing image and label pairs.
                        The images are preprocessed and batched, and the labels
                        are cast to int64 data type.

    Note:
        This function assumes that the 'load_images' function is defined and returns
        a list of image data and labels. It also assumes the availability of 'BATCH_SIZE',
        'AUTOTUNE', and 'BUFFER_SIZE' variables for data preprocessing.

    """
    # Load images and labels using the 'load_images' function
    data, labels = load_images(path_origen, rgb=True)

    # Convert the data to NumPy array
    data_array = np.asarray(data)

    # Create a TensorFlow dataset for the image data
    data_ds = tf.data.Dataset.from_tensor_slices(data_array)

    # Create a TensorFlow dataset for the labels and batch them
    labels_ds = tf.data.Dataset.from_tensor_slices(tf.cast(labels, tf.int64)).batch(BATCH_SIZE)

    # Apply image preprocessing, cache, shuffle, and batch the image dataset
    data_ds = data_ds.map(preprocess_image, num_parallel_calls=AUTOTUNE).cache().shuffle(BUFFER_SIZE).batch(BATCH_SIZE)

    # Combine the image and label datasets
    data_label_ds = tf.data.Dataset.zip((data_ds, labels_ds))

    return data_label_ds


# <font color='red'>**Generator embeddings**</font>

In [None]:
def load_emb_model(generator_g):
    """
    Load Embedding Model from a Generator Model.

    This function takes a generator model, extracts the intermediate embedding layer,
    and creates a new model (embedding model) that outputs the embeddings obtained
    from the intermediate layer. The function returns this new embedding model.

    Parameters:
        generator_g (tf.keras.Model): The generator model from which to extract
                                      the intermediate embedding layer.

    Returns:
        tf.keras.Model: A new TensorFlow model (embedding model) that takes the same
                        input as the generator model and outputs the embeddings.

    Note:
        This function assumes that the generator model has a layer named 'concatenate'
        representing the intermediate embedding layer.

    """
    
    print("making emb model")
    l1 = generator_g.get_layer(name='concatenate')
    emb = Model(generator_g.inputs, l1.output)

    #for second concatenate layer
    input = emb.output
    b, w, h, d = input.shape 
    #x = tf.keras.layers.Conv2D(filters=dim, kernel_size=(2,2))(input)
    x = tf.keras.layers.Reshape((4096,))(input)
    emb2 = Model(inputs=emb.inputs, outputs=x)
    
    print("emb model done!")
    return emb2

## For train and test split

In [None]:
def get_test_videos():
    
    path = "../path/synthetic_imgs_results/test/"
    
    folders = os.listdir(path)
    check = 'WL'
    folders = [folder for folder in folders if folder.split('_')[-1].upper() == check.upper()]
    print("folders: ", folders)
    videos_to_read = set()
    for folder in folders:
        print(folder)
        folder_path = path + '/' + folder
        videos = os.listdir(folder_path)
        for video in videos:
            name = folder + '/' + video 
            videos_to_read.add(name)

    print("videos_to_read: ", len(videos_to_read))
    videos_to_read = list(videos_to_read)
    
    check = 'a'
    res_ade = [video for video in videos_to_read if video[0].lower() == check.lower()]
    check = 'h'
    res_hyp = [video for video in videos_to_read if video[0].lower() == check.lower()]
    check = 's'
    res_ser = [video for video in videos_to_read if video[0].lower() == check.lower()]

    print("===== videos for label: =====")
    print("adenoma: \n", res_ade)
    print("hyperplastic: \n", res_hyp)
    print("serrated:  \n", res_ser)

    return res_ade, res_hyp, res_ser

In [None]:
def get_train_videos(): 
    path = "../path/train_WL/" 
    imgs = os.listdir(path)

    videos_to_read = set()
    for img in imgs:
        info = img.split('.')[0]
        folder = info.split('_')[0] + '_WL/'
        video = info.split('_')[3]
        name = folder + 'video_' + video
        videos_to_read.add(name)


    print("videos_to_read: ", len(videos_to_read))
    videos_to_read = list(videos_to_read)

    check = 'a'
    res_ade = [video for video in videos_to_read if video[0].lower() == check.lower()]
    check = 'h'
    res_hyp = [video for video in videos_to_read if video[0].lower() == check.lower()]

    print("===== videos for label:=====")
    print("adenoma: \n", res_ade)
    print("hyperplastic: \n", res_hyp)
    
    videos_filt = res_ade + res_hyp
    return imgs, sorted(videos_filt)

In [None]:
def get_val_videos():
    val_path = '../data/csv_files/adeVshyp/NBI/trainNBI.csv'
    val_df = pd.read_csv(val_path, header=None)
    val_df.columns = ['path', 'label']
    val_df.groupby(['label']).count()
    
    videos = []
    for i in range(len(val_df)):
        path = val_df.iloc[i]['path']
        info = path.split('/')[-1]
        clase = info.split('_')[0]
        video = info.split('_')[3]
        to_save = clase + '_WL/video_' + video
        videos.append(to_save)

    videos_set = set(videos)
    videos = list(videos_set)
    
    check = 'a'
    res_ade = [video for video in videos if video[0].lower() == check.lower()]
    check = 'h'
    res_hyp = [video for video in videos if video[0].lower() == check.lower()]
    check = 's'
    res_ser = [video for video in videos if video[0].lower() == check.lower()]

    print("===== videos for label: =====")
    print("adenoma: \n", res_ade)
    print("hyperplastic: \n", res_hyp)
    print("serrated:  \n", res_ser)

    return res_ade, res_hyp

In [None]:
sorted(get_val_videos())

## Generating images 
<font color='red'>**For test set**</font>

In [None]:
gen_path = '../../../../../data/polyp_original/WL/'
split = 'test'

#loading embedding model
emb2 = load_emb_model(generator_g)

res_ade, res_hyp, res_ser = get_test_videos()
clases = [res_ade, res_hyp, res_ser]

for tipo in clases:
    print("working on: ", tipo)
    embeddings, labels, all_videos = [], [], []#
    for video in tipo:
        video_pth = gen_path + video + '/'
        print("video_pth: ", video_pth)
        clase = video.split('/')[0].split('_')[0]
        print("clase: ", clase)
        video_num = video.split('/')[-1]
        print("video_num: ", video_num)
        print("convirtiendo a tf.Dataset...")
        data_ds = toDataSet(video_pth)
        for img, label in tqdm(data_ds):
            out = emb2(img)#emb2([img])#
            #out = out[0][-1]
            embeddings.extend(out)
        can = len(data_ds)
        print("cantidad: ", can)
        label = [clase]*can
        labels.extend(label)
        curr_video = [video_num]*can
        all_videos.extend(curr_video)
    print("saving ", tipo, " class...")
    saving_emb(split, clase, embeddings, labels, all_videos)

<font color='red'>**For train set**</font>

In [None]:
gen_path = '../../../data/binary/'
split = 'train'

folds = ['fold1', 'fold2', 'fold3', 'fold4', 'fold5'] #--->for 5 kfold cross validation
for fold in folds:
    print("============== FOLD: ", fold, " ===========")
    
    #loading embedding model
    emb2 = load_emb_model(generator_g)    
    #getting all images and filtered videos over fold                
    imgs, videos_filt = get_train_videos() 
    print("cantidad de videos a entrenar: ", len(videos_filt))
    
    for polyp_class in ['adenoma', 'hiperplastic']:
        embeddings, all_labels, all_videos = [], [], []
        print("working on: ", polyp_class)
        related_videos = list(filter(lambda x: polyp_class in x, videos_filt)) 
        for single_video in related_videos: 
            data_list, label_list = list(), list()
            clase = single_video.split('/')[0]
            polyp_clase = clase.split('_')[0]
            print("polyp_clase: ",polyp_clase)
            video = single_video.split('/')[1]
            video_num = video.split('_')[-1]
            print("video_num: ", video_num)

            check = polyp_clase + '_WL_video_' + video_num + '_'
            print("working on: ", check)
            related_imgs = list(filter(lambda x: check in x, imgs))
            
            for img in related_imgs:               
                img_path = gen_path + fold + '/train_WL/' + img  
                pixels = load_img(img_path, target_size=(256,256), color_mode= "rgb")
                # convert to numpy array
                pixels = img_to_array(pixels)
                # store
                data_list.append(pixels)

                #for labels
                clase = img.split('_')[0]
                indx = str2idx[clase]
                #get ohe from index
                ohe_label = ohe_class(indx)
                label_list.append(ohe_label)          

            data, labels = asarray(data_list), label_list 
            data_array = np.asarray(data)
            data_ds = tf.data.Dataset.from_tensor_slices(data_array)
            labels_ds = tf.data.Dataset.from_tensor_slices(tf.cast(labels, tf.int64)).batch(BATCH_SIZE)
            data_ds = data_ds.map(preprocess_image, num_parallel_calls=AUTOTUNE).cache().shuffle(
                        BUFFER_SIZE).batch(BATCH_SIZE)

            data_ds = tf.data.Dataset.zip((data_ds, labels_ds))
            for img, label in tqdm(data_ds):
                out = emb2(img)#emb2([img])#
                #out = out[0][-1]
                embeddings.extend(out)
            can = len(data_ds)
            print("cantidad: ", can)
            label = [polyp_clase]*can
            all_labels.extend(label)
            curr_video = [video_num]*can
            all_videos.extend(curr_video)
            
        print("saving ", check, " class...")
        saving_emb(split, polyp_clase, embeddings, all_labels, all_videos)  

**For serrated samples as test set**

In [None]:
gen_path = '../../../../../data/polyp_original/WL/serrated_WL/'
embeddings, labels, all_videos = [], [], []

videos = os.listdir(gen_path)
for video in videos:
    video_num = video.split('/')[-1]
    print(video_num)
    video_path = gen_path + video + '/'
    data_ds = toDataSet(video_path)
    for img, label in data_ds:
        out = emb2(img)
        embeddings.extend(out)
    can = len(data_ds)
    label = ['serrated']*can
    labels.extend(label)
    curr_video = [video_num]*can
    all_videos.extend(curr_video)
    
print("saving ...")
saving_emb('serrated', embeddings, labels, all_videos, 'test')