In [None]:
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID";
os.environ["CUDA_VISIBLE_DEVICES"] = "6"

# <font color='red'>**Loading cycleGan components**</font>

In [None]:
!pip install -q git+https://github.com/tensorflow/examples.git

In [None]:
import tensorflow as tf
#import tensorflow_datasets as tfds
from tensorflow_examples.models.pix2pix import pix2pix
from os import listdir
from tensorflow.keras.preprocessing.image import load_img
from tensorflow.keras.preprocessing.image import img_to_array
from numpy import vstack
from numpy import asarray
from numpy import savez_compressed
import numpy as np
from PIL import Image
from tensorflow.keras.utils import plot_model
from tensorflow.keras.models import Model
from tensorflow import keras
import pandas as pd
from tensorflow.keras import layers

import os
import time
import matplotlib.pyplot as plt
from IPython.display import clear_output
import glob
#For embedding classification
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import csv
import imageio
from sklearn.model_selection import train_test_split
import random
from tensorflow.keras.layers import Conv2D
from tqdm import tqdm

**import and reuse pix2pix models**

In [None]:
OUTPUT_CHANNELS = 3

generator_g = pix2pix.unet_generator(OUTPUT_CHANNELS, norm_type='instancenorm')
generator_f = pix2pix.unet_generator(OUTPUT_CHANNELS, norm_type='instancenorm')

discriminator_x = pix2pix.discriminator(norm_type='instancenorm', target=False)
discriminator_y = pix2pix.discriminator(norm_type='instancenorm', target=False)

**Optimizers**

In [None]:
generator_g_optimizer = tf.keras.optimizers.Adam(2e-4, beta_1=0.5)
generator_f_optimizer = tf.keras.optimizers.Adam(2e-4, beta_1=0.5)

discriminator_x_optimizer = tf.keras.optimizers.Adam(2e-4, beta_1=0.5)
discriminator_y_optimizer = tf.keras.optimizers.Adam(2e-4, beta_1=0.5)

In [None]:
checkpoint_path = "../models/rgb/"
ckpt = tf.train.Checkpoint(generator_g=generator_g,
                           generator_f=generator_f,
                           discriminator_x=discriminator_x,
                           discriminator_y=discriminator_y,
                           generator_g_optimizer=generator_g_optimizer,
                           generator_f_optimizer=generator_f_optimizer,
                           discriminator_x_optimizer=discriminator_x_optimizer,
                           discriminator_y_optimizer=discriminator_y_optimizer)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)

# if a checkpoint exists, restore the latest checkpoint.
if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print ('Latest checkpoint restored!!')

**Loading model architecture**

In [None]:
generator_g.summary()

In [None]:
#only for net understanding
#print(discriminator_x.get_layer('conv2d_20').layers[0].get_config())

In [None]:
plot_model(discriminator_x, to_file='model_plot.png', show_shapes=True, show_layer_names=True)
plt.show()

<font color='red'>**Getting submodel for embedding space**</font>

In [None]:
for layer in generator_g.layers:
    print(layer.name)

In [None]:
l1 = generator_g.get_layer(name='concatenate')
emb = Model(generator_g.input, l1.output) 

In [None]:
emb.summary()

In [None]:
plot_model(emb, to_file='model_plot.png', show_shapes=True, show_layer_names=True)

**Resolviendo el problema con capa concatenate...**

In [None]:
dim = 512

In [None]:
l1 = generator_g.get_layer(name='concatenate')
emb = Model(generator_g.input, l1.output)

#for solve problem
#inputs = keras.Input(shape=(295, 2, 2, 1024), name='img')
input = emb.output
x = Conv2D(filters=dim, kernel_size=(2,2))(input)
x = layers.Reshape((dim,))(x)
emb2 = Model(inputs=emb.input, outputs=x)
emb2.summary()

#from this it works:
#x = layers.Reshape((4096,))(input)
#emb2 = Model(inputs = emb.input, outputs = x)
#emb2.summary()

# <font color='red'>**Help functions**</font>

In [None]:
def create_generator(train_df):
    
    #datagenerator using 25% for validation
    datagen = ImageDataGenerator(rescale=1./255.)

    train_generator= datagen.flow_from_dataframe(dataframe=train_df,
                                                x_col="path",
                                                y_col="label",
                                                subset="training",
                                                batch_size=32,
                                                seed=42,
                                                shuffle=True,
                                                class_mode="categorical",
                                                #color_mode="grayscale",
                                                target_size=(256, 256))
    
    return train_generator

In [None]:
def sort_testing_dataframe(test_df):
    test_df = test_df.sort_values(by='path')
    test_df['Frame'] = (
         test_df.apply(lambda x: int(x.path.split('/')[-1].split('_')[-1][:-4]), axis=1)
         )
    return test_df.sort_values(by='Frame')

In [None]:
def predict(nomb_video, emb, classifier, test_df):
    batch = len(test_df)
    test_datagen = ImageDataGenerator(rescale=1./255.)
    #Obtiene el número de frames.
    number_of_frames = test_df.Frame
    test_generator=test_datagen.flow_from_dataframe(dataframe=test_df,
                                                  x_col="path",
                                                  y_col="label",
                                                  batch_size=1,
                                                  seed=42,
                                                  shuffle=False,
                                                  class_mode="categorical",
                                                  target_size=(256, 256))
    
    print("cycleGan predicting over test set...")
    X_test = emb.predict(test_generator, steps = batch, verbose=1)
    can = X_test.shape[0]
    label = nomb_video.split('_')[0]
    Y_test = [label]*can
    print("***TEST***amount of ade:{}, hyp:{}, ser:{}".format(Y_test.count("adenoma"),
                                                              Y_test.count("hiperplastic"),
                                                              Y_test.count("serrated")))
    
    
    print("classifier over test set...")
    preds = classifier.predict(X_test)
    for k in range(len(preds)):
        pred_k = preds[k]
        real_k = Y_test[k]
        writer.writerow([nomb_video, number_of_frames.iloc[k], pred_k, real_k])        

# <font color='red'>**CycleGan predicting and embedding classification**</font>

In [None]:
polyps = {0: "adenoma",
         1: "hiperplastic",
         2: "serrated"}

### Main
Downsampling approach
1. Original csv frames reading into a dataframe
2. Dataframe sorting
3. For each video (LOPO), get from the "minimo" variable value taking his 25% left, 25% right from the center value of each frame
4. For the previous frames get embedding space from cycleGan net

In [None]:
minimo = 9999
gen_path = '../../../../pregrado/data/RGB/NBI/'
clases = os.listdir(gen_path)
for clase in clases:
    videos_path = gen_path + clase    
    videos = os.listdir(videos_path)
    for video in videos:
        imgs_path = videos_path + '/' + video
        can = len(os.listdir(imgs_path))
        if can < minimo:
            minimo = can
            tipo = clase
            num_vid = video
print("la clase con menor cantidad de frames es: {}, en el video: {}, con {} frames".format(tipo, num_vid, minimo))

In [None]:
data_path = '../../../../pregrado/data/realWL.csv'
train_df = pd.read_csv(data_path, header=None)
train_df.columns = ["path", "label"]
train_df.head()

**Downsamplig data**

In [None]:
"""adenoma:0
hyperplastic:1
serrated:2
"""
csvfile = open('downsampledRGBorigData.csv', '+w')
clases = ["adenoma_NBI/", "hiperplastic_NBI/", "serrated_NBI/"]
for clase in clases:
    
    print("==== working on ", clase, "====")
    if clase == "adenoma_NBI/":
        cant = 40
        target = ',adenoma\n'
    elif clase == "hiperplastic_NBI/":
        cant = 21
        target = ',hiperplastic\n'
    else:
        cant = 15
        target = ',serrated\n'
    
    for i in range(1, cant+1):
        print("================================= video #", i,"===========================================")  
        current_test_df = train_df[train_df['path'].str.contains(clase+'video_'+str(i)+'/')]
        current_test_df = current_test_df.reset_index(drop=True)
        #sort actual video frames
        current_test_df = sort_testing_dataframe(current_test_df)
        current_test_df = current_test_df.reset_index(drop=True)
        can_frames = len(current_test_df)
        #getting frames from the middle of video
        samples = int(minimo/2)
        idx = int(can_frames/2)
        tmp_df = current_test_df.loc[idx-samples:idx+samples]
        tmp_df = tmp_df.reset_index(drop=True)
        #go throught temp data frame
        for j in range(len(tmp_df)):
            path = tmp_df['path'][j]
            csvfile.write(path+target)
            
csvfile.close()
print("finished!")

**Downsampled data verification**

In [None]:
new_csv = '../data/rgb/csvFiles/downsampledRGBorigData.csv'
new_train_df = pd.read_csv(new_csv, header=None)
new_train_df.columns = ['path', 'label']
new_train_df.head()

In [None]:
new_train_df.groupby(['label']).count()

## Leave one out 
### Getting embedding space over downsampled data

**saving embedding**

In [None]:
def saving_emb(clase, embeddings, labels, videos, dim_folder):
    embeddings_arr = np.array(embeddings)
    labels_arr = np.array(labels)
    videos_arr = np.array(videos)
    print("emb dimension: ", embeddings_arr.shape)
    print("label dimension: ", labels_arr.shape)
    print("videos dimension: ", videos_arr.shape)
    file_name = "../data/embeddings/GenA/correct_inputs/"+dim_folder+'/'+clase
    np.save(file_name+"Embeddings", embeddings_arr)
    np.save(file_name+"Labels", labels_arr)
    np.save(file_name+"Videos", videos_arr)

**getting embedding space for all frames**

In [None]:
dim_folder = 'dim512'

In [None]:
"""adenoma:0
hyperplastic:1
serrated:2
"""
embeddings, labels, videos = [], [], [] 
#target_names = ['adenoma', 'hyperplastic', 'serrated']
clase = "adenoma_WL/"
cant = 40
#with open('predict-embeddingAdenoma.csv', 'w', newline='') as file:
    #writer = csv.writer(file)
print("==== testing with ", clase, "====")
for i in range(1, cant+1):
    print("=================================kfold #", i,"===========================================")  
    current_test_df = train_df[train_df['path'].str.contains(clase+'video_'+str(i)+'/')]
    current_test_df = current_test_df.reset_index(drop=True)
    #sort test frames
    current_test_df = sort_testing_dataframe(current_test_df)
    #training data    
    current_train_df = train_df[~train_df["path"].str.contains(clase+'video_'+str(i)+'/')]
    current_train_df = current_train_df.reset_index(drop=True)               
    #make the train generator
    x_test_gen = create_generator(current_test_df) 

    print("cycleGan predicting over test set...")
    X_train = emb2.predict(x_test_gen, verbose=1)
    can = X_train.shape[0] 
    label = 'adenoma'
    Y_train = [label]*can
    video = "video_" + str(i)
    W_train = [video]*can
    #Y_train = [polyps[label] for label in x_train_gen.labels]
    print("***TRAIN***: amount of ade:{}, hyp:{}, ser:{}".format(Y_train.count("adenoma"),
                                                                 Y_train.count("hiperplastic"),
                                                                 Y_train.count("serrated")))
    embeddings.extend(X_train)
    labels.extend(Y_train)
    videos.extend(W_train)

In [None]:
#save data
saving_emb("Adenoma", embeddings, labels, videos, dim_folder)

In [None]:
embeddings, labels, videos = [], [], []
clase = "hiperplastic_WL/"
cant = 21
#with open('predict-embeddingHiperplastic.csv', 'w', newline='') as file:
    #writer = csv.writer(file)
print("==== testing with ", clase, "====")
for i in range(1, cant+1):
    print("=================================kfold #", i+40,"===========================================")  
    current_test_df = train_df[train_df['path'].str.contains(clase+'video_'+str(i)+'/')]
    current_test_df = current_test_df.reset_index(drop=True)
    #sort test frames
    current_test_df = sort_testing_dataframe(current_test_df)
    #training data    
    current_train_df = train_df[~train_df["path"].str.contains(clase+'video_'+str(i)+'/')]
    current_train_df = current_train_df.reset_index(drop=True)

    #make the train generator
    x_train_gen = create_generator(current_test_df)        
    print("cycleGan predicting over train set...")
    X_train = emb2.predict(x_train_gen, verbose=1)
    can = X_train.shape[0] 
    label = 'hiperplastic'
    Y_train = [label]*can
    video = "video_" + str(i+40)
    W_train = [video]*can

    print("***TRAIN***: amount of ade:{}, hyp:{}, ser:{}".format(Y_train.count("adenoma"),
                                                                 Y_train.count("hiperplastic"),
                                                                 Y_train.count("serrated")))
    embeddings.extend(X_train)
    labels.extend(Y_train)
    videos.extend(W_train)

In [None]:
#save data
saving_emb("Hiperplastic", embeddings, labels, videos, dim_folder)

In [None]:
embeddings, labels, videos = [], [], []
clase = "serrated_WL/"
cant = 15
#with open('predict-embeddingSerrated.csv', 'w', newline='') as file:
    #writer = csv.writer(file)
print("==== testing with ", clase, "====")
for i in range(1, cant+1):
    print("=================================kfold #", i+61,"===========================================")  
    current_test_df = train_df[train_df['path'].str.contains(clase+'video_'+str(i)+'/')]
    current_test_df = current_test_df.reset_index(drop=True)
    #sort test frames
    current_test_df = sort_testing_dataframe(current_test_df)
    #training data    
    current_train_df = train_df[~train_df["path"].str.contains(clase+'video_'+str(i)+'/')]
    current_train_df = current_train_df.reset_index(drop=True)
   #make the train generator
    x_train_gen = create_generator(current_test_df)  

    print("cycleGan predicting over train set...")
    X_train = emb2.predict(x_train_gen, verbose=1)
    can = X_train.shape[0] 
    label = 'serrated'
    Y_train = [label]*can
    video = "video_" + str(i+61)
    W_train = [video]*can

    print("***TRAIN***: amount of ade:{}, hyp:{}, ser:{}".format(Y_train.count("adenoma"),
                                                                 Y_train.count("hiperplastic"),
                                                                 Y_train.count("serrated")))
    embeddings.extend(X_train)
    labels.extend(Y_train)
    videos.extend(W_train)

In [None]:
#save data
saving_emb("Serrated", embeddings, labels, videos, dim_folder)

## Classical train/test split (80-20)
### Train test split

In [None]:
#for adenoma general example: hiperplastic_NBI/video_3
ade, hyp, ser = [], [], []
for i in range(1,41):
    text = "adenoma_NBI/video_" + str(i)
    ade.append(text)
    if i < 16:
        text = "hiperplastic_NBI/video_" + str(i)        
        hyp.append(text)
        text = "serrated_NBI/video_" + str(i)
        ser.append(text)
    if i > 15 and i < 22:
        text = "hiperplastic_NBI/video_" + str(i)
        hyp.append(text)
    
print("ade videos:\n")
print(ade)
print("=================")
print("hyp videos:\n")
print(hyp)
print("=================")
print("ser videos:\n")
print(ser)

In [None]:
ade_train, ade_test = train_test_split(ade, test_size=0.2, random_state=14)
hyp_train, hyp_test = train_test_split(hyp, test_size=0.2, random_state=14)
ser_train, ser_test = train_test_split(ser, test_size=0.2, random_state=14)

print("ade videos train:\n")
print(ade_train)
print("ade videos test:\n")
print(ade_test)
print("=================")
print("hyp videos train:\n")
print(hyp_train)
print("hyp videos test:\n")
print(hyp_test)
print("=================")
print("ser videos train: \n")
print(ser_train)
print("ser videos test: \n")
print(ser_test)

In [None]:
ade_train.extend(hyp_train)
ade_train.extend(ser_train)
train_set = ade_train
random.shuffle(train_set)

ade_test.extend(hyp_test)
ade_test.extend(ser_test)
test_set = ade_test
random.shuffle(test_set)

print("====== train amount: ", len(train_set)," ======")
print(train_set)
print("====== test amount: ", len(test_set)," ======")
print(test_set)

### Main

In [None]:
data_path = '../../../../pregrado/data/realNBI.csv'
data_df = pd.read_csv(data_path, header=None)
data_df.columns = ["path", "label"]
data_df.describe()

In [None]:
#for test data
embeddings, labels = [], [] 
for data in train_set:
    clase = data.split('/')[0]
    clase = clase.split('_')[0]
    current_df = data_df[data_df['path'].str.contains(data)]  
    x_train_gen = create_generator(current_df)        
    print("cycleGan predicting over train set...")
    X_train = emb2.predict(x_train_gen, verbose=1)
    can = X_train.shape[0]
    Y_train = [clase]*can
    embeddings.extend(X_train)
    labels.extend(Y_train)
    print("amount of ade:{}, hyp:{}, ser:{}".format(labels.count("adenoma"), labels.count("hiperplastic"),
                                                    labels.count("serrated")))
print("finished!")

**For save and load embeddings data**

In [None]:
embeddings_arr = np.array(embeddings)
labels_arr = np.array(labels)

In [None]:
print("emb dimension: ", embeddings_arr.shape)
print("label dimension: ", labels_arr.shape)

In [None]:
np.save("trainEmbeddings", embeddings_arr)
np.save("trainEmbeddingsLabels", labels_arr)

In [None]:
test_emb = np.load('trainEmbeddings.npy')
test_emb_lab = np.load('trainEmbeddingsLabels.npy')
print(test_emb.shape)
print(test_emb_lab.shape)

In [None]:
x = test_emb.tolist()
y = test_emb_lab.tolist()
print(y.count("adenoma"), y.count("hiperplastic"), y.count("serrated")) 

<font color='red'>**All embedding data**</font>

In [None]:
directory = "../../../../pregrado/data/RGB/WL/"
pathname = directory + "/**/*.png"
files = glob.glob(pathname, recursive=True)

img_height, img_width = 256, 256
video = []
imagen = []
label = []
pred = [] 

for path in tqdm(files):
    info = path.split('/')[-1]
    clase = info.split('_')[0]
    vid = info.split('_')[3]
    img_ext = info.split('_')[-1]
    image = img_ext.split('.')[0]    

    img = keras.preprocessing.image.load_img(path, target_size=(img_height, img_width))
    img_array = keras.preprocessing.image.img_to_array(img)
    img_array = img_array* 1./255.
    img_array = tf.expand_dims(img_array, 0) # Create a batch
    logits = emb2.predict(img_array, steps = 1)
    pred.extend(logits)
    video.append(vid)
    imagen.append(image)

    if clase == 'adenoma':
        label.append('adenoma')
    elif clase == 'hiperplastic':
        label.append('hiperplastic')
    else:
        label.append('serrated')

#haciendo dataframe
df = pd.DataFrame(list(zip(video, imagen, label, pred)), columns=['#Video', '#imagen', 'clase', 'predicción'])

#pred = np.squeeze(pred, axis=1)
pred = np.array(pred)
print("dimension of predic: ", pred.shape)

#label = np.squeeze(label, axis=1)
label = np.array(label)
print("dimension of label: ", label.shape)

**For save and load embeddings data**

In [None]:
embeddings_arr = np.array(pred)
labels_arr = np.array(label)

In [None]:
print("emb dimension: ", embeddings_arr.shape)
print("label dimension: ", labels_arr.shape)

In [None]:
np.save("embeddings60", embeddings_arr)
np.save("embeddingsLabels60", labels_arr)

In [None]:
test_emb = np.load('embeddings60.npy')
test_emb_lab = np.load('embeddingsLabels60.npy')
print(test_emb.shape)
print(test_emb_lab.shape)

In [None]:
x = test_emb.tolist()
y = test_emb_lab.tolist()
print(y.count("adenoma"), y.count("hiperplastic"), y.count("serrated")) 

# <font color='red'>**Dimension reduction**</font>
## T-sne

In [None]:
import numpy as np
from sklearn.datasets import load_digits
from scipy.spatial.distance import pdist
#from sklearn.manifold.t_sne import _joint_probabilities
from scipy import linalg
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import squareform
from sklearn.manifold import TSNE
from matplotlib import pyplot as plt
import seaborn as sns
sns.set(rc={'figure.figsize':(11.7,8.27)})
palette = sns.color_palette("bright", 10)
import random
import os
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow import keras
import plotly.express as px
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
#from mlxtend.plotting import plot_decision_regions
import matplotlib

In [None]:
tsne = TSNE(n_components = 2, init = 'pca')
P1_tsne = tsne.fit_transform(test_emb)
P1_tsne.shape

In [None]:
l1 = P1_tsne[:,0]
l2 = P1_tsne[:,1]

In [None]:
df

In [None]:
#df = df.drop(columns='predicción')
df['x'] = l1
df['y'] = l2

In [None]:
conditions = [
    (df['clase'] == 'adenoma'),
    (df['clase'] == 'hiperplastic'),
    (df['clase'] == 'serrated')
    ]

values = [1, 2, 3]

df['labels'] = np.select(conditions, values)

In [None]:
df.head()

### **Overview**

In [None]:
fig = px.scatter(df, x="x", y="y", color = 'clase', hover_name="clase", hover_data=["#Video"], opacity=0.2)
fig.show()

## PCA

In [None]:
from sklearn.decomposition import PCA

In [None]:
pca = PCA(n_components=3)
pca_result = pca.fit_transform(test_features)