In [None]:
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID";
os.environ["CUDA_VISIBLE_DEVICES"] = "5"

# <font color='red'>**CycleGan embedding classification**</font>

In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import csv
import imageio
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.metrics import precision_recall_fscore_support as score
import pandas as pd
import os
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.layers import GlobalAveragePooling2D, Input, Conv2D, MaxPooling2D, Dropout, Flatten
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import SGD, Adam, RMSprop
from tensorflow.keras import layers
from tensorflow.keras.models import Model
from joblib import dump, load

## Data analysis
### Gen A embeddings
**Nota:** Gen A takes white light images and translate it into Nbi representation

#### Downsamplig strategy

1. Loading original data embeddings (all are sorted).
2. Get the lowest amount of frames that belongs a video
3. From the original embedding representation make downsampling approach
4. Make a stratified 4 fold-cross validation

##### 1. Data loading

In [2]:
#downsampling es para cuando se quiera balancear los datos por el minimo de muestras y 
#videos disponibles para entrenar (ej. la clase serrated tiene 12 videos para testear con cada video con 164 
#frames entonces downsampling==True quiere decir que para todos los videos se van a tomar 12 con 164 frames c/u)
downsampling = True

generator = 'GenB'
path = "../data/embeddings/" + generator + "/correct_inputs/"
ade_dat = np.load(path + "AdenomaEmbeddings.npy")
ade_lab = np.load(path + "AdenomaLabels.npy")
ade_vid = np.load(path + "AdenomaVideos.npy") 

hyp_dat = np.load(path + "HiperplasticEmbeddings.npy")
hyp_lab = np.load(path + "HiperplasticLabels.npy")
hyp_vid = np.load(path + "HiperplasticVideos.npy")

ser_dat = np.load(path + "SerratedEmbeddings.npy")
ser_lab = np.load(path + "SerratedLabels.npy")
ser_vid = np.load(path + "SerratedVideos.npy")

print("==== data info ====")
print("ade dim: {}, amount of labels: {}, videos: {}".format(ade_dat.shape, ade_lab.shape, ade_vid.shape))
print("hyp dim: {}, amount of labels: {}, videos: {}".format(hyp_dat.shape, hyp_lab.shape, hyp_vid.shape))
print("ser dim: {}, amount of labels: {}, videos: {}".format(ser_dat.shape, ser_lab.shape, ser_vid.shape))

==== data info ====
ade dim: (23319, 4096), amount of labels: (23319,), videos: (23319,)
hyp dim: (8218, 4096), amount of labels: (8218,), videos: (8218,)
ser dim: (7227, 4096), amount of labels: (7227,), videos: (7227,)


##### **1.1 Data filtering for cycleGan train/test videos**

In [3]:
features = np.concatenate((ade_dat, hyp_dat, ser_dat), axis=0)
labels = np.concatenate((ade_lab, hyp_lab, ser_lab), axis=0)
videos = np.concatenate((ade_vid, hyp_vid, ser_vid), axis=0)

In [4]:
(unique, counts) = np.unique(videos, return_counts=True)
freq = np.asarray((unique, counts)).T
print(len(freq))

76


In [5]:
df = pd.DataFrame({'features': list(features), 'label': labels, 'video': videos}, columns=['features', 'label', 'video'])
df['info'] = df['label'] + '/' + df['video']
df.tail()

Unnamed: 0,features,label,video,info
38759,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",serrated,video_76,serrated/video_76
38760,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",serrated,video_76,serrated/video_76
38761,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",serrated,video_76,serrated/video_76
38762,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",serrated,video_76,serrated/video_76
38763,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",serrated,video_76,serrated/video_76


In [6]:
root = '../../../data/polyps/testA/'
test = []
files = os.listdir(root)
for file in files:
    info = file.split('.')[0]
    video_number = info.split('_')[-3]
    clase = info.split('_')[0]
    if clase == 'adenoma':
        name = clase + '/video_' + video_number
        test.append(name)
    elif clase == 'hiperplastic':
        video_number = int(video_number) + 40
        name = clase + '/video_' + str(video_number)
        test.append(name)
    else:
        video_number = int(video_number) + 61
        name = clase + '/video_' + str(video_number)
        test.append(name)
        
myset = set(test)
print(myset)
print(len(myset))

{'hiperplastic/video_53', 'serrated/video_70', 'hiperplastic/video_51', 'adenoma/video_21', 'adenoma/video_33', 'adenoma/video_9', 'adenoma/video_10', 'adenoma/video_36', 'adenoma/video_22', 'hiperplastic/video_43', 'adenoma/video_19', 'adenoma/video_14', 'hiperplastic/video_41', 'serrated/video_66', 'serrated/video_71'}
15


In [7]:
test_videos = list(myset)
data_frames = []
for video in test_videos:
    tmp_df = df[df['info']==video] 
    data_frames.append(tmp_df)
    
test_df = pd.concat(data_frames)

In [8]:
train_df = df[~(df['info'].isin(test_df['info']))].reset_index(drop=True)

print("total data:\n")
print(df.groupby(['label']).count())

print("\n ===== train data: ===== \n")
print(train_df.groupby(['label']).count())
print("amount of videos by polyp class: \n")
print("adenoma: {}".format(len(train_df[train_df['label']=='adenoma']['video'].unique())))
print("hiperplastic: {}".format(len(train_df[train_df['label']=='hiperplastic']['video'].unique())))
print("serrated: {}".format(len(train_df[train_df['label']=='serrated']['video'].unique())))

print("\n ===== test dataframe: ===== \n")
print(test_df.groupby(['label']).count())
print("amount of videos by polyp class: \n")
print("adenoma: {}".format(len(test_df[test_df['label']=='adenoma']['video'].unique())))
print("hiperplastic: {}".format(len(test_df[test_df['label']=='hiperplastic']['video'].unique())))
print("serrated: {}".format(len(test_df[test_df['label']=='serrated']['video'].unique())))

total data:

              features  video   info
label                               
adenoma          23319  23319  23319
hiperplastic      8218   8218   8218
serrated          7227   7227   7227

 ===== train data: ===== 

              features  video   info
label                               
adenoma          19158  19158  19158
hiperplastic      6841   6841   6841
serrated          6120   6120   6120
amount of videos by polyp class: 

adenoma: 32
hiperplastic: 17
serrated: 12

 ===== test dataframe: ===== 

              features  video  info
label                              
adenoma           4161   4161  4161
hiperplastic      1377   1377  1377
serrated          1107   1107  1107
amount of videos by polyp class: 

adenoma: 8
hiperplastic: 4
serrated: 3


In [None]:
print("amount of frames by each video training sample:")
a = pd.DataFrame(train_df.groupby(['video']).count()['features'])
a = a.sort_values(by='features', axis=0, ascending=True)
a = a.reset_index()
a.head(n=11)

In [None]:
to_drop = a.head(n=10)
to_drop

**Minimo de frames en todos los videos**

In [None]:
minimo1 = 9999
path = '../../../../pregrado/data/RGB/NBI/'
clases = os.listdir(path)
for clase in clases:
    videos_path = path + clase
    videos = os.listdir(videos_path)
    for video in videos:
        imgs_path = videos_path + '/' + video
        can = len(os.listdir(imgs_path))
        if can<minimo1:
            minimo1 = can
            tipo = clase
        
print("la clase {} tiene menos imagenes {}".format(tipo, minimo1))

In [None]:
minimo2 = a.iloc[11]['features']
print("the minimum of frames to be taking into account: ", minimo)
to_drop2 = list(to_drop['video'].values)
nums = [int(item.split('_')[-1]) for item in to_drop2]
print("======================")
print("adenoma videos to drop: {}".format(np.count_nonzero(np.array(nums)<41)))
print("hyperplastic videos to drop: {}".format(np.count_nonzero( (np.array(nums)>40)&(np.array(nums)<62)) ))
print("serrated videos to drop: {}".format(np.count_nonzero( (np.array(nums)>61)&(np.array(nums)<77)) ))

#### Dropping videos

In [None]:
#train_df2 = train_df[~(train_df['video'].isin(to_drop['video']))].reset_index(drop=True)
#train_df = train_df2
#print("train lenght before number of videos balancing: {}".format(len(train_df)))

**Getting the lowest video sample possible**
* Se busca tanto la clase como la cantidad de videos para determinada clase en el set train

In [None]:
if downsampling == True:
    import random

    random.seed(42)

    train_videos = np.unique(train_df['info'].values)

    ade, hip, ser = 0, 0, 0
    adenomas, hiperplastic, serrated = [], [], []
    lower = -999
    train_videos = np.unique(train_df['info'].values)
    for val in train_videos:
        clase = val.split('/')[0]
        if clase == 'adenoma':
            ade += 1
            adenomas.append(val)
        elif clase == 'hiperplastic':
            hip += 1
            hiperplastic.append(val)
        else:
            ser += 1
            serrated.append(val)

    if ade<lower:
        lower = ade
        clase = "ade"

    elif hip<lower:
        lower = hip
        clase = "hip"
    else:
        lower = ser
        clase = "ser"

    print("la clase con menos videos es: {} con {} videos".format(clase, lower))

    ade_samples = random.sample(adenomas, lower)
    ade_samples = np.array(ade_samples)
    hip_samples = random.sample(hiperplastic, lower)
    hip_samples = np.array(hip_samples)
    ser_samples = random.sample(serrated, lower)
    ser_samples = np.array(ser_samples)

    train_videos_tmp = np.concatenate((ade_samples, hip_samples, ser_samples), axis = 0)
    
    data_frames = []
    for val in train_videos_tmp:
        tmp_df = train_df[train_df['info']==val] 
        data_frames.append(tmp_df)

    train_df = pd.concat(data_frames)

    train_df = train_df.reset_index(drop=True)  
    print("train lenght after number of videos balancing: {}".format(len(train_df)))

**3. Downsamplig original embedding representation**

In [None]:
#for adenoma
new_ade_dat, new_ade_lab, new_ade_vid = [], [], []
ade_idx, hyp_idx, ser_idx = [], [], []

(unique, counts) = np.unique(ade_vid, return_counts=True)
freq = np.asarray((unique, counts)).T

#for i, info in enumerate(freq):
#    if freq[i][0] in to_drop2:
#        ade_idx.append(i)
#    else:
#        pass
#freq = np.delete(freq, ade_idx, axis=0)

can = 0
samples = int(minimo1/2)

for i in range(len(freq)):
    idx = can + int(int(freq[i][1])/2)
    dat = ade_dat[idx-samples:idx+samples]
    can = can + int(freq[i][1]) 
    
    label = 'adenoma'
    lab = [label]*(minimo1)
    video = "video_" + str(i+1)
    vid = [video]*(minimo1)
    
    new_ade_dat.extend(dat)
    new_ade_lab.extend(lab)
    new_ade_vid.extend(vid)

new_ade_dat = np.array(new_ade_dat)
new_ade_lab = np.array(new_ade_lab)
new_ade_vid = np.array(new_ade_vid)

#for hyperplastic
new_hyp_dat, new_hyp_lab, new_hyp_vid = [], [], []
(unique, counts) = np.unique(hyp_vid, return_counts=True)
freq = np.asarray((unique, counts)).T

#for i, info in enumerate(freq):
#    if freq[i][0] in to_drop2:
#        hyp_idx.append(i)
#    else:
#        pass
#freq = np.delete(freq, hyp_idx, axis=0)

can = 0
samples = int(minimo1/2)

for i in range(len(freq)):
    idx = can + int(int(freq[i][1])/2)
    dat = hyp_dat[idx-samples:idx+samples]
    can = can + int(freq[i][1]) 
    
    label = 'hiperplastic'
    lab = [label]*(minimo1)
    video = "video_" + str(40+i+1)
    vid = [video]*(minimo1)
    
    new_hyp_dat.extend(dat)
    new_hyp_lab.extend(lab)
    new_hyp_vid.extend(vid)

new_hyp_dat = np.array(new_hyp_dat)
new_hyp_lab = np.array(new_hyp_lab)
new_hyp_vid = np.array(new_hyp_vid)
    
#for serrated
#for adenoma
new_ser_dat, new_ser_lab, new_ser_vid = [], [], []
(unique, counts) = np.unique(ser_vid, return_counts=True)
freq = np.asarray((unique, counts)).T

#for i, info in enumerate(freq):
#    if freq[i][0] in to_drop2:
#        ser_idx.append(i)
#    else:
#        pass
#freq = np.delete(freq, ser_idx, axis=0)

can = 0
samples = int(minimo1/2)

for i in range(len(freq)):
    idx = can + int(int(freq[i][1])/2)
    dat = ser_dat[idx-samples:idx+samples]
    can = can + int(freq[i][1]) 
    
    label = 'serrated'
    lab = [label]*(minimo1)
    video = "video_" + str(61+i+1)
    vid = [video]*(minimo1)
    
    new_ser_dat.extend(dat)
    new_ser_lab.extend(lab)
    new_ser_vid.extend(vid)

new_ser_dat = np.array(new_ser_dat)
new_ser_lab = np.array(new_ser_lab)
new_ser_vid = np.array(new_ser_vid)
    
print("==== data info ====")
print("ade dim: {}, amount of labels: {}, videos: {}".format(new_ade_dat.shape, new_ade_lab.shape, new_ade_vid.shape))
print("hyp dim: {}, amount of labels: {}, videos: {}".format(new_hyp_dat.shape, new_hyp_lab.shape, new_hyp_vid.shape))
print("ser dim: {}, amount of labels: {}, videos: {}".format(new_ser_dat.shape, new_ser_lab.shape, new_ser_vid.shape))

**Concatenating data embeddings**

In [None]:
features = np.concatenate((new_ade_dat, new_hyp_dat, new_ser_dat), axis=0)
labels = np.concatenate((new_ade_lab, new_hyp_lab, new_ser_lab), axis=0)
videos = np.concatenate((new_ade_vid, new_hyp_vid, new_ser_vid), axis=0)

In [None]:
(unique, counts) = np.unique(videos, return_counts=True)
freq = np.asarray((unique, counts)).T
print(len(freq))

In [None]:
features.shape

In [None]:
df2 = pd.DataFrame({'features': list(features), 'label': labels, 'video': videos}, columns=['features', 'label', 'video'])
df2['info'] = df2['label'] + '/' + df2['video']
df2.tail()

In [None]:
train_df = df2[df2['info'].isin(train_df['info'])].reset_index(drop=True)
train_df

**4. Stratified 4 fold cross validation**
* 4.1. Dataframe creation from embedding information

**4.2 CycleGan train filtering data**

* 4.2 Stratified Kfold

In [None]:
from sklearn.model_selection import StratifiedKFold

In [None]:
if downsampling==True:
    skf = StratifiedKFold(n_splits=4, shuffle=True, random_state=42)
else:
    skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

In [None]:
ade = ["adenoma/video_" + str(i) for i in range(1,41)]
hyp = ["hiperplastic/video_" + str(i) for i in range(41,62)]
ser = ["serrated/video_" + str(i) for i in range(62,77)]

In [None]:
general_videos = np.concatenate((ade, hyp, ser), axis=0)
general_videos

In [None]:
train_videos2 = list(set(general_videos) - set(test_videos))
print(len(train_videos2))
train_videos = list(set(train_videos2) -  set(train_videos_tmp))  
len(train_videos)

In [None]:
len(train_videos_tmp)

In [None]:
train_videos = list(set(general_videos) - set(test_videos))
if downsampling==True:
    test_videos2 = list(set(train_videos) -  set(train_videos_tmp))  
    clases = [video.split('/')[0] for video in train_videos_tmp]
    values, counts = np.unique(clases, return_counts=True)
else:
    clases = [video.split('/')[0] for video in train_videos]
    values, counts = np.unique(clases, return_counts=True)


if downsampling==True:
    print("train videos: ")
    print(train_videos_tmp)

    print("values: {}".format(values))
    print("counts: {}".format(counts))

    print("letting the lowest amount of frames and videos we have:")
    print("===== is any test videos in train video?=====")
    test_videos = test_videos + test_videos2
    result = any(elem in test_videos for elem in train_videos_tmp)
    print(result)
else:
    print("train videos: ")
    print(train_videos)
    print("values: {}".format(values))
    print("counts: {}".format(counts))
    
    print("lonly taking the lowest amount of frames we have:")
    print("===== is any test videos in train video?=====")
    result = any(elem in test_videos for elem in train_videos)
    print(result)   

In [None]:
ade_label, hyp_label, ser_label = [], [], []
if downsampling==True:
    train_videos = train_videos_tmp
else:
    pass
   
for video in train_videos:
    clase = video.split('/')[0]
    if clase == 'adenoma':
        ade_label.append(0)
    elif clase == 'hiperplastic':
        hyp_label.append(1)
    else:
        ser_label.append(2)
        
labels = np.concatenate((ade_label, hyp_label, ser_label), axis = 0)

<font color='red'>**Helper functions**</font>

In [None]:
def get_metrics(file_name, y_true, preds):
    
    print("*** metrics report for: ", file_name, "***")
    txtfile.write("*** metrics report for: " + file_name + "***\n")
    precision,recall,fscore,support=score(y_true, preds, average='weighted')
    print ('Precision : {}'.format(precision))
    print ('Recall    : {}'.format(recall))
    print ('F-score   : {}'.format(fscore))
    print ('Support   : {}'.format(support))
    
    txtfile.write('Precision :'+str(precision)+'\n')
    txtfile.write('Recall    :'+str(recall)+'\n')
    txtfile.write('F-score   :'+str(fscore)+'\n')
    txtfile.write('Support   :'+str(support)+'\n')

    return fscore    

In [None]:
def make_confusion_matrix(file_name, y_true, preds, fold):
    from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
    
    if downsampling==True:
        save = '../confussionMatrix/' + generator + '/' + 'balanced_class' + file_name + str(fold) + '.png'
    else:
        save = '../confussionMatrix/' + generator + '/' + 'unbalanced_class' + file_name + str(fold) + '.png'
        
    target_names = ['adenoma', 'hiperplastic', 'serrated']
    cm = confusion_matrix(y_true=y_test, y_pred=preds, normalize='true')
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=target_names)
    disp = disp.plot(include_values=True, cmap=plt.cm.Blues, xticks_rotation='horizontal', values_format='.2f')
    
    plt.savefig(save)
    plt.show()

In [None]:
def ml_train(train_features, labels):
        
    print("Random forest 200 trees training...")
    forest = RandomForestClassifier(n_estimators=200, class_weight="balanced", bootstrap=True, oob_score=True,
                                    random_state=42)
    forest.fit(train_features, labels)
    
    print("Support vector machine training...")
    svm_model = SVC(C=10, kernel='rbf', gamma='auto', class_weight='balanced', decision_function_shape='ovr')
    svm_model.fit(train_features, labels)
    
    print("KNN 15 neighbors training...")
    knn = KNeighborsClassifier(n_neighbors=15, weights="distance")
    knn.fit(train_features, labels)
    
    return forest, svm_model, knn

In [None]:
def ml_test(test_features, labels, forest, svm_model, knn, fold):
        
    max_fscore = -9999
    models = ["RF200", "SVM", "KNN15"]
    for model in models:       
        print("Testing with ", model)
        if model == "RF200":
            preds = forest.predict(test_features)
            fscore = get_metrics(model, labels, preds)
            if fscore > max_fscore:
                max_fscore = fscore
                experiment = model
                preds2 = preds
        elif model == "SVM":
            preds = svm_model.predict(test_features)
            fscore = get_metrics(model, labels, preds)
            if fscore > max_fscore:
                max_fscore = fscore
                experiment = model
                preds2 = preds
        else:
            preds = knn.predict(test_features)
            fscore = get_metrics(model, labels, preds)
            if fscore > max_fscore:
                max_fscore = fscore
                experiment = model
                preds2 = preds
            
        
    print("the ", experiment, " experiment get max fscore: ", max_fscore)
    
    print("saving model...")
    if downsampling==True:
        save_pth = '../models/embeddingClassification/checkPoints/' + generator+ '/' + 'balanced_class' 
    else:
        save_pth = '../models/embeddingClassification/checkPoints/' + generator+ '/' + 'unbalanced_class'
        
    if experiment=="RF200":
        file_name = save_pth + experiment + "fold" + str(fold) + '.joblib'
        dump(forest, file_name)
    elif experiment=='SVM':
        file_name = save_pth + experiment + "fold" + str(fold) + '.joblib'
        dump(svm_model, file_name) 
    else:
        file_name = save_pth + experiment + "fold" + str(fold) + '.joblib'
        dump(knn, file_name)
    
    print("CONFUSSION MATRIX:")
    make_confusion_matrix(experiment, labels, preds2, fold)

**Main**

In [None]:
txtfile = open('embClassificationMetricsBalancedClass' + generator + '.txt', 'w+')
c = 0
for train_index, test_index in skf.split(train_videos, labels):

    x_train, x_test, y_train, y_test = [], [], [], []
    x_train_fold, x_test_fold = [], []
    c += 1
    txtfile.write("***KFOLD ***" +str(c))
    print("***KFOLD ***", c)
    
    for index in train_index:
        fold = train_videos[index]
        x_train_fold.append(fold)
    for index in test_index:
        fold = train_videos[index]
        x_test_fold.append(fold)
    
    #tomo el nombre tanto de videos como labels por el indice
    #x_train_fold, x_test_fold = train_videos[train_index], train_videos[test_index]
    #y_train_fold, y_test_fold = labels[train_index], labels[test_index]
    print("folds created!")
    #make train dataframe  
    for i, data in enumerate(x_train_fold):
        tmp_train_df= train_df[train_df['info']==data]
        x_train.append(tmp_train_df)

    train_df2 = pd.concat(x_train)
    train_df2 = train_df2.reset_index(drop=True)
    y_train = train_df2['label']
    y_train = y_train.to_numpy()
    
    print("train info: ")
    unique, counts = np.unique(y_train, return_counts=True)

    print(np.asarray((unique, counts)).T)
    
    #make test dataframe
    for i, data in enumerate(x_test_fold):
        tmp_test_df= train_df[train_df['info']==data]
        x_test.append(tmp_test_df)

    test_df2 = pd.concat(x_test)
    test_df2 = test_df2.reset_index(drop=True)
    y_test = test_df2['label']
    y_test = y_test.to_numpy()
    
    print("test info: ")
    unique, counts = np.unique(y_test, return_counts=True)

    print(np.asarray((unique, counts)).T)
    
    print("features management: ")
    train_features = []
    for i in range(len(train_df2)):
        tmp_features = train_df2.loc[i]['features']
        train_features.append(tmp_features)

    train_features = np.array(train_features)
    
    test_features = []
    for i in range(len(test_df2)):
        tmp_features = test_df2.loc[i]['features']
        test_features.append(tmp_features)

    test_features = np.array(test_features) 

    print("ml models training...")
    
    forest, svm_model, knn = ml_train(train_features, y_train)
    

    print("ml models testing...")
    ml_test(test_features, y_test, forest, svm_model, knn, c)

txtfile.close()

# <font color='red'>Visual ML prediction</font>
### Machine learning loading ...

In [None]:
root = '../models/embeddingClassification/checkPoints/'
if generator == "GenA":
    if downsampling==True:
        classifier = load(root+'GenA/balanced_class/balanced_classKNN15fold4.joblib')
    else:
        classifier = load(root+'GenA/unbalanced_class/balanced_classKNN15fold4.joblib')
    
else:
    if downsampling==True:
        classifier = load(root+'GenB/balanced_class/balanced_classRF200fold4.joblib')
    else:
        classifier = load(root+'GenB/unbalanced_class/balanced_classKNN15fold4.joblib')

print("classifier loaded! from generator: {} and downsampled: {}".format(generator, downsampling))

### Data loading

### Filtering for cycleGan data test

In [None]:
test_df = test_df.reset_index(drop=True)

### Classifier predicting

In [None]:
test_features = []
for i in range(len(test_df)):
    tmp_features = test_df.loc[i]['features']
    test_features.append(tmp_features)

test_features = np.array(test_features)
print("features shape: {}".format(test_features.shape))

In [None]:
predictions = classifier.predict(test_features)
print(predictions.shape)

In [None]:
y_true = test_df['label'].values
y_true

In [None]:
predictions

In [None]:
print(metrics.classification_report(y_true, predictions, target_names=["adenoma","hiperplastico", "serrated"]))

In [None]:
test_df['predicted'] = predictions
test_df.head()

### PCA data projection and visualization

In [None]:
from sklearn.decomposition import PCA
import plotly.express as px

In [None]:
pca = PCA(n_components=3)
pca_result = pca.fit_transform(test_features)

In [None]:
test_df['pca-one'] = pca_result[:,0]
test_df['pca-two'] = pca_result[:,1] 
test_df['pca-three'] = pca_result[:,2]

print('Explained variation per principal component: {}'.format(pca.explained_variance_ratio_))

In [None]:
test_df.tail()

In [None]:
fig = px.scatter(test_df, x="pca-one", y="pca-two", color='predicted', hover_name="label", hover_data=["video"],
                 opacity=0.5)
fig.show()