# <font color='red'>**Libraries**</font>

In [None]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import os
import joblib

# <font color='red'>**Data loading and preprocessing**</font>

In [None]:
def load_data(split):
    path = "../data/embeddings/generalEmbc/" + split + "/"

    ade_dat = np.load(path+"adenomaEmbeddings.npy")
    ade_lab = np.load(path+"adenomaLabels.npy")
    ade_vid = np.load(path+"adenomaVideos.npy") 

    hyp_dat = np.load(path+"hiperplasticEmbeddings.npy")
    hyp_lab = np.load(path+"hiperplasticLabels.npy")
    hyp_vid = np.load(path+"hiperplasticVideos.npy")
    
    ser_dat = np.load(path+"serratedEmbeddings.npy")
    ser_lab = np.load(path+"serratedLabels.npy")
    ser_vid = np.load(path+"serratedVideos.npy")

    print("==== "+ split + " data info ====")
    print("ade dim: {}, amount of labels: {}, videos: {}".format(ade_dat.shape, ade_lab.shape, ade_vid.shape))
    print("hyp dim: {}, amount of labels: {}, videos: {}".format(hyp_dat.shape, hyp_lab.shape, hyp_vid.shape))
    print("ser dim: {}, amount of labels: {}, videos: {}".format(ser_dat.shape, ser_lab.shape, ser_vid.shape))
    
    features = np.concatenate((ade_dat, hyp_dat, ser_dat), axis=0)
    labels = np.concatenate((ade_lab, hyp_lab, ser_lab), axis=0)
    videos = np.concatenate((ade_vid, hyp_vid, ser_vid), axis=0)
    
    df = pd.DataFrame({'features': list(features), 'label': labels, 'video': videos}, columns=['features', 'label', 'video'])
    
    return df

In [None]:
def load_data(split):
    #path = "../Embeddings/vggDiscriminator/" + split + "/"
    path = "../../../unconditional/cycleGan-polyps/data/embeddings/adeVsHyp/embcBaseline/" + split + '/'

    ade_dat = np.load(path+"adenomaEmbeddings.npy")
    ade_lab = np.load(path+"adenomaLabels.npy")
    ade_vid = np.load(path+"adenomaVideos.npy") 

    hyp_dat = np.load(path+"hiperplasticEmbeddings.npy")
    hyp_lab = np.load(path+"hiperplasticLabels.npy")
    hyp_vid = np.load(path+"hiperplasticVideos.npy")
    
    print("==== "+ split + " data info ====")
    print("ade dim: {}, amount of labels: {}, videos: {}".format(ade_dat.shape, ade_lab.shape, ade_vid.shape))
    print("hyp dim: {}, amount of labels: {}, videos: {}".format(hyp_dat.shape, hyp_lab.shape, hyp_vid.shape))
    
    features = np.concatenate((ade_dat, hyp_dat), axis=0)
    labels = np.concatenate((ade_lab, hyp_lab), axis=0)
    videos = np.concatenate((ade_vid, hyp_vid), axis=0)
    
    df = pd.DataFrame({'features': list(features), 'label': labels, 'video': videos}, columns=['features', 'label', 'video'])
    
    return df

In [None]:
train_df = load_data(split='train')
train_df['info'] = train_df['label'] + '_' + train_df['video']
print("train_df info:")
print(train_df.groupby(['label']).count())

test_df = load_data(split='test')
test_df['info'] = test_df['label'] + '_' + test_df['video']
print("test_df info:")
print(test_df.groupby(['label']).count())

In [None]:
def get_features(df):
    features = []
    for i in range(len(df)):
        tmp_features = df.loc[i]['features']
        features.append(tmp_features)

    features = np.array(features)
    
    return features

In [None]:
train_features = get_features(train_df)
print("train features shape: {}, min and max values: {} {}".format(train_features.shape, train_features.min(),
                                                                   train_features.max()))

test_features = get_features(test_df)
print("test features shape: {}, min and max values: {} {}".format(test_features.shape, test_features.min(),
                                                                   test_features.max()))

## PCA

In [None]:
pca = PCA(n_components=3, random_state=69)
pca.fit(train_features)
pca_result = pca.transform(train_features)

In [None]:
train_df['pca-one'] = pca_result[:,0]
train_df['pca-two'] = pca_result[:,1] 
train_df['pca-three'] = pca_result[:,2]

print('Explained variation per principal component: {}'.format(pca.explained_variance_ratio_))

In [None]:
pca_result = pca.transform(test_features)
test_df['pca-one'] = pca_result[:,0]
test_df['pca-two'] = pca_result[:,1] 
test_df['pca-three'] = pca_result[:,2]

print('Explained variation per principal component: {}'.format(pca.explained_variance_ratio_))

## Tsne 

In [None]:
tsne = TSNE(n_components = 2, init = 'pca')
P1_tsne = tsne.fit_transform(train_features)
P1_tsne.shape

In [None]:
l1 = P1_tsne[:,0]
l2 = P1_tsne[:,1]

In [None]:
train_df['x'] = l1
train_df['y'] = l2

**For test split**

In [None]:
P1_tsne = tsne.fit_transform(test_features)
P1_tsne.shape

In [None]:
l1 = P1_tsne[:,0]
l2 = P1_tsne[:,1]

In [None]:
test_df['x'] = l1
test_df['y'] = l2

# <font color='red'>**Classifying**</font>

In [None]:
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC  
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.multiclass import OneVsRestClassifier
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import precision_recall_fscore_support as score

In [None]:
x_train, y_train = train_features, train_df['label'].values
x_test, y_test = test_features, test_df['label'].values

print("====Train info:====")
print("data shape:{}, labels: {}".format(x_train.shape, y_train.shape))
print("====Test info:====")
print("data shape:{}, labels: {}".format(x_test.shape, y_test.shape))

In [None]:
le = preprocessing.LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_test_enc = le.fit_transform(y_test)
print("train labels:")
print(y_train_enc)
print("test labels:")
print(y_test_enc)
n_class = len(set(y_train_enc))
print("number of classes: ", n_class)

## Models

In [None]:
def get_confussion_matrix(y_true, y_pred):
    target_names = ['adenoma', 'hiperplastic']
    cm = confusion_matrix(y_true=y_true, y_pred=y_pred, normalize='true')
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=target_names)
    disp = disp.plot(include_values=True, cmap=plt.cm.Blues, xticks_rotation='horizontal', values_format='.2f')

    plt.grid(False)
    plt.show()

In [None]:
to_save = '../../../unconditional/cycleGan-polyps/models_emb_classification/adeVsHyp/embcBaseline/'
max_auc = -9999
#for KNN
for i in range(5, 40, 15):
    fpr = {}
    tpr = {}
    thresh ={}
    print("===== for k =====", i)
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(x_train, y_train_enc)
    y_pred = knn.predict(x_test)
    pred_prob = knn.predict_proba(x_test)
    acc = metrics.accuracy_score(y_test_enc, y_pred)
    for j in range(n_class):   
        fpr[j], tpr[j], thresh[j] = roc_curve(y_test_enc, pred_prob[:,j], pos_label=j)
    ade_auc, hyp_auc = auc(fpr[0], tpr[0]), auc(fpr[1], tpr[1])
    gen_auc = roc_auc_score(y_test_enc, np.argmax(pred_prob, axis=1))    
    precision, recall, fscore, support = score(y_test_enc, y_pred, average='macro')
    
    filename = to_save + "KNN" + str(i) + '.pkl'
    joblib.dump(knn, filename) 
    if gen_auc>max_auc:
        max_auc = gen_auc
        k_val = i
        y_pred2 = y_pred
        gen_auc2 = gen_auc
        
    print("METRICS:")
    print("Acc: ", acc)
    print("Precision: ", precision)
    print("Recall: ", recall)
    print("Fscore: ", fscore)
    print("Gen AUC: ", gen_auc)
    print("ade auc: ", ade_auc)
    print("hyp auc: ", hyp_auc)
    
print("for KNN, the best model was the k value: ", k_val, "with general auc: ", max_auc)
print("confussion matrix:")
get_confussion_matrix(y_test_enc, y_pred2)

#for random forest
max_auc = -9999
for i in range(10, 40, 10):
    fpr = {}
    tpr = {}
    thresh ={}
    print("===== for ", i, " trees =====")
    rfc = RandomForestClassifier(n_estimators=i, random_state=14)
    rfc.fit(x_train, y_train_enc)
    y_pred = rfc.predict(x_test)
    pred_prob = rfc.predict_proba(x_test)
    acc = metrics.accuracy_score(y_test_enc, y_pred)
    for j in range(n_class):   
        fpr[j], tpr[j], thresh[j] = roc_curve(y_test_enc, pred_prob[:,j], pos_label=j)
    ade_auc, hyp_auc = auc(fpr[0], tpr[0]), auc(fpr[1], tpr[1])
    gen_auc = roc_auc_score(y_test_enc, np.argmax(pred_prob, axis=1))    
    precision, recall, fscore, support = score(y_test_enc, y_pred, average='macro')
    
    filename = to_save + "RF" + str(i) + '.pkl'
    joblib.dump(rfc, filename)     
    if gen_auc>max_auc:
        max_auc = gen_auc
        k_val = i
        y_pred2 = y_pred
        gen_auc2 = gen_auc
        
    print("METRICS:")
    print("Acc: ", acc)
    print("Precision: ", precision)
    print("Recall: ", recall)
    print("Fscore: ", fscore)
    print("Gen AUC: ", gen_auc)
    print("ade auc: ", ade_auc)
    print("hyp auc: ", hyp_auc)
    
print("for Random forest, the best model was the trees value: ", k_val, "with general auc: ", gen_auc2)
print("confussion matrix:")
get_confussion_matrix(y_test_enc, y_pred2)