# <font color='red'>**Useful functions**</font>

In [None]:
from matplotlib.colors import ListedColormap
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import joblib
from sklearn import neighbors
from sklearn.manifold import TSNE

In [None]:
def load_data(split, experiment, task='binary', include_serrated=False):
        
    if task == 'binary':
        path = "../data/embeddings/adeVsHyp/"+experiment+"/" + split + "/"  
    else:
        path = "../data/embeddings/generalEmbc/" + split + "/"
    
    print("reading from: ", path)
    ade_dat = np.load(path+"adenomaEmbeddings.npy")
    ade_lab = np.load(path+"adenomaLabels.npy")
    ade_vid = np.load(path+"adenomaVideos.npy") 

    hyp_dat = np.load(path+"hiperplasticEmbeddings.npy")
    hyp_lab = np.load(path+"hiperplasticLabels.npy")
    hyp_vid = np.load(path+"hiperplasticVideos.npy")
    
    if split!='train' and include_serrated == True:
        ser_dat = np.load(path+"serratedEmbeddings.npy")
        ser_lab = np.load(path+"serratedLabels.npy")
        ser_vid = np.load(path+"serratedVideos.npy")
    else:
        None
        
    
        
    if split!='train' and include_serrated == True:
        print("==== "+ split + " data info ====")
        print("ade dim: {}, amount of labels: {}, videos: {}".format(ade_dat.shape, ade_lab.shape, ade_vid.shape))
        print("hyp dim: {}, amount of labels: {}, videos: {}".format(hyp_dat.shape, hyp_lab.shape, hyp_vid.shape))
        print("ser dim: {}, amount of labels: {}, videos: {}".format(ser_dat.shape, ser_lab.shape, ser_vid.shape))
        features = np.concatenate((ade_dat, hyp_dat, ser_dat), axis=0)
        labels = np.concatenate((ade_lab, hyp_lab, ser_lab), axis=0)
        videos = np.concatenate((ade_vid, hyp_vid, ser_vid), axis=0)
    else:
        print("==== "+ split + " data info ====")
        print("ade dim: {}, amount of labels: {}, videos: {}".format(ade_dat.shape, ade_lab.shape, ade_vid.shape))
        print("hyp dim: {}, amount of labels: {}, videos: {}".format(hyp_dat.shape, hyp_lab.shape, hyp_vid.shape))
        features = np.concatenate((ade_dat, hyp_dat), axis=0)
        labels = np.concatenate((ade_lab, hyp_lab), axis=0)
        videos = np.concatenate((ade_vid, hyp_vid), axis=0)
    
    
    
    df = pd.DataFrame({'features': list(features), 'clase': labels, 'video': videos}, columns=['features', 'clase', 'video'])
    
    return df    

In [None]:
def load_model(path, experiment):
    if experiment == 'embcBaseline':
        to_load = path + experiment + '/KNN20.pkl'
    else:
        to_load = path + experiment + '/KNN35.pkl'

    print("model to load: ", to_load)
    model = joblib.load(to_load)
    print("model loaded!")
    return model

In [None]:
def get_features(df):
    features = []
    labels = []
    for i in range(len(df)):
        tmp_features = df.loc[i]['features']
        tmp_labels = df.loc[i]['clase']
        features.append(tmp_features)
        labels.append(tmp_labels)

    features = np.array(features)
    labels = np.array(labels)
    
    return features, labels

In [None]:
def modif_df(df):
    conditions = [
    (df['clase'] == 'adenoma'),
    (df['clase'] == 'hiperplastic')]

    values = [1, 2]

    df['labels'] = np.select(conditions, values)
    
    return df

# <font color='red'>**Loading data and model**</font>

In [None]:
experiment = 'embcBaseline'
task = 'binary'
include_serrated = False

In [None]:
split = 'test'
df_test = load_data(split=split, experiment=experiment, task = task, include_serrated=include_serrated)
df_test['info'] = df_test['clase'] + '_' + df_test['video']
print("train_df info:")
print(df_test.groupby(['clase']).count())

In [None]:
split = 'train'
df_train = load_data(split=split, experiment=experiment, task = task, include_serrated=include_serrated)
df_train['info'] = df_train['clase'] + '_' + df_train['video']
print("train_df info:")
print(df_train.groupby(['clase']).count())

In [None]:
df_test.head()

## <font color='red'>**Dimension reduction with Tsne**</font>

In [None]:
tsne = TSNE(n_components = 2, init = 'pca')

#for train set
train_features, train_labels = get_features(df_train)
Ptrain_tsne = tsne.fit_transform(train_features)
print("shape of train features: ", Ptrain_tsne.shape)
l1_train = Ptrain_tsne[:,0]
l2_train = Ptrain_tsne[:,1]
df_train['x'] = l1_train
df_train['y'] = l2_train
#modifying df
df_train = modif_df(df_train)

#for test set
test_features, test_labels = get_features(df_test)
Ptest_tsne = tsne.fit_transform(test_features)
print("shape of test features: ", Ptest_tsne.shape)
l1_test = Ptest_tsne[:,0]
l2_test = Ptest_tsne[:,1]
df_test['x'] = l1_test
df_test['y'] = l2_test
#modifying df
df_test = modif_df(df_test)

In [None]:
df_test

# <font color='red'>**Plotting KNN boundary decision embedding space**</font>

In [None]:
import plotly.graph_objects as go
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix

In [None]:
X_train, y_train = df_train.loc[:, ["x", "y"]].values, df_train.loc[:, "labels"].values
X_test, y_test = df_test.loc[:, ["x", "y"]].values, df_test.loc[:, "labels"].values
print("cantidad en train {}, cantidad en test {}".format(X_train.shape, X_test.shape))

In [None]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
classifier = KNeighborsClassifier(n_neighbors = 50, weights='uniform')
classifier.fit(X_train, y_train)

In [None]:
y_pred = classifier.predict(X_test)

In [None]:
cm = confusion_matrix(y_test, y_pred)
cm

# <font color='red'>**Visualizing results**</font>

In [None]:
# Visualising the Training set results
from matplotlib.colors import ListedColormap
import matplotlib.colors

X_set, y_set = X_test, y_test
X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, stop = X_set[:, 0].max() + 1, step = 0.01),
                     np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() + 1, step = 0.01))


colors=['#db697e', '#8f94ff'] #hex color codes colors for background (ade, hyp, ser)

cmap = matplotlib.colors.ListedColormap(colors)

Z = classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape)

#plt.contourf(X1, X2, Z, alpha = 1, cmap = cmap, linestyles = '-', levels = 2)

#new 
    #contourf: sirve para poner colores de fondo a las areas de clasificacion
    #contour: sirve para mostrar los bordes de los limites
    
plt.contour(X1, X2, Z, alpha = 1, colors='black', linestyles = 'dashdot', levels = 2, linewidths = 2.0)

plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())

for i, j in enumerate(np.unique(y_set)):
    if j == 1:
        label = "adenoma"
        #marker = "o"
    if j == 2:
        label = "hyperplastic"
        #marker = "^"
    if j == 3:
        label = "serrated"
        #marker = "s"
        
    #colors for dots (ade, hyp, ser)
    
    plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
                c = ListedColormap(('#d63351', '#1c09ed'))(i), edgecolors="black", label = label, s=40, 
                linewidths=1)

#plt.title('Classifier (Test set)')
plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05), fancybox=True, shadow=True, ncol=2)
plt.axis('off')
#plt.savefig('knn-polyps.png')
plt.show()