# <font color='red'>**Libraries**</font>

In [None]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import os
import joblib

# <font color='red'>**Data loading and preprocessing**</font>

In [None]:
def load_data(split, domain):
    
    if split == "train":
        
        cuni1_emb = np.load("../embeddings/ablationRgb/2048/train_" + domain.lower() + "/" + "CuNi1/" + "Embeddings.npy")
        cuni1_lab = np.load("../embeddings/ablationRgb/2048/train_" + domain.lower() + "/" + "CuNi1/" + "Labels.npy")
        
        cuni2_emb = np.load("../embeddings/ablationRgb/2048/train_" + domain.lower() + "/" + "CuNi2/" + "Embeddings.npy")
        cuni2_lab = np.load("../embeddings/ablationRgb/2048/train_" + domain.lower() + "/" + "CuNi2/" + "Labels.npy")
        
        cuni3_emb = np.load("../embeddings/ablationRgb/2048/train_" + domain.lower() + "/" + "CuNi3/" + "Embeddings.npy")
        cuni3_lab = np.load("../embeddings/ablationRgb/2048/train_" + domain.lower() + "/" + "CuNi3/" + "Labels.npy")
        
    else:
        cuni1_emb = np.load("../embeddings/ablationRgb/2048/test_" + domain.lower() + "/" + "CuNi1/" + "Embeddings.npy")
        cuni1_lab = np.load("../embeddings/ablationRgb/2048/test_" + domain.lower() + "/" + "CuNi1/" + "Labels.npy")

        cuni2_emb = np.load("../embeddings/ablationRgb/2048/test_" + domain.lower() + "/" + "CuNi2/" + "Embeddings.npy")
        cuni2_lab = np.load("../embeddings/ablationRgb/2048/test_" + domain.lower() + "/" + "CuNi2/" + "Labels.npy")

        cuni3_emb = np.load("../embeddings/ablationRgb/2048/test_" + domain.lower() + "/" + "CuNi3/" + "Embeddings.npy")
        cuni3_lab = np.load("../embeddings/ablationRgb/2048/test_" + domain.lower() + "/" + "CuNi3/" + "Labels.npy")
        
        
    print("==== "+ split + " data info ====")
    print("CuNi1 dim: {}, amount of labels: {}".format(cuni1_emb.shape, cuni1_lab.shape))
    print("CuNi2 dim: {}, amount of labels: {}".format(cuni2_emb.shape, cuni2_lab.shape))
    print("CuNi3 dim: {}, amount of labels: {}".format(cuni3_emb.shape, cuni3_lab.shape))
    
    features = np.concatenate((cuni1_emb, cuni2_emb, cuni3_emb), axis=0)
    labels = np.concatenate((cuni1_lab, cuni2_lab, cuni3_lab), axis=0)
    
    df = pd.DataFrame({'features': list(features), 'label': labels}, columns=['features', 'label'])
    
    return df

In [None]:
train_df = load_data(split='train', domain="dry")
print("train_df info:")
print(train_df.groupby(['label']).count())

test_df = load_data(split='test', domain="dry")
print("test_df info:")
print(test_df.groupby(['label']).count())

In [None]:
def get_features(df):
    features = []
    for i in range(len(df)):
        tmp_features = df.loc[i]['features']
        features.append(tmp_features)

    features = np.array(features)
    
    return features

In [None]:
train_features = get_features(train_df)
print("train features shape: {}, min and max values: {} {}".format(train_features.shape, train_features.min(),
                                                                   train_features.max()))

test_features = get_features(test_df)
print("test features shape: {}, min and max values: {} {}".format(test_features.shape, test_features.min(),
                                                                   test_features.max()))

## PCA

In [None]:
pca = PCA(n_components=3, random_state=69)
pca.fit(train_features)
pca_result = pca.transform(train_features)

In [None]:
train_df['pca-one'] = pca_result[:,0]
train_df['pca-two'] = pca_result[:,1] 
train_df['pca-three'] = pca_result[:,2]

print('Explained variation per principal component: {}'.format(pca.explained_variance_ratio_))

In [None]:
pca_result = pca.transform(test_features)
test_df['pca-one'] = pca_result[:,0]
test_df['pca-two'] = pca_result[:,1] 
test_df['pca-three'] = pca_result[:,2]

print('Explained variation per principal component: {}'.format(pca.explained_variance_ratio_))

## Tsne 

In [None]:
tsne = TSNE(n_components = 2, init = 'pca')
P1_tsne = tsne.fit_transform(train_features)
P1_tsne.shape

In [None]:
l1 = P1_tsne[:,0]
l2 = P1_tsne[:,1]

In [None]:
train_df['x'] = l1
train_df['y'] = l2

**For test split**

In [None]:
P1_tsne = tsne.fit_transform(test_features)
P1_tsne.shape

In [None]:
l1 = P1_tsne[:,0]
l2 = P1_tsne[:,1]

In [None]:
test_df['x'] = l1
test_df['y'] = l2

# <font color='red'>**Classifying**</font>

In [None]:
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC  
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.multiclass import OneVsRestClassifier
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import precision_recall_fscore_support as score

In [None]:
x_train, y_train = train_features, train_df['label'].values
x_test, y_test = test_features, test_df['label'].values

print("====Train info:====")
print("data shape:{}, labels: {}".format(x_train.shape, y_train.shape))
print("====Test info:====")
print("data shape:{}, labels: {}".format(x_test.shape, y_test.shape))

In [None]:
le = preprocessing.LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_test_enc = le.fit_transform(y_test)
print("train labels:")
print(y_train_enc)
print("test labels:")
print(y_test_enc)
n_class = len(set(y_train_enc))
print("number of classes: ", n_class)

## Models

In [None]:
def get_confussion_matrix(y_true, y_pred):
    target_names = ['CuNi1', 'CuNi2', 'CuNi3']
    cm = confusion_matrix(y_true=y_true, y_pred=y_pred, normalize='true')
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=target_names)
    disp = disp.plot(include_values=True, cmap=plt.cm.Blues, xticks_rotation='horizontal', values_format='.2f')

    plt.grid(False)
    plt.show()

In [None]:
to_save = '../models_embeddings/ablationRgb/2048/secoAhumedo/'
max_fscore = -9999
#for KNN
for i in range(5, 40, 15):
    #fpr = {}
    #tpr = {}
    #thresh ={}
    print("===== for k =====", i)
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(x_train, y_train_enc)
    y_pred = knn.predict(x_test)
    pred_prob = knn.predict_proba(x_test)
    acc = metrics.accuracy_score(y_test_enc, y_pred)
    #for j in range(n_class):   
    #    fpr[j], tpr[j], thresh[j] = roc_curve(y_test_enc, pred_prob[:,j], pos_label=j)
    #ade_auc, hyp_auc = auc(fpr[0], tpr[0]), auc(fpr[1], tpr[1])
    #gen_auc = roc_auc_score(y_test_enc, np.argmax(pred_prob, axis=1))    
    precision, recall, fscore, support = score(y_test_enc, y_pred, average='macro')
    
    filename = to_save + "KNN" + str(i) + '.pkl'
    joblib.dump(knn, filename) 
    if fscore>max_fscore:
        max_fscore = fscore
        k_val = i
        y_pred2 = y_pred
        gen_fscore = fscore
        
    print("METRICS:")
    print("Acc: ", acc)
    print("Precision: ", precision)
    print("Recall: ", recall)
    print("Fscore: ", fscore)
    
print("for KNN, the best model was the k value: ", k_val, "with fscore: ", gen_fscore)
print("confussion matrix:")
get_confussion_matrix(y_test_enc, y_pred2)

#for random forest
max_fscore = -9999
for i in range(10, 40, 10):
    #fpr = {}
    #tpr = {}
    #thresh ={}
    print("===== for ", i, " trees =====")
    rfc = RandomForestClassifier(n_estimators=i, random_state=14)
    rfc.fit(x_train, y_train_enc)
    y_pred = rfc.predict(x_test)
    pred_prob = rfc.predict_proba(x_test)
    acc = metrics.accuracy_score(y_test_enc, y_pred)
    #for j in range(n_class):   
    #    fpr[j], tpr[j], thresh[j] = roc_curve(y_test_enc, pred_prob[:,j], pos_label=j)
    #ade_auc, hyp_auc = auc(fpr[0], tpr[0]), auc(fpr[1], tpr[1])
    #gen_auc = roc_auc_score(y_test_enc, np.argmax(pred_prob, axis=1))    
    precision, recall, fscore, support = score(y_test_enc, y_pred, average='macro')
    
    filename = to_save + "RF" + str(i) + '.pkl'
    joblib.dump(rfc, filename)     
    if fscore>max_fscore:
        max_fscore = fscore
        k_val = i
        y_pred2 = y_pred
        gen_fscore = fscore
        
    print("METRICS:")
    print("Acc: ", acc)
    print("Precision: ", precision)
    print("Recall: ", recall)
    print("Fscore: ", fscore)

print("for Random forest, the best model was the trees value: ", k_val, "with fscore: ", gen_fscore)
print("confussion matrix:")
get_confussion_matrix(y_test_enc, y_pred2)