### Pipeline : KNN and Dimensionality Reduction with NCA

In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib.colors import ListedColormap
import seaborn as sns
from sklearn.model_selection import cross_val_score, GridSearchCV, RepeatedStratifiedKFold
from sklearn.neighbors import KNeighborsClassifier, NeighborhoodComponentsAnalysis
from sklearn.metrics import confusion_matrix, classification_report, plot_confusion_matrix, precision_score
from sklearn.metrics import precision_recall_curve, f1_score, auc, average_precision_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn import metrics
from collections import Counter
from sklearn.neighbors import LocalOutlierFactor

### Load Data and Check Shape

In [2]:
data = pd.read_csv("train_159_text.csv")
print(data.shape)

(159, 69)


In [3]:
x=data.drop('saidas', axis=1)
y=data['saidas']
print(x.shape)
print(y.shape)

(159, 68)
(159,)


In [22]:
counter = Counter(y)
print(counter)

Counter({1.0: 102, 0.0: 57})


### Step 1 - Results with KNN

In [6]:
for i in range(1,31):
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.1, shuffle=True)
    knn = KNeighborsClassifier(n_neighbors = 1)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    acc = accuracy_score(y_test, y_pred)
    aucroc = roc_auc_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, digits=4)

    print("Basic KNN Accuracy: {:.2f}".format(acc*100))
    print("Valor AUC-ROC : {:.2f}".format(aucroc*100))
    print("CM : \n", cm)
    print(report)
    
    trueNegative=cm[0][0]
    falsePositive=cm[0][1]
    falsenegative=cm[1][0]
    truePositive=cm[1][1]
  
    print("Sensibilidade: {:.2f}".format((truePositive/(truePositive+falsenegative))*100))
    print("Especificidade: {:.2f}".format((trueNegative/(trueNegative+falsePositive))*100))
    print()

Basic KNN Accuracy: 31.25
Valor AUC-ROC : 34.13
CM : 
 [[1 8]
 [3 4]]
              precision    recall  f1-score   support

         0.0     0.2500    0.1111    0.1538         9
         1.0     0.3333    0.5714    0.4211         7

    accuracy                         0.3125        16
   macro avg     0.2917    0.3413    0.2874        16
weighted avg     0.2865    0.3125    0.2707        16

Sensibilidade: 57.14
Especificidade: 11.11

Basic KNN Accuracy: 56.25
Valor AUC-ROC : 55.00
CM : 
 [[3 3]
 [4 6]]
              precision    recall  f1-score   support

         0.0     0.4286    0.5000    0.4615         6
         1.0     0.6667    0.6000    0.6316        10

    accuracy                         0.5625        16
   macro avg     0.5476    0.5500    0.5466        16
weighted avg     0.5774    0.5625    0.5678        16

Sensibilidade: 60.00
Especificidade: 50.00

Basic KNN Accuracy: 75.00
Valor AUC-ROC : 71.43
CM : 
 [[3 4]
 [0 9]]
              precision    recall  f1-score   su

Basic KNN Accuracy: 68.75
Valor AUC-ROC : 61.67
CM : 
 [[2 4]
 [1 9]]
              precision    recall  f1-score   support

         0.0     0.6667    0.3333    0.4444         6
         1.0     0.6923    0.9000    0.7826        10

    accuracy                         0.6875        16
   macro avg     0.6795    0.6167    0.6135        16
weighted avg     0.6827    0.6875    0.6558        16

Sensibilidade: 90.00
Especificidade: 33.33

Basic KNN Accuracy: 75.00
Valor AUC-ROC : 66.67
CM : 
 [[ 2  2]
 [ 2 10]]
              precision    recall  f1-score   support

         0.0     0.5000    0.5000    0.5000         4
         1.0     0.8333    0.8333    0.8333        12

    accuracy                         0.7500        16
   macro avg     0.6667    0.6667    0.6667        16
weighted avg     0.7500    0.7500    0.7500        16

Sensibilidade: 83.33
Especificidade: 50.00

Basic KNN Accuracy: 62.50
Valor AUC-ROC : 61.90
CM : 
 [[4 3]
 [3 6]]
              precision    recall  f1-score 

### Step 2 - Results with Standardization + KNN (step 1)

In [24]:
scaler=StandardScaler()
#scaler=MinMaxScaler()
x_scaled=scaler.fit_transform(x)

In [25]:
x_scaled

array([[5.93261689e-01, 3.13673138e-01, 4.34043181e-01, ...,
        6.35872024e-01, 7.93229052e-01, 6.66719844e-04],
       [8.62519279e-02, 6.43120436e-03, 6.28812128e-01, ...,
        4.11581649e-01, 4.31169592e-01, 3.69149497e-03],
       [3.29403905e-01, 3.40954084e-01, 7.58122697e-01, ...,
        7.98177643e-01, 7.82799692e-01, 6.49985887e-04],
       ...,
       [2.20823903e-01, 6.35155018e-02, 5.85174506e-01, ...,
        5.36885532e-01, 5.33241543e-01, 5.37163427e-03],
       [1.00785154e-01, 3.97801146e-02, 6.03675925e-01, ...,
        4.62090853e-01, 5.70121989e-01, 2.52635150e-03],
       [6.83777634e-02, 5.53951417e-03, 6.31066296e-01, ...,
        0.00000000e+00, 6.96203530e-01, 1.08148060e-03]])

In [9]:
for i in range(1,31):
    X_train, X_test, y_train, y_test = train_test_split(x_scaled, y, test_size=0.1, shuffle=True)
    knn = KNeighborsClassifier(n_neighbors = 1)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    acc = accuracy_score(y_test, y_pred)
    aucroc = roc_auc_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, digits=4)

    print("Basic KNN Accuracy: {:.2f}".format(acc*100))
    print("Valor AUC-ROC : {:.2f}".format(aucroc*100))
    print("CM : \n", cm)
    print(report)
    
    trueNegative=cm[0][0]
    falsePositive=cm[0][1]
    falsenegative=cm[1][0]
    truePositive=cm[1][1]
  
    print("Sensibilidade: {:.2f}".format((truePositive/(truePositive+falsenegative))*100))
    print("Especificidade: {:.2f}".format((trueNegative/(trueNegative+falsePositive))*100))
    print()

Basic KNN Accuracy: 50.00
Valor AUC-ROC : 50.00
CM : 
 [[3 3]
 [5 5]]
              precision    recall  f1-score   support

         0.0     0.3750    0.5000    0.4286         6
         1.0     0.6250    0.5000    0.5556        10

    accuracy                         0.5000        16
   macro avg     0.5000    0.5000    0.4921        16
weighted avg     0.5312    0.5000    0.5079        16

Sensibilidade: 50.00
Especificidade: 50.00

Basic KNN Accuracy: 62.50
Valor AUC-ROC : 62.50
CM : 
 [[3 5]
 [1 7]]
              precision    recall  f1-score   support

         0.0     0.7500    0.3750    0.5000         8
         1.0     0.5833    0.8750    0.7000         8

    accuracy                         0.6250        16
   macro avg     0.6667    0.6250    0.6000        16
weighted avg     0.6667    0.6250    0.6000        16

Sensibilidade: 87.50
Especificidade: 37.50

Basic KNN Accuracy: 56.25
Valor AUC-ROC : 55.00
CM : 
 [[3 3]
 [4 6]]
              precision    recall  f1-score   su

Basic KNN Accuracy: 75.00
Valor AUC-ROC : 65.45
CM : 
 [[ 2  3]
 [ 1 10]]
              precision    recall  f1-score   support

         0.0     0.6667    0.4000    0.5000         5
         1.0     0.7692    0.9091    0.8333        11

    accuracy                         0.7500        16
   macro avg     0.7179    0.6545    0.6667        16
weighted avg     0.7372    0.7500    0.7292        16

Sensibilidade: 90.91
Especificidade: 40.00

Basic KNN Accuracy: 81.25
Valor AUC-ROC : 80.16
CM : 
 [[5 2]
 [1 8]]
              precision    recall  f1-score   support

         0.0     0.8333    0.7143    0.7692         7
         1.0     0.8000    0.8889    0.8421         9

    accuracy                         0.8125        16
   macro avg     0.8167    0.8016    0.8057        16
weighted avg     0.8146    0.8125    0.8102        16

Sensibilidade: 88.89
Especificidade: 71.43

Basic KNN Accuracy: 56.25
Valor AUC-ROC : 46.36
CM : 
 [[1 4]
 [3 8]]
              precision    recall  f1-score 

### Step 3 - Results with Hiperparameters

In [10]:
def knn_best_params(X_train,X_test,y_train,y_test):
    k_range=list(range(1,31))
    weight_options=["uniform","distance"]
    print()
    
    param_grid=dict(n_neighbors=k_range,weights=weight_options)
    
    knn= KNeighborsClassifier()
    cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=10, random_state=42)
    grid=GridSearchCV(knn,param_grid,cv=cv,scoring="accuracy")
    grid.fit(X_train,y_train)
    
    print("Melhores parâmetros alcançados : ")
    print("Best acc. score: {}\n Best parameters {}".format(grid.best_score_,grid.best_params_))
    print()
    
    knn = KNeighborsClassifier(**grid.best_params_)
    knn.fit(X_train,y_train)
    
    y_pred_test= knn.predict(X_test)
    y_pred_train=knn.predict(X_train)
    
    cm_test=confusion_matrix(y_test,y_pred_test)
    cm_train=confusion_matrix(y_train,y_pred_train)
    
    acc_test= accuracy_score(y_test, y_pred_test)
    acc_train= accuracy_score(y_train, y_pred_train)
    
    aucroc_test = roc_auc_score(y_test, y_pred_test)
    aucroc_train = roc_auc_score(y_train, y_pred_train)
    
    report = classification_report(y_test, y_pred_test, digits=4)
    
    #print("Valor acuracia TRAIN :: {:.2f}".format(acc_train*100))
    #print("Matriz de confusão TRAIN\n",cm_train)
    #print("Valor AUC TRAIN :: {:.2f}".format(aucroc_train*100))
    #print()
    print("Valor acuracia TESTE :: {:.2f}".format(acc_test*100))
    print("Matriz de confusão TESTE\n",cm_test)
    print("Valor AUC TESTE :: {:.2f}".format(aucroc_test*100))
    print()
    print(report)
    
    trueNegative=cm_test[0][0]
    falsePositive=cm_test[0][1]
    falsenegative=cm_test[1][0]
    truePositive=cm_test[1][1]

    print("true negative", trueNegative)
    print("False Positive", falsePositive)
    print("false Negative", falsenegative)
    print("True Positive", truePositive)
    print()
    print("Sensibilidade: {:.2f}".format((truePositive/(truePositive+falsenegative))*100))
    print("Especificidade: {:.2f}".format((trueNegative/(trueNegative+falsePositive))*100))
    '''
    #roc curve
    RL_prob = knn.predict_proba(X_test)
    probs = RL_prob[:, 1]
    rfp, rvp, lim = metrics.roc_curve(y_test, probs)
    roc_auc = metrics.auc(rfp, rvp)
    #print('AUC: %f' % roc_auc)
    
    plt.title('Receiver Operating Characteristic Curve')
    plt.plot(rfp, rvp, marker='.', label = 'AUC = %0.2f' % roc_auc, color="orange")
    plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.legend(loc = 'lower right')
    plt.show()
    
    #precision-recall curve
    plt.title('Precision-Recall Curve')
    knn_precision, knn_recall, _ = precision_recall_curve(y_test, probs)
    ap = average_precision_score(y_test, probs)
    baseline = len(y_test[y_test==1]) / len(y_test)
    plt.plot([0, 1], [baseline, baseline], linestyle='--')
    #plt.plot([1, 0], [0, 1], color='darkblue', linestyle='--')
    plt.plot(knn_recall, knn_precision, marker='.', label = 'AP = %0.2f' % ap, color="orange")
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.legend(loc = 'lower left')
    plt.show()
    ''' 
    return grid

In [11]:
resultados = []
for i in range(1,31):
    X_train, X_test, y_train, y_test = train_test_split(x_scaled, y, test_size=0.1, shuffle=True)
    grid_nca = knn_best_params(X_train,X_test,y_train,y_test)
    resultados.append(grid_nca)


Melhores parâmetros alcançados : 
Best acc. score: 0.7363054187192118
 Best parameters {'n_neighbors': 5, 'weights': 'distance'}

Valor acuracia TESTE :: 75.00
Matriz de confusão TESTE
 [[3 3]
 [1 9]]
Valor AUC TESTE :: 70.00

              precision    recall  f1-score   support

         0.0     0.7500    0.5000    0.6000         6
         1.0     0.7500    0.9000    0.8182        10

    accuracy                         0.7500        16
   macro avg     0.7500    0.7000    0.7091        16
weighted avg     0.7500    0.7500    0.7364        16

true negative 3
False Positive 3
false Negative 1
True Positive 9

Sensibilidade: 90.00
Especificidade: 50.00

Melhores parâmetros alcançados : 
Best acc. score: 0.7681034482758621
 Best parameters {'n_neighbors': 6, 'weights': 'distance'}

Valor acuracia TESTE :: 62.50
Matriz de confusão TESTE
 [[1 3]
 [3 9]]
Valor AUC TESTE :: 50.00

              precision    recall  f1-score   support

         0.0     0.2500    0.2500    0.2500         

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Melhores parâmetros alcançados : 
Best acc. score: 0.7642118226600985
 Best parameters {'n_neighbors': 4, 'weights': 'distance'}

Valor acuracia TESTE :: 50.00
Matriz de confusão TESTE
 [[3 5]
 [3 5]]
Valor AUC TESTE :: 50.00

              precision    recall  f1-score   support

         0.0     0.5000    0.3750    0.4286         8
         1.0     0.5000    0.6250    0.5556         8

    accuracy                         0.5000        16
   macro avg     0.5000    0.5000    0.4921        16
weighted avg     0.5000    0.5000    0.4921        16

true negative 3
False Positive 5
false Negative 3
True Positive 5

Sensibilidade: 62.50
Especificidade: 37.50

Melhores parâmetros alcançados : 
Best acc. score: 0.7467241379310344
 Best parameters {'n_neighbors': 3, 'weights': 'distance'}

Valor acuracia TESTE :: 75.00
Matriz de confusão TESTE
 [[ 2  3]
 [ 1 10]]
Valor AUC TESTE :: 65.45

              precision    recall  f1-score   support

         0.0     0.6667    0.4000    0.5000      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Melhores parâmetros alcançados : 
Best acc. score: 0.7331280788177339
 Best parameters {'n_neighbors': 19, 'weights': 'distance'}

Valor acuracia TESTE :: 68.75
Matriz de confusão TESTE
 [[ 1  4]
 [ 1 10]]
Valor AUC TESTE :: 55.45

              precision    recall  f1-score   support

         0.0     0.5000    0.2000    0.2857         5
         1.0     0.7143    0.9091    0.8000        11

    accuracy                         0.6875        16
   macro avg     0.6071    0.5545    0.5429        16
weighted avg     0.6473    0.6875    0.6393        16

true negative 1
False Positive 4
false Negative 1
True Positive 10

Sensibilidade: 90.91
Especificidade: 20.00

Melhores parâmetros alcançados : 
Best acc. score: 0.7502955665024631
 Best parameters {'n_neighbors': 9, 'weights': 'distance'}

Valor acuracia TESTE :: 56.25
Matriz de confusão TESTE
 [[1 6]
 [1 8]]
Valor AUC TESTE :: 51.59

              precision    recall  f1-score   support

         0.0     0.5000    0.1429    0.2222    

### Step 4 - Results with NCA + hiperparameters

In [26]:
#nca=NeighborhoodComponentsAnalysis(n_components=15,random_state=42)
#nca=NeighborhoodComponentsAnalysis(n_components=10,random_state=42)
nca=NeighborhoodComponentsAnalysis(n_components=8,random_state=42)
#nca=NeighborhoodComponentsAnalysis(n_components=6,random_state=42)
#nca=NeighborhoodComponentsAnalysis(n_components=4,random_state=42)
nca.fit(x_scaled,y)

x_reduced_nca = nca.transform(x_scaled)
#nca_data=pd.DataFrame(x_reduced_nca,columns=["p1","p2","p3","p4","p5","p6","p7","p8","p9","p10","p11","p12","p13","p14","p15"])
#nca_data=pd.DataFrame(x_reduced_nca,columns=["p1","p2","p3","p4","p5","p6","p7","p8","p9","p10"])
nca_data=pd.DataFrame(x_reduced_nca,columns=["f1","f2","f3","f4","f5","f6","f7","f8"])
#nca_data=pd.DataFrame(x_reduced_nca,columns=["p1","p2","p3","p4","p5","p6"])
#nca_data=pd.DataFrame(x_reduced_nca,columns=["f1","f2","f3","f4"])
nca_data["saidas"]=y

In [27]:
x_reduced_nca

array([[-1.33841567e+01,  2.31060786e+01,  2.32127968e+00,
         4.51084288e+01],
       [ 2.15168622e+00,  4.10094012e+00, -2.55200150e+01,
         2.56264315e+01],
       [-5.11547050e+00,  1.35637914e+01, -4.94956040e+01,
         5.06122033e+01],
       [ 9.40469794e+00,  1.16516545e+01, -3.20139885e+01,
         3.48276412e+01],
       [-5.93044202e+00,  1.03566171e+01, -3.61420931e+01,
         3.86660036e+01],
       [ 9.36207795e-01,  1.39949096e+01, -4.70726561e+01,
         5.15742137e+01],
       [ 1.07002134e+00,  1.72297070e+01,  2.54643640e+01,
         1.84702995e+01],
       [-4.21867478e+00,  4.42067120e+00, -2.03830537e+01,
         3.17240792e+01],
       [-1.22492947e+01, -4.40345266e-01,  7.24584451e+00,
         3.45310979e+01],
       [ 2.03887755e+00,  1.99345655e+01, -1.11618860e+01,
         4.27478363e+01],
       [-3.29094579e+00,  2.98423988e+00, -2.18339285e+01,
         3.72335035e+01],
       [ 9.18375428e+00,  2.24634755e+00, -3.36242695e+01,
      

In [28]:
resultados_NCA = []
for i in range(1,31):
    X_train_nca, X_test_nca, y_train_nca, y_test_nca = train_test_split(x_reduced_nca, y, test_size=0.1, shuffle=True)
    grid_nca = knn_best_params(X_train_nca,X_test_nca,y_train_nca,y_test_nca)
    resultados_NCA.append(grid_nca)


Melhores parâmetros alcançados : 
Best acc. score: 0.8623152709359606
 Best parameters {'n_neighbors': 1, 'weights': 'uniform'}

Valor acuracia TESTE :: 93.75
Matriz de confusão TESTE
 [[ 2  1]
 [ 0 13]]
Valor AUC TESTE :: 83.33

              precision    recall  f1-score   support

         0.0     1.0000    0.6667    0.8000         3
         1.0     0.9286    1.0000    0.9630        13

    accuracy                         0.9375        16
   macro avg     0.9643    0.8333    0.8815        16
weighted avg     0.9420    0.9375    0.9324        16

true negative 2
False Positive 1
false Negative 0
True Positive 13

Sensibilidade: 100.00
Especificidade: 66.67

Melhores parâmetros alcançados : 
Best acc. score: 0.8685221674876847
 Best parameters {'n_neighbors': 1, 'weights': 'uniform'}

Valor acuracia TESTE :: 93.75
Matriz de confusão TESTE
 [[6 0]
 [1 9]]
Valor AUC TESTE :: 95.00

              precision    recall  f1-score   support

         0.0     0.8571    1.0000    0.9231     

Melhores parâmetros alcançados : 
Best acc. score: 0.8473645320197045
 Best parameters {'n_neighbors': 1, 'weights': 'uniform'}

Valor acuracia TESTE :: 93.75
Matriz de confusão TESTE
 [[8 1]
 [0 7]]
Valor AUC TESTE :: 94.44

              precision    recall  f1-score   support

         0.0     1.0000    0.8889    0.9412         9
         1.0     0.8750    1.0000    0.9333         7

    accuracy                         0.9375        16
   macro avg     0.9375    0.9444    0.9373        16
weighted avg     0.9453    0.9375    0.9377        16

true negative 8
False Positive 1
false Negative 0
True Positive 7

Sensibilidade: 100.00
Especificidade: 88.89

Melhores parâmetros alcançados : 
Best acc. score: 0.8525862068965517
 Best parameters {'n_neighbors': 1, 'weights': 'uniform'}

Valor acuracia TESTE :: 93.75
Matriz de confusão TESTE
 [[6 1]
 [0 9]]
Valor AUC TESTE :: 92.86

              precision    recall  f1-score   support

         0.0     1.0000    0.8571    0.9231         7


Melhores parâmetros alcançados : 
Best acc. score: 0.8645320197044335
 Best parameters {'n_neighbors': 1, 'weights': 'uniform'}

Valor acuracia TESTE :: 93.75
Matriz de confusão TESTE
 [[7 1]
 [0 8]]
Valor AUC TESTE :: 93.75

              precision    recall  f1-score   support

         0.0     1.0000    0.8750    0.9333         8
         1.0     0.8889    1.0000    0.9412         8

    accuracy                         0.9375        16
   macro avg     0.9444    0.9375    0.9373        16
weighted avg     0.9444    0.9375    0.9373        16

true negative 7
False Positive 1
false Negative 0
True Positive 8

Sensibilidade: 100.00
Especificidade: 87.50

Melhores parâmetros alcançados : 
Best acc. score: 0.870665024630542
 Best parameters {'n_neighbors': 1, 'weights': 'uniform'}

Valor acuracia TESTE :: 87.50
Matriz de confusão TESTE
 [[ 3  2]
 [ 0 11]]
Valor AUC TESTE :: 80.00

              precision    recall  f1-score   support

         0.0     1.0000    0.6000    0.7500        