# Evaluation for the Core Analysis

In this notebook, we use the Euclidean Distance Metric to implement the Brute Force k-NN classifiers on case-bases created by different case-base editing algorithms.

In [1]:
#importing all the necessary packages
import import_ipynb
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.datasets import load_breast_cancer
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import KNeighborsClassifier  
from sklearn.model_selection import train_test_split
import seaborn as sns
sns.set()
import dataset_loader as dl
import time
from scipy.spatial import distance
import sklearn
import time
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
import timeit
from sklearn.preprocessing import StandardScaler
import Conservative_RR
from Conservative_RR import crr

importing Jupyter notebook from Conservative_RR.ipynb
importing Jupyter notebook from Dynamic_tw.ipynb


The following function takes the training and test sets and implements the Brute Force, Ball Tree and K-D Tree k-NN methods on the original dataset. It also implements Brute Force k-NN on the edited dataset after the implementation of the CNN algorithm and on the edited dataset after the implementation of the CRR algorithm. It simultaneously records the different model's speed and accuracy. The results are scaled w.r.t the Brute Force k-nn method, where the value 1 is the speed and accuracy of the Brute Force k-NN. The prediction is executed in a loop for a total of 100 times, and the mean of the speed is taken as the final result. This function applies only to the five smaller datasets. 

In [2]:
def eval_core(X_train,y_train,X_test,y_test):
    result=pd.DataFrame(columns=['Algorithms','Algorithm time','Brute force time','Brute force accuracy','Ball tree time','Ball tree accuracy','KD tree time','KD tree accuracy'])
    result['Algorithms']=["None","CNN","CRR"]
    result_chart=pd.DataFrame()
    
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)
    from sklearn.neighbors import KNeighborsClassifier
    #Brute Force k-nn on the original dataset
    tmp=0
    classifier = KNeighborsClassifier(n_neighbors=5, algorithm="brute", metric='euclidean')
    classifier.fit(X_train, y_train)
    from sklearn.metrics import accuracy_score
    ti=0
    for i in range(0,100):
        start = time.perf_counter()
        y_pred = classifier.predict(X_test)
        end = time.perf_counter()
        ti+=(end-start)
    tmp=accuracy_score(y_test,y_pred)
    result.loc[0,"Brute force accuracy"]=tmp
    result.loc[0,"Algorithm time"]=0
    result.loc[0,"Brute force time"]=ti/100
    result.loc[0,"Dataset Size"]=X_train.shape[0]
    
    #Ball Tree k-nn on the original dataset
    classifier = KNeighborsClassifier(n_neighbors=5, algorithm="ball_tree", metric='euclidean')
    classifier.fit(X_train, y_train)
    tmp=0
    from sklearn.metrics import accuracy_score
    start = time.perf_counter()
    y_pred = classifier.predict(X_test)
    end = time.perf_counter()
    tmp=accuracy_score(y_test,y_pred)
    result.loc[0,"Ball tree accuracy"]=tmp
    result.loc[0,"Ball tree time"]=(end-start)

    #K-D Tree k-nn on the original dataset
    classifier = KNeighborsClassifier(n_neighbors=5, algorithm="kd_tree", metric='euclidean')
    classifier.fit(X_train, y_train)
    tmp=0
    from sklearn.metrics import accuracy_score
    start = time.perf_counter()
    y_pred = classifier.predict(X_test)
    end = time.perf_counter()
    tmp=accuracy_score(y_test,y_pred)
    result.loc[0,"KD tree accuracy"]=tmp
    result.loc[0,"KD tree time"]=(end-start)
    
    #Brute Force k-nn on the case-base edited by the CNN algorithm
    tmp=0
    from collections import Counter
    from sklearn.datasets import fetch_mldata
    from imblearn.under_sampling import CondensedNearestNeighbour
    start = time.perf_counter()
    cnn=CondensedNearestNeighbour(random_state=0)
    X_cnn, y_cnn=cnn.fit_resample(X_train, y_train)
    end = time.perf_counter()
    tmp=(end-start)
    result.loc[1,"Algorithm time"]=tmp
    from sklearn.neighbors import KNeighborsClassifier
    tmp=0
    classifier = KNeighborsClassifier(n_neighbors=5, algorithm="brute", metric='euclidean')
    classifier.fit(X_cnn, y_cnn)
    from sklearn.metrics import accuracy_score
    #loop
    ti=0
    for i in range(0,100):
        start = time.perf_counter()
        y_pred = classifier.predict(X_test)
        end = time.perf_counter()
        ti+=(end-start)
    tmp=accuracy_score(y_test,y_pred)
    result.loc[1,"Brute force accuracy"]=tmp
    result.loc[1,"Brute force time"]=ti/100
    result.loc[1,"Dataset Size"]=X_cnn.shape[0]
    
    #Brute Force k-nn on the case-base edited by the CRR algorithm
    eset,result.loc[2,"Algorithm time"]=crr(X_train,y_train) #Calling the crr function to form the edited case-base
    X_crr=eset.iloc[:, :-1].values
    y_crr=eset["Class"].values
    from sklearn.neighbors import KNeighborsClassifier
    tmp=0
    classifier = KNeighborsClassifier(n_neighbors=5, algorithm="brute", metric='euclidean')
    classifier.fit(X_crr, y_crr)
    from sklearn.metrics import accuracy_score
    ti=0
    for i in range(0,100):
        start = time.perf_counter()
        y_pred = classifier.predict(X_test)
        end = time.perf_counter()
        ti+=(end-start)
    tmp=accuracy_score(y_test,y_pred)
    result.loc[2,"Brute force accuracy"]=tmp
    result.loc[2,"Brute force time"]=ti/100
    result.loc[2,"Dataset Size"]=X_crr.shape[0]
    
    #Scaling the results w.r.t Brute Force k-nn results
    result_chart['Time wrt BF']=[result['Brute force time'][0]/result['Brute force time'][0],result['Brute force time'][1]/result['Brute force time'][0],result['Brute force time'][2]/result['Brute force time'][0]]
    result_chart['Accuracy wrt BF']=[result['Brute force accuracy'][0]/result['Brute force accuracy'][0],result['Brute force accuracy'][1]/result['Brute force accuracy'][0],result['Brute force accuracy'][2]/result['Brute force accuracy'][0]]
    result_chart['Dataset Size wrt BF']=[result['Dataset Size'][0]/result['Dataset Size'][0],result['Dataset Size'][1]/result['Dataset Size'][0],result['Dataset Size'][2]/result['Dataset Size'][0]]
    return result,result_chart,X_cnn.shape,X_crr.shape

The following function takes the training and test sets and implements the Brute Force, Ball Tree and K-D Tree k-NN methods on the original dataset. It also implements Brute Force k-NN on the edited dataset after the implementation of the CNN algorithm and on the edited dataset after the implementation of the CRR algorithm. It simultaneously records the different model's speed and accuracy. The results are scaled w.r.t the Brute Force k-nn method, where the value 1 is the speed and accuracy of the Brute Force k-NN. The prediction is executed only once for the large datasets, since it isn't necessary to run them multiple times. Running them once gives accuracte speed recordings because for a large dataset like the Online News Popularity Dataset (39644 cases), the execution time for the prediction takes 500 times more time than the execution time of the prediction for a small dataset like the Audit Dataset (775 cases). 

In [3]:
def eval_core_large(X_train,y_train,X_test,y_test):
    result=pd.DataFrame(columns=['Algorithms','Algorithm time','Brute force time','Brute force accuracy','Ball tree time','Ball tree accuracy','KD tree time','KD tree accuracy'])
    result['Algorithms']=["None","CNN","CRR"]
    result_chart=pd.DataFrame()
    from sklearn.preprocessing import StandardScaler
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)
    from sklearn.neighbors import KNeighborsClassifier
    import time
    
    #Brute Force k-nn on the original dataset
    tmp=0
    classifier = KNeighborsClassifier(n_neighbors=5, algorithm="brute", metric='euclidean')
    classifier.fit(X_train, y_train)
    from sklearn.metrics import accuracy_score
    ti=0
    start = time.perf_counter()
    y_pred = classifier.predict(X_test)
    end = time.perf_counter()
    ti=(end-start)
    tmp=accuracy_score(y_test,y_pred)
    result.loc[0,"Brute force accuracy"]=tmp
    result.loc[0,"Algorithm time"]=0
    result.loc[0,"Brute force time"]=ti
    result.loc[0,"Dataset Size"]=X_train.shape[0]
    
    #Ball Tree k-nn on the original dataset
    classifier = KNeighborsClassifier(n_neighbors=5, algorithm="ball_tree", metric='euclidean')
    classifier.fit(X_train, y_train)
    tmp=0
    from sklearn.metrics import accuracy_score
    start = time.perf_counter()
    y_pred = classifier.predict(X_test)
    end = time.perf_counter()
    tmp=accuracy_score(y_test,y_pred)
    result.loc[0,"Ball tree accuracy"]=tmp
    result.loc[0,"Ball tree time"]=(end-start)
    
    #K-D Tree k-nn on the original dataset
    classifier = KNeighborsClassifier(n_neighbors=5, algorithm="kd_tree", metric='euclidean')
    classifier.fit(X_train, y_train)
    tmp=0
    from sklearn.metrics import accuracy_score
    start = time.perf_counter()
    y_pred = classifier.predict(X_test)
    end = time.perf_counter()
    tmp=accuracy_score(y_test,y_pred)
    result.loc[0,"KD tree accuracy"]=tmp
    result.loc[0,"KD tree time"]=(end-start)
    
    #Brute Force k-nn on the case-base edited by the CNN algorithm
    tmp=0
    from collections import Counter
    from sklearn.datasets import fetch_mldata
    from imblearn.under_sampling import CondensedNearestNeighbour
    start = time.perf_counter()
    cnn=CondensedNearestNeighbour(random_state=0)
    X_cnn, y_cnn=cnn.fit_resample(X_train, y_train)
    end = time.perf_counter()
    tmp=(end-start)
    result.loc[1,"Algorithm time"]=tmp
    from sklearn.neighbors import KNeighborsClassifier
    tmp=0
    classifier = KNeighborsClassifier(n_neighbors=5, algorithm="brute", metric='euclidean')
    classifier.fit(X_cnn, y_cnn)
    from sklearn.metrics import accuracy_score
    #loop
    ti=0
    start = time.perf_counter()
    y_pred = classifier.predict(X_test)
    end = time.perf_counter()
    ti=(end-start)
    tmp=accuracy_score(y_test,y_pred)
    result.loc[1,"Brute force accuracy"]=tmp
    result.loc[1,"Brute force time"]=ti
    result.loc[1,"Dataset Size"]=X_cnn.shape[0]
    
    #Brute Force k-nn on the case-base edited by the CRR algorithm
    eset,result.loc[2,"Algorithm time"]=crr(X_train,y_train) #Calling the crr function to form the edited case-base
    X_crr=eset.iloc[:, :-1].values
    y_crr=eset["Class"].values
    from sklearn.neighbors import KNeighborsClassifier
    tmp=0
    classifier = KNeighborsClassifier(n_neighbors=5, algorithm="brute", metric='euclidean')
    classifier.fit(X_crr, y_crr)
    from sklearn.metrics import accuracy_score
    ti=0
    start = time.perf_counter()
    y_pred = classifier.predict(X_test)
    end = time.perf_counter()
    ti=(end-start)
    tmp=accuracy_score(y_test,y_pred)
    result.loc[2,"Brute force accuracy"]=tmp
    result.loc[2,"Brute force time"]=ti
    result.loc[2,"Dataset Size"]=X_crr.shape[0]
    
    #Scaling the results w.r.t the Brute Force k-nn results
    result_chart['Time wrt BF']=[result['Brute force time'][0]/result['Brute force time'][0],result['Brute force time'][1]/result['Brute force time'][0],result['Brute force time'][2]/result['Brute force time'][0]]
    result_chart['Accuracy wrt BF']=[result['Brute force accuracy'][0]/result['Brute force accuracy'][0],result['Brute force accuracy'][1]/result['Brute force accuracy'][0],result['Brute force accuracy'][2]/result['Brute force accuracy'][0]]
    result_chart['Dataset Size wrt BF']=[result['Dataset Size'][0]/result['Dataset Size'][0],result['Dataset Size'][1]/result['Dataset Size'][0],result['Dataset Size'][2]/result['Dataset Size'][0]]
    return result,result_chart,X_cnn.shape,X_crr.shape