In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import load_iris, load_breast_cancer, load_wine
from sklearn.model_selection import cross_validate, train_test_split, GridSearchCV
from sklearn.metrics import f1_score
from sklearn.utils.estimator_checks import check_estimator
from time import perf_counter
from statistics import stdev, mean
from pandas.api.types import is_numeric_dtype
from interpretable_knn import interpretable_knn

In [3]:
# Test sklearn's decision tree using either the original or the original plus the generated features
def test_classification(X, y):
    
    def test_sklearn_knn():    
        clf = KNeighborsClassifier(n_neighbors=5)
        t1 = perf_counter()
        scores = cross_validate(clf, X, y, cv=5, scoring='f1_macro', return_train_score=True)
        t2 = perf_counter()
        #print(f"Cross Validated sklearn KNeighbors Classifier in {t2 - t1:0.4f} seconds")
        return scores
    
    def test_2d_knn(visualize_2d_spaces=False):       
        clf = interpretable_knn()
        t1 = perf_counter()
        scores = cross_validate(clf, X, y, cv=5, scoring='f1_macro', return_train_score=True)
        t2 = perf_counter()
        #print(f"Cross Validated interpretable_knn in {t2 - t1:0.4f} seconds")
    
        if visualize_2d_spaces:
            # Create a single interpretable_knn to use for visualization
            clf_final = interpretable_knn(n_neighbors=5, method=method, weight_by_score=weight_by_score, num_best_spaces=num_best_spaces)
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
            y_test=y_test.values
            clf_final.fit(X_train, y_train)        
            pred_y_train = clf_final.predict(X_train)
            pred_y_test = clf_final.predict(X_test)
            #print("pred_y_test: ", pred_y_test)
            #print("y_test: ", y_test)
            #print("len(pred_y_test): ", len(pred_y_test))
            #print("len(y_test): ", len(y_test))
            wrong_rows = []
            for i in range(len(y_test)):
                #print("str(pred_y_test[i]): ", str(list(pred_y_test)[i]))
                #print("str(y_test[i]): ", str(list(y_test)[i]))
                if (str(pred_y_test[i]) != str(y_test[i])):
                    wrong_rows.append(i)
            print("wrong_rows: ", wrong_rows)

            #for i in range(len(X_test)):
            for i in range(5):
                clf_final.graph_2d_spaces(X_test.iloc[i], i, y_test[i:i+1][0])
            
        return scores
    
    def print_scores(scores):    
        train_scores = scores['train_score']
        test_scores = scores['test_score']
        avg_train_score = mean(train_scores)
        avg_test_score = mean(test_scores)
        scores_std_dev = stdev(test_scores)

        print("\nAverage f1 score on training data: ", round(avg_train_score,3))
        print("Average f1 score on test data: ", round(avg_test_score,3))
        print("Std dev of f1 scores on test data: ", round(scores_std_dev,3))
        
    print_scores(test_sklearn_knn(hyperparams_setting='default'))
    print_scores(test_2d_knn(hyperparams_setting='default', visualize_2d_spaces=False))
    print_scores(test_sklearn_knn(hyperparams_setting='grid_search'))
    print_scores(test_2d_knn(hyperparams_setting='grid_search', visualize_2d_spaces=False))

# Given a method to load a dataset, load the dataset and test the accuracy of a sklearn decision tree with and without
# the extended features.
def test_dataset(X, y, file_name):
    print("\n\n*********************************************")
    print("Calling for " + file_name)
    print("*********************************************")
    
    # One-hot encode any non-numeric columns
    is_numeric_arr = []
    for c in range(len(X.columns)):
        if is_numeric_dtype(X[X.columns[c]]):
            is_numeric_arr.append(1)
        else:
            is_numeric_arr.append(0)    
    new_df = pd.DataFrame()
    for c in range(len(is_numeric_arr)):
        col_name = X.columns[c]
        if is_numeric_arr[c] == 0:
            one_hot_cols = pd.get_dummies(X[col_name], prefix=col_name, dummy_na=True, drop_first=False)
            new_df = pd.concat([new_df, one_hot_cols], axis=1)
            #num_one_hot_cols = len(one_hot_cols.columns)
        else:
            new_df[col_name] = X[col_name]
    X = new_df
    
    X = X.fillna(0.0)
    X = X.replace([np.inf, -np.inf], 0.0)                
    print("shape: ", X.shape)
    
    test_classification(X, y)
    

In [4]:
# Test datasets from public sources
# todo: any that don't work well, remove. Need to run overnight still.

def test_public_dataset(filename, drop_cols, target_col, problem_type):
    file_df = pd.read_csv("TestData/" + filename)
    file_df = file_df.drop(drop_cols, axis = 1)
    X = file_df.drop([target_col], axis = 1)
    y = file_df[target_col].astype(str)
    test_dataset(X, y, filename)

def test_schooling():
    # From https://vincentarelbundock.github.io/Rdatasets/doc/Ecdat/Schooling.html
    # Removed lwage76 as it's the log of wage76. Used wage76 as the target, though created a new
    # column for this called low_wage, which is 1 when wage76 is < 1000
    test_public_dataset("schooling.csv", ['lwage76','wage76'], "low_wage", "classification")

def test_spam7():  
    # From: http://vincentarelbundock.github.io/Rdatasets/doc/DAAG/spam7.html
    test_public_dataset("spam7.csv", [], "yesno", "classification")

def test_creditcardcsvpresent():
    # From: https://www.kaggle.com/shubhamjoshi2130of/abstract-data-set-for-credit-card-fraud-detection
    test_public_dataset("creditcardcsvpresent.csv", [], "isFradulent", "classification")

def test_wilt():
    # From https://archive.ics.uci.edu/ml/datasets/Wilt
    test_public_dataset("wilt_training.csv", [], "class", "classification")

def test_QSAR_biodegredation():
    # From https://archive.ics.uci.edu/ml/datasets/QSAR+biodegradation#
    test_public_dataset("biodeg.csv", [], "experimental class", "classification")

def test_statlog():
    # From https://archive.ics.uci.edu/ml/datasets/Statlog+%28Image+Segmentation%29
    test_public_dataset("statlog.csv", [], "class", "classification")

def test_segmentation():
    # From https://archive.ics.uci.edu/ml/datasets/Image+Segmentation
    test_public_dataset("Segmentation.csv", [], "class", "classification")

def test_frogs():
    # From https://archive.ics.uci.edu/ml/datasets/Anuran+Calls+%28MFCCs%29#
    # It's possible to predict Family, Genus, or Species
    test_public_dataset("Frogs_MFCCs.csv", [], "Family", "classification")

def test_blocks():
    # From https://archive.ics.uci.edu/ml/datasets/Page+Blocks+Classification
    test_public_dataset("page_block.csv", [], "class", "classification")

def test_electrical_grid():
    # From https://archive.ics.uci.edu/ml/datasets/Electrical+Grid+Stability+Simulated+Data+
    test_public_dataset("electrical_grid.csv", ['stab'], "stabf", "classification")

def test_bank8FM():
    # From https://www.openml.org/d/725
    test_public_dataset("bank8FM.csv", [], "binaryClass", "classification")

def test_eeg_eye_state():
    # From https://www.openml.org/d/1471
    test_public_dataset("eeg-eye-state.csv", [], "Class", "classification")

def test_vowel():
    # From https://www.openml.org/d/307
    test_public_dataset("vowel.csv", [], "Class", "classification")

def test_vehicle():
    # From https://www.openml.org/d/994
    test_public_dataset("vehicle.csv", [], "binaryClass", "classification")

def test_space_ga():
    # From https://www.openml.org/d/722
    test_public_dataset("space_ga.csv", [], "binaryClass", "classification")

def test_pol():
    # From https://www.openml.org/d/737
    test_public_dataset("pol.csv", [], "binaryClass", "classification")

def test_ringnorm():
    # From https://www.openml.org/d/1496
    test_public_dataset("ringnorm.csv", [], "Class", "classification")

def test_churn():
    # From https://www.openml.org/d/40701
    test_public_dataset("churn.csv", [], "class", "classification")

def test_biomed():
    # From https://www.openml.org/d/481
    test_public_dataset("biomed.csv", [], "class", "classification")

def test_kdd_el_nino_small():
    # From https://www.openml.org/d/839
    test_public_dataset("kdd_el_nino-small.csv", [], "binaryClass", "classification")

def test_artificial_characters():
    # From https://www.openml.org/d/1459
    test_public_dataset("artificial-characters.csv", [], "Class", "classification")

def test_fri_c3_500_25():
    # From https://www.openml.org/d/896
    test_public_dataset("fri_c3_500_25.csv", [], "binaryClass", "classification")
     
def test_public_datasets():    
    # For faster execution time, comment out any of the functions below not necessary

    # Test files from http://vincentarelbundock.github.io/Rdatasets/
    test_schooling()    
    test_spam7()

    # Test files from Kaggle
    test_creditcardcsvpresent()

    # Test files from UCI
    test_wilt()
    test_QSAR_biodegredation()
    test_statlog()
    test_segmentation()
    test_frogs()
    test_blocks()
    test_electrical_grid()

    # Test files from OpenML
    test_bank8FM()
    test_eeg_eye_state()
    test_vowel()
    test_vehicle()
    test_space_ga() 
    test_pol()
    test_ringnorm()
    test_churn()
    test_biomed()
    test_kdd_el_nino_small()
    test_artificial_characters()
    test_fri_c3_500_25()

In [5]:
test_public_datasets()



*********************************************
Calling for schooling.csv
*********************************************
shape:  (3010, 65)

Average f1 score on training data:  0.634
Average f1 score on test data:  0.263
Std dev of f1 scores on test data:  0.14

Average f1 score on training data:  0.646
Average f1 score on test data:  0.58
Std dev of f1 scores on test data:  0.047


*********************************************
Calling for spam7.csv
*********************************************
shape:  (4601, 7)

Average f1 score on training data:  0.999
Average f1 score on test data:  0.895
Std dev of f1 scores on test data:  0.148

Average f1 score on training data:  1.0
Average f1 score on test data:  0.892
Std dev of f1 scores on test data:  0.15


*********************************************
Calling for creditcardcsvpresent.csv
*********************************************
shape:  (3075, 17)

Average f1 score on training data:  0.643
Average f1 score on test data:  0.345
Std dev o


Average f1 score on training data:  0.908
Average f1 score on test data:  0.793
Std dev of f1 scores on test data:  0.038


In [6]:
n = np.array(['a','b','a','b','c','d','a'])
u = np.unique(n)
u

array(['a', 'b', 'c', 'd'], dtype='<U1')