# Testing bootstrapped ensemble test-set filter on MiniROCKET feature-set

In [1]:
import numpy as np
import pandas as pd
import os
import sys
module_path = os.path.abspath(os.path.join('../..'))

sys.path.insert(1, module_path + '/src')
import utility

sys.path.insert(1, module_path + '/src/models')
import bootstrapped_ensemble_lr_filter as BE_LR_filter


from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.preprocessing import MinMaxScaler
from time import time
from sktime.utils.data_io import load_from_tsfile_to_dataframe
from sktime.utils.data_processing import from_nested_to_2d_array

# Load data, and select features

In [2]:
X_train, y_train = load_from_tsfile_to_dataframe(module_path + '/features/extracted_features_ts_files/lungsound_MiniROCKET_TRAIN.ts')

X_test, y_test = load_from_tsfile_to_dataframe(module_path + '/features/extracted_features_ts_files/lungsound_MiniROCKET_TEST.ts')


In [3]:
from sklearn.model_selection import train_test_split

X_train = from_nested_to_2d_array(X_train)
X_test = from_nested_to_2d_array(X_test)

y_test = y_test.astype(int)
y_train = y_train.astype(int)

X_train.columns = np.arange(len(X_train.columns))
X_test.columns = np.arange(len(X_test.columns))

X_train.head(10)


y_train = pd.Series(y_train)
y_test = pd.Series(y_test)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

### Function below addes noise to the dataset--> for testing

In [97]:
from sklearn.preprocessing import MinMaxScaler

X_train, n_train = add_noise_dataset(X_train)
X_test, n_test= add_noise_dataset(X_test)

scaler = MinMaxScaler()
scaler.fit(X_train)
X_train = pd.DataFrame(scaler.transform(X_train))
X_test = pd.DataFrame(scaler.transform(X_test))

### feature selection based on SelectKBest

In [4]:
k = 200
select = SelectKBest(chi2, k=k)
X_train_s = select.fit(X_train, y_train)
indices = select.get_support(indices = True)
X_test = X_test[indices]

X_train = X_train[indices]

X_val = X_val[indices]

# Testing double classifier approach with filtering based on Logistic Regression

### Hyperparameter search for double classifier

The hyperparameters are number of **folds = f** which the training dataset will be devided into, in order to make **f** ensemble classifiers. The other hyperparameter is **threshold**, which is the number of ensembles that classify an example as probably incorrect, before the sample will be removed from the testing set.

In [5]:
k_range = list(range(1,8))
weight_options = ["uniform", "distance"]
grid = dict(n_neighbors = k_range, weights = weight_options)

clf = KNeighborsClassifier()

clf_dict = {
    'knn': KNeighborsClassifier(leaf_size=1, n_neighbors=1, p=1),
    'knnOptimal': clf,
    'nb':  GaussianNB()
}

grid_dict = {
    'knnOptimal': grid
}

thresh_list = [2,4,6]
folds_list = [5, 7, 10]


param_dict = BE_LR_filter.hyperparam_search(X_train,y_train, X_val, y_val, clf_dict, grid_dict = grid_dict, thresh_list = thresh_list, folds_list = folds_list)

Testing fold = 5, and threshold = 2
Testing fold = 7, and threshold = 2
Testing fold = 10, and threshold = 2
Testing fold = 5, and threshold = 4
Testing fold = 7, and threshold = 4
Testing fold = 10, and threshold = 4
Testing fold = 7, and threshold = 6
Testing fold = 10, and threshold = 6


## Testing with the optimal hyperparameters

In [6]:
print(param_dict)
print(clf_dict)

{'knn': {'threshold': 2, 'folds': 10}, 'knnOptimal': {'threshold': 2, 'folds': 10}, 'nb': {'threshold': 2, 'folds': 10}}
{'knn': KNeighborsClassifier(leaf_size=1, n_neighbors=1, p=1), 'knnOptimal': KNeighborsClassifier(n_neighbors=6), 'nb': GaussianNB()}


In [7]:
dict_results, indices = BE_LR_filter.compare_classifiers(X_train,y_train, X_test, y_test, clf_dict, param_dict = param_dict)

In [9]:
dict_results

{'knn': {'original': 0.6502699460107978, 'filtered': 0.7568058076225045},
 'knnOptimal': {'original': 0.6796640671865627, 'filtered': 0.793400286944046},
 'nb': {'original': 0.5716856628674265, 'filtered': 0.7595628415300546}}

In [8]:
for name, i in indices.items():
    print(f'Percentage deleted of the total amount: {1 - len(i)/len(X_test)}')

Percentage deleted of the total amount: 0.6694661067786443
Percentage deleted of the total amount: 0.5818836232753449
Percentage deleted of the total amount: 0.5608878224355129
