# Note to marker
The dimensionality reduction and hyperparameter search took over 24h for some of the datasets even on Google Colab.<br>
So sometimes we save progress in ***savefiles/*** folder due to disconection times of Colab.<br>
We also included a demo dataset which you can run the whole notebook for demonstration purposes. The dimensionality reduction techniques we use are meant for datasets with over 100 dimensions, but the results are surprisingly good on this 27 attributes demo dataset.  <br>

At the last moment, we found an optimization for the Combined ReliefF-Linear SVM weights technique, which drastically reduces the time to perform the reduction to be only slightly longer than Linear SVM weights. We only had time to get all updated results for the micro_mass dataset(1300 features). For the indian_pines dataset(220 features), we were able to get the reduction times, but due to the high sample size, we couldn't get the hypterparameters final results, but they shouldn't have been significantly different. For the olivetti_faces dataset(4096 features), we got the reduction times for 10 different features removal and averaged them to get an estimate. The reduction times are not affected by number of features removed, but have some small variations due to usage of randomness in the algorithms. At the end of the notebook we display these new timings for the 2 datasets, which we use in the report, along with the old values.<br>

We have saved all the results from running the 3 main datasets, which can be plotted at the end of the notebook. But if you have a a day or two to spare, you can run it on one of the 3 main datasets, change the ***dataset_name*** variable below to the name of some other dataset from ***datasets*** folder. <br>


In [None]:
dataset_name = "forest_types"

In [None]:
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd
import scipy.spatial as sp #KDTree
import scipy.stats  # reciprocal distribution
import sklearn.neighbors #KNeighborsClassifier
import sklearn.ensemble #RandomForestClassifier
import sklearn.svm #SVC
import sklearn.preprocessing #standarization
import sklearn.model_selection #RandomSearchCV
import time
import random


# Definition of all needed functions
## Dimensionality reduction techniques

In [None]:

def ReliefF(X, Y, m, k):
    """Weighs the ReliefF, ReliefF model definition, assumes the KDTree sorting has already been done"""
    #initialization
    W = np.zeros(X.shape[1])
    trees = np.array([sp.KDTree(X[Y==i]) for i in range(len(np.unique(Y)))]) #O(cnlogn)
    #choosing random points
    choice = np.random.choice(np.arange(len(X)),m)
    #maximums and minimums of all features
    maximums, minimums = np.max(X, axis=0), np.min(X, axis=0)
    P = np.unique(Y, return_counts = True)[1]/len(Y)
    for i in range(m): #for each point
        #point definition
        x, y = X[choice[i]], Y[choice[i]]
        current = trees[y]
        hits = np.array(current.query(x, k=k+1)[1], dtype='i4')
        misses = np.array([tree.query(x, k=k)[1] if tree != current else np.zeros(k) for tree in trees], dtype='i4')
        for j in range(X.shape[1]):
            if maximums[j] - minimums[j] <= 0.000001 and maximums[j] - minimums[j] >= -0.000001:
                W[j] = 0
            else:
                W[j] += -np.sum(diff(x[j], X[hits].T[j], maximums[j], minimums[j]))/(m*k)
                W[j] += np.sum(np.array([P[category]/(1-P[y])*sum(diff(x[j], X[misses[category]].T[j], maximums[j], minimums[j])) for \
                                                                        category in range(len(P)) if category != y]))/(k*m)
    return W

def diff(p1,p2,maxi,mini):
    """Auxiliary function to ReliefF, true to the theory of ReliefF"""
    return np.absolute(p1-p2)/(maxi-mini)

def linearSVM(X,Y): 
    """Weighs the linear SVM weight model. Assumes SVM random search for parameters tol and C"""
    if len(X) > X.shape[1]:
        svm = sklearn.svm.LinearSVC(dual=False, max_iter=1000)
    else:
        svm = sklearn.svm.LinearSVC(max_iter=1000)
    param_dist = {'C': np.logspace(-3, 2, 6)}
    r_search = sklearn.model_selection.RandomizedSearchCV(svm, param_distributions={'C': np.logspace(-3, 2, 15)}, cv = 3, random_state=0)
    r_search.fit(X,Y)
    weights = r_search.best_estimator_.coef_
    if (len(weights)==1):
        return weights[0]
    else:
        for i in range(len(weights)):
            weights[i] = np.argsort(np.abs(weights[i]))
        return np.sum(weights, axis=0)

def reduce_X(X, W, remove, feature_list):
    """
    Reduces the dimension of X based on the sorting of the weight.
    The weights with greatest magnitude are the most important features
    """
    new = np.argsort(np.abs(W))[remove:]
    return X[:,new], feature_list[new]

def ReliefFSelect(X, Y, m, k, remove, feature_list):
    """ReliefF Feature Selection Model"""
    return reduce_X(X, ReliefF(X, Y, m, k), remove, feature_list)

def linearSVMWeightSelect(X,Y, remove, feature_list):
    """Linear SVM Weight Feature Selection Model"""
    return reduce_X(X, linearSVM(X,Y), remove, feature_list)

def combinedReliefFLinearSVM(X, Y, m, k, part, total, feature_list):
    """
    Main model, combination of 2 models
    Multilayered feature selection

    FUNCTION OF INTEREST
    RRFE implementation of the algorithm.
    """
    # first layer. Linear SVM Weight Feature Selection
    X, feature_list = linearSVMWeightSelect(X,Y, part, feature_list)
    # second layer, ReliefF
    return ReliefFSelect(X, Y, m, k, total-part, feature_list)

# OLD VERSION OF combinedReliefFLinearSVM before optimization
"""
def combinedReliefFLinearSVM(X, Y, m, k, part, total, feature_list):
    # first layer
    #RFE Linear SVM Weight Feature Selection
    i=0
    while RFE_step*i < part:
        X, feature_list = linearSVMWeightSelect(X,Y, RFE_step, feature_list)
        i += 1
    # second layer
    return ReliefFSelect(X, Y, m, k, total-i*RFE_step, feature_list)
"""



def dim_reduction(X, y, RFsamples, partial_remove, total_remove):
    """
    Applies ReliefF, Linear-SVM-Weight and Combined ReliefF-Linear-SVM-Weight on X,y
    Outputs the column indexes that are kept after dimension reduction, as well as time it took to perform the reduction.
    Returns 3x2 list, representing the results (remaining features and time to reduce) for each of the 3 dimensional reduction techniques.
    Output: [[feature_list_RF,      reduction_time_RF],
             [feature_list_SVM,     reduction_time_SVM],
             [feature_list_RFSVM,   reduction_time_RFSVM]]

    RFsamples:      number of samples used to estimate ReliefF, can be safely 100, for smaller datasets, at least 1% of samples.
    partial_remove: number of features removed by SVM, for combinedSVMRF only
    total_remove:   total number of features removed
    """

    print("\nReducing dimensions with: RFsamples={}, partial_remove={}, total_remove={}".format(RFsamples, partial_remove, total_remove))
    # Apply ReliefF
    timer = time.time()
    f_l_RF = np.arange(X.shape[1])
    X_ReliefF, f_l_RF = ReliefFSelect(X, y, RFsamples, 6, total_remove, f_l_RF)
    time_RF = time.time() - timer
    print("ReliefF took {:.1f} minutes".format(time_RF/60))

    # Apply Linear SVM weight
    timer = time.time()
    f_l_SVM = np.arange(X_train.shape[1])
    X_SVM, f_l_SVM = linearSVMWeightSelect(X,y, total_remove, f_l_SVM)
    time_SVM = time.time() - timer
    print("LiSVM took {:.1f} minutes".format(time_SVM/60))

    # Apply combined ReliefF-Linear SVM weight
    timer = time.time()
    f_l_RFSVM = np.arange(X_train.shape[1])
    X_RFSVM, f_l_RFSVM = combinedReliefFLinearSVM(X, y, RFsamples, 6, partial_remove, total_remove, f_l_RFSVM)
    time_RFSVM = time.time() - timer
    print("Combined ReliefF-LiSVM took {:.1f} minutes".format(time_RFSVM/60))

    print("\nTotal time taken for removing {} dims: {:.1f} minutes".format(total_remove, (time_RF+time_SVM+time_RFSVM)/60))
    result = [[f_l_RF, int(time_RF)],
              [f_l_SVM, int(time_SVM)],
              [f_l_RFSVM, int(time_RFSVM)]]
    return result


## Data loading and training

In [None]:
def prep_data(filename):
    """Loads dataset from filename, returns train/test data and targets."""
    data = np.loadtxt(filename, skiprows=1, delimiter=',')
    data_train, data_test = sklearn.model_selection.train_test_split(data, test_size=0.3, random_state=0)
    X_train, y_train = data_train[:,:-1], data_train[:,-1].astype('i4')
    X_test, y_test = data_test[:,:-1], data_test[:,-1].astype('i4')
    return X_train, X_test, y_train, y_test

def dim_reduction_results(X_train, X_test, y_train, y_test, model, features_indexes):
  '''
  For each of the 3 dimension reduction techniques, train model on reduced features of X_train according to features_indexes.
  features_indexes: 3xN array containing indexes of N features we are to keep from X
  Returns score on reduced test data as well as time it took to train model in a 3x2 array
  '''
  #ReliefF
  X_train_reduced = X_train[:,features_indexes[0]]
  X_test_reduced = X_test[:,features_indexes[0]]
  timer = time.time()
  model.fit(X_train_reduced, y_train)
  time_RF = time.time() - timer
  score_RF = model.score(X_test_reduced, y_test)

  #Linear-SVM-Weights
  X_train_reduced = X_train[:,features_indexes[1]]
  X_test_reduced = X_test[:,features_indexes[1]]
  timer = time.time()
  model.fit(X_train_reduced, y_train)
  time_SVM = time.time() - timer
  score_SVM = model.score(X_test_reduced, y_test)

  #Combined ReliefF-Linear-SVM-Weights
  X_train_reduced = X_train[:,features_indexes[2]]
  X_test_reduced = X_test[:,features_indexes[2]]
  timer = time.time()
  model.fit(X_train_reduced, y_train)
  time_RFSVM = time.time() - timer
  score_RFSVM = model.score(X_test_reduced, y_test)

  result = np.array(
      [[score_RF, np.int(time_RF)],
       [score_SVM, np.int(time_SVM)],
       [score_RFSVM, np.int(time_RFSVM)]], dtype=object)
  return result

## Plotting

In [None]:
def plot_features_scores(num_features, reduced_scores, unreduced_scores, title=None):
  """
  plots accuracy for reduced features data, as well as a horizontal line representing unreduced accuracy
  """
  max_scores = np.max(reduced_scores, axis=0)
  best_num_features = num_features[np.argmax(reduced_scores, axis=0)]

  plt.figure(figsize=(max(len(num_features)/10,7.5),5))
  #plt.figure()
  plt.scatter(best_num_features[0], max_scores[0], c='k', marker='x', s=75, linewidth=3)
  plt.scatter(best_num_features[1], max_scores[1], c='k', marker='x', s=75, linewidth=3)
  plt.scatter(best_num_features[2], max_scores[2], c='k', marker='x', s=75, linewidth=3)
  plt.plot(num_features, reduced_scores[:,0], c='b', label='ReliefF')
  plt.plot(num_features, reduced_scores[:,1], c='r', label='LiSVM')
  plt.plot(num_features, reduced_scores[:,2], c='g', label='ReliefF+LiSVM')
  plt.axhline(unreduced_scores, c='brown', label='Unreduced')

  plt.text(0.4 * (max(num_features)+1), 0.20,'Best RF score:{:.3f} at {} features'.format(max_scores[0], best_num_features[0]), c='b', ha='left')
  plt.text(0.4 * (max(num_features)+1), 0.15,'Best LiSVM score:{:.3f} at {} features'.format(max_scores[1], best_num_features[1]), c='r', ha='left')
  plt.text(0.4 * (max(num_features)+1), 0.10,'Best RF-LiSVM score:{:.3f} at {} features'.format(max_scores[2], best_num_features[2]), c='g', ha='left')
  plt.text(0.4 * (max(num_features)+1), 0.05,'Unreduced score:{:.3f}'.format(unreduced_scores), c='k', ha='left')

  plt.xticks(np.arange(0, max(num_features)+1, 5))
  plt.yticks(np.linspace(0,1,11))
  #plt.ylim(0,1)
  plt.xlabel("Number of features")
  plt.ylabel("Accuracy")
  plt.title(title)
  plt.legend(loc='best')

def plot_timings(num_features, timings, labels, unreduced=None, title=None):
  """
  general plotting for timings. Used for time taken to reduce dimensions and time to train a model
  """
  plt.figure(figsize=(max(len(num_features)/10,7.5),5))
  #plt.figure()
  plt.plot(num_features, timings[:,0], c='b', label='ReliefF')
  plt.plot(num_features, timings[:,1], c='r', label='LiSVM')
  plt.plot(num_features, timings[:,2], c='g', label='Multi-Layers(ReliefF+LiSVM)')
  if unreduced != None:
    plt.axhline(unreduced, c='brown', label='Unreduced')
  
  plt.xticks(np.arange(0, max(num_features)+1, 5))
  plt.xlabel(labels[0])
  plt.ylabel(labels[1])
  plt.title(title)
  plt.legend(loc='best')

# Perform dimensional reduction

In [None]:
#load data
X_train, X_test, y_train, y_test = prep_data('datasets/{}.csv'.format(dataset_name))
#standardize
scaler = sklearn.preprocessing.StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
print("Shape of train data: {}".format(X_train.shape))
print("Shape of test data: {}".format(X_test.shape))

In [None]:
#sanity check by running dummy classifiers
dummies = []
dummies.append(sklearn.dummy.DummyClassifier('stratified').fit(X_train, y_train))
dummies.append(sklearn.dummy.DummyClassifier('most_frequent').fit(X_train, y_train))
dummies.append(sklearn.dummy.DummyClassifier('prior').fit(X_train, y_train))
dummies.append(sklearn.dummy.DummyClassifier('uniform').fit(X_train, y_train))

for dummy in dummies:
    print('{} dummy train score: {:.2f}'.format(dummy.strategy, dummy.score(X_train, y_train)))
    print('{} dummy test score: {:.2f}\n'.format(dummy.strategy, dummy.score(X_test, y_test)))

## Setup dimension reduction parameters
We show, but comment out the parameters we used for reducing the datasets from the paper.

In [None]:
np.random.seed(0) #for reproducing results
#TO MODIFY DEPENDING ON THE DATASET

#number of samples used to estimate ReliefF, can be safely 100. For smaller datasets, at least 1% of samples.
RFsamples = 30
#number of features removed by SVM, for combinedRFSVM only
partial_remove = 5
#maximum number of features removed
max_total_remove = 5 #or X_train.shape[1]-1


#indian_pines
#RFsamples = 80
#partial_remove = 120

#micro_mass
#RFsamples = 40
#partial_remove = 700

#olivetti_faces (optimized)
#RFsamples = 40
#partial_remove = 3500

#olivetti_faces (non-optimized as displayed in results later):
#note the extra parameter RFE_step which was only present in old version of combinedReliefFLinearSVM (commented out above)
#RFE_step = 500
#RFsamples = 40
##partial_remove = 3500

assert max_total_remove < X_train.shape[1], "max_total_remove can't be >= total number of features!"

### Reduce data

In [None]:
#dict instead of list since for olivetti_faces dataset, I had to compute for every other number of feature and needed to know for which I computed
results = {}
#get results from reducing to 1 feature, up to reducing max_total_remove features
for total_remove in range(X_train.shape[1]-1, X_train.shape[1]-max_total_remove, -1):
  results[X_train.shape[1]-total_remove] = dim_reduction(X_train, y_train, RFsamples, partial_remove, total_remove)

In [None]:
#save results. Yes saving dictionary as .npy
np.save("savefiles/reduction_{}_{}_{}.npy".format(dataset_name, RFsamples, partial_remove), results)

# Individual testing
Use cell below to test removal of specific number of dimensions. Mainly used for testing

In [None]:
total_remove = 10
assert total_remove < X_train.shape[1] and total_remove > 0, "total remove must be larger than 0 and lower than total number of features!"
print('This is the preprocessed data:\n{}\n\n'.format(X_train))

# Test ReliefF
f_l_RF = np.arange(X_train.shape[1])
timer = time.time()
X_ReliefF, f_l_RF = ReliefFSelect(X_train, y_train, RFsamples, 6, total_remove, f_l_RF)
print("Relieff took {:.0f} seconds".format(time.time() - timer))
print('This is the data chosen by the ReliefF:\n{}'.format(X_ReliefF))
print('These are the features selected:\n{}\n\n'.format(f_l_RF))

# Test linearSVM
f_l_SVM = np.arange(X_train.shape[1])
timer = time.time()
X_SVM, f_l_SVM = linearSVMWeightSelect(X_train,y_train, total_remove, f_l_SVM)
print("SVM took {:.0f} seconds".format(time.time() - timer))
print('This is the data chosen by the Linear SVM weights:\n{}'.format(X_SVM))
print('These are the features selected:\n{}\n\n'.format(f_l_SVM))

# Test combined ReliefF-LinearSVM
f_l_RFSVM = np.arange(X_train.shape[1])
timer = time.time()
X_RFSVM, f_l_RFSVM = combinedReliefFLinearSVM(X_train, y_train, RFsamples, 6, partial_remove, total_remove, f_l_RFSVM)
print("RelieffSVM took {:.0f} seconds".format(time.time() - timer))
print('This is the data chosen by the combined ReliefF-Linear SVM weights:\n{}'.format(X_RFSVM))
print('These are the features selected:\n{}'.format(f_l_RFSVM))

# Hyperparameter search
### Load results from dimension reduction

In [None]:
'''
config contains the feature indexes we keep and how long it took to reduce to them. It's setup in the following way:
key: number of features we use after dimension reduction
value: [[[indexes_RF], reduced_time_ReliefF],
        [[indexes_LiSVM], reduced_time_LiSVM],
        [[indexes_RFSVM], reduced_time_RFSVM]]
'''
config = np.load("savefiles/reduction_{}_{}_{}.npy".format(dataset_name, RFsamples, partial_remove), allow_pickle=True)[()]
#load usefull data from config
num_features = (np.array(list(config.keys())))
reduced_idx = np.array(list(config.values()))[:,:,0]
reduced_time = np.array(list(config.values()))[:,:,1]
print(random.choice(list(config.items())))

## RandomForestClassifier

In [None]:
#setup hyperparameter search
model_forest = sklearn.ensemble.RandomForestClassifier(random_state=0)
#param_distribution = {'max_depth':scipy.stats.randint(1,60), 'n_estimators':scipy.stats.randint(1,200)}
#forest_rsCV = sklearn.model_selection.RandomizedSearchCV(model_fore

#the above was used on the project datasets, but it's too much for the demo dataset, so we run the following instead
param_distribution = {'max_depth':scipy.stats.randint(1,10), 'n_estimators':scipy.stats.randint(1,25)}
forest_rsCV = sklearn.model_selection.RandomizedSearchCV(model_forest, param_distributions=param_distribution, n_iter=5, random_state=0)

In [None]:
reduction_results_forest = []
for features, idx in zip(num_features, reduced_idx):
  print('Training and scoring on {} features...'.format(features))
  results = dim_reduction_results(X_train, X_test, y_train, y_test, forest_rsCV, idx)
  reduction_results_forest.append(results)
reduction_results_forest = np.array(reduction_results_forest, dtype=object)

In [None]:
'''
reduction_results_forest has the following structure, and this will also apply to SVC and KNN
[[RF_score, RF_train_time],
 [SVM_score, SVM_train_time],
 [RFSVM_score, RFSVM_train_time]]
'''
#separate scoring results from train_time results
reduced_scores_forest = reduction_results_forest[:,:,0]
reduced_train_time_forest = reduction_results_forest[:,:,1]
print(random.choice(reduction_results_forest))

In [None]:
# hyperparameter search using original unreduced data
timer = time.time()
forest_rsCV.fit(X_train,y_train)
unreduced_score_forest = forest_rsCV.score(X_test,y_test)
unreduced_train_time_forest = np.int(time.time() - timer)

In [None]:
plot_features_scores(num_features, reduced_scores_forest, unreduced_score_forest)
plt.savefig('savefigs/{}_RandomForestClassifier'.format(dataset_name))

## SVC


In [None]:
#setup hyperparameter search
model = sklearn.svm.SVC(random_state=0)
#reciprocal_distribution = scipy.stats.reciprocal(0.01, 250)
reciprocal_distribution = scipy.stats.reciprocal(0.1, 10)
param_distribution = {'C':reciprocal_distribution}
#svc_rsCV = sklearn.model_selection.RandomizedSearchCV(model, param_distributions=param_distribution, random_state=0, n_iter=50)
svc_rsCV = sklearn.model_selection.RandomizedSearchCV(model, param_distributions=param_distribution, random_state=0, n_iter=10)

In [None]:
reduction_results_svc = []
for features, idx in zip(num_features, reduced_idx):
  print('Training and scoring on {} features...'.format(features))
  results = dim_reduction_results(X_train, X_test, y_train, y_test, svc_rsCV, idx)
  reduction_results_svc.append(results)
reduction_results_svc = np.array(reduction_results_svc, dtype=object)

In [None]:
reduced_scores_svc = reduction_results_svc[:,:,0]
reduced_train_time_svc = reduction_results_svc[:,:,1]
print(random.choice(reduction_results_svc))

In [None]:
timer = time.time()
svc_rsCV = svc_rsCV.fit(X_train,y_train)
unreduced_score_svc = svc_rsCV.score(X_test,y_test)
unreduced_train_time_svc = np.int(time.time() - timer)

In [None]:
plot_features_scores(num_features, reduced_scores_svc, unreduced_score_svc)
plt.savefig('savefigs/{}_SVC'.format(dataset_name))

# KNN

In [None]:
model = sklearn.neighbors.KNeighborsClassifier()
#param_distribution = {"n_neighbors":scipy.stats.randint(1,200)}
param_distribution = {"n_neighbors":scipy.stats.randint(1,25)}
knn_gsCV = sklearn.model_selection.RandomizedSearchCV(model, param_distributions=param_distribution, random_state=0, n_iter=10)

In [None]:
reduction_results_knn = []
for features, idx in zip(num_features, reduced_idx):
  print('Training and scoring on {} features...'.format(features))
  results = dim_reduction_results(X_train, X_test, y_train, y_test, knn_gsCV, idx)
  reduction_results_knn.append(results)
reduction_results_knn = np.array(reduction_results_knn, dtype=object)

In [None]:
reduced_scores_knn = reduction_results_knn[:,:,0]
reduced_train_time_knn = reduction_results_knn[:,:,1]
print(random.choice(reduction_results_knn))

In [None]:
timer = time.time()
knn_gsCV = knn_gsCV.fit(X_train,y_train)
unreduced_score_knn = knn_gsCV.score(X_test,y_test)
unreduced_train_time_knn = np.int(time.time() - timer)

In [None]:
plot_features_scores(num_features, reduced_scores_knn, unreduced_score_knn)
plt.savefig('savefigs/{}_KNN'.format(dataset_name))

## Save results

In [None]:
df = pd.DataFrame({ 'num_features':num_features,\
                    'RF_indexes':reduced_idx[:,0],\
                    'SVM_indexes':reduced_idx[:,1],\
                    'RFSM_indexes':reduced_idx[:,2],\
                    'RF_time':reduced_time[:,0],\
                    'SVM_time':reduced_time[:,1],\
                    'RFSM_time':reduced_time[:,2],\
                    \
                    'unreduced_score_forest':unreduced_score_forest,\
                    'RF_scores_forest':reduced_scores_forest[:,0],\
                    'SVM_scores_forest':reduced_scores_forest[:,1],\
                    'RFSVM_scores_forest':reduced_scores_forest[:,2],\
                    \
                    'unreduced_train_time_forest':unreduced_train_time_forest,\
                    'RF_train_time_forest':reduced_train_time_forest[:,0],\
                    'SVM_train_time_forest':reduced_train_time_forest[:,1],\
                    'RFSVM_train_time_forest':reduced_train_time_forest[:,2],\
                    \
                    'unreduced_score_svc':unreduced_score_svc,\
                    'RF_scores_svc':reduced_scores_svc[:,0],\
                    'SVM_scores_svc':reduced_scores_svc[:,1],\
                    'RFSVM_scores_svc':reduced_scores_svc[:,2],\
                    \
                    'unreduced_train_time_svc':unreduced_train_time_svc,\
                    'RF_train_time_svc':reduced_train_time_svc[:,0],\
                    'SVM_train_time_svc':reduced_train_time_svc[:,1],\
                    'RFSVM_train_time_svc':reduced_train_time_svc[:,2],\
                    \
                    'unreduced_score_knn':unreduced_score_knn,\
                    'RF_scores_knn':reduced_scores_knn[:,0],\
                    'SVM_scores_knn':reduced_scores_knn[:,1],\
                    'RFSVM_scores_knn':reduced_scores_knn[:,2],\
                    \
                    'unreduced_train_time_knn':unreduced_train_time_knn,\
                    'RF_train_time_knn':reduced_train_time_knn[:,0],\
                    'SVM_train_time_knn':reduced_train_time_knn[:,1],\
                    'RFSVM_train_time_knn':reduced_train_time_knn[:,2]})
df.to_csv('savefiles/final_results_{}.csv'.format(dataset_name), index=None)

# Plot results
The results for all project datasets are already saved in savefiles folder<br>
Uncomment the line for which dataset you want to plot.<br><br>
As stated at the begining of notebook, the Combined ReliefF-Linear SVM weight had a last minute optimization. But we only had time to apply it to the micro_mass and indina_pines datasets. Thus, the olivetti_faces datasets have much higher time for reduction under Combined ReliefF-Linear SVM weight. We make an estimate for correct reduction time at the end of the notebook.

In [None]:
#dataset_name = 'forest_types'
#dataset_name = 'indian_pines'
#dataset_name = 'micro_mass'
dataset_name = 'olivetti_faces'

In [None]:
"""
Load the final_results from csv file to numpy, then to appropriate variables.
I saved the feature indexes we keep as strings in the csv file, so I'll just convert them back to numpy arrays
"""
def strArray_to_npArray(str):
    elements = str[1:-1].split()
    return np.array(elements, dtype=np.int)

results = pd.read_csv('savefiles/final_results_{}.csv'.format(dataset_name)).to_numpy()
results[:,1] = np.array(list(map(strArray_to_npArray, results[:,1])))
results[:,2] = np.array(list(map(strArray_to_npArray, results[:,2])))
results[:,3] = np.array(list(map(strArray_to_npArray, results[:,3])))

reduced_scores_forest = np.column_stack([results[:,8], results[:,9], results[:,10]])
reduced_scores_forest = reduced_scores_forest.reshape(-1,3)

reduced_scores_svc = np.column_stack([results[:,16], results[:,17], results[:,18]])
reduced_scores_svc = reduced_scores_svc.reshape(-1,3)

reduced_scores_knn = np.column_stack([results[:,24], results[:,25], results[:,26]])
reduced_scores_knn = reduced_scores_knn.reshape(-1,3)

In [None]:
plot_timings(results[:,0], results[:,4:7], ['Number of features', 'Time(seconds)'], title="Time to reduce relative to final number of features (4096 features set)")
#plt.savefig('savefigs/reduction_time_{}'.format(dataset_name))
plot_features_scores(results[:,0], reduced_scores_forest, results[0,7], "{} using RandomForestClassifier".format(dataset_name))
plot_features_scores(results[:,0], reduced_scores_svc, results[0,15], "{} using SVC".format(dataset_name))
plot_features_scores(results[:,0], reduced_scores_knn, results[0,23], "{} using KNN".format(dataset_name))

# Get estimate for time to reduce data with improved Combined ReliefF-LiSVM
Here we estimate the reduction times for the olivetti_faces dataset (4086 attribs) with the improved Combined ReliefF_SVM <br>
Because the time taken to reduce is close to constant, we averaged the reduction times for 10 different feature reductions. We can observe that the RF and LiSVM haven't changed significantly, but we got very large improvements on the RF-LiSVM method

In [None]:
config_old = pd.read_csv('savefiles/final_results_olivetti_faces.csv').to_numpy()
reduced_time = results[:,4:7]
avg_reduced_time = np.average(reduced_time, axis=0)
print("Average time to reduce olivetti_faces under non-optimal Combined ReliefF-LiSVM:")
print("RF:{}\nLiSVM:{}\nRF-LiSVM:{}\n\n".format(avg_reduced_time[0], avg_reduced_time[1], avg_reduced_time[2]))
print(avg_reduced_time)

config_new = np.load('savefiles/some_reduction_olivetti_faces_40_3500.npy', allow_pickle=True)[()]
reduced_time = np.array(list(config_new.values()))[:,:,1]
avg_reduced_time = np.average(reduced_time, axis=0)
print("Average time to reduce olivetti_faces under optimized Combined ReliefF-LiSVM:")
print("RF:{}\nLiSVM:{}\nRF-LiSVM:{}\n\n".format(avg_reduced_time[0], avg_reduced_time[1], avg_reduced_time[2]))

