# Ensemble Feature Selection (EFS) Framework

In [None]:
import os

import pandas as pd
import numpy as np
import math

from scipy.signal import savgol_filter
from itertools import combinations

# mrmr
from mrmr import mrmr_classif
# all reliefF
import skrebate as skr
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import f_classif

import shap
shap.initjs()

from sklearn.metrics import accuracy_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RepeatedStratifiedKFold
from boruta import BorutaPy

import matplotlib.pyplot as plt
from cycler import cycler
import matplotlib.transforms

import time
from joblib import dump


In [None]:
# Change to data directory

work_path = os.getcwd()
print(work_path)

os.chdir('YOUR DATA DIRECTORY')
data_path = os.getcwd()
print(data_path)
os.chdir(data_path)

In [None]:
# Load data

os.chdir(data_path)

# Load dataset - df
df = pd.read_csv('YOUR DATA FILE.csv')

# Average spectra for each tissue type
avg_df_by_y = df.groupby(['target_y']).mean()
stdev_df_by_y = df.groupby(['target_y']).std()

# Column names to numerical
col_wavelengths = df.columns.drop('target_y')
col_wavelengths = col_wavelengths.astype(np.float64)
print('Number of wavelengths: ', len(col_wavelengths))

# Legend labels
tissue_types = df['target_y'].unique()
tissue_types.sort()

In [None]:
# SG Smoothing of the raw data - Gentle smoothing with w/p = 2.5 to get rid of some baseline noise

# Features
X_raw = df.drop(['target_y'], axis=1)

# Target
y = df['target_y']

# Spectral Smoothing - Savitzky–Golay (SG) method
w = 5
p = 2
X_smooth = savgol_filter(X_raw, w, polyorder=p, axis=1, deriv=0)

# Smoothed dataframe
df_smooth = pd.DataFrame(X_smooth, columns = col_wavelengths.astype("string"))
df_smooth = pd.concat([df_smooth, y.rename('target_y')], axis=1)

# Average spectra for each tissue type
avg_df_by_y_smooth = df_smooth.groupby(['target_y']).mean()
stdev_df_by_y_smooth = df_smooth.groupby(['target_y']).std()

In [None]:
# Spectral Partition into UV/VIS/NIR/SWIR

#VIS
df_vis_idx = np.where(col_wavelengths.values<700)[-1][-1]
df_smooth_vis = df_smooth.iloc[:,:df_vis_idx]
df_smooth_vis = pd.concat([df_smooth_vis, y.rename('target_y')], axis=1)
col_wavelengths_vis = col_wavelengths[:df_vis_idx]

#NIR
df_nir_idx_start = np.where((col_wavelengths.values>=700) & (col_wavelengths.values<1000))[-1][0]
df_nir_idx_end = np.where((col_wavelengths.values>=700) & (col_wavelengths.values<1000))[-1][-1]
df_smooth_nir = df_smooth.iloc[:,df_nir_idx_start:df_nir_idx_end]
df_smooth_nir = pd.concat([df_smooth_nir, y.rename('target_y')], axis=1)
col_wavelengths_nir = col_wavelengths[df_nir_idx_start:df_nir_idx_end]

#SWIR
df_swir_idx = np.where(col_wavelengths.values>=1000)[-1][0]
df_smooth_swir = df_smooth.iloc[:,df_swir_idx:]
col_wavelengths_swir = col_wavelengths[df_swir_idx:]

#VIS/NIR together
df_vis_nir_idx = np.where(col_wavelengths.values<1000)[-1][-1]
df_smooth_vis_nir = df_smooth.iloc[:,:df_vis_nir_idx]
df_smooth_vis_nir = pd.concat([df_smooth_vis_nir, y.rename('target_y')], axis=1)
col_wavelengths_vis_nir = col_wavelengths[:df_vis_nir_idx]

In [None]:
# Change to save path
os.chdir(work_path)
os.chdir('YOUR SAVE PATH')
save_path = os.getcwd()
print(save_path)

#### Define functions

In [None]:
# Define a function to remove multicollinearity - can select top K least correlated features using Pearson correlation
# Return indices

def top_K_least_correlated(X,y, K_features, selection = f_classif):

    # Select the top feature by ANOVA F-value
    fs = SelectKBest(selection,k="all")
    fs.fit(X,y)
    # Find the index from high to low
    anova_ind = np.argsort(-fs.scores_)

    # Initiate the search
    selected_ind_update = [anova_ind[0]]
    anova_ind_update = np.copy(anova_ind)
    anova_ind_update = np.delete(anova_ind_update, 0)

    while len(selected_ind_update) < K_features:
        #print('Number of Selected Ind: ', len(selected_ind_update))
        pearson_rank = []
        for i in range(len(anova_ind_update)):
            pearson_corr = np.max(np.abs(np.corrcoef(X.iloc[:, selected_ind_update + [anova_ind_update[i]]], rowvar=False))
                                  - np.identity(len(selected_ind_update)+1))
            pearson_rank.append(pearson_corr)
        selected_ind = np.argsort(pearson_rank)[0]
        selected_anova_ind = anova_ind_update[selected_ind]
        #print('Selected Ind: ', selected_anova_ind)
        #print('Selected Pearson Coeff: ', pearson_rank[selected_ind])
        selected_ind_update.append(selected_anova_ind)
        anova_ind_update = np.delete(anova_ind_update, selected_ind)

    return selected_ind_update

In [None]:
# Define a function to plot ensemble results - selected wavelength
def plot_wv_ensemble(spectral_range, mrmr_wv, relieff_wv, mi_wv, union_wv, boruta_wv, final_wv, save_flag, tissue_1, tissue_2):

    # Set custom color cycle
    custom_cycler = (cycler('color', ['#d62728', '#1f77b4', '#2ca02c', '#ff7f0e',
                                      '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf']) +
                     cycler(lw=[1.5, 1.5, 1.5, 1.5, 1.5, 1.5, 1.5, 1.5, 1.5, 1.5]))


    # Set the figure size
    f, ax = plt.subplots(figsize=(10, 8))

    # Tight layout
    f.tight_layout()

    # Set colors
    ax.set_prop_cycle(custom_cycler)

    # Plot
    x_fill_1 = np.zeros(len(mrmr_wv))+1
    x_fill_2 = np.zeros(len(relieff_wv))+1.1
    x_fill_3 = np.zeros(len(mi_wv))+1.2
    x_fill_4 = np.zeros(len(union_wv))+1.4

    plt.scatter(mrmr_wv, x_fill_1, label='mRMR Selected')
    plt.scatter(relieff_wv, x_fill_2, label='ReliefF Selected')
    plt.scatter(mi_wv, x_fill_3, label='MI selected')
    plt.scatter(union_wv, x_fill_4, label='Union')

    if np.any(boruta_wv):
        if np.any(final_wv):
            for wv in final_wv:
                plt.axvline(x=wv, color='m', linestyle='-', alpha=0.3)
                plt.text(0, -0.13, 'Selected Wavelengths - Final: ' + str(np.array(final_wv)), horizontalalignment='left',
                         verticalalignment = 'center_baseline', transform=ax.transAxes)
        else:
            for wv in boruta_wv:
                plt.axvline(x=wv, color='c', linestyle='-', alpha=0.08)
                plt.text(0, -0.1, 'Number of Selected Wavelengths - BORUTA: ' + str(len(boruta_wv)), horizontalalignment='left',
                         verticalalignment = 'center_baseline', transform=ax.transAxes)
    else:
        plt.text(0, -0.13, 'Number of Selected Wavelengths - Union: ' + str(len(union_wv)), horizontalalignment='left',
                 verticalalignment = 'center_baseline', transform=ax.transAxes)

    # Set figure object
    ax.set_title('Selected Wavelengths ' + str(tissue_1) + ' vs. ' + str(tissue_2))
    ax.set_xlabel('Wavelength (nm)')
    ax.set_ylabel('(A.U.)')
    if spectral_range[-1] < 700:
        ax.set_xlim([350, 700])
    elif (spectral_range[0] > 700) and (spectral_range[-1] < 1000):
        ax.set_xlim([700, 1000])
    elif (spectral_range[0] > 350) and (spectral_range[-1] < 1000):
        ax.set_xlim([350, 1000])
    elif (spectral_range[0] > 1000) and (spectral_range[-1] < 1850):
        ax.set_xlim([1000, 1850])
    else:
        ax.set_xlim([350, 1850])
    ax.set_ylim([0.9, 1.5])
    plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))
    bbox = matplotlib.transforms.Bbox([[-0.45, -1.3], [11.45, 8.56]])

    # Save figure - condition
    if save_flag:
        if not os.path.exists(str(tissue_1) + '_' + str(tissue_2) + '_' + str(np.any(boruta_wv)) + str(np.any(final_wv)) + '_ensemble_fs.png'):
            f.savefig(os.getcwd() + '/' +  str(tissue_1) + '_' + str(tissue_2) + '_' + str(np.any(boruta_wv)) + str(np.any(final_wv)) +
                      '_ensemble_fs.png', dpi = 1080, bbox_inches =bbox)
        plt.close()
    else:
        plt.show()

In [None]:
# Define a function to plot ensemble results - selected wavelength
def plot_wv_ensemble_spectral(df_plot, union_wv, boruta_wv, final_wv, tissue_1, tissue_2):

    # Set custom color cycle
    custom_cycler = (cycler('color', ['#d62728', '#1f77b4', '#2ca02c', '#ff7f0e',
                                      '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf']) +
                     cycler(lw=[1.5, 1.5, 1.5, 1.5, 1.5, 1.5, 1.5, 1.5, 1.5, 1.5]))

    # Set colors
    ax.set_prop_cycle(custom_cycler)

    # Average spectra for each tissue type
    df_subset = df_plot[df_plot['target_y'].isin([tissue_1, tissue_2])]
    avg_df_select_tissues = df_subset.groupby(['target_y']).mean()
    stdev_df_select_tissues = df_subset.groupby(['target_y']).std()

    select_tissue_label = [tissue_1,tissue_2]
    for tissue in range(avg_df_select_tissues.shape[0]):
        plt.plot(col_wavelengths, avg_df_select_tissues.iloc[tissue,:], label = select_tissue_label[tissue])
        pos_std = avg_df_select_tissues.iloc[tissue,:] + stdev_df_select_tissues.iloc[tissue,:]
        neg_std = avg_df_select_tissues.iloc[tissue,:] - stdev_df_select_tissues.iloc[tissue,:]
        ax.fill_between(col_wavelengths, neg_std, pos_std, alpha = 0.08)

    if np.any(boruta_wv):
        if np.any(final_wv):
            for wv in final_wv:
                plt.axvline(x=wv, color='m', linestyle='-', alpha=0.3)
                plt.text(0, -0.13, 'Selected Wavelengths - Final: ' + str(np.array(final_wv)), horizontalalignment='left',
                         verticalalignment = 'center_baseline', transform=ax.transAxes)
        else:
            for wv in boruta_wv:
                plt.axvline(x=wv, color='c', linestyle='-', alpha=0.08)
                plt.text(0, -0.1, 'Number of Selected Wavelengths - BORUTA: ' + str(len(boruta_wv)), horizontalalignment='left',
                         verticalalignment = 'center_baseline', transform=ax.transAxes)
    else:
        for wv in union_wv:
            plt.axvline(x=wv, color='g', linestyle='-', alpha=0.05)
            plt.text(0, -0.13, 'Number of Selected Wavelengths - Union: ' + str(len(union_wv)), horizontalalignment='left',
                     verticalalignment = 'center_baseline', transform=ax.transAxes)

    # Set figure object
    ax.set_title('Selected Wavelengths ' + str(tissue_1) + ' vs. ' + str(tissue_2))
    ax.set_xlabel('Wavelength (nm)')
    ax.set_ylabel('(A.U.)')
    ax.set_xlim([350,1850])
    plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))


In [None]:
def plot_wv_ensemble_spectral_ovr(df_plot, spectral_range, union_wv, boruta_wv, final_wv, pos_class, neg_class):

    # Average spectra for each tissue type
    avg_df_select_tissues = df_plot.groupby(['target_y']).mean()
    stdev_df_select_tissues = df_plot.groupby(['target_y']).std()

    # Plot
    select_tissue_label = [i,k_1,k_2,k_3,k_4] # Can define outside the function
    select_tissue_label.sort()
    # Set custom color cycle
    if pos_class == 'boneCement':
        custom_cycler= (cycler('color', ['#d62728', '#808080', '#808080', '#808080', '#808080']) + cycler(lw=[1.5, 1.5, 1.5, 1.5, 1.5]))
    elif pos_class == 'cortBone':
        custom_cycler = (cycler('color', ['#808080', '#808080', '#d62728', '#808080', '#808080']) + cycler(lw=[1.5, 1.5, 1.5, 1.5, 1.5]))

    # Set colors
    ax.set_prop_cycle(custom_cycler)
    for tissue in range(avg_df_select_tissues.shape[0]):
        plt.plot(spectral_range, avg_df_select_tissues.iloc[tissue, :], label=select_tissue_label[tissue])
        pos_std = avg_df_select_tissues.iloc[tissue, :] + stdev_df_select_tissues.iloc[tissue, :]
        neg_std = avg_df_select_tissues.iloc[tissue, :] - stdev_df_select_tissues.iloc[tissue, :]
        ax.fill_between(spectral_range, neg_std, pos_std, alpha=0.08)

    if np.any(boruta_wv):
        if np.any(final_wv):
            for wv in final_wv:
                plt.axvline(x=wv, color='m', linestyle='-', alpha=0.3)
                plt.text(0, -0.13, 'Selected Wavelengths - Final: ' + str(np.array(final_wv)), horizontalalignment='left',
                         verticalalignment = 'center_baseline', transform=ax.transAxes)
        else:
            for wv in boruta_wv:
                plt.axvline(x=wv, color='c', linestyle='-', alpha=0.15)
                plt.text(0, -0.1, 'Number of Selected Wavelengths - BORUTA: ' + str(len(boruta_wv)), horizontalalignment='left',
                         verticalalignment = 'center_baseline', transform=ax.transAxes)
    else:
        for wv in union_wv:
            plt.axvline(x=wv, color='g', linestyle='-', alpha=0.05)
            plt.text(0, -0.13, 'Number of Selected Wavelengths - Union: ' + str(len(union_wv)), horizontalalignment='left',
                     verticalalignment = 'center_baseline', transform=ax.transAxes)

    # Set figure object
    ax.set_title('Selected Wavelengths for ' + str(pos_class) + ' vs. ' + str(neg_class))
    ax.set_xlabel('Wavelength (nm)')
    ax.set_ylabel('Normalized Intensity (A.U.)')
    if spectral_range[-1] < 700:
        ax.set_xlim([350, 700])
    elif (spectral_range[0] > 700) and (spectral_range[-1] < 1000):
        ax.set_xlim([700, 1000])
    elif (spectral_range[0] > 350) and (spectral_range[-1] < 1000):
        ax.set_xlim([350, 1000])
    elif (spectral_range[0] > 1000) and (spectral_range[-1] < 1850):
        ax.set_xlim([1000, 1850])
    else:
        ax.set_xlim([350, 1850])
    plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))

In [None]:
# Define a function to evaluate clf accuracy using lda - Number of Features is a variable

def clf_accuracy_eval(X, y, model, cv):

    # Initiate empty lists
    clf_accuracy_scores = [] # Calculate classification scores
    clf_balanced_accuracy_scores = []
    clf_f1_scores = []
    #count = 0
    for train_index, test_index in cv.split(X, y):
        # count+=1
        # print(count, " of ",  cv.get_n_splits(), " CV folds ", end="\r")

        # define model
        clf_model = model # model needs to be refined in each loop? so move down into the loop?

        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # Fit model
        clf_model.fit(X_train, y_train)
        # Predict using the fitted model
        y_predict = clf_model.predict(X_test)
        # Calculate classification accuracy score
        clf_accuracy_score = accuracy_score(y_test, y_predict)
        # Calculate classification balanced accuracy score
        clf_balanced_accuracy_score = balanced_accuracy_score(y_test, y_predict)
        # Calculate classification F-1 score
        clf_f1_score = f1_score(y_test, y_predict, average='binary')
        # Store each iteration
        clf_accuracy_scores.append(clf_accuracy_score)
        clf_balanced_accuracy_scores.append(clf_balanced_accuracy_score)
        clf_f1_scores.append(clf_f1_score)

    return clf_accuracy_scores, clf_balanced_accuracy_scores, clf_f1_scores

#### The OVO Approach

In [None]:
# Change to save path
os.chdir(save_path)
os.chdir('YOUR SAVE PATH - SUBFOLDER')
print(os.getcwd())

In [None]:
# Ensemble Framework for the OVO approach starts here:


# Define dataset
df_dataset = df_smooth
spectral_range = col_wavelengths

#df_dataset = df_smooth_vis
#spectral_range = col_wavelengths_vis

#df_dataset = df_smooth_vis_nir
#spectral_range = col_wavelengths_vis_nir

tissue_types = tissue_types

# Set global random state
random_state = 42

# Define cross validation
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=10, random_state=random_state)

# Define binary combinations
list_tissue_comb = list(combinations(tissue_types,2))
# Save the list of binary pairs
if not os.path.exists('order_of_tissue_comb.joblib'):
    dump(list_tissue_comb, 'order_of_tissue_comb.joblib')

In [None]:
# Run the for loop to calculate for all binary pairs

# for i,k in combinations(tissue_types,2): # can separate this to make an individual function
#
#     # Track timestamp - elapsed time
#     start_time = time.time()
#
#     count += 1

# if count == 2:
#     print('count = 2; break')
#     break

# Else for single pairs --->

i = 'boneCement'
k = 'cortBone'

# i = 'boneMarrow'
# k = 'cortBone'

# Select from the original dataset
binary_subset = df_dataset[df_dataset['target_y'].isin([i, k])]

# Features
X_binary_init = binary_subset.drop(['target_y'], axis=1)
# Target
y_binary_init = binary_subset['target_y']
# Convert to 0 and 1 - i = 1, k = 0
y_binary_init = (y_binary_init == i).astype('uint8')


# Split Train Test Validation set
X_binary, X_val_all, y_binary, y_val_all = train_test_split(X_binary_init, y_binary_init,
                                                            test_size=0.2, stratify=y_binary_init, random_state=random_state)


#### Univariate Filtering

In [None]:
# Univariate Filtering

# Define the number of features to be selected from each filter method
fraction = 0.2
K_features = math.floor(len(col_wavelengths)*fraction)
print('The number of features to be selected: ', K_features)

# mRMR
selected_wv_mrmr = mrmr_classif(X_binary, y_binary, K=K_features) #, show_progress=False
selected_wv_mrmr = np.float64(selected_wv_mrmr)

# reliefF
relief_F_model = skr.ReliefF(n_features_to_select=K_features, n_neighbors=20)
relief_F_scores = relief_F_model.fit(X_binary.values, y_binary.values)
# Find the selected wavelengths
idx_relieff = relief_F_scores.top_features_
selected_wv_relieff = spectral_range[idx_relieff[:K_features]]

# Mutual information
mi_scores = SelectKBest(mutual_info_classif, k=K_features).fit(X_binary, y_binary)
# Find the selected wavelengths
selected_wv_mi = spectral_range[mi_scores.get_support()]

# Fine the union of the selected wavelengths from mRMR, reliefF, Mutual Information
all_union = list(set(selected_wv_mrmr) | set(selected_wv_relieff) | set(selected_wv_mi))
all_union_str = [str(x) for x in np.sort(all_union)]
print('The Selected Wavelengths from Union: ', len(all_union_str))

# Selected wavelengths
selected_X_binary = X_binary.loc[:, all_union_str].values

# Plot union of the three filter methods
plot_wv_ensemble(spectral_range, selected_wv_mrmr, selected_wv_relieff, selected_wv_mi, all_union, None, None, True, i, k)

In [None]:
# Set the figure size
f, ax = plt.subplots(figsize=(10, 8))

# Tight layout
f.tight_layout()

# Plot
plot_wv_ensemble_spectral(df_dataset, all_union, None, None, i, k)

bbox = matplotlib.transforms.Bbox([[-0.45, -1.3], [11.45, 8.56]])
if not os.path.exists(str(i) + '_' + str(k) + '_1_UNION_fs.png'):
    f.savefig(os.getcwd() + '/' +  str(i) + '_' + str(k) + '_1_UNION_fs.png', dpi = 1080, bbox_inches =bbox)
plt.close()

#### BORUTA - RF

In [None]:
# BORUTA - RF

# Boruta feature selection - ALL

# Track timestamp - elapsed time
start_time = time.time()

# Define boruta estimator
boruta_model = RandomForestClassifier(n_estimators=100, class_weight='balanced',
                                      random_state=random_state, n_jobs=-1)

# Define boruta feature selector
fs_boruta = BorutaPy(estimator=boruta_model, n_estimators='auto', perc=100, max_iter=100,  verbose=1, random_state=random_state)

# Fit boruta feature selector
fs_boruta.fit(selected_X_binary, y_binary)

# End timestamp
end_time = time.time()
print('Time to run: ', end_time - start_time, ' seconds')

# Rank Boruta selected features
selected_boruta_features = pd.DataFrame({'Features': all_union_str,
                                         'Ranking': fs_boruta.ranking_})
selected_boruta_features.sort_values(by='Ranking')

# Select TOP boruta selected features
boruta_top_features = selected_boruta_features[selected_boruta_features["Ranking"] == 1]
print('The Number of Selected Wavelengths from BORUTA + RF: ', len(boruta_top_features))

# Find the selected wavelengths in STR and FLOAT
boruta_wavelengths = boruta_top_features["Features"].astype(np.float64)
boruta_wavelengths_str = [str(x) for x in boruta_top_features["Features"]]

# Convert to dataframe for BORUTA + RF
df_selected_wv_boruta = pd.concat([X_binary.loc[:, boruta_wavelengths_str], y_binary.rename('target_y')], axis=1)

In [None]:
# Set the figure size
f, ax = plt.subplots(figsize=(10, 8))

# Tight layout
f.tight_layout()

# Plot
plot_wv_ensemble_spectral(df_dataset, all_union, boruta_wavelengths, None, i, k)

bbox = matplotlib.transforms.Bbox([[-0.45, -1.3], [11.45, 8.56]])
if not os.path.exists(str(i) + '_' + str(k) + '_2_BORUTARF_fs.png'):
    f.savefig(os.getcwd() + '/' +  str(i) + '_' + str(k) + '_2_BORUTARF_fs.png', dpi = 1080, bbox_inches =bbox)
plt.close()

In [None]:
# Top 10 Feature

# Select top 10 features by f_classif scores using SelectKBest
f_clf_scores = SelectKBest(f_classif, k=10).fit(X_binary.loc[:, boruta_wavelengths_str], y_binary)
f_clf_wv = boruta_wavelengths[f_clf_scores.get_support()]
# To string
f_clf_wv_str = [str(x) for x in f_clf_wv]

# Save variables
dump(f_clf_wv, str(i) + '_' + str(k) + '_ensemble_Col.joblib')

# Set the figure size
f, ax = plt.subplots(figsize=(10, 8))

# Tight layout
f.tight_layout()

# Plot
plot_wv_ensemble_spectral(df_dataset, all_union, boruta_wavelengths, f_clf_wv, i, k)

bbox = matplotlib.transforms.Bbox([[-0.45, -1.3], [11.45, 8.56]])
if not os.path.exists(str(i) + '_' + str(k) + '_3_Top_emsemble_fs.png'):
    f.savefig(os.getcwd() + '/' +  str(i) + '_' + str(k) + '_3_Top_emsemble_fs.png', dpi = 1080, bbox_inches =bbox)
plt.close()

In [None]:
# Select top 10 features by f_classif scores and RANK and REMOVE MULTICOLLINEARITY
f_clf_wv_1_idx = top_K_least_correlated(X_binary.loc[:, boruta_wavelengths_str], y_binary, K_features=10, selection = f_classif)
f_clf_wv_1 = boruta_wavelengths.values[f_clf_wv_1_idx]
# To string
f_clf_wv_1_str = [str(x) for x in f_clf_wv_1]

# Save variables
dump(f_clf_wv_1, str(i) + '_' + str(k) + '_ensemble_noCol.joblib')

# Set the figure size
f, ax = plt.subplots(figsize=(10, 8))

# Tight layout
f.tight_layout()

# Plot
plot_wv_ensemble_spectral(df_dataset, all_union, boruta_wavelengths, f_clf_wv_1, i, k)

bbox = matplotlib.transforms.Bbox([[-0.45, -1.3], [11.45, 8.56]])
if not os.path.exists(str(i) + '_' + str(k) + '_3_Top_emsemble_fs_1.png'):
    f.savefig(os.getcwd() + '/' +  str(i) + '_' + str(k) + '_3_Top_emsemble_fs_1.png', dpi = 1080, bbox_inches =bbox)
plt.close()

#### SHAP Interaction

In [None]:
# SHAP

# Calculate SHAP interaction for individual contributions to prediction
X_binary_shap_1 = X_binary.loc[:, f_clf_wv_1_str]

#Train model
model_shap_rf_1 = RandomForestClassifier(n_estimators=100, class_weight='balanced', max_depth=5,
                                         random_state=random_state, n_jobs=-1)
model_shap_rf_1.fit(X_binary_shap_1, y_binary)

# SHAP values
shap_explainer_rf_1 = shap.TreeExplainer(model_shap_rf_1)
shap_values_rf_1 = shap_explainer_rf_1.shap_values(X_binary_shap_1)
# Sort shap features
vals = np.abs(shap_values_rf_1[1]).mean(0)
feature_importance = pd.DataFrame(list(zip(f_clf_wv_1_str, vals)),
                                  columns=['col_wavelengths', 'feature_importance_vals'])
feature_importance.sort_values(by=['feature_importance_vals'], ascending=False, inplace=True)
# Rank
shap_wv_rank = X_binary_shap_1.columns[np.argsort(-np.abs(shap_values_rf_1[1]).mean(0))].values
print('Shape Feature Rank: ', shap_wv_rank)
# Plot bar plot
# Set the figure size
f, ax = plt.subplots(figsize=(10, 8))
# Tight layout
f.tight_layout()
# Plot
shap.summary_plot(shap_values_rf_1, X_binary_shap_1, plot_type='bar')
bbox = matplotlib.transforms.Bbox([[-0.2, -0.2], [8.5, 6]])

# Save figure
if not os.path.exists(str(i) + '_' + str(k) + '_SHAP_bar_noCol.png'):
    f.savefig(os.getcwd() + '/' + str(i) + '_' + str(k) + '_SHAP_bar_noCol.png', dpi=1080, bbox_inches=bbox)
# Plot bar plot

# Set the figure size
f, ax = plt.subplots(figsize=(10, 8))
# Tight layout
f.tight_layout()
# Plot
shap.summary_plot(shap_values_rf_1[1], X_binary_shap_1)
bbox = matplotlib.transforms.Bbox([[-0.2, -0.2], [8, 6]])

# Save figure
if not os.path.exists(str(i) + '_' + str(k) + '_SHAP_beeswarm_noCol.png'):
    f.savefig(os.getcwd() + '/' + str(i) + '_' + str(k) + '_SHAP_beeswarm_noCol.png', dpi=1080, bbox_inches=bbox)

#### LDA Classification Accuracy nFeatures

In [None]:
# Calculate clf accuracy

# Can calculate balanced and f1 scores at the same time

# Now only include classification accuracy

# Define cross validation
random_state = 42
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=10, random_state=random_state)
shap_select_X_binary = X_binary.loc[:, shap_wv_rank]

all_lda_scores = []
all_lda_stdev = []
for wv in range(len(shap_wv_rank)):
    # Define LDA model
    model = LinearDiscriminantAnalysis()

    # Calculate clf accuracy cv
    print('Number of Features: ', wv + 1)
    lda_cv_scores, _, _ = clf_accuracy_eval(shap_select_X_binary.iloc[:, 0:wv + 1], y_binary, model, cv)
    all_lda_scores.append(np.mean(lda_cv_scores))
    all_lda_stdev.append(np.std(lda_cv_scores))
# Save
dump(all_lda_scores, str(i) + '_' + str(k) + '_LDA_accuracy_nFeatures_train.joblib')
dump(all_lda_stdev, str(i) + '_' + str(k) + '_LDA_stdev_nFeatures_train.joblib')

# plot Clf Accuracy vs. Number of Features

# Set the figure size
f, ax = plt.subplots(figsize=(8, 6))
# Tight layout
f.tight_layout()

# Plot
plt.plot(np.linspace(1, len(shap_wv_rank), len(shap_wv_rank)), all_lda_scores, '--', lw=2.5)
plt.errorbar(np.linspace(1, len(shap_wv_rank), len(shap_wv_rank)), all_lda_scores, yerr=all_lda_stdev,
             fmt='o', capsize=1.5)
xlabels = ['0'] + shap_wv_rank.tolist()
plt.xticks(np.arange(0, len(shap_wv_rank) + 1, 1), xlabels)
# Set figure object
ax.set_title('Classification Accuracy vs. Number of Wavelengths for ' + str(i) + ' vs. ' + str(k))
ax.set_xlabel('Wavelengths Included (nm)')
ax.set_ylabel('LDA Clf Accuracy')
bbox = matplotlib.transforms.Bbox([[-0.4, -0.4], [8.5, 6.5]])

# Save figure
if not os.path.exists(str(i) + '_' + str(k) + '_LDA_accuracy_nFeatures_train.png'):
    f.savefig(os.getcwd() + '/' + str(i) + '_' + str(k) + '_LDA_accuracy_nFeatures_train.png', dpi=1080,
              bbox_inches=bbox)
# Calculate clf accuracy

# Define cross validation
random_state = 42
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=10, random_state=random_state)
shap_select_X_binary = X_val_all.loc[:, shap_wv_rank]

all_lda_scores = []
all_lda_stdev = []
for wv in range(len(shap_wv_rank)):
    # Define LDA model
    model = LinearDiscriminantAnalysis()

    # Calculate clf accuracy cv
    print('Number of Features: ', wv + 1)
    lda_cv_scores, _, _ = clf_accuracy_eval(shap_select_X_binary.iloc[:, 0:wv + 1], y_val_all, model, cv)
    all_lda_scores.append(np.mean(lda_cv_scores))
    all_lda_stdev.append(np.std(lda_cv_scores))
# Save
dump(all_lda_scores, str(i) + '_' + str(k) + '_LDA_accuracy_nFeatures_val.joblib')
dump(all_lda_stdev, str(i) + '_' + str(k) + '_LDA_stdev_nFeatures_val.joblib')

# plot Clf Accuracy vs. Number of Features

# Set the figure size
f, ax = plt.subplots(figsize=(8, 6))
# Tight layout
f.tight_layout()

# Plot
plt.plot(np.linspace(1, len(shap_wv_rank), len(shap_wv_rank)), all_lda_scores, '--', lw=2.5)
plt.errorbar(np.linspace(1, len(shap_wv_rank), len(shap_wv_rank)), all_lda_scores, yerr=all_lda_stdev,
             fmt='o', capsize=1.5)
xlabels = ['0'] + shap_wv_rank.tolist()
plt.xticks(np.arange(0, len(shap_wv_rank) + 1, 1), xlabels)
# Set figure object
ax.set_title('Classification Accuracy vs. Number of Wavelengths for ' + str(i) + ' vs. ' + str(k))
ax.set_xlabel('Wavelengths Included (nm)')
ax.set_ylabel('LDA Clf Accuracy')
bbox = matplotlib.transforms.Bbox([[-0.4, -0.4], [8.5, 6.5]])

# Save figure
if not os.path.exists(str(i) + '_' + str(k) + '_LDA_accuracy_nFeatures_val.png'):
    f.savefig(os.getcwd() + '/' + str(i) + '_' + str(k) + '_LDA_accuracy_nFeatures_val.png', dpi=1080,
              bbox_inches=bbox)

#### The OVR Approach

In [None]:
# Change to save_path
os.chdir(save_path)
os.chdir('YOUR SAVE PATH - SUBFOLDER')
print(os.getcwd())

In [None]:
# Define dataset

df_dataset = df_smooth
spectral_range = col_wavelengths

# df_dataset = df_smooth_vis
# spectral_range = col_wavelengths_vis

# df_dataset = df_smooth_vis_nir
# spectral_range = col_wavelengths_vis_nir

#df_dataset = df_smooth_swir
#df_dataset = df_snv_smooth

tissue_types = tissue_types

# Set global random state
random_state = 42

# Define cross validation
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=10, random_state=random_state)

# Bone Cement = 1 vs. [Cortical Bone, Trabecular Bone, Cartilage, Bone Marrow] = 0
# i = 'boneCement'
# k_1 = 'cortBone'
# k_2 = 'traBone'
# k_3 = 'cartilage'
# k_4 = 'boneMarrow'

# Cortical Bone = 1 vs. [Trabecular Bone, Cartilage, Bone Marrow, Muscle] = 0
i = 'cortBone'
k_1 = 'traBone'
k_2 = 'muscle'
k_3 = 'cartilage'
k_4 = 'boneMarrow'

In [None]:
print('First Label: ', i)
#print('Second Label: ',k)
print('Second Label: ' + k_1 + ', ' + k_2 + ', ' + k_3 + ', ' + k_4)

# Data selection for multiple tissue types
df_subset_ovr = df_dataset[df_dataset['target_y'].isin([i,k_1,k_2,k_3,k_4])]
binary_subset = df_dataset[df_dataset['target_y'].isin([i,k_1,k_2,k_3,k_4])]
binary_subset.loc[df['target_y'].isin([k_1, k_2, k_3, k_4]), 'target_y'] = 'rest'
binary_subset.loc[df['target_y'].isin([i]), 'target_y'] = i
k = 'rest'

# Features
X_binary_init = binary_subset.drop(['target_y'], axis=1)
# Target
y_binary_init = binary_subset['target_y']
# Convert to 0 and 1 - i = 1, k = 0
y_binary_init = (y_binary_init == i).astype('uint8')

# Split Train Test Validation set
X_binary, X_val_all, y_binary, y_val_all = train_test_split(X_binary_init, y_binary_init,
                                                            test_size=0.2, stratify=y_binary_init, random_state=random_state)

# Save train test sets
# dump(X_binary, str(i) + '_' + str(k) + '_X_binary.joblib')
# dump(X_val_all, str(i) + '_' + str(k) + '_X_val_all.joblib')
# dump(y_binary, str(i) + '_' + str(k) + '_y_binary.joblib')
# dump(y_val_all, str(i) + '_' + str(k) + '_y_val_all.joblib')

#### Univariate Filtering

In [None]:
# Univariate Filtering

# Define the number of features to be selected from each filter method
fraction = 0.2
K_features = math.floor(len(spectral_range) * fraction)
print('The number of features to be selected: ', K_features)

# mRMR
selected_wv_mrmr = mrmr_classif(X_binary, y_binary, K=K_features) #, show_progress=False
selected_wv_mrmr = np.float64(selected_wv_mrmr)

# reliefF
relief_F_model = skr.ReliefF(n_features_to_select=K_features, n_neighbors=20)
relief_F_scores = relief_F_model.fit(X_binary.values, y_binary.values)
# Find the selected wavelengths
idx_relieff = relief_F_scores.top_features_
selected_wv_relieff = spectral_range[idx_relieff[:K_features]]

# Mutual information
mi_scores = SelectKBest(mutual_info_classif, k=K_features).fit(X_binary, y_binary)
# Find the selected wavelengths
selected_wv_mi = spectral_range[mi_scores.get_support()]

# Fine the union of the selected wavelengths from mRMR, reliefF, Mutual Information
all_union = list(set(selected_wv_mrmr) | set(selected_wv_relieff) | set(selected_wv_mi))
all_union_str = [str(x) for x in np.sort(all_union)]
print('The Selected Wavelengths from Union: ', len(all_union_str))

# Selected wavelengths
selected_X_binary = X_binary.loc[:, all_union_str].values

# Plot union of the three filter methods
plot_wv_ensemble(spectral_range, selected_wv_mrmr, selected_wv_relieff, selected_wv_mi, all_union, None, None, True, i, k)

In [None]:
# Set the figure size
f, ax = plt.subplots(figsize=(10, 8))

# Tight layout
f.tight_layout()

# Plot
plot_wv_ensemble_spectral_ovr(df_subset_ovr, spectral_range, all_union, None, None, i, k)

bbox = matplotlib.transforms.Bbox([[-0.45, -1.3], [11.45, 8.56]])
if not os.path.exists(str(i) + '_' + str(k) + '_1_UNION_fs.png'):
    f.savefig(os.getcwd() + '/' +  str(i) + '_' + str(k) + '_1_UNION_fs.png', dpi = 1080, bbox_inches =bbox)
plt.close()

#### BORUTA - RF

In [None]:
# BORUTA - RF

# Boruta feature selection - ALL

# Track timestamp - elapsed time
start_time = time.time()

# Define boruta estimator
boruta_model = RandomForestClassifier(n_estimators=100, class_weight='balanced',
                                      random_state=random_state, n_jobs=-1)

# Define boruta feature selector
fs_boruta = BorutaPy(estimator=boruta_model, n_estimators='auto', perc=100, max_iter=200, verbose=0,
                     random_state=random_state)

# Fit boruta feature selector
fs_boruta.fit(selected_X_binary, y_binary)

# End timestamp
end_time = time.time()
print('Time to run: ', end_time - start_time, ' seconds')

# Rank Boruta selected features
selected_boruta_features = pd.DataFrame({'Features': all_union_str,
                                         'Ranking': fs_boruta.ranking_})
selected_boruta_features.sort_values(by='Ranking')

# Select TOP boruta selected features
boruta_top_features = selected_boruta_features[selected_boruta_features["Ranking"] == 1]
print('The Number of Selected Wavelengths from BORUTA + RF: ', len(boruta_top_features))

# Find the selected wavelengths in STR and FLOAT
boruta_wavelengths = boruta_top_features["Features"].astype(np.float64)
boruta_wavelengths_str = [str(x) for x in boruta_top_features["Features"]]

# Convert to dataframe for BORUTA + RF
df_selected_wv_boruta = pd.concat([X_binary.loc[:, boruta_wavelengths_str], y_binary.rename('target_y')], axis=1)


In [None]:
# Plot the selected wavelengths from BORUTA + RF

# Set the figure size
f, ax = plt.subplots(figsize=(10, 8))

# Tight layout
f.tight_layout()

# Plot
plot_wv_ensemble_spectral_ovr(df_subset_ovr, spectral_range, all_union, boruta_wavelengths, None, i, k)

bbox = matplotlib.transforms.Bbox([[-0.45, -1.3], [11.45, 8.56]])
if not os.path.exists(str(i) + '_' + str(k) + '_2_BORUTARF_fs_1.png'):
    f.savefig(os.getcwd() + '/' + str(i) + '_' + str(k) + '_2_BORUTARF_fs_1.png', dpi=1080, bbox_inches=bbox)
plt.close()

#### Top 10 Features

In [None]:
# Top 10 Features

# Select top 10 features by f_classif scores using SelectKBest
f_clf_scores = SelectKBest(f_classif, k=10).fit(X_binary.loc[:, boruta_wavelengths_str], y_binary)
f_clf_wv = boruta_wavelengths[f_clf_scores.get_support()]
# To string
f_clf_wv_str = [str(x) for x in f_clf_wv]

# Save variables
dump(f_clf_wv, str(i) + '_' + str(k) + '_ensemble_Col.joblib')
# Plot these top 10 features
# Set the figure size
f, ax = plt.subplots(figsize=(10, 8))

# Tight layout
f.tight_layout()

# Plot
plot_wv_ensemble_spectral_ovr(df_subset_ovr, spectral_range, all_union, boruta_wavelengths, f_clf_wv, i, k)

bbox = matplotlib.transforms.Bbox([[-0.45, -1.3], [11.45, 8.56]])
if not os.path.exists(str(i) + '_' + str(k) + '_3_Top_emsemble_fs_Col.png'):
    f.savefig(os.getcwd() + '/' + str(i) + '_' + str(k) + '_3_Top_emsemble_fs_Col.png', dpi=1080, bbox_inches=bbox)
plt.close()

In [None]:
# Select top 10 features by f_classif scores and RANK and REMOVE MULTICOLLINEARITY
f_clf_wv_1_idx = top_K_least_correlated(X_binary.loc[:, boruta_wavelengths_str], y_binary, K_features=10,
                                        selection=f_classif)
f_clf_wv_1 = boruta_wavelengths.values[f_clf_wv_1_idx]
# To string
f_clf_wv_1_str = [str(x) for x in f_clf_wv_1]

# Save variables
dump(f_clf_wv_1, str(i) + '_' + str(k) + '_ensemble_noCol.joblib')
# Plot these top 10 features
# Set the figure size
f, ax = plt.subplots(figsize=(10, 8))

# Tight layout
f.tight_layout()

# Plot
plot_wv_ensemble_spectral_ovr(df_subset_ovr, spectral_range, all_union, boruta_wavelengths, f_clf_wv_1, i, k)

bbox = matplotlib.transforms.Bbox([[-0.45, -1.3], [11.45, 8.56]])
if not os.path.exists(str(i) + '_' + str(k) + '_3_Top_emsemble_fs_noCol.png'):
    f.savefig(os.getcwd() + '/' + str(i) + '_' + str(k) + '_3_Top_emsemble_fs_noCol.png', dpi=1080, bbox_inches=bbox)
plt.close()

#### SHAP Interaction

In [None]:
# SHAP

# Calculate SHAP interaction for individual contributions to prediction - Remove Colinearity
X_binary_shap_1 = X_binary.loc[:, f_clf_wv_1_str]

#Train model
model_shap_rf_1 = RandomForestClassifier(n_estimators=100, class_weight='balanced', max_depth=5,
                                         random_state=random_state, n_jobs=-1)
model_shap_rf_1.fit(X_binary_shap_1, y_binary)

# SHAP values
shap_explainer_rf_1 = shap.TreeExplainer(model_shap_rf_1)
shap_values_rf_1 = shap_explainer_rf_1.shap_values(X_binary_shap_1)

# Sort shap features
vals = np.abs(shap_values_rf_1[1]).mean(0)
feature_importance = pd.DataFrame(list(zip(f_clf_wv_1_str, vals)),
                                  columns=['col_wavelengths', 'feature_importance_vals'])
feature_importance.sort_values(by=['feature_importance_vals'], ascending=False, inplace=True)

# Rank
shap_wv_rank_1 = X_binary_shap_1.columns[np.argsort(-np.abs(shap_values_rf_1[1]).mean(0))].values
print('Shape Feature Rank: ', shap_wv_rank_1)
# Plot bar plot
# Set the figure size
f, ax = plt.subplots(figsize=(10, 8))
# Tight layout
f.tight_layout()
# Plot
shap.summary_plot(shap_values_rf_1, X_binary_shap_1, plot_type='bar')
bbox = matplotlib.transforms.Bbox([[-0.2, -0.2], [8.5, 6]])

# Save figure
if not os.path.exists(str(i) + '_' + str(k) + '_SHAP_bar_noCol.png'):
    f.savefig(os.getcwd() + '/' + str(i) + '_' + str(k) + '_SHAP_bar_noCol.png', dpi=1080, bbox_inches=bbox)

# Plot beeswarm plot

# Set the figure size
f, ax = plt.subplots(figsize=(10, 8))
# Tight layout
f.tight_layout()
# Plot
shap.summary_plot(shap_values_rf_1[1], X_binary_shap_1)
bbox = matplotlib.transforms.Bbox([[-0.2, -0.2], [8, 6]])

# Save figure
if not os.path.exists(str(i) + '_' + str(k) + '_SHAP_beeswarm_noCol.png'):
    f.savefig(os.getcwd() + '/' + str(i) + '_' + str(k) + '_SHAP_beeswarm_noCol.png', dpi=1080, bbox_inches=bbox)

In [None]:
# Calculate SHAP interaction for individual contributions to prediction - with Colinearity
X_binary_shap = X_binary.loc[:, f_clf_wv_str]

#Train model
model_shap_rf = RandomForestClassifier(n_estimators=100, class_weight='balanced', max_depth=5,
                                       random_state=random_state, n_jobs=-1)
model_shap_rf.fit(X_binary_shap, y_binary)

# SHAP values
shap_explainer_rf = shap.TreeExplainer(model_shap_rf)
shap_values_rf = shap_explainer_rf.shap_values(X_binary_shap)

# Sort shap features
vals = np.abs(shap_values_rf[1]).mean(0)
feature_importance = pd.DataFrame(list(zip(f_clf_wv_str, vals)),
                                  columns=['col_wavelengths', 'feature_importance_vals'])
feature_importance.sort_values(by=['feature_importance_vals'], ascending=False, inplace=True)

# Rank
shap_wv_rank = X_binary_shap.columns[np.argsort(-np.abs(shap_values_rf[1]).mean(0))].values
print('Shape Feature Rank: ', shap_wv_rank)
# Plot bar plot
# Set the figure size
f, ax = plt.subplots(figsize=(10, 8))
# Tight layout
f.tight_layout()
# Plot
shap.summary_plot(shap_values_rf, X_binary_shap, plot_type='bar')
bbox = matplotlib.transforms.Bbox([[-0.2, -0.2], [8.5, 6]])

# Save figure
if not os.path.exists(str(i) + '_' + str(k) + '_SHAP_bar_Col.png'):
    f.savefig(os.getcwd() + '/' + str(i) + '_' + str(k) + '_SHAP_bar_Col.png', dpi=1080, bbox_inches=bbox)
# Plot beeswarm plot

# Set the figure size
f, ax = plt.subplots(figsize=(10, 8))
# Tight layout
f.tight_layout()
# Plot
shap.summary_plot(shap_values_rf[1], X_binary_shap)
bbox = matplotlib.transforms.Bbox([[-0.2, -0.2], [8, 6]])

# Save figure
if not os.path.exists(str(i) + '_' + str(k) + '_SHAP_beeswarm_Col.png'):
    f.savefig(os.getcwd() + '/' + str(i) + '_' + str(k) + '_SHAP_beeswarm_Col.png', dpi=1080, bbox_inches=bbox)

#### LDA Classification Accuracy nFeatures

In [None]:
# Calculate clf accuracy

shap_rank_fea = shap_wv_rank_1

# Define cross validation
random_state = 42
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=10, random_state=random_state)
shap_select_X_binary = X_binary.loc[:, shap_rank_fea]

# Initiate all balanced accuracy scores
all_balanced_accuracy_scores = []
all_balanced_accuracy_stdev = []
# Initiate all f1 scores
all_f1_scores = []
all_f1_stdev = []

for wv in range(len(shap_rank_fea)):
    # Define LDA model
    model = LinearDiscriminantAnalysis()

    # Calculate clf accuracy cv
    print('Number of Features: ', wv + 1)
    _, lda_balanced_accuracy_scores, lda_f1_scores = clf_accuracy_eval(shap_select_X_binary.iloc[:, 0:wv + 1], y_binary,
                                                                       model, cv)
    # Store all balanced accuracy scores
    all_balanced_accuracy_scores.append(np.mean(lda_balanced_accuracy_scores))
    all_balanced_accuracy_stdev.append(np.std(lda_balanced_accuracy_scores))
    # Store all f1 scores
    all_f1_scores.append(np.mean(lda_f1_scores))
    all_f1_stdev.append(np.std(lda_f1_scores))

# Save
dump(all_balanced_accuracy_scores, str(i) + '_' + str(k) + '_LDA_balanced_train_noCol.joblib')
dump(all_balanced_accuracy_stdev, str(i) + '_' + str(k) + '_LDA_balanced_stdev_train_noCol.joblib')
dump(all_f1_scores, str(i) + '_' + str(k) + '_LDA_f1_train_noCol.joblib')
dump(all_f1_stdev, str(i) + '_' + str(k) + '_LDA_f1_stdev_train_noCol.joblib')
# plot Clf Balanced Accuracy vs. Number of Features

# Set the figure size
f, ax = plt.subplots(figsize=(8, 6))
# Tight layout
f.tight_layout()

# Plot
plt.plot(np.linspace(1, len(shap_rank_fea), len(shap_rank_fea)), all_balanced_accuracy_scores, '--', lw=2.5)
plt.errorbar(np.linspace(1, len(shap_rank_fea), len(shap_rank_fea)), all_balanced_accuracy_scores,
             yerr=all_balanced_accuracy_stdev,
             fmt='o', capsize=1.5)
xlabels = ['0'] + shap_rank_fea.tolist()
plt.xticks(np.arange(0, len(shap_rank_fea) + 1, 1), xlabels)
# Set figure object
ax.set_title('Balanced Accuracy vs. Number of Wavelengths for ' + str(i) + ' vs. ' + str(k))
ax.set_xlabel('Wavelengths Included (nm)')
ax.set_ylabel('LDA Clf Balanced Accuracy')
bbox = matplotlib.transforms.Bbox([[-0.4, -0.4], [8.5, 6.5]])

# Save figure
if not os.path.exists(str(i) + '_' + str(k) + '_LDA_balanced_train_noCol.png'):
    f.savefig(os.getcwd() + '/' + str(i) + '_' + str(k) + '_LDA_balanced_train_noCol.png', dpi=1080,
              bbox_inches=bbox)
plt.close()

# plot Clf F1 Scores vs. Number of Features

# Set the figure size
f, ax = plt.subplots(figsize=(8, 6))
# Tight layout
f.tight_layout()

# Plot
plt.plot(np.linspace(1, len(shap_rank_fea), len(shap_rank_fea)), all_f1_scores, '--', lw=2.5)
plt.errorbar(np.linspace(1, len(shap_rank_fea), len(shap_rank_fea)), all_f1_scores, yerr=all_f1_stdev,
             fmt='o', capsize=1.5)
xlabels = ['0'] + shap_rank_fea.tolist()
plt.xticks(np.arange(0, len(shap_rank_fea) + 1, 1), xlabels)
# Set figure object
ax.set_title('F1 Scores vs. Number of Wavelengths for ' + str(i) + ' vs. ' + str(k))
ax.set_xlabel('Wavelengths Included (nm)')
ax.set_ylabel('LDA Clf F1 Score')
bbox = matplotlib.transforms.Bbox([[-0.4, -0.4], [8.5, 6.5]])

# Save figure
if not os.path.exists(str(i) + '_' + str(k) + '_LDA_f1_train_noCol.png'):
    f.savefig(os.getcwd() + '/' + str(i) + '_' + str(k) + '_LDA_f1_train_noCol.png', dpi=1080,
              bbox_inches=bbox)
plt.close()
# Calculate clf accuracy

# Define cross validation
random_state = 42
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=10, random_state=random_state)
shap_select_X_binary = X_val_all.loc[:, shap_rank_fea]

# Initiate all balanced accuracy scores
all_balanced_accuracy_scores = []
all_balanced_accuracy_stdev = []
# Initiate all f1 scores
all_f1_scores = []
all_f1_stdev = []

for wv in range(len(shap_rank_fea)):
    # Define LDA model
    model = LinearDiscriminantAnalysis()

    # Calculate clf accuracy cv
    print('Number of Features: ', wv + 1)
    _, lda_balanced_accuracy_scores, lda_f1_scores = clf_accuracy_eval(shap_select_X_binary.iloc[:, 0:wv + 1],
                                                                       y_val_all, model, cv)
    # Store all balanced accuracy scores
    all_balanced_accuracy_scores.append(np.mean(lda_balanced_accuracy_scores))
    all_balanced_accuracy_stdev.append(np.std(lda_balanced_accuracy_scores))
    # Store all f1 scores
    all_f1_scores.append(np.mean(lda_f1_scores))
    all_f1_stdev.append(np.std(lda_f1_scores))
# Save
dump(all_balanced_accuracy_scores, str(i) + '_' + str(k) + '_LDA_balanced_test_noCol.joblib')
dump(all_balanced_accuracy_stdev, str(i) + '_' + str(k) + '_LDA_balanced_stdev_test_noCol.joblib')
dump(all_f1_scores, str(i) + '_' + str(k) + '_LDA_f1_test_noCol.joblib')
dump(all_f1_stdev, str(i) + '_' + str(k) + '_LDA_f1_stdev_test_noCol.joblib')
# plot Clf Balanced Accuracy vs. Number of Features

# Set the figure size
f, ax = plt.subplots(figsize=(8, 6))
# Tight layout
f.tight_layout()

# Plot
plt.plot(np.linspace(1, len(shap_rank_fea), len(shap_rank_fea)), all_balanced_accuracy_scores, '--', lw=2.5)
plt.errorbar(np.linspace(1, len(shap_rank_fea), len(shap_rank_fea)), all_balanced_accuracy_scores,
             yerr=all_balanced_accuracy_stdev,
             fmt='o', capsize=1.5)
xlabels = ['0'] + shap_rank_fea.tolist()
plt.xticks(np.arange(0, len(shap_rank_fea) + 1, 1), xlabels)
# Set figure object
ax.set_title('Balanced Accuracy vs. Number of Wavelengths for ' + str(i) + ' vs. ' + str(k))
ax.set_xlabel('Wavelengths Included (nm)')
ax.set_ylabel('LDA Clf Balanced Accuracy')
bbox = matplotlib.transforms.Bbox([[-0.4, -0.4], [8.5, 6.5]])

# Save figure
if not os.path.exists(str(i) + '_' + str(k) + '_LDA_balanced_test_noCol.png'):
    f.savefig(os.getcwd() + '/' + str(i) + '_' + str(k) + '_LDA_balanced_test_noCol.png', dpi=1080,
              bbox_inches=bbox)
plt.close()
# plot Clf F1 Scores vs. Number of Features

# Set the figure size
f, ax = plt.subplots(figsize=(8, 6))
# Tight layout
f.tight_layout()

# Plot
plt.plot(np.linspace(1, len(shap_rank_fea), len(shap_rank_fea)), all_f1_scores, '--', lw=2.5)
plt.errorbar(np.linspace(1, len(shap_rank_fea), len(shap_rank_fea)), all_f1_scores, yerr=all_f1_stdev,
             fmt='o', capsize=1.5)
xlabels = ['0'] + shap_rank_fea.tolist()
plt.xticks(np.arange(0, len(shap_rank_fea) + 1, 1), xlabels)
# Set figure object
ax.set_title('F1 Scores vs. Number of Wavelengths for ' + str(i) + ' vs. ' + str(k))
ax.set_xlabel('Wavelengths Included (nm)')
ax.set_ylabel('LDA Clf F1 Score')
bbox = matplotlib.transforms.Bbox([[-0.4, -0.4], [8.5, 6.5]])

# Save figure
if not os.path.exists(str(i) + '_' + str(k) + '_LDA_f1_test_noCol.png'):
    f.savefig(os.getcwd() + '/' + str(i) + '_' + str(k) + '_LDA_f1_test_noCol.png', dpi=1080,
              bbox_inches=bbox)
plt.close()


In [None]:
# Calculate clf accuracy - with Colinearity

# shap_rank_fea = shap_wv_rank
shap_rank_fea = shap_wv_rank
shap_rank_fea = [str(x) for x in shap_rank_fea]
shap_rank_fea = np.array(shap_rank_fea)

# Define cross validation
random_state = 42
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=10, random_state=random_state)
shap_select_X_binary = X_binary.loc[:, shap_rank_fea]

# Initiate all balanced accuracy scores
all_balanced_accuracy_scores = []
all_balanced_accuracy_stdev = []
# Initiate all f1 scores
all_f1_scores = []
all_f1_stdev = []

for wv in range(len(shap_rank_fea)):
    # Define LDA model
    model = LinearDiscriminantAnalysis()

    # Calculate clf accuracy cv
    print('Number of Features: ', wv + 1)
    _, lda_balanced_accuracy_scores, lda_f1_scores = clf_accuracy_eval(shap_select_X_binary.iloc[:, 0:wv + 1], y_binary,
                                                                       model, cv)
    # Store all balanced accuracy scores
    all_balanced_accuracy_scores.append(np.mean(lda_balanced_accuracy_scores))
    all_balanced_accuracy_stdev.append(np.std(lda_balanced_accuracy_scores))
    # Store all f1 scores
    all_f1_scores.append(np.mean(lda_f1_scores))
    all_f1_stdev.append(np.std(lda_f1_scores))

# Save
dump(all_balanced_accuracy_scores, str(i) + '_' + str(k) + '_LDA_balanced_train_Col.joblib')
dump(all_balanced_accuracy_stdev, str(i) + '_' + str(k) + '_LDA_balanced_stdev_train_Col.joblib')
dump(all_f1_scores, str(i) + '_' + str(k) + '_LDA_f1_train_Col.joblib')
dump(all_f1_stdev, str(i) + '_' + str(k) + '_LDA_f1_stdev_train_Col.joblib')

# plot Clf Balanced Accuracy vs. Number of Features

# Set the figure size
f, ax = plt.subplots(figsize=(8, 6))
# Tight layout
f.tight_layout()

# Plot
plt.plot(np.linspace(1, len(shap_rank_fea), len(shap_rank_fea)), all_balanced_accuracy_scores, '--', lw=2.5)
plt.errorbar(np.linspace(1, len(shap_rank_fea), len(shap_rank_fea)), all_balanced_accuracy_scores,
             yerr=all_balanced_accuracy_stdev,
             fmt='o', capsize=1.5)
xlabels = ['0'] + shap_rank_fea.tolist()
plt.xticks(np.arange(0, len(shap_rank_fea) + 1, 1), xlabels)
# Set figure object
ax.set_title('Balanced Accuracy vs. Number of Wavelengths for ' + str(i) + ' vs. ' + str(k))
ax.set_xlabel('Wavelengths Included (nm)')
ax.set_ylabel('LDA Clf Balanced Accuracy')
bbox = matplotlib.transforms.Bbox([[-0.4, -0.4], [8.5, 6.5]])

# Save figure
if not os.path.exists(str(i) + '_' + str(k) + '_LDA_balanced_train_Col.png'):
    f.savefig(os.getcwd() + '/' + str(i) + '_' + str(k) + '_LDA_balanced_train_Col.png', dpi=1080,
              bbox_inches=bbox)
plt.close()

# plot Clf F1 Scores vs. Number of Features

# Set the figure size
f, ax = plt.subplots(figsize=(8, 6))
# Tight layout
f.tight_layout()

# Plot
plt.plot(np.linspace(1, len(shap_rank_fea), len(shap_rank_fea)), all_f1_scores, '--', lw=2.5)
plt.errorbar(np.linspace(1, len(shap_rank_fea), len(shap_rank_fea)), all_f1_scores, yerr=all_f1_stdev,
             fmt='o', capsize=1.5)
xlabels = ['0'] + shap_rank_fea.tolist()
plt.xticks(np.arange(0, len(shap_rank_fea) + 1, 1), xlabels)
# Set figure object
ax.set_title('F1 Scores vs. Number of Wavelengths for ' + str(i) + ' vs. ' + str(k))
ax.set_xlabel('Wavelengths Included (nm)')
ax.set_ylabel('LDA Clf F1 Score')
bbox = matplotlib.transforms.Bbox([[-0.4, -0.4], [8.5, 6.5]])

# Save figure
if not os.path.exists(str(i) + '_' + str(k) + '_LDA_f1_train_Col.png'):
    f.savefig(os.getcwd() + '/' + str(i) + '_' + str(k) + '_LDA_f1_train_Col.png', dpi=1080,
              bbox_inches=bbox)
plt.close()
# Calculate clf accuracy

# Define cross validation
random_state = 42
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=10, random_state=random_state)
shap_select_X_binary = X_val_all.loc[:, shap_rank_fea]

# Initiate all balanced accuracy scores
all_balanced_accuracy_scores = []
all_balanced_accuracy_stdev = []
# Initiate all f1 scores
all_f1_scores = []
all_f1_stdev = []

for wv in range(len(shap_rank_fea)):
    # Define LDA model
    model = LinearDiscriminantAnalysis()

    # Calculate clf accuracy cv
    print('Number of Features: ', wv + 1)
    _, lda_balanced_accuracy_scores, lda_f1_scores = clf_accuracy_eval(shap_select_X_binary.iloc[:, 0:wv + 1],
                                                                       y_val_all, model, cv)
    # Store all balanced accuracy scores
    all_balanced_accuracy_scores.append(np.mean(lda_balanced_accuracy_scores))
    all_balanced_accuracy_stdev.append(np.std(lda_balanced_accuracy_scores))
    # Store all f1 scores
    all_f1_scores.append(np.mean(lda_f1_scores))
    all_f1_stdev.append(np.std(lda_f1_scores))
# Save
dump(all_balanced_accuracy_scores, str(i) + '_' + str(k) + '_LDA_balanced_test_Col.joblib')
dump(all_balanced_accuracy_stdev, str(i) + '_' + str(k) + '_LDA_balanced_stdev_test_Col.joblib')
dump(all_f1_scores, str(i) + '_' + str(k) + '_LDA_f1_test_Col.joblib')
dump(all_f1_stdev, str(i) + '_' + str(k) + '_LDA_f1_stdev_test_Col.joblib')
# plot Clf Balanced Accuracy vs. Number of Features

# Set the figure size
f, ax = plt.subplots(figsize=(8, 6))
# Tight layout
f.tight_layout()

# Plot
plt.plot(np.linspace(1, len(shap_rank_fea), len(shap_rank_fea)), all_balanced_accuracy_scores, '--', lw=2.5)
plt.errorbar(np.linspace(1, len(shap_rank_fea), len(shap_rank_fea)), all_balanced_accuracy_scores,
             yerr=all_balanced_accuracy_stdev,
             fmt='o', capsize=1.5)
xlabels = ['0'] + shap_rank_fea.tolist()
plt.xticks(np.arange(0, len(shap_rank_fea) + 1, 1), xlabels)
# Set figure object
ax.set_title('Balanced Accuracy vs. Number of Wavelengths for ' + str(i) + ' vs. ' + str(k))
ax.set_xlabel('Wavelengths Included (nm)')
ax.set_ylabel('LDA Clf Balanced Accuracy')
bbox = matplotlib.transforms.Bbox([[-0.4, -0.4], [8.5, 6.5]])

# Save figure
if not os.path.exists(str(i) + '_' + str(k) + '_LDA_balanced_test_Col.png'):
    f.savefig(os.getcwd() + '/' + str(i) + '_' + str(k) + '_LDA_balanced_test_Col.png', dpi=1080,
              bbox_inches=bbox)
plt.close()
# plot Clf F1 Scores vs. Number of Features

# Set the figure size
f, ax = plt.subplots(figsize=(8, 6))
# Tight layout
f.tight_layout()

# Plot
plt.plot(np.linspace(1, len(shap_rank_fea), len(shap_rank_fea)), all_f1_scores, '--', lw=2.5)
plt.errorbar(np.linspace(1, len(shap_rank_fea), len(shap_rank_fea)), all_f1_scores, yerr=all_f1_stdev,
             fmt='o', capsize=1.5)
xlabels = ['0'] + shap_rank_fea.tolist()
plt.xticks(np.arange(0, len(shap_rank_fea) + 1, 1), xlabels)
# Set figure object
ax.set_title('F1 Scores vs. Number of Wavelengths for ' + str(i) + ' vs. ' + str(k))
ax.set_xlabel('Wavelengths Included (nm)')
ax.set_ylabel('LDA Clf F1 Score')
bbox = matplotlib.transforms.Bbox([[-0.4, -0.4], [8.5, 6.5]])

# Save figure
if not os.path.exists(str(i) + '_' + str(k) + '_LDA_f1_test_Col.png'):
    f.savefig(os.getcwd() + '/' + str(i) + '_' + str(k) + '_LDA_f1_test_Col.png', dpi=1080,
              bbox_inches=bbox)
plt.close()