# Linear Discriminative Analysis (LDA) Feature Selection (FS) Framework

In [None]:
import os
import sys

import pandas as pd
import numpy as np
import math

from scipy.signal import savgol_filter
from itertools import combinations

import matplotlib.pyplot as plt
from cycler import cycler
import matplotlib.transforms
import matplotlib

import time
from joblib import dump
from joblib import load

from sklearn.metrics import balanced_accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import SelectFromModel

from sklearn.model_selection import train_test_split
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import RepeatedStratifiedKFold

# Disable Print
def blockPrint():
    sys.__stdout__ = sys.stdout
    sys.stdout = open(os.devnull, 'w')

# Restore Print
def enablePrint():
    sys.stdout = sys.__stdout__

In [None]:
# Change to data directory

work_path = os.getcwd()
print(work_path)

os.chdir('YOUR DATA DIRECTORY')
data_path = os.getcwd()
print(data_path)
os.chdir(data_path)

In [None]:
# Load data

os.chdir(data_path)

# Load dataset - df
df = pd.read_csv('YOUR DATA FILE.csv')

# Average spectra for each tissue type
avg_df_by_y = df.groupby(['target_y']).mean()
stdev_df_by_y = df.groupby(['target_y']).std()

# Column names to numerical
col_wavelengths = df.columns.drop('target_y')
col_wavelengths = col_wavelengths.astype(np.float64)
print('Number of wavelengths: ', len(col_wavelengths))

# Legend labels
tissue_types = df['target_y'].unique()
tissue_types.sort()

In [None]:
# SG Smoothing of the raw data - Gentle smoothing with w/p = 2.5 to get rid of some baseline noise

# Features
X_raw = df.drop(['target_y'], axis=1)

# Target
y = df['target_y']

# Spectral Smoothing - Savitzky–Golay (SG) method
w = 5
p = 2
X_smooth = savgol_filter(X_raw, w, polyorder=p, axis=1, deriv=0)

# Smoothed dataframe
df_smooth = pd.DataFrame(X_smooth, columns = col_wavelengths.astype("string"))
df_smooth = pd.concat([df_smooth, y.rename('target_y')], axis=1)

# Average spectra for each tissue type
avg_df_by_y_smooth = df_smooth.groupby(['target_y']).mean()
stdev_df_by_y_smooth = df_smooth.groupby(['target_y']).std()

In [None]:
# Change to save path
os.chdir(work_path)
os.chdir('YOUR SAVE PATH')
save_path = os.getcwd()
print(save_path)

#### Define functions

In [None]:
# Define a function to set intervals by indexing

def interval_indexing(ind_list, interval_len):
    return [ind_list[i:i+interval_len] for i in range(0, len(ind_list), interval_len)]

In [None]:
# Define a function to evaluate clf accuracy using lda - Number of Features is a variable
# F1 score calculation can be added here

def clf_accuracy_eval(X, y, model, cv):

    # Initiate empty lists
    clf_accuracy_scores = [] # Calculate classification scores
    clf_balanced_scores = [] # Calculate balanced classification scores
    #clf_f1_scores = [] # Calculate F1 scores
    #count = 0
    for train_index, test_index in cv.split(X, y):
        # count+=1
        # print(count, " of ",  cv.get_n_splits(), " CV folds ", end="\r")

        # define model
        clf_model = model # model needs to be refined in each loop? so move down into the loop?

        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # Fit model
        clf_model.fit(X_train, y_train)
        # Predict using the fitted model
        y_predict = clf_model.predict(X_test)
        # Calculate classification accuracy score
        clf_score = accuracy_score(y_test, y_predict)
        # Calculate classification balanced accuracy score
        clf_balanced_score = balanced_accuracy_score(y_test, y_predict)
        # Calculate classification F-1 score
        #clf_f1_score = f1_score(y_test, y_predict, average='binary')
        # Store each iteration
        clf_accuracy_scores.append(clf_score)
        clf_balanced_scores.append(clf_balanced_score)
        #clf_f1_scores.append(clf_f1_score)

    return clf_accuracy_scores, clf_balanced_scores #, clf_f1_scores

In [None]:
# Define temperature in Simulated Annealing

def temperature_sa(accuracy_max): #-> need to fix this so that as accuracy increases, change of acceptance decreases
    return 0.05 * (1-accuracy_max) + 0.00000001 # regularization term to avoid dividing by 0

In [None]:
# Define a function to optimize the final feature selection using Simulated Annealing

def wv_sa_opt(X_select, y, cv, nwv_total, nwv_select, n_iter):

    # Set the number of selected wavelength
    nwv_total = nwv_total
    nwv_select = nwv_select

    # Set selected wv's - Initial condition is random shuffle
    idx_wv_arr = np.arange(nwv_total)
    np.random.shuffle(idx_wv_arr)
    wv_select = idx_wv_arr[:nwv_select]
    wv_exclude = idx_wv_arr[nwv_select:]

    # Choose the selected wv's from the original X dataset
    X_select_init = X_select[:,wv_select]

    # Initialize list for storing updated clf accuracy from each iteration
    all_clf_accuracy = []
    all_wv_select = []
    all_wv_exclude = []

    # Calculate the initial clf accuracy score using the randomly selected wv's - LDA for clf
    # define lda regression
    lda = LinearDiscriminantAnalysis()
    # Simple cv score
    lda_scores = cross_val_score(lda, X_select_init, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
    lda_score_init = np.mean(lda_scores)
    # Store initial conditions
    all_clf_accuracy.append(lda_score_init)
    all_wv_select.append(wv_select)
    all_wv_exclude.append(wv_exclude)
    print('The initial clf accuracy using LDA: %.5f ' % lda_score_init)

    # Simulated Annealing loop

    # Initialize arrays for updating
    wv_select_update = np.copy(wv_select)
    print('Initial Selected wv', wv_select_update)
    wv_exclude_update = np.copy(wv_exclude)
    print('Initial Excluded wv', wv_exclude_update)
    lda_score_update = lda_score_init

    if lda_score_init == 1:
        print('-> LDA clf accuracy is already 100%')
        wv_select_update = wv_select_update
        lda_score_update = lda_score_update
        update_count = 0
        all_clf_accuracy = []
        all_wv_select = []
        all_wv_exclude = []
    else:

        update_count = 0

        for i in range(n_iter):

            print('\n')
            print('--- Iteration: ', i, ' ---')
            # Define temperature reg
            temp_reg = temperature_sa(lda_score_update)
            print('Temperature Reg for SA: ', temp_reg)
            # Initialize lists for updates in the inner loop
            wv_select_inner = np.copy(wv_select_update)
            wv_exclude_inner = np.copy(wv_exclude_update)

            # Change one element during each iteration
            idx_select = np.random.randint(nwv_select)
            idx_exclude = np.random.randint(nwv_total-nwv_select)
            item_select = wv_select_inner[idx_select]
            print('Selected wv for Exchange: ', item_select)
            item_exclude = wv_exclude_inner[idx_exclude]
            print('Excluded PC for Exchange: ', item_exclude)
            wv_select_inner[idx_select] = item_exclude
            print('New Selected PCs: ', wv_select_inner)
            wv_exclude_inner[idx_exclude] = item_select
            print('New Excluded PCs: ', wv_exclude_inner)

            # Select the new PCs and evaluate
            X_select_update = X_select[:,wv_select_inner]
            # define lda regression
            lda_inner = LinearDiscriminantAnalysis()
            # Simple cv score
            lda_scores_inner = cross_val_score(lda_inner, X_select_update, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
            lda_score_inner = np.mean(lda_scores_inner)
            print('Update clf accuracy: %.5f ' % lda_score_inner)
            print('Old clf accuracy: %.5f ' % lda_score_update)


            # Make decision
            if lda_score_inner > lda_score_update:
                print('-> Yes higher clf accuracy')
                # Update
                wv_select_update = wv_select_inner
                wv_exclude_update = wv_exclude_inner
                lda_score_update = lda_score_inner
                update_count += 1
                print('-> Number of updates: ', update_count)
                # Store
                all_clf_accuracy.append(lda_score_update)
                all_wv_select.append(wv_select_update)
                all_wv_exclude.append(wv_exclude_update)
                print('-> New wv: ', wv_select_update)
                print('-> New clf accuracy: %.5f ' % lda_score_update)
            elif lda_score_inner <= lda_score_update:
                sa_prob = np.exp((lda_score_inner-lda_score_update)/temp_reg)
                print('SA Probability: ', sa_prob)
                if np.random.random() < sa_prob:
                    print('-> Accepted; Lower than SA probability')
                    # Update
                    wv_select_update = wv_select_inner
                    wv_exclude_update = wv_exclude_inner
                    lda_score_update = lda_score_inner
                    update_count += 1
                    print('-> Number of updates: ', update_count)
                    # Store
                    all_clf_accuracy.append(lda_score_update)
                    all_wv_select.append(wv_select_update)
                    all_wv_exclude.append(wv_exclude_update)
                    print('-> New wv: ', wv_select_update)
                    print('-> New clf accuracy: %.5f ' % lda_score_update)
                else:
                    print('-> Rejected')
                    print('-> Keep Old PCs: ', wv_select_update)
                    print('-> Keep Old Accuracy: %.5f ' % lda_score_update)
                    # Store
                    all_clf_accuracy.append(lda_score_update)
                    all_wv_select.append(wv_select_update)
                    all_wv_exclude.append(wv_exclude_update)

    return wv_select_update, lda_score_update, all_clf_accuracy, all_wv_select, all_wv_exclude, update_count

#### The OVO Approach

In [None]:
# Change to save path
os.chdir(save_path)
os.chdir('YOUR SAVE PATH - SUBFOLDER')
print(os.getcwd())

In [None]:
# LDA Framework for the OVO approach starts here:

# Define parameters
df_dataset = df_smooth
tissue_types = tissue_types

random_state = 42
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=10, random_state=random_state)

# Define binary combinations
list_tissue_comb = list(combinations(tissue_types,2))
# Save the list of binary pairs
if not os.path.exists('order_of_tissue_comb.joblib'):
    dump(list_tissue_comb, 'order_of_tissue_comb.joblib')

In [None]:
# Run the for loop to calculate for all binary pairs

# for i,k in combinations(tissue_types,2): # can separate this to make an individual function
#
#     # Track timestamp - elapsed time
#     start_time = time.time()
#
#     count += 1

    # if count == 2:
    #     print('count = 2; break')
    #     break

# Else for single pairs --->

i = 'boneCement'
k = 'cortBone'

# i = 'boneMarrow'
# k = 'cortBone'

#print('Iteration: ', count)
print('First Label: ', i)
print('Second Label: ',k)

# Data selection
binary_subset = df_dataset.loc[(df['target_y'] == i) + (df['target_y'] == k)]

# Features
X_binary_init = binary_subset.drop(['target_y'], axis=1)
# Target
y_binary_init = binary_subset['target_y']
# Convert to 0 and 1 - i = 1, k = 0
y_binary_init = (y_binary_init == i).astype('uint8')

# Split Train Test Validation set
X_binary, X_val_all, y_binary, y_val_all = train_test_split(X_binary_init, y_binary_init,
                                                            test_size=0.2, stratify=y_binary_init, random_state=random_state)
# Save train test sets
dump(X_binary, str(i) + '_' + str(k) + '_X_binary.joblib')
dump(X_val_all, str(i) + '_' + str(k) + '_X_val_all.joblib')
dump(y_binary, str(i) + '_' + str(k) + '_y_binary.joblib')
dump(y_val_all, str(i) + '_' + str(k) + '_y_val_all.joblib')

In [None]:
# Moving-window approach to select features using LDA coeff ranking
win_size = [25, 50, 75, 100, 150, 200, 300]
wv_idx = np.arange(0, len(col_wavelengths),1)
all_selected_wv = []
for win in win_size:
    print('Window Size: ', win)

    # Set interval indices
    wv_idx_interval = interval_indexing(wv_idx, interval_len=win)
    print('Number of Intervals: ', len(wv_idx_interval))

    win_selected_wv = []
    for interval in range(len(wv_idx_interval)):

        # Define feature selection model - LDA
        lda_fs_model = LinearDiscriminantAnalysis()
        # Selector
        fs_selector = SelectFromModel(lda_fs_model, max_features=1).fit(X_binary.iloc[:, wv_idx_interval[interval]],y_binary)
        # Wavelength interval
        wv_interval = col_wavelengths[wv_idx_interval[interval]]
        selected_wv = wv_interval[fs_selector.get_support()]
        win_selected_wv.append(selected_wv[0])
    all_selected_wv.append(win_selected_wv)

# Remove duplicates and wavelengths separated by <= 30nm
all_selected_wv = [wv for sublist in all_selected_wv for wv in sublist]
all_selected_wv=list(dict.fromkeys(all_selected_wv))
pt_dup = []
for x, y in combinations(all_selected_wv,2):
    if abs(x-y)<=20:
        ind = np.where(all_selected_wv==y)[0][0]
        pt_dup.append(ind)
# Remove duplicated indices
pt_dup = np.unique(pt_dup)
# Remove the peaks
if np.any(pt_dup):
    all_selected_wv_final = np.delete(all_selected_wv, pt_dup)
else:
    all_selected_wv_final = all_selected_wv

# Convert to str
all_selected_wv_final_str = [str(x) for x in all_selected_wv_final]

In [None]:
# Define the number of features to be selected for SA
fraction = 0.75
K_features = math.floor(len(all_selected_wv_final) * fraction)
print('The number of features to be selected: ', K_features)

# Define estimator
fs_model = LinearDiscriminantAnalysis()
# Define feature selection model - LDA
fs = SelectFromModel(fs_model).fit(X_binary.loc[:, all_selected_wv_final_str], y_binary)

# Find the index from high to low
fs_coeff_ind = np.argsort(-abs(fs.estimator_.coef_)).T

# Initiate the search
selected_ind_update = [fs_coeff_ind[0][0]]
fs_coeff_ind_update = np.copy(fs_coeff_ind)
fs_coeff_ind_update = np.delete(fs_coeff_ind_update, 0)

while len(selected_ind_update) < K_features:
    #print('Number of Selected Ind: ', len(selected_ind_update))
    pearson_rank = []
    for n in range(len(fs_coeff_ind_update)):
        pearson_corr = np.max(
            np.abs(np.corrcoef(X_binary.iloc[:, selected_ind_update + [fs_coeff_ind_update[n]]], rowvar=False))
            - np.identity(len(selected_ind_update) + 1))
        pearson_rank.append(pearson_corr)
    selected_ind = np.argsort(pearson_rank)[0]
    selected_fs_coeff_ind = fs_coeff_ind_update[selected_ind]
    #print('Selected Ind: ', selected_anova_ind)
    #print('Selected Pearson Coeff: ', pearson_rank[selected_ind])
    selected_ind_update.append(selected_fs_coeff_ind)
    fs_coeff_ind_update = np.delete(fs_coeff_ind_update, selected_ind)

# Find the final selected wavelengths
lda_select_wv_final_str = pd.Series(all_selected_wv_final_str)[selected_ind_update]
lda_select_wv_final = pd.Series(all_selected_wv_final)[selected_ind_update]
dump(lda_select_wv_final, str(i) + '_' + str(k) + '_LDA_select_wv.joblib')

# SA to select the 10 features that give the highest LDA clf accuracy
lda_select_wv_1 = X_binary.loc[:, lda_select_wv_final_str].values
lda_sa_fs_wv,aa,bb,cc,dd,ee = wv_sa_opt(lda_select_wv_1, y_binary, cv=10, nwv_total=len(lda_select_wv_final), nwv_select=10, n_iter = 2000)

# The final 10 features
lda_sa_fs_wv_10 = lda_select_wv_final.iloc[lda_sa_fs_wv]
lda_sa_fs_wv_10_str = [str(x) for x in lda_sa_fs_wv_10]

In [None]:
# Plot and save selected wavelengths on spectra
# Set custom color cycle
custom_cycler = (cycler('color', ['#d62728', '#1f77b4', '#2ca02c', '#ff7f0e',
                                  '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf']) +
                 cycler(lw=[1.5, 1.5, 1.5, 1.5, 1.5, 1.5, 1.5, 1.5, 1.5, 1.5]))

# Average spectra for each tissue type
avg_df_select_tissues = binary_subset.groupby(['target_y']).mean()
stdev_df_select_tissues = binary_subset.groupby(['target_y']).std()

# Set the figure size
f, ax = plt.subplots(figsize=(10, 8))
# Tight layout
f.tight_layout()
# Set colors
ax.set_prop_cycle(custom_cycler)
# Plot
select_tissue_label = [i,k]
for tissue in range(avg_df_select_tissues.shape[0]):
    plt.plot(col_wavelengths, avg_df_select_tissues.iloc[tissue,:], label = select_tissue_label[tissue])
    pos_std = avg_df_select_tissues.iloc[tissue,:] + stdev_df_select_tissues.iloc[tissue,:]
    neg_std = avg_df_select_tissues.iloc[tissue,:] - stdev_df_select_tissues.iloc[tissue,:]
    ax.fill_between(col_wavelengths, neg_std, pos_std, alpha = 0.08)
for wv in lda_sa_fs_wv_10:
    plt.axvline(x=wv, color='#2ca02c', linestyle='-', alpha=0.6, lw=1.75)
# Add text - selected wavelengths
plt.text(0, -0.12, 'Final Selected Wavelength: '+ str(np.array(lda_sa_fs_wv_10)),
         horizontalalignment='left', verticalalignment = 'center_baseline', transform=ax.transAxes)
# Set figure object
ax.set_title('Selected Peaks for ' + str(i) + ' vs. ' + str(k))
ax.set_xlabel('Wavelength (nm)')
ax.set_ylabel('Normalized Intensity (A.U.)')
ax.set_xlim([350, 1850])
plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))
bbox = matplotlib.transforms.Bbox([[-0.45, -1.3], [11.45, 8.56]])
# Save figure
if not os.path.exists(str(i) + '_' + str(k) + '_LDA_select_wv.png'):
    f.savefig(os.getcwd() + '/' +  str(i) + '_' + str(k) + '_LDA_select_wv.png', dpi = 1080, bbox_inches =bbox)

#### The OVR Approach

In [None]:
# Change to save_path
os.chdir(save_path)
os.chdir('YOUR SAVE PATH - SUBFOLDER')
print(os.getcwd())

In [None]:
# Define dataset
df_dataset = df_smooth

# Tissue labels
tissue_types = tissue_types

# Set global random state
random_state = 42

# Define cross validation
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=10, random_state=random_state)

In [None]:
# One vs Rest

# Bone Cement = 1 vs. [Cortical Bone, Trabecular Bone, Cartilage, Bone Marrow] = 0
i = 'boneCement'
k_1 = 'cortBone'
k_2 = 'traBone'
k_3 = 'cartilage'
k_4 = 'boneMarrow'

# Cortical Bone = 1 vs. [Trabecular Bone, Cartilage, Bone Marrow, Muscle] = 0
# i = 'cortBone'
# k_1 = 'traBone'
# k_2 = 'muscle'
# k_3 = 'cartilage'
# k_4 = 'boneMarrow'

In [None]:
print('First Label: ', i)
#print('Second Label: ',k)
print('Second Label: ' + k_1 + ', ' + k_2 + ', ' + k_3 + ', ' + k_4)

# Data selection for multiple tissue types
df_subset = df_dataset[df_dataset['target_y'].isin([i,k_1,k_2,k_3,k_4])]
binary_subset = df_dataset[df_dataset['target_y'].isin([i,k_1,k_2,k_3,k_4])]
binary_subset.loc[df['target_y'].isin([k_1, k_2, k_3, k_4]), 'target_y'] = 'rest'
binary_subset.loc[df['target_y'].isin([i]), 'target_y'] = i
k = 'rest'

# Features
X_binary_init = binary_subset.drop(['target_y'], axis=1)
# Target
y_binary_init = binary_subset['target_y']
# Convert to 0 and 1 - i = 1, k = 0
y_binary_init = (y_binary_init == i).astype('uint8')

# Split Train Test Validation set
X_binary, X_val_all, y_binary, y_val_all = train_test_split(X_binary_init, y_binary_init,
                                                            test_size=0.2, stratify=y_binary_init, random_state=random_state)

# Save train test sets
dump(X_binary, str(i) + '_' + str(k) + '_X_binary.joblib')
dump(X_val_all, str(i) + '_' + str(k) + '_X_val_all.joblib')
dump(y_binary, str(i) + '_' + str(k) + '_y_binary.joblib')
dump(y_val_all, str(i) + '_' + str(k) + '_y_val_all.joblib')

In [None]:
print('First Label: ', i)
#print('Second Label: ',k)
print('Second Label: ' + k_1 + ', ' + k_2 + ', ' + k_3 + ', ' + k_4)

start_time = time.time()

# Moving-window approach to select features using LDA coeff ranking
win_size = [25, 50, 75, 100, 150, 200, 300]
wv_idx = np.arange(0, len(col_wavelengths),1)
all_selected_wv = []
for win in win_size:
    print('Window Size: ', win)

    # Set interval indices
    wv_idx_interval = interval_indexing(wv_idx, interval_len=win)
    print('Number of Intervals: ', len(wv_idx_interval))

    win_selected_wv = []
    for interval in range(len(wv_idx_interval)):

        # Define feature selection model - LDA
        lda_fs_model = LinearDiscriminantAnalysis()
        # Selector
        fs_selector = SelectFromModel(lda_fs_model, max_features=1).fit(X_binary.iloc[:, wv_idx_interval[interval]],y_binary)
        # Wavelength interval
        wv_interval = col_wavelengths[wv_idx_interval[interval]]
        selected_wv = wv_interval[fs_selector.get_support()]
        win_selected_wv.append(selected_wv[0])
    all_selected_wv.append(win_selected_wv)

# Remove duplicates and wavelengths separated by <= 20nm
all_selected_wv = [wv for sublist in all_selected_wv for wv in sublist]
all_selected_wv=list(dict.fromkeys(all_selected_wv))
pt_dup = []
for x, y in combinations(all_selected_wv,2):
    if abs(x-y)<=20:
        ind = np.where(all_selected_wv==y)[0][0]
        pt_dup.append(ind)
# Remove duplicated indices
pt_dup = np.unique(pt_dup)
# Remove the peaks
if np.any(pt_dup):
    all_selected_wv_final = np.delete(all_selected_wv, pt_dup)
else:
    all_selected_wv_final = all_selected_wv

# Convert to str
all_selected_wv_final_str = [str(x) for x in all_selected_wv_final]

# Define the number of features to be selected for SA
fraction = 0.75
K_features = math.floor(len(all_selected_wv_final) * fraction)
print('The number of features to be selected: ', K_features)
# Define estimator
fs_model = LinearDiscriminantAnalysis()
# Define feature selection model - LDA
fs = SelectFromModel(fs_model).fit(X_binary.loc[:, all_selected_wv_final_str], y_binary)

# Find the index from high to low
fs_coeff_ind = np.argsort(-abs(fs.estimator_.coef_)).T

# Initiate the search
selected_ind_update = [fs_coeff_ind[0][0]]
fs_coeff_ind_update = np.copy(fs_coeff_ind)
fs_coeff_ind_update = np.delete(fs_coeff_ind_update, 0)

while len(selected_ind_update) < K_features:
    #print('Number of Selected Ind: ', len(selected_ind_update))
    pearson_rank = []
    for n in range(len(fs_coeff_ind_update)):
        pearson_corr = np.max(
            np.abs(np.corrcoef(X_binary.iloc[:, selected_ind_update + [fs_coeff_ind_update[n]]], rowvar=False))
            - np.identity(len(selected_ind_update) + 1))
        pearson_rank.append(pearson_corr)
    selected_ind = np.argsort(pearson_rank)[0]
    selected_fs_coeff_ind = fs_coeff_ind_update[selected_ind]
    #print('Selected Ind: ', selected_anova_ind)
    #print('Selected Pearson Coeff: ', pearson_rank[selected_ind])
    selected_ind_update.append(selected_fs_coeff_ind)
    fs_coeff_ind_update = np.delete(fs_coeff_ind_update, selected_ind)

# Find the final selected wavelengths
lda_select_wv_final_str = pd.Series(all_selected_wv_final_str)[selected_ind_update]
lda_select_wv_final = pd.Series(all_selected_wv_final)[selected_ind_update]
#dump(lda_select_wv_final, str(i) + '_' + str(k) + '_LDA_select_wv.joblib')
# SA to select the 10 features that give the highest LDA clf accuracy
lda_select_wv_1 = X_binary.loc[:, lda_select_wv_final_str].values
lda_sa_fs_wv, aa, bb, cc, dd, ee = wv_sa_opt(lda_select_wv_1, y_binary, cv=10, nwv_total=len(lda_select_wv_final),
                                             nwv_select=10, n_iter=2000)

# The final 10 features
lda_sa_fs_wv_10 = lda_select_wv_final.iloc[lda_sa_fs_wv]
lda_sa_fs_wv_10_str = [str(x) for x in lda_sa_fs_wv_10]
#dump(lda_sa_fs_wv_10, str(i) + '_' + str(k) + '_LDA_SA_select_wv_final.joblib')

end_time = time.time()
print('Time to run: ', (end_time - start_time)/60, ' minutes')

In [None]:
# Define the number of features to be selected with SA
fraction = 0.75
K_features = math.floor(len(all_selected_wv_final) * fraction)
print('The number of features to be selected: ', K_features)

# Define estimator
fs_model = LinearDiscriminantAnalysis()
# Define feature selection model - LDA
fs = SelectFromModel(fs_model).fit(X_binary.loc[:, all_selected_wv_final_str], y_binary)

# Find the index from high to low
fs_coeff_ind = np.argsort(-abs(fs.estimator_.coef_)).T

# Initiate the search
selected_ind_update = [fs_coeff_ind[0][0]]
fs_coeff_ind_update = np.copy(fs_coeff_ind)
fs_coeff_ind_update = np.delete(fs_coeff_ind_update, 0)

while len(selected_ind_update) < K_features:
    #print('Number of Selected Ind: ', len(selected_ind_update))
    pearson_rank = []
    for n in range(len(fs_coeff_ind_update)):
        pearson_corr = np.max(
            np.abs(np.corrcoef(X_binary.iloc[:, selected_ind_update + [fs_coeff_ind_update[n]]], rowvar=False))
            - np.identity(len(selected_ind_update) + 1))
        pearson_rank.append(pearson_corr)
    selected_ind = np.argsort(pearson_rank)[0]
    selected_fs_coeff_ind = fs_coeff_ind_update[selected_ind]
    #print('Selected Ind: ', selected_anova_ind)
    #print('Selected Pearson Coeff: ', pearson_rank[selected_ind])
    selected_ind_update.append(selected_fs_coeff_ind)
    fs_coeff_ind_update = np.delete(fs_coeff_ind_update, selected_ind)

# Find the final selected wavelengths
lda_select_wv_final_str = pd.Series(all_selected_wv_final_str)[selected_ind_update]
lda_select_wv_final = pd.Series(all_selected_wv_final)[selected_ind_update]

# Save Variables
dump(lda_select_wv_final, str(i) + '_' + str(k) + '_LDA_select_wv.joblib')

In [None]:
# SA to select the 10 features that give the highest LDA clf accuracy
lda_select_wv_1 = X_binary.loc[:, lda_select_wv_final_str].values
lda_sa_fs_wv, aa, bb, cc, dd, ee = wv_sa_opt(lda_select_wv_1, y_binary, cv=10, nwv_total=len(lda_select_wv_final),
                                             nwv_select=10, n_iter=2000)

# The final 10 features
lda_sa_fs_wv_10 = lda_select_wv_final.iloc[lda_sa_fs_wv]
lda_sa_fs_wv_10_str = [str(x) for x in lda_sa_fs_wv_10]
# Save
dump(lda_sa_fs_wv_10, str(i) + '_' + str(k) + '_LDA_SA_select_wv_final.joblib')

In [None]:
# Plot and save selected wavelengths on spectra
# Set custom color cycle
custom_cycler = (cycler('color', ['#d62728', '#1f77b4', '#2ca02c', '#ff7f0e',
                                  '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf']) +
                 cycler(lw=[1.5, 1.5, 1.5, 1.5, 1.5, 1.5, 1.5, 1.5, 1.5, 1.5]))

# Average spectra for each tissue type
avg_df_select_tissues = df_subset.groupby(['target_y']).mean()
stdev_df_select_tissues = df_subset.groupby(['target_y']).std()

# Set the figure size
f, ax = plt.subplots(figsize=(10, 8))
# Tight layout
f.tight_layout()
# Plot
select_tissue_label = [i,k_1,k_2,k_3,k_4]
select_tissue_label.sort()
# Set custom color cycle
custom_cycler_boneCement = (cycler('color', ['#d62728', '#808080', '#808080', '#808080', '#808080']) +
                            cycler(lw=[1.5, 1.5, 1.5, 1.5, 1.5]))
# custom_cycler_cortBone = (cycler('color', ['#808080', '#808080', '#d62728', '#808080', '#808080']) +
#                           cycler(lw=[1.5, 1.5, 1.5, 1.5, 1.5]))
# Set colors
ax.set_prop_cycle(custom_cycler_boneCement)
#ax.set_prop_cycle(custom_cycler_cortBone)
for tissue in range(avg_df_select_tissues.shape[0]):
    plt.plot(col_wavelengths, avg_df_select_tissues.iloc[tissue, :], label=select_tissue_label[tissue])
    pos_std = avg_df_select_tissues.iloc[tissue, :] + stdev_df_select_tissues.iloc[tissue, :]
    neg_std = avg_df_select_tissues.iloc[tissue, :] - stdev_df_select_tissues.iloc[tissue, :]
    ax.fill_between(col_wavelengths, neg_std, pos_std, alpha=0.08)
for wv in lda_sa_fs_wv_10:
    plt.axvline(x=wv, color='#2ca02c', linestyle='-', alpha=0.6, lw=1.75)
# Add text - selected wavelengths
plt.text(0, -0.12, 'Final Selected Wavelength: ' + str(np.array(lda_sa_fs_wv_10)),
         horizontalalignment='left', verticalalignment='center_baseline', transform=ax.transAxes)
# Set figure object
ax.set_title('Selected Peaks for ' + str(i) + ' vs. ' + str(k))
ax.set_xlabel('Wavelength (nm)')
ax.set_ylabel('Normalized Intensity (A.U.)')
ax.set_xlim([350, 1850])
plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))
bbox = matplotlib.transforms.Bbox([[-0.45, -1.3], [11.45, 8.56]])
# Save figure
if not os.path.exists(str(i) + '_' + str(k) + '_LDA_select_wv.png'):
    f.savefig(os.getcwd() + '/' + str(i) + '_' + str(k) + '_LDA_select_wv.png', dpi=1080, bbox_inches=bbox)

#### Plot Accuracy vs. Number of Features in the order given by the algorithm
- or can rank the order by f_classif scores

In [None]:
# Load the final selected wavelengths
lda_sa_fs_wv_10 = load('YOUR SELECTED WAVELENGTHS.joblib') # in float; or use the variable directly from above
lda_sa_fs_wv_10_str = [str(x) for x in lda_sa_fs_wv_10] # convert to string if float
print('Final wv:, ', lda_sa_fs_wv_10_str)

In [None]:
# Define dataset
df_dataset = df_smooth

# Tissue types
i = 'boneCement'
k_1 = 'cortBone'
k_2 = 'traBone'
k_3 = 'cartilage'
k_4 = 'boneMarrow'

# i = 'cortBone'
# k_1 = 'traBone'
# k_2 = 'muscle'
# k_3 = 'cartilage'
# k_4 = 'boneMarrow'

k = 'rest'

# Select from the original dataset
#df_subset = df_dataset[df_dataset['target_y'].isin([i,k_1,k_2,k_3,k_4])]

# Features - Bone Cement
X_binary = load('boneCement_rest_X_binary.joblib') # train/test
X_val = load('boneCement_rest_X_val_all.joblib') # validation
# Target
y_binary = load('boneCement_rest_y_binary.joblib') # train/test
y_val = load('boneCement_rest_y_val_all.joblib') # validation

# Features - Cortical Bone
# X_binary = load('cortBone_rest_X_binary.joblib') # train/test
# X_val = load('cortBone_rest_X_val_all.joblib') # validation
# # Target
# y_binary = load('cortBone_rest_y_binary.joblib') # train/test
# y_val = load('cortBone_rest_y_val_all.joblib') # validation

# Select the features
lda_select_X_binary = X_binary.loc[:, lda_sa_fs_wv_10]
lda_select_X_val = X_val_all.loc[:, lda_sa_fs_wv_10]

In [None]:
# Calculate clf accuracy

all_lda_scores = []
all_lda_stdev = []
all_lda_scores_val = []
all_lda_stdev_val = []

all_lda_balanced_scores = []
all_lda_balanced_stdev = []
all_lda_balanced_scores_val = []
all_lda_balanced_stdev_val = []
for wv in range(len(lda_sa_fs_wv_10)):
    # Define LDA model
    model = LinearDiscriminantAnalysis()

    # Calculate clf accuracy cv
    print('Number of Features: ', wv + 1)
    lda_cv_scores, lda_balanced_scores = clf_accuracy_eval(lda_select_X_binary.iloc[:, 0:wv + 1], y_binary, model, cv)
    lda_cv_scores_val, lda_balanced_scores_val = clf_accuracy_eval(lda_select_X_val.iloc[:, 0:wv + 1], y_val_all, model, cv)

    all_lda_scores.append(np.mean(lda_cv_scores))
    all_lda_stdev.append(np.std(lda_cv_scores))
    all_lda_balanced_scores.append(np.mean(lda_balanced_scores))
    all_lda_balanced_stdev.append(np.std(lda_balanced_scores))

    all_lda_scores_val.append(np.mean(lda_cv_scores_val))
    all_lda_stdev_val.append(np.std(lda_cv_scores_val))
    all_lda_balanced_scores_val.append(np.mean(lda_balanced_scores_val))
    all_lda_balanced_stdev_val.append(np.std(lda_balanced_scores_val))

# Save
dump(all_lda_scores, str(i) + '_' + str(k) + '_LDA_accuracy_nFeatures_train_new.joblib')
dump(all_lda_stdev, str(i) + '_' + str(k) + '_LDA_stdev_nFeatures_train_new.joblib')
dump(all_lda_balanced_scores, str(i) + '_' + str(k) + '_LDA_balanced_nFeatures_train_new.joblib')
dump(all_lda_balanced_stdev, str(i) + '_' + str(k) + '_LDA_balanced_stdev_nFeatures_train_new.joblib')

dump(all_lda_scores_val, str(i) + '_' + str(k) + '_LDA_accuracy_nFeatures_val_new.joblib')
dump(all_lda_stdev_val, str(i) + '_' + str(k) + '_LDA_stdev_nFeatures_val_new.joblib')
dump(all_lda_balanced_scores_val, str(i) + '_' + str(k) + '_LDA_balanced_nFeatures_val_new.joblib')
dump(all_lda_balanced_stdev_val, str(i) + '_' + str(k) + '_LDA_balanced_stdev_nFeatures_val_new.joblib')

In [None]:
# plot Clf Accuracy vs. Number of Features

# Set the figure size
f, ax = plt.subplots(figsize=(8, 6))
# Tight layout
f.tight_layout()

# Plot - Accuracy on train
plt.plot(np.linspace(1, len(lda_sa_fs_wv_10), len(lda_sa_fs_wv_10)), all_lda_scores, '--', lw=2.5)
plt.errorbar(np.linspace(1, len(lda_sa_fs_wv_10), len(lda_sa_fs_wv_10)), all_lda_scores, yerr=all_lda_stdev,
             fmt='o', capsize=1.5)
xlabels = ['0'] + lda_sa_fs_wv_10_str
plt.xticks(np.arange(0, len(lda_sa_fs_wv_10) + 1, 1), xlabels)
# Set figure object
ax.set_title('Classification Accuracy vs. Number of Wavelengths for ' + str(i) + ' vs. ' + str(k))
ax.set_xlabel('Wavelengths Included (nm)')
ax.set_ylabel('LDA Clf Accuracy')
#ax.set_xlim([350, 1850])
bbox = matplotlib.transforms.Bbox([[-0.4, -0.4], [8.5, 6.5]])

# Save figure
if not os.path.exists(str(i) + '_' + str(k) + '_LDA_accuracy_nFeatures_train.png'):
    f.savefig(os.getcwd() + '/' + str(i) + '_' + str(k) + '_LDA_accuracy_nFeatures_train.png', dpi=1080,
              bbox_inches=bbox)


# plot Clf Accuracy vs. Number of Features

# Set the figure size
f, ax = plt.subplots(figsize=(8, 6))
# Tight layout
f.tight_layout()

# Plot - Balanced accuracy on train
plt.plot(np.linspace(1, len(lda_sa_fs_wv_10), len(lda_sa_fs_wv_10)), all_lda_balanced_scores, '--', lw=2.5)
plt.errorbar(np.linspace(1, len(lda_sa_fs_wv_10), len(lda_sa_fs_wv_10)), all_lda_balanced_scores,
             yerr=all_lda_balanced_stdev,
             fmt='o', capsize=1.5)
xlabels = ['0'] + lda_sa_fs_wv_10_str
plt.xticks(np.arange(0, len(lda_sa_fs_wv_10) + 1, 1), xlabels)
# Set figure object
ax.set_title('Classification Accuracy vs. Number of Wavelengths for ' + str(i) + ' vs. ' + str(k))
ax.set_xlabel('Wavelengths Included (nm)')
ax.set_ylabel('LDA Clf Accuracy')
#ax.set_xlim([350, 1850])
bbox = matplotlib.transforms.Bbox([[-0.4, -0.4], [8.5, 6.5]])

# Save figure
if not os.path.exists(str(i) + '_' + str(k) + '_LDA_balanced_nFeatures_train.png'):
    f.savefig(os.getcwd() + '/' + str(i) + '_' + str(k) + '_LDA_balanced_nFeatures_train.png', dpi=1080,
              bbox_inches=bbox)


# plot Clf Accuracy vs. Number of Features

# Set the figure size
f, ax = plt.subplots(figsize=(8, 6))
# Tight layout
f.tight_layout()

# Plot - Accuracy on validation
plt.plot(np.linspace(1, len(lda_sa_fs_wv_10), len(lda_sa_fs_wv_10)), all_lda_scores_val, '--', lw=2.5)
plt.errorbar(np.linspace(1, len(lda_sa_fs_wv_10), len(lda_sa_fs_wv_10)), all_lda_scores_val, yerr=all_lda_stdev_val,
             fmt='o', capsize=1.5)
xlabels = ['0'] + lda_sa_fs_wv_10_str
plt.xticks(np.arange(0, len(lda_sa_fs_wv_10) + 1, 1), xlabels)
# Set figure object
ax.set_title('Classification Accuracy vs. Number of Wavelengths for ' + str(i) + ' vs. ' + str(k))
ax.set_xlabel('Wavelengths Included (nm)')
ax.set_ylabel('LDA Clf Accuracy')
#ax.set_xlim([350, 1850])
bbox = matplotlib.transforms.Bbox([[-0.4, -0.4], [8.5, 6.5]])

# Save figure
if not os.path.exists(str(i) + '_' + str(k) + '_LDA_accuracy_nFeatures_val.png'):
    f.savefig(os.getcwd() + '/' + str(i) + '_' + str(k) + '_LDA_accuracy_nFeatures_val.png', dpi=1080,
              bbox_inches=bbox)


# plot Clf Accuracy vs. Number of Features

# Set the figure size
f, ax = plt.subplots(figsize=(8, 6))
# Tight layout
f.tight_layout()

# Plot - Balanced accuracy on validation
plt.plot(np.linspace(1, len(lda_sa_fs_wv_10), len(lda_sa_fs_wv_10)), all_lda_balanced_scores_val, '--', lw=2.5)
plt.errorbar(np.linspace(1, len(lda_sa_fs_wv_10), len(lda_sa_fs_wv_10)), all_lda_balanced_scores_val,
             yerr=all_lda_balanced_stdev_val,
             fmt='o', capsize=1.5)
xlabels = ['0'] + lda_sa_fs_wv_10_str
plt.xticks(np.arange(0, len(lda_sa_fs_wv_10) + 1, 1), xlabels)
# Set figure object
ax.set_title('Classification Accuracy vs. Number of Wavelengths for ' + str(i) + ' vs. ' + str(k))
ax.set_xlabel('Wavelengths Included (nm)')
ax.set_ylabel('LDA Clf Accuracy')
#ax.set_xlim([350, 1850])
bbox = matplotlib.transforms.Bbox([[-0.4, -0.4], [8.5, 6.5]])

# Save figure
if not os.path.exists(str(i) + '_' + str(k) + '_LDA_balanced_nFeatures_val.png'):
    f.savefig(os.getcwd() + '/' + str(i) + '_' + str(k) + '_LDA_balanced_nFeatures_val.png', dpi=1080,
              bbox_inches=bbox)