# Principal Component Analysis (PCA) Feature Selection (FS) Framework

In [None]:
import os
import sys
import gc

import pandas as pd
import numpy as np

from scipy.signal import savgol_filter
from itertools import combinations
from sklearn.decomposition import PCA
from sklearn.decomposition import SparsePCA
from scipy.signal import find_peaks, peak_prominences

import plotly.express as px
import matplotlib.pyplot as plt
from cycler import cycler
import matplotlib.transforms
import matplotlib

import time
from joblib import dump
from joblib import load

from sklearn.metrics import balanced_accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.metrics import f1_score

from sklearn.model_selection import train_test_split
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import RepeatedStratifiedKFold

# Disable Print
def blockPrint():
    sys.__stdout__ = sys.stdout
    sys.stdout = open(os.devnull, 'w')

# Restore Print
def enablePrint():
    sys.stdout = sys.__stdout__

In [None]:
# Change to data directory

work_path = os.getcwd()
print(work_path)

os.chdir('YOUR DATA DIRECTORY')
data_path = os.getcwd()
print(data_path)
os.chdir(data_path)

In [None]:
# Load data

os.chdir(data_path)

# Load dataset - df
df = pd.read_csv('YOUR DATA FILE.csv')

# Average spectra for each tissue type
avg_df_by_y = df.groupby(['target_y']).mean()
stdev_df_by_y = df.groupby(['target_y']).std()

# Column names to numerical
col_wavelengths = df.columns.drop('target_y')
col_wavelengths = col_wavelengths.astype(np.float64)
print('Number of wavelengths: ', len(col_wavelengths))

# Legend labels
tissue_types = df['target_y'].unique()
tissue_types.sort()

In [None]:
# SG Smoothing of the raw data - Gentle smoothing with w/p = 2.5 to get rid of some baseline noise

# Features
X_raw = df.drop(['target_y'], axis=1)

# Target
y = df['target_y']

# Spectral Smoothing - Savitzky–Golay (SG) method
w = 5
p = 2
X_smooth = savgol_filter(X_raw, w, polyorder=p, axis=1, deriv=0)

# Smoothed dataframe
df_smooth = pd.DataFrame(X_smooth, columns = col_wavelengths.astype("string"))
df_smooth = pd.concat([df_smooth, y.rename('target_y')], axis=1)

# Average spectra for each tissue type
avg_df_by_y_smooth = df_smooth.groupby(['target_y']).mean()
stdev_df_by_y_smooth = df_smooth.groupby(['target_y']).std()

In [None]:
# Change to save path
os.chdir(work_path)
os.chdir('YOUR SAVE PATH')
save_path = os.getcwd()
print(save_path)

#### Define functions

In [None]:
# Define a function to search for the optimal number of PCA components - evaluated by LDA in this case

def pca_ncomp(X_pca, y, n_comp, cv): # Change this to use GridSearchCV

    # Define rmsecv as the goodness-of-fit measure
    pca_lda_scores_avg = []
    pca_lda_scores_std = []
    for n in range(1, n_comp+1, 1):

        # define the first n selected pc
        X_npca_select = X_pca[:,:n]
        # define lda regression
        lda = LinearDiscriminantAnalysis()
        # Simple cv score
        lda_score = cross_val_score(lda, X_npca_select, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
        # Accuracy scores
        pca_lda_scores_avg.append(np.mean(lda_score))
        pca_lda_scores_std.append(np.std(lda_score))

    best_ncomp = np.argmax(pca_lda_scores_avg) + 1
    lda_score_max = pca_lda_scores_avg[best_ncomp-1]

    return best_ncomp, lda_score_max, pca_lda_scores_avg, pca_lda_scores_std

In [None]:
# Define temperature in Simulated Annealing

def temperature_sa(accuracy_max): #-> need to fix this so that as accuracy increases, change of acceptance decreases
    return 0.05 * (1-accuracy_max) + 0.00000001 # regularization term to avoid dividing by 0

In [None]:
# Define a function to optimize PC selection using Simulated Annealing

def pc_sa_opt(X_pca, y, cv, npc_total, npc_select, n_iter):

    # Set the number of principal components
    npc_total = npc_total
    npc_select = npc_select

    # Set selected PCs - Initial condition is the first n PCs
    idx_pc_arr = np.arange(npc_total)
    pc_select = idx_pc_arr[:npc_select]
    pc_exclude = idx_pc_arr[npc_select:]

    # Choose the selected PCs from the PCA transformed X dataset
    X_pca_select = X_pca[:,pc_select]

    # Initialize list for storing updated clf accuracy from each iteration
    all_clf_accuracy = []
    all_opt_ncomp = []
    all_pc_select = []
    all_pc_exclude = []


    # Calculate the initial clf accuracy score and the optimal number of PCs
    opt_ncomp_init, lda_score_init, _, _ = pca_ncomp(X_pca_select, y, npc_select, cv=cv)
    all_clf_accuracy.append(lda_score_init)
    all_opt_ncomp.append(opt_ncomp_init)
    all_pc_select.append(pc_select)
    all_pc_exclude.append(pc_exclude)
    print('The initial optimal number of PCA components: ', opt_ncomp_init)
    print('The initial clf accuracy using LDA: %.5f ' % lda_score_init)

    # Simulated Annealing loop

    # Initialize arrays for updating
    pc_select_update = np.copy(pc_select)
    print('Initial Selected PCs', pc_select_update)
    pc_exclude_update = np.copy(pc_exclude)
    print('Initial Excluded PCs', pc_exclude_update)
    lda_score_update = lda_score_init
    opt_ncomp_update = opt_ncomp_init

    if lda_score_init == 1:
        print('-> LDA clf accuracy is already 100%')
        pc_select_update = pc_select_update
        lda_score_update = lda_score_update
        update_count = 0
        all_clf_accuracy = []
        all_opt_ncomp = []
        all_pc_select = []
        all_pc_exclude = []
    else:

        update_count = 0

        for i in range(n_iter):

            print('\n')
            print('--- Iteration: ', i, ' ---')
            #print('Initial clf accuracy: ', lda_score_update)
            # Define temperature reg
            temp_reg = temperature_sa(lda_score_update)
            print('Temperature Reg for SA: ', temp_reg)
            # Initialize lists for updates in the inner loop
            pc_select_inner = np.copy(pc_select_update)
            pc_exclude_inner = np.copy(pc_exclude_update)

            # Change one element during each iteration
            idx_select = np.random.randint(npc_select)
            idx_exclude = np.random.randint(npc_total-npc_select)
            item_select = pc_select_inner[idx_select]
            print('Selected PC for Exchange: ', item_select)
            item_exclude = pc_exclude_inner[idx_exclude]
            print('Excluded PC for Exchange: ', item_exclude)
            pc_select_inner[idx_select] = item_exclude
            print('New Selected PCs: ', pc_select_inner)
            pc_exclude_inner[idx_exclude] = item_select
            print('New Excluded PCs: ', pc_exclude_inner)

            # Select the new PCs and evaluate
            X_pca_select_update = X_pca[:,pc_select_inner]
            opt_ncomp_inner, lda_score_inner, _, _ = pca_ncomp(X_pca_select_update, y, npc_select, cv=cv)
            print('Update npc: ', opt_ncomp_inner)
            print('Update clf accuracy: %.5f ' % lda_score_inner)
            print('Old clf accuracy: %.5f ' % lda_score_update)


            # Make decision
            if lda_score_inner > lda_score_update:
                print('-> Yes higher clf accuracy')
                # Update
                pc_select_update = pc_select_inner
                pc_exclude_update = pc_exclude_inner
                opt_ncomp_update = opt_ncomp_inner
                lda_score_update = lda_score_inner
                update_count += 1
                print('-> Number of updates: ', update_count)
                # Store
                all_clf_accuracy.append(lda_score_update)
                all_opt_ncomp.append(opt_ncomp_update)
                all_pc_select.append(pc_select_update)
                all_pc_exclude.append(pc_exclude_update)
                print('-> New PCs: ', pc_select_update)
                print('-> New clf accuracy: %.5f ' % lda_score_update)
                print('-> New npc: ', opt_ncomp_update)
            elif lda_score_inner <= lda_score_update:
                sa_prob = np.exp((lda_score_inner-lda_score_update)/temp_reg)
                print('SA Probability: ', sa_prob)
                if np.random.random() < sa_prob:
                    print('-> Accepted; Lower than SA probability')
                    # Update
                    pc_select_update = pc_select_inner
                    pc_exclude_update = pc_exclude_inner
                    opt_ncomp_update = opt_ncomp_inner
                    lda_score_update = lda_score_inner
                    update_count += 1
                    print('-> Number of updates: ', update_count)
                    # Store
                    all_clf_accuracy.append(lda_score_update)
                    all_opt_ncomp.append(opt_ncomp_update)
                    all_pc_select.append(pc_select_update)
                    all_pc_exclude.append(pc_exclude_update)
                    print('-> New PCs: ', pc_select_update)
                    print('-> New clf accuracy: %.5f ' % lda_score_update)
                    print('-> New npc: ', opt_ncomp_update)
                else:
                    print('-> Rejected')
                    print('-> Keep Old PCs: ', pc_select_update)
                    print('-> Keep Old Accuracy: %.5f ' % lda_score_update)
                    # Store
                    all_clf_accuracy.append(lda_score_update)
                    all_opt_ncomp.append(opt_ncomp_update)
                    all_pc_select.append(pc_select_update)
                    all_pc_exclude.append(pc_exclude_update)

    return pc_select_update, lda_score_update, all_clf_accuracy, all_opt_ncomp, all_pc_select, all_pc_exclude, update_count


In [None]:
# Define a function to rank and remove multicollinearity - can select top K least correlated features using Pearson correlation
# Return indices

def top_K_least_correlated(X,y, K_features, selection = f_classif):

    # Select the top feature by ANOVA F-value
    fs = SelectKBest(selection,k="all")
    fs.fit(X,y)
    # Find the index from high to low
    anova_ind = np.argsort(-fs.scores_)

    # Initiate the search
    selected_ind_update = [anova_ind[0]]
    anova_ind_update = np.copy(anova_ind)
    anova_ind_update = np.delete(anova_ind_update, 0)

    while len(selected_ind_update) < K_features:
        #print('Number of Selected Ind: ', len(selected_ind_update))
        pearson_rank = []
        for i in range(len(anova_ind_update)):
            pearson_corr = np.max(np.abs(np.corrcoef(X.iloc[:, selected_ind_update + [anova_ind_update[i]]], rowvar=False))
                                  - np.identity(len(selected_ind_update)+1))
            pearson_rank.append(pearson_corr)
        selected_ind = np.argsort(pearson_rank)[0]
        selected_anova_ind = anova_ind_update[selected_ind]
        #print('Selected Ind: ', selected_anova_ind)
        #print('Selected Pearson Coeff: ', pearson_rank[selected_ind])
        selected_ind_update.append(selected_anova_ind)
        anova_ind_update = np.delete(anova_ind_update, selected_ind)

    return selected_ind_update

In [None]:
# Define a function to evaluate clf accuracy using lda - Number of Features is a variable
# F1 score calculation can be added here

def clf_accuracy_eval(X, y, model, cv):

    # Initiate empty lists
    clf_accuracy_scores = [] # Calculate classification scores
    clf_balanced_scores = [] # Calculate balanced classification scores
    #clf_f1_scores = [] # Calculate F1 scores
    #count = 0
    for train_index, test_index in cv.split(X, y):
        # count+=1
        # print(count, " of ",  cv.get_n_splits(), " CV folds ", end="\r")

        # define model
        clf_model = model # model needs to be refined in each loop? so move down into the loop?

        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # Fit model
        clf_model.fit(X_train, y_train)
        # Predict using the fitted model
        y_predict = clf_model.predict(X_test)
        # Calculate classification accuracy score
        clf_score = accuracy_score(y_test, y_predict)
        # Calculate classification balanced accuracy score
        clf_balanced_score = balanced_accuracy_score(y_test, y_predict)
        # Calculate classification F-1 score
        #clf_f1_score = f1_score(y_test, y_predict, average='binary')
        # Store each iteration
        clf_accuracy_scores.append(clf_score)
        clf_balanced_scores.append(clf_balanced_score)
        #clf_f1_scores.append(clf_f1_score)

    return clf_accuracy_scores, clf_balanced_scores #, clf_f1_scores

#### The OVO Approach

In [None]:
# Change to save path
os.chdir(save_path)
os.chdir('YOUR SAVE PATH - SUBFOLDER')
print(os.getcwd())

In [None]:
# PCA Framework for the OVO approach starts here:

# Separate the for loop below into batches to solve memory error
# Or else clear all figure objects and rest python garbage collector

# Define parameters
df_dataset = df_smooth
tissue_types = tissue_types

pca_components = 30
peak_height = 0.2

random_state = 42
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=10, random_state=random_state)

npc_total = 30
npc_select = 10
n_iter = 600

# Define binary combinations
list_tissue_comb = list(combinations(tissue_types,2))
# Save the list of binary pairs
if not os.path.exists('order_of_tissue_comb.joblib'):
    dump(list_tissue_comb, 'order_of_tissue_comb.joblib')

# Initiate lists for storing variables
all_X_binary_pca = []
all_pca_loadings = []
all_total_var = []
all_n_components = []
all_pc_var = []
all_sa_select_pc = []
# Selected wavelength by find local maxima
all_select_ind = []
all_select_wv = []
all_select_wv_final = []
# Total time of execution
total_time = []

# Set custom color cycle
custom_cycler = (cycler('color', ['#d62728', '#1f77b4', '#2ca02c', '#ff7f0e',
                                  '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf']) +
                 cycler(lw=[1.5, 1.5, 1.5, 1.5, 1.5, 1.5, 1.5, 1.5, 1.5, 1.5]))

# Set matplotlib backend to non-interactive to release memory
matplotlib.use('agg')

count = 0
for i,k in combinations(tissue_types,2): # can separate this to make an individual function

    # Track timestamp - elapsed time
    start_time = time.time()

    count += 1

    # if count == 2:
    #     print('count = 2; break')
    #     break

    print('Iteration: ', count)
    print('First Label: ', i)
    print('Second Label: ',k)

    # Data selection
    binary_subset = df_dataset.loc[(df['target_y'] == i) + (df['target_y'] == k)]

    # Features
    X_binary_init = binary_subset.drop(['target_y'], axis=1)
    # Target
    y_binary_init = binary_subset['target_y']
    # Convert to 0 and 1 - i = 1, k = 0
    y_binary_init = (y_binary_init == i).astype('uint8')

    # Split Train Test Validation set
    X_binary, X_val_all, y_binary, y_val_all = train_test_split(X_binary_init, y_binary_init,
                                                                test_size=0.2, stratify=y_binary_init, random_state=random_state)
    # Save train test sets
    dump(X_binary, str(i) + '_' + str(k) + '_X_binary.joblib')
    dump(X_val_all, str(i) + '_' + str(k) + '_X_val_all.joblib')
    dump(y_binary, str(i) + '_' + str(k) + '_y_binary.joblib')
    dump(y_val_all, str(i) + '_' + str(k) + '_y_val_all.joblib')

    # PCA - can separate this to make an individual function
    pca_model = PCA(n_components=pca_components, svd_solver = 'full')
    X_binary_pca = pca_model.fit_transform(X_binary)
    pca_loadings = pca_model.components_
    total_var = pca_model.explained_variance_ratio_.sum() * 100
    num_components = pca_model.n_components_
    pc_var = pca_model.explained_variance_ratio_
    #print("Number of Components: ", num_components)
    # Store variables in lists
    all_X_binary_pca.append(X_binary_pca)
    all_pca_loadings.append(pca_loadings)
    all_total_var.append(total_var)
    all_n_components.append(num_components)
    all_pc_var.append(pc_var)

    # SA on PCA results - optimization
    # A = pc_select_update
    # B = lda_score_update
    # C = all_clf_accuracy
    # D = all_opt_ncomp
    # E = all_pc_select
    # F = all_pc_exclude
    # G = update_count
    blockPrint() # Block printing
    A, B, C, D, E, F, G = pc_sa_opt(X_binary_pca, y_binary, cv=cv, npc_total=npc_total, npc_select=npc_select, n_iter=n_iter)
    enablePrint() # Restore printing
    print('Selected PCs: ', A)
    all_sa_select_pc.append(A)

    # Plot Optimization
    if np.any(C):
        print('Clf accuracy using Selected PCs: ', C[-1])
        # Set the figure size
        f, ax = plt.subplots(figsize=(10, 8))
        # Tight layout
        f.tight_layout()
        # Plot
        plt.plot(np.arange(n_iter+1), C, label='Clf Accuracy')
        # Set figure object
        ax.set_title('LDA Clf Accuracy Progression for ' + str(i) + ' vs. ' + str(k))
        ax.set_xlabel('Iterations')
        ax.set_ylabel('LDA Clf Accuracy (A.U.)')
        plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))
        bbox = matplotlib.transforms.Bbox([[-0.5, -0.36], [11.45, 8.56]])
        # Save figure
        if not os.path.exists(str(i) + '_' + str(k) + '_LDA_clf_accuracy_opt.png'):
            f.savefig(os.getcwd() + '/' +  str(i) + '_' + str(k) + '_LDA_clf_accuracy_opt.png', dpi = 1080, bbox_inches =bbox)
        f.clear()
        plt.cla()
        plt.clf()
        plt.close('all')
        plt.close()
        gc.collect()

    else:
        print('Clf accuracy using Selected PCs: 100%')


    # PC plot labels - can separate this to make an individual function
    labels = {str(c): f"PC {i+1} ({var:.1f}%)"
              for c, i, var in zip(range(len(A)), A, pca_model.explained_variance_ratio_[A] * 100)}
    labels['color'] = 'Tissue Types'
    # PCA Scatter plot
    selected_pca_var = pca_model.explained_variance_ratio_[A].sum() * 100
    fig = px.scatter_matrix(X_binary_pca[:, A],
                            color=y_binary,
                            dimensions=range(len(A)),
                            labels=labels,
                            title=f'PCA Total Explained Variance: {selected_pca_var:.3f}%',
                            width = 1100,
                            height = 1100,
                            template='ggplot2')
    fig.update_layout(font=dict(size=10))
    fig.update_traces(diagonal_visible=False)
    fig.update_xaxes(automargin=True)
    fig.update_yaxes(automargin=True)
    # Save figure
    if not os.path.exists(str(i) + '_' + str(k) + '_pca.png'):
        fig.write_image(os.getcwd() + '/' +  str(i) + '_' + str(k) + '_pca.png', scale = 2)
    # Close figure
    fig.data = []
    fig.layout = {}

    # PCA Loadings Plot - can separate this to make an individual function
    # Set the figure size
    f, ax = plt.subplots(figsize=(10, 8))
    # Tight layout
    f.tight_layout()
    # Set colors
    ax.set_prop_cycle(custom_cycler)
    #plt.style.library['tableau-colorblind10']
    # Labels
    labels = ['PC ' + str(i + 1) for i in A]
    # Plot
    plt.plot(col_wavelengths, abs(pca_loadings[A,:]).T, label=labels)
    # Set figure object
    ax.set_title('PCA Loadings for ' + str(i) + ' vs. ' + str(k))
    ax.set_xlabel('Wavelength (nm)')
    ax.set_ylabel('PCA Loadings (A.U.)')
    ax.set_xlim([350, 1850])
    plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))
    bbox = matplotlib.transforms.Bbox([[-0.2, -0.36], [11.45, 8.56]])
    # Save figure
    if not os.path.exists(str(i) + '_' + str(k) + '_PCA_loadings.png'):
        f.savefig(os.getcwd() + '/' +  str(i) + '_' + str(k) + '_PCA_loadings.png', dpi = 1080, bbox_inches =bbox)
    f.clear()
    plt.cla()
    plt.clf()
    plt.close('all')
    plt.close()
    gc.collect()

    # Find PCA loadings peak values - can separate this to make an individual function
    # Scale each abs PCA loading curve between 0 and 1
    max_pts = np.max(abs(pca_loadings[A,:]), axis = 1)
    norm_abs_pca_loadings = (abs(pca_loadings[A,:]).T/max_pts).T
    # Find indices of local peaks by user defined criteria
    all_peaks_pca = np.array([]).astype(int)
    for n in range(len(norm_abs_pca_loadings)):
        # calculate peak prominence for setting threshold
        peaks_pca_prominence, _ = find_peaks(norm_abs_pca_loadings[n], distance=20) # 20 to indicate a typical LED FWHM
        avg_prominence = np.mean(peak_prominences(norm_abs_pca_loadings[n], peaks_pca_prominence)[0])
        # refine peak selection; height = 0.2 to further remove baseline noises
        peaks_pca, _ = find_peaks(norm_abs_pca_loadings[n], height=peak_height, distance=20, prominence=avg_prominence)
        all_peaks_pca = np.concatenate([all_peaks_pca, peaks_pca], axis=None)
    # Find duplicate peaks among all PCs and remove nearby neighbors
    all_peaks_pca=list(dict.fromkeys(all_peaks_pca))
    peak_dup = []
    for x, y in combinations(all_peaks_pca,2):
        if abs(col_wavelengths[x]-col_wavelengths[y])<=20:
            ind = np.where(all_peaks_pca==y)[0][0]
            peak_dup.append(ind)

    # Remove duplicated indices
    peak_dup = np.unique(peak_dup)
    # Remove the peaks
    if np.any(peak_dup):
        all_peaks_pca_final = np.delete(all_peaks_pca, peak_dup)
    else:
        all_peaks_pca_final = all_peaks_pca
    # Store
    all_select_ind.append(all_peaks_pca_final)
    all_select_wv.append(col_wavelengths[all_peaks_pca_final])
    print('Number of Selected Wavelength: ', len(col_wavelengths[all_peaks_pca_final]))
    print('Selected Wavelength: ', col_wavelengths[all_peaks_pca_final])
    #print('\n')

    # Plot selected wavelengths on PC 1 - can separate this to make an individual function
    # Set the figure size
    f, ax = plt.subplots(figsize=(10, 8))
    # Tight layout
    f.tight_layout()
    # Set colors
    ax.set_prop_cycle(custom_cycler)
    #plt.style.library['tableau-colorblind10']
    # Plot
    plt.plot(col_wavelengths, abs(pca_loadings[A,:]).T, label=labels)
    plt.plot(col_wavelengths[all_peaks_pca_final], abs(pca_loadings[A,:]).T[:,0][all_peaks_pca_final], "ko")
    # Set figure object
    ax.set_title('Selected Peaks for ' + str(i) + ' vs. ' + str(k))
    ax.set_xlabel('Wavelength (nm)')
    ax.set_ylabel('PCA Loadings (A.U.)')
    ax.set_xlim([350, 1850])
    plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))
    bbox = matplotlib.transforms.Bbox([[-0.45, -1.3], [11.45, 8.56]])
    # Add text - selected wavelengths
    plt.text(0, -0.13, 'Selected Wavelength: ' + str(np.sort(np.array(col_wavelengths[all_peaks_pca_final]))), horizontalalignment='left',
             verticalalignment = 'center_baseline', transform=ax.transAxes)
    # Save figure
    if not os.path.exists(str(i) + '_' + str(k) + '_PCA_peaks.png'):
        f.savefig(os.getcwd() + '/' +  str(i) + '_' + str(k) + '_PCA_peaks.png', dpi = 1080, bbox_inches =bbox)
    #plt.close()
    f.clear()
    plt.cla()
    plt.clf()
    plt.close('all')
    plt.close()
    gc.collect()

    # Select a fixed number of wavelengths - K = 10
    if len(all_peaks_pca_final)>=10:
        final_select_wv = top_K_least_correlated(X_binary.iloc[:,all_peaks_pca_final],y_binary, K_features=10, selection = f_classif)
    else:
        final_select_wv = top_K_least_correlated(X_binary.iloc[:,all_peaks_pca_final],y_binary,
                                                 K_features=len(all_peaks_pca_final), selection = f_classif)
    all_select_wv_final.append(col_wavelengths[all_peaks_pca_final][final_select_wv])
    print('Final Selected Wavelength: ', col_wavelengths[all_peaks_pca_final][final_select_wv])

    # Plot selected wavelengths on DRS measurements - can separate this to make an individual function
    # Average spectra for each tissue type
    avg_df_select_tissues = binary_subset.groupby(['target_y']).mean()
    stdev_df_select_tissues = binary_subset.groupby(['target_y']).std()
    # Set the figure size
    f, ax = plt.subplots(figsize=(10, 8))
    # Tight layout
    f.tight_layout()
    # Set colors
    ax.set_prop_cycle(custom_cycler)
    # Plot
    select_tissue_label = [i,k]
    for tissue in range(avg_df_select_tissues.shape[0]):
        plt.plot(col_wavelengths, avg_df_select_tissues.iloc[tissue,:], label = select_tissue_label[tissue])
        pos_std = avg_df_select_tissues.iloc[tissue,:] + stdev_df_select_tissues.iloc[tissue,:]
        neg_std = avg_df_select_tissues.iloc[tissue,:] - stdev_df_select_tissues.iloc[tissue,:]
        ax.fill_between(col_wavelengths, neg_std, pos_std, alpha = 0.08)
    for wv in col_wavelengths[all_peaks_pca_final][final_select_wv]:
        plt.axvline(x=wv, color='#2ca02c', linestyle='-', alpha=0.6, lw=1.75)
    # Add text - selected wavelengths
    plt.text(0, -0.12, 'Final Selected Wavelength: '+
             str(np.array(col_wavelengths[all_peaks_pca_final][final_select_wv])),
             horizontalalignment='left', verticalalignment = 'center_baseline', transform=ax.transAxes)
    # Set figure object
    ax.set_title('Selected Peaks for ' + str(i) + ' vs. ' + str(k))
    ax.set_xlabel('Wavelength (nm)')
    ax.set_ylabel('Normalized Intensity (A.U.)')
    ax.set_xlim([350, 1850])
    plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))
    bbox = matplotlib.transforms.Bbox([[-0.45, -1.3], [11.45, 8.56]])
    # Save figure
    if not os.path.exists(str(i) + '_' + str(k) + '_PCA_select_wv.png'):
        f.savefig(os.getcwd() + '/' +  str(i) + '_' + str(k) + '_PCA_select_wv.png', dpi = 1080, bbox_inches =bbox)
    #plt.close()
    f.clear()
    plt.cla()
    plt.clf()
    plt.close('all')
    plt.close()
    gc.collect()


    # End timestamp
    end_time = time.time()
    print('Time to run: ', (end_time - start_time)/60, ' minutes')
    total_time.append(end_time - start_time)
    print('\n')

print('Total time to run: ', sum(total_time)/3600, ' hours')
print('END')

In [None]:
# Save variables

# dump(all_X_binary_pca, 'all_X_binary_pca.joblib')
# dump(all_pca_loadings, 'all_pca_loadings.joblib')
# dump(all_total_var, 'all_total_var.joblib')
# dump(all_n_components, 'all_n_components.joblib')
# dump(all_pc_var, 'all_pc_var.joblib')
# dump(all_sa_select_pc, 'all_sa_select_pc.joblib')
#
# dump(all_select_ind, 'all_select_ind.joblib')
# dump(all_select_wv, 'all_select_wv.joblib')
# dump(all_select_wv_final, 'all_select_wv_final.joblib')
#
# dump(total_time, 'total_time.joblib')

#### The OVR Approach

In [None]:
# Change to save_path
os.chdir(save_path)
os.chdir('YOUR SAVE PATH - SUBFOLDER')
print(os.getcwd())

In [None]:
# Define dataset
df_dataset = df_smooth

# Tissue labels
tissue_types = tissue_types

# Set global random state
random_state = 42

# Define cross validation
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=10, random_state=random_state)

In [None]:
# One vs Rest

# Bone Cement = 1 vs. [Cortical Bone, Trabecular Bone, Cartilage, Bone Marrow] = 0
i = 'boneCement'
k_1 = 'cortBone'
k_2 = 'traBone'
k_3 = 'cartilage'
k_4 = 'boneMarrow'

# Cortical Bone = 1 vs. [Trabecular Bone, Cartilage, Bone Marrow, Muscle] = 0
# i = 'cortBone'
# k_1 = 'traBone'
# k_2 = 'muscle'
# k_3 = 'cartilage'
# k_4 = 'boneMarrow'

In [None]:
print('First Label: ', i)
#print('Second Label: ',k)
print('Second Label: ' + k_1 + ', ' + k_2 + ', ' + k_3 + ', ' + k_4)

# Data selection for multiple tissue types
df_subset = df_dataset[df_dataset['target_y'].isin([i,k_1,k_2,k_3,k_4])]
binary_subset = df_dataset[df_dataset['target_y'].isin([i,k_1,k_2,k_3,k_4])]
binary_subset.loc[df['target_y'].isin([k_1, k_2, k_3, k_4]), 'target_y'] = 'rest'
binary_subset.loc[df['target_y'].isin([i]), 'target_y'] = i
k = 'rest'

# Features
X_binary_init = binary_subset.drop(['target_y'], axis=1)
# Target
y_binary_init = binary_subset['target_y']
# Convert to 0 and 1 - i = 1, k = 0
y_binary_init = (y_binary_init == i).astype('uint8')

# Split Train Test Validation set
X_binary, X_val_all, y_binary, y_val_all = train_test_split(X_binary_init, y_binary_init,
                                                            test_size=0.2, stratify=y_binary_init, random_state=random_state)

# Save train test sets
dump(X_binary, str(i) + '_' + str(k) + '_X_binary.joblib')
dump(X_val_all, str(i) + '_' + str(k) + '_X_val_all.joblib')
dump(y_binary, str(i) + '_' + str(k) + '_y_binary.joblib')
dump(y_val_all, str(i) + '_' + str(k) + '_y_val_all.joblib')

In [None]:
# Define parameters
pca_components = 30
peak_height = 0.2

npc_total = 30
npc_select = 10
n_iter = 600

# Set custom color cycle
custom_cycler = (cycler('color', ['#d62728', '#1f77b4', '#2ca02c', '#ff7f0e',
                                  '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf']) +
                 cycler(lw=[1.5, 1.5, 1.5, 1.5, 1.5, 1.5, 1.5, 1.5, 1.5, 1.5]))

# # Set matplotlib backend to non-interactive
# matplotlib.use('agg')

# Track timestamp - elapsed time
start_time = time.time()

print('First Label: ', i)
print('Second Label: ' + k_1 + ', ' + k_2 + ', ' + k_3 + ', ' + k_4)

# PCA - can separate this to make an individual function
pca_model = PCA(n_components=pca_components, svd_solver = 'full')
X_binary_pca = pca_model.fit_transform(X_binary)
pca_loadings = pca_model.components_
total_var = pca_model.explained_variance_ratio_.sum() * 100
num_components = pca_model.n_components_
pc_var = pca_model.explained_variance_ratio_

# Store variables in lists
dump(X_binary_pca, str(i) + '_' + str(k) + '_X_binary_pca.joblib')
dump(pca_loadings, str(i) + '_' + str(k) + '_pca_loadings.joblib')
dump(total_var, str(i) + '_' + str(k) + '_total_var.joblib')
dump(num_components, str(i) + '_' + str(k) + '_num_components.joblib')
dump(pc_var, str(i) + '_' + str(k) + '_pc_var.joblib')

# SA on PCA results - optimization
# A = pc_select_update
# B = lda_score_update
# C = all_clf_accuracy
# D = all_opt_ncomp
# E = all_pc_select
# F = all_pc_exclude
# G = update_count
blockPrint()
A, B, C, D, E, F, G = pc_sa_opt(X_binary_pca, y_binary, cv=cv, npc_total=npc_total, npc_select=npc_select, n_iter=n_iter)
enablePrint()
print('Selected PCs: ', A)
dump(A, str(i) + '_' + str(k) + '_sa_select_pc.joblib')

# Plot SA Optimization
if np.any(C):
    print('Clf accuracy using Selected PCs: ', C[-1])
    # Set the figure size
    f, ax = plt.subplots(figsize=(10, 8))
    # Tight layout
    f.tight_layout()
    # Plot
    plt.plot(np.arange(n_iter+1), C, label='Clf Accuracy')
    # Set figure object
    ax.set_title('LDA Clf Accuracy Progression for ' + str(i) + ' vs. ' + str(k))
    ax.set_xlabel('Iterations')
    ax.set_ylabel('LDA Clf Accuracy (A.U.)')
    plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))
    bbox = matplotlib.transforms.Bbox([[-0.5, -0.36], [11.45, 8.56]])
    # Save figure
    if not os.path.exists(str(i) + '_' + str(k) + '_LDA_clf_accuracy_opt.png'):
        f.savefig(os.getcwd() + '/' +  str(i) + '_' + str(k) + '_LDA_clf_accuracy_opt.png', dpi = 1080, bbox_inches =bbox)
    f.clear()
    plt.cla()
    plt.clf()
    plt.close('all')
    plt.close()
    gc.collect()

else:
    print('Clf accuracy using Selected PCs: 100%')


# PC plot labels - can separate this to make an individual function
labels = {str(c): f"PC {i+1} ({var:.1f}%)"
          for c, i, var in zip(range(len(A)), A, pca_model.explained_variance_ratio_[A] * 100)}
labels['color'] = 'Tissue Types'
# PCA Scatter plot
selected_pca_var = pca_model.explained_variance_ratio_[A].sum() * 100
fig = px.scatter_matrix(X_binary_pca[:, A],
                        color=y_binary,
                        dimensions=range(len(A)),
                        labels=labels,
                        title=f'PCA Total Explained Variance: {selected_pca_var:.3f}%',
                        width = 1100,
                        height = 1100,
                        template='ggplot2')
fig.update_layout(font=dict(size=10))
fig.update_traces(diagonal_visible=False)
fig.update_xaxes(automargin=True)
fig.update_yaxes(automargin=True)
# Save figure
if not os.path.exists(str(i) + '_' + str(k) + '_pca.png'):
    fig.write_image(os.getcwd() + '/' +  str(i) + '_' + str(k) + '_pca.png', scale = 2)
# Close figure
fig.data = []
fig.layout = {}

# PCA Loadings Plot - can separate this to make an individual function
# Set the figure size
f, ax = plt.subplots(figsize=(10, 8))
# Tight layout
f.tight_layout()
# Set colors
ax.set_prop_cycle(custom_cycler)
#plt.style.library['tableau-colorblind10']
# Labels
labels = ['PC ' + str(i + 1) for i in A]
# Plot
plt.plot(col_wavelengths, abs(pca_loadings[A,:]).T, label=labels)
# Set figure object
ax.set_title('PCA Loadings for ' + str(i) + ' vs. ' + str(k))
ax.set_xlabel('Wavelength (nm)')
ax.set_ylabel('PCA Loadings (A.U.)')
ax.set_xlim([350, 1850])
plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))
bbox = matplotlib.transforms.Bbox([[-0.2, -0.36], [11.45, 8.56]])
# Save figure
if not os.path.exists(str(i) + '_' + str(k) + '_PCA_loadings.png'):
    f.savefig(os.getcwd() + '/' +  str(i) + '_' + str(k) + '_PCA_loadings.png', dpi = 1080, bbox_inches =bbox)
f.clear()
plt.cla()
plt.clf()
plt.close('all')
plt.close()
gc.collect()

# Find PCA loadings peak values - can separate this to make an individual function
# Scale each abs PCA loading curve between 0 and 1
max_pts = np.max(abs(pca_loadings[A,:]), axis = 1)
norm_abs_pca_loadings = (abs(pca_loadings[A,:]).T/max_pts).T
# Find indices of local peaks by user defined criteria
all_peaks_pca = np.array([]).astype(int)
for n in range(len(norm_abs_pca_loadings)):
    # calculate peak prominence for setting threshold
    peaks_pca_prominence, _ = find_peaks(norm_abs_pca_loadings[n], distance=20) # 20 to indicate a typical LED FWHM
    avg_prominence = np.mean(peak_prominences(norm_abs_pca_loadings[n], peaks_pca_prominence)[0])
    # refine peak selection; height = 0.2 to further remove baseline noises
    peaks_pca, _ = find_peaks(norm_abs_pca_loadings[n], height=peak_height, distance=20, prominence=avg_prominence)
    all_peaks_pca = np.concatenate([all_peaks_pca, peaks_pca], axis=None)
# Find duplicate peaks among all PCs and remove nearby neighbors
all_peaks_pca=list(dict.fromkeys(all_peaks_pca))
peak_dup = []
for x, y in combinations(all_peaks_pca,2):
    if abs(col_wavelengths[x]-col_wavelengths[y])<=20:
        ind = np.where(all_peaks_pca==y)[0][0]
        peak_dup.append(ind)

# Remove duplicated indices
peak_dup = np.unique(peak_dup)
# Remove the peaks
if np.any(peak_dup):
    all_peaks_pca_final = np.delete(all_peaks_pca, peak_dup)
else:
    all_peaks_pca_final = all_peaks_pca
# Store
dump(all_peaks_pca_final, str(i) + '_' + str(k) + '_select_ind.joblib')
dump(col_wavelengths[all_peaks_pca_final], str(i) + '_' + str(k) + '_select_wv.joblib')
print('Number of Selected Wavelength: ', len(col_wavelengths[all_peaks_pca_final]))
print('Selected Wavelength: ', col_wavelengths[all_peaks_pca_final])

# Plot selected wavelengths on PC 1 - can separate this to make an individual function
# Set the figure size
f, ax = plt.subplots(figsize=(10, 8))
# Tight layout
f.tight_layout()
# Set colors
ax.set_prop_cycle(custom_cycler)
#plt.style.library['tableau-colorblind10']
# Plot
plt.plot(col_wavelengths, abs(pca_loadings[A,:]).T, label=labels)
plt.plot(col_wavelengths[all_peaks_pca_final], abs(pca_loadings[A,:]).T[:,0][all_peaks_pca_final], "ko")
# Set figure object
ax.set_title('Selected Peaks for ' + str(i) + ' vs. ' + str(k))
ax.set_xlabel('Wavelength (nm)')
ax.set_ylabel('PCA Loadings (A.U.)')
ax.set_xlim([350, 1850])
plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))
bbox = matplotlib.transforms.Bbox([[-0.45, -1.3], [11.45, 8.56]])
# Add text - selected wavelengths
plt.text(0, -0.13, 'Selected Wavelength: ' + str(np.sort(np.array(col_wavelengths[all_peaks_pca_final]))), horizontalalignment='left',
         verticalalignment = 'center_baseline', transform=ax.transAxes)
# Save figure
if not os.path.exists(str(i) + '_' + str(k) + '_PCA_peaks.png'):
    f.savefig(os.getcwd() + '/' +  str(i) + '_' + str(k) + '_PCA_peaks.png', dpi = 1080, bbox_inches =bbox)
f.clear()
plt.cla()
plt.clf()
plt.close('all')
plt.close()
gc.collect()

# Select a fixed number of wavelengths - K = 10
if len(all_peaks_pca_final)>=10:
    final_select_wv = top_K_least_correlated(X_binary.iloc[:,all_peaks_pca_final],y_binary, K_features=10, selection = f_classif)
else:
    final_select_wv = top_K_least_correlated(X_binary.iloc[:,all_peaks_pca_final],y_binary,
                                             K_features=len(all_peaks_pca_final), selection = f_classif)
dump(col_wavelengths[all_peaks_pca_final][final_select_wv], str(i) + '_' + str(k) + '_select_wv_final.joblib')
print('Final Selected Wavelength: ', col_wavelengths[all_peaks_pca_final][final_select_wv])
# Plot selected wavelengths on DRS measurements - can separate this to make an individual function
# Average spectra for each tissue type
avg_df_select_tissues = df_subset.groupby(['target_y']).mean()
stdev_df_select_tissues = df_subset.groupby(['target_y']).std()
# Set the figure size
f, ax = plt.subplots(figsize=(10, 8))
# Tight layout
f.tight_layout()
# Plot
select_tissue_label = [i,k_1,k_2,k_3,k_4]
select_tissue_label.sort()
# Set custom color cycle
custom_cycler_1 = (cycler('color', ['#808080', '#808080', '#d62728', '#808080', '#808080']) +
                   cycler(lw=[1.5, 1.5, 1.5, 1.5, 1.5]))
# Set colors
ax.set_prop_cycle(custom_cycler_1)
for tissue in range(avg_df_select_tissues.shape[0]):
    plt.plot(col_wavelengths, avg_df_select_tissues.iloc[tissue,:], label = select_tissue_label[tissue])
    pos_std = avg_df_select_tissues.iloc[tissue,:] + stdev_df_select_tissues.iloc[tissue,:]
    neg_std = avg_df_select_tissues.iloc[tissue,:] - stdev_df_select_tissues.iloc[tissue,:]
    ax.fill_between(col_wavelengths, neg_std, pos_std, alpha = 0.08)
for wv in col_wavelengths[all_peaks_pca_final][final_select_wv]:
    plt.axvline(x=wv, color='#2ca02c', linestyle='-', alpha=0.6, lw=1.75)
# Add text - selected wavelengths
plt.text(0, -0.12, 'Final Selected Wavelength: '+
         str(np.array(col_wavelengths[all_peaks_pca_final][final_select_wv])),
         horizontalalignment='left', verticalalignment = 'center_baseline', transform=ax.transAxes)
# Set figure object
ax.set_title('Selected Peaks for ' + str(i) + ' vs. ' + str(k))
ax.set_xlabel('Wavelength (nm)')
ax.set_ylabel('Normalized Intensity (A.U.)')
ax.set_xlim([350, 1850])
plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))
bbox = matplotlib.transforms.Bbox([[-0.45, -1.3], [11.45, 8.56]])
# Save figure
if not os.path.exists(str(i) + '_' + str(k) + '_PCA_select_wv.png'):
    f.savefig(os.getcwd() + '/' +  str(i) + '_' + str(k) + '_PCA_select_wv.png', dpi = 1080, bbox_inches =bbox)
#plt.close()
f.clear()
plt.cla()
plt.clf()
plt.close('all')
plt.close()
gc.collect()


# End timestamp
end_time = time.time()
print('Time to run: ', (end_time - start_time)/60, ' minutes')
dump(end_time - start_time, str(i) + '_' + str(k) + '_total_time.joblib')
print('\n')

#### Plot Accuracy vs. Number of Features in the order given by the algorithm
- or can rank the order by f_classif scores

In [None]:
# Load the final selected wavelengths
select_wv_final = load('YOUR SELECTED WAVELENGTHS.joblib') # in float; or use the variable directly from above
select_wv_final_str = [str(x) for x in select_wv_final] # convert to string if float
print('Final wv:, ', select_wv_final_str)

In [None]:
# Define dataset
df_dataset = df_smooth

# Tissue types
i = 'boneCement'
k_1 = 'cortBone'
k_2 = 'traBone'
k_3 = 'cartilage'
k_4 = 'boneMarrow'

# i = 'cortBone'
# k_1 = 'traBone'
# k_2 = 'muscle'
# k_3 = 'cartilage'
# k_4 = 'boneMarrow'

k = 'rest'

# Select from the original dataset
#df_subset = df_dataset[df_dataset['target_y'].isin([i,k_1,k_2,k_3,k_4])]

# Features - Bone Cement
X_binary = load('boneCement_rest_X_binary.joblib') # train/test
X_val = load('boneCement_rest_X_val_all.joblib') # validation
# Target
y_binary = load('boneCement_rest_y_binary.joblib') # train/test
y_val = load('boneCement_rest_y_val_all.joblib') # validation

# Features - Cortical Bone
# X_binary = load('cortBone_rest_X_binary.joblib') # train/test
# X_val = load('cortBone_rest_X_val_all.joblib') # validation
# # Target
# y_binary = load('cortBone_rest_y_binary.joblib') # train/test
# y_val = load('cortBone_rest_y_val_all.joblib') # validation

# Select the features
selected_X_binary = X_binary.loc[:, select_wv_final_str]
selected_X_val = X_val.loc[:, select_wv_final_str]


In [None]:
# Calculate clf accuracy
# Can add F1 score calculation here

# Define cross validation
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=10, random_state=random_state)

all_lda_scores = []
all_lda_stdev = []
all_lda_scores_val = []
all_lda_stdev_val = []

all_lda_balanced_scores = []
all_lda_balanced_stdev = []
all_lda_balanced_scores_val = []
all_lda_balanced_stdev_val = []

for wv in range(len(select_wv_final_str)):
    # Define LDA model
    model = LinearDiscriminantAnalysis()
    #model = RandomForestClassifier()

    # Calculate clf accuracy cv
    print('Number of Features: ', wv+1)
    lda_cv_scores, lda_balanced_scores = clf_accuracy_eval(selected_X_binary.iloc[:,0:wv+1], y_binary, model, cv)
    lda_cv_scores_val, lda_balanced_scores_val = clf_accuracy_eval(selected_X_val.iloc[:,0:wv+1], y_val, model, cv)

    all_lda_scores.append(np.mean(lda_cv_scores))
    all_lda_stdev.append(np.std(lda_cv_scores))
    all_lda_balanced_scores.append(np.mean(lda_balanced_scores))
    all_lda_balanced_stdev.append(np.std(lda_balanced_scores))

    all_lda_scores_val.append(np.mean(lda_cv_scores_val))
    all_lda_stdev_val.append(np.std(lda_cv_scores_val))
    all_lda_balanced_scores_val.append(np.mean(lda_balanced_scores_val))
    all_lda_balanced_stdev_val.append(np.std(lda_balanced_scores_val))

# Save variables
dump(all_lda_scores, str(i) + '_' + str(k) + '_lda_score_train.joblib')
dump(all_lda_stdev, str(i) + '_' + str(k) + '_lda_std_train.joblib')
dump(all_lda_balanced_scores, str(i) + '_' + str(k) + '_lda_balanced_train.joblib')
dump(all_lda_balanced_stdev, str(i) + '_' + str(k) + '_lda_balanced_std_train.joblib')

dump(all_lda_scores_val, str(i) + '_' + str(k) + '_lda_score_val.joblib')
dump(all_lda_stdev_val, str(i) + '_' + str(k) + '_lda_std_val.joblib')
dump(all_lda_balanced_scores_val, str(i) + '_' + str(k) + '_lda_balanced_val.joblib')
dump(all_lda_balanced_stdev_val, str(i) + '_' + str(k) + '_lda_balanced_std_val.joblib')

In [None]:
# plot Clf Accuracy vs. Number of Features

# Set the figure size
f, ax = plt.subplots(figsize=(8, 6))
# Tight layout
f.tight_layout()

# Plot - Accuracy on train
plt.plot(np.linspace(1, len(select_wv_final_str), len(select_wv_final_str)), all_lda_scores, '--', lw=2.5)
plt.errorbar(np.linspace(1, len(select_wv_final_str), len(select_wv_final_str)), all_lda_scores, yerr=all_lda_stdev,
             fmt='o', capsize=1.5)
xlabels = ['0'] + select_wv_final_str
plt.xticks(np.arange(0, len(select_wv_final_str) + 1, 1), xlabels)
# Set figure object
ax.set_title('Classification Accuracy vs. Number of Wavelengths for ' + str(i) + ' vs. ' + str(k))
ax.set_xlabel('Wavelengths Included (nm)')
ax.set_ylabel('LDA Clf Accuracy')
#ax.set_xlim([350, 1850])
bbox = matplotlib.transforms.Bbox([[-0.4, -0.4], [8.5, 6.5]])

# Save figure
if not os.path.exists(str(i) + '_' + str(k) + '_accuracy_nFeatures_train.png'):
    f.savefig(os.getcwd() + '/' + str(i) + '_' + str(k) + '_accuracy_nFeatures_train.png', dpi=1080,
              bbox_inches=bbox)


# plot Clf Accuracy vs. Number of Features

# Set the figure size
f, ax = plt.subplots(figsize=(8, 6))
# Tight layout
f.tight_layout()

# Plot - Accuracy on validation
plt.plot(np.linspace(1, len(select_wv_final_str), len(select_wv_final_str)), all_lda_scores_val, '--', lw=2.5)
plt.errorbar(np.linspace(1, len(select_wv_final_str), len(select_wv_final_str)), all_lda_scores_val,
             yerr=all_lda_stdev_val, fmt='o', capsize=1.5)
xlabels = ['0'] + select_wv_final_str
plt.xticks(np.arange(0, len(select_wv_final_str) + 1, 1), xlabels)
# Set figure object
ax.set_title('Classification Accuracy vs. Number of Wavelengths for ' + str(i) + ' vs. ' + str(k))
ax.set_xlabel('Wavelengths Included (nm)')
ax.set_ylabel('LDA Clf Accuracy')
#ax.set_xlim([350, 1850])
bbox = matplotlib.transforms.Bbox([[-0.4, -0.4], [8.5, 6.5]])

# Save figure
if not os.path.exists(str(i) + '_' + str(k) + '_accuracy_nFeatures_val.png'):
    f.savefig(os.getcwd() + '/' + str(i) + '_' + str(k) + '_accuracy_nFeatures_val.png', dpi=1080, bbox_inches=bbox)


# plot Clf Accuracy vs. Number of Features

# Set the figure size
f, ax = plt.subplots(figsize=(8, 6))
# Tight layout
f.tight_layout()

# Plot - Balanced accuracy on train
plt.plot(np.linspace(1, len(select_wv_final_str), len(select_wv_final_str)), all_lda_balanced_scores, '--', lw=2.5)
plt.errorbar(np.linspace(1, len(select_wv_final_str), len(select_wv_final_str)), all_lda_balanced_scores,
             yerr=all_lda_balanced_stdev,
             fmt='o', capsize=1.5)
xlabels = ['0'] + select_wv_final_str
plt.xticks(np.arange(0, len(select_wv_final_str) + 1, 1), xlabels)
# Set figure object
ax.set_title('Classification Accuracy vs. Number of Wavelengths for ' + str(i) + ' vs. ' + str(k))
ax.set_xlabel('Wavelengths Included (nm)')
ax.set_ylabel('LDA Clf Accuracy')
#ax.set_xlim([350, 1850])
bbox = matplotlib.transforms.Bbox([[-0.4, -0.4], [8.5, 6.5]])

# Save figure
if not os.path.exists(str(i) + '_' + str(k) + '_balanced_nFeatures_train.png'):
    f.savefig(os.getcwd() + '/' + str(i) + '_' + str(k) + '_balanced_nFeatures_train.png', dpi=1080,
              bbox_inches=bbox)


# Plot Clf Accuracy vs. Number of Features

# Set the figure size
f, ax = plt.subplots(figsize=(8, 6))
# Tight layout
f.tight_layout()

# Plot - Balanced accuracy on validation
plt.plot(np.linspace(1, len(select_wv_final_str), len(select_wv_final_str)), all_lda_balanced_scores_val, '--', lw=2.5)
plt.errorbar(np.linspace(1, len(select_wv_final_str), len(select_wv_final_str)), all_lda_balanced_scores_val,
             yerr=all_lda_balanced_stdev_val,
             fmt='o', capsize=1.5)
xlabels = ['0'] + select_wv_final_str
plt.xticks(np.arange(0, len(select_wv_final_str) + 1, 1), xlabels)
# Set figure object
ax.set_title('Classification Accuracy vs. Number of Wavelengths for ' + str(i) + ' vs. ' + str(k))
ax.set_xlabel('Wavelengths Included (nm)')
ax.set_ylabel('LDA Clf Accuracy')
#ax.set_xlim([350, 1850])
bbox = matplotlib.transforms.Bbox([[-0.4, -0.4], [8.5, 6.5]])

# Save figure
if not os.path.exists(str(i) + '_' + str(k) + '_balanced_nFeatures_val.png'):
    f.savefig(os.getcwd() + '/' + str(i) + '_' + str(k) + '_balanced_nFeatures_val.png', dpi=1080, bbox_inches=bbox)