# Decision Tree Classifier using One Feature

In [None]:
import os

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from matplotlib.pyplot import cm
import matplotlib.transforms

from scipy.signal import savgol_filter

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RepeatedStratifiedKFold
from itertools import combinations
import time
from joblib import dump

In [None]:
# Change to data directory

work_path = os.getcwd()
print(work_path)

os.chdir('YOUR DATA DIRECTORY')
data_path = os.getcwd()
print(data_path)
os.chdir(data_path)

In [None]:
# Load data

os.chdir(data_path)

# Load dataset - df
df = pd.read_csv('YOUR DATA FILE.csv')

# Average spectra for each tissue type
avg_df_by_y = df.groupby(['target_y']).mean()
stdev_df_by_y = df.groupby(['target_y']).std()

# Column names to numerical
col_wavelengths = df.columns.drop('target_y')
col_wavelengths = col_wavelengths.astype(np.float64)
print('Number of wavelengths: ', len(col_wavelengths))

# Legend labels
tissue_types = df['target_y'].unique()
tissue_types.sort()

In [None]:
# SG Smoothing of the raw data - Gentle smoothing with w/p = 2.5 to get rid of some baseline noise

# Features
X_raw = df.drop(['target_y'], axis=1)

# Target
y = df['target_y']

# Spectral Smoothing - Savitzky–Golay (SG) method
w = 5
p = 2
X_smooth = savgol_filter(X_raw, w, polyorder=p, axis=1, deriv=0)

# Smoothed dataframe
df_smooth = pd.DataFrame(X_smooth, columns = col_wavelengths.astype("string"))
df_smooth = pd.concat([df_smooth, y.rename('target_y')], axis=1)

# Average spectra for each tissue type
avg_df_by_y_smooth = df_smooth.groupby(['target_y']).mean()
stdev_df_by_y_smooth = df_smooth.groupby(['target_y']).std()

In [None]:
# Change to save path
os.chdir(work_path)
os.chdir('YOUR SAVE PATH')
save_path = os.getcwd()
print(save_path)

#### Define functions

In [None]:
# Define a function to evaluate the model using classification metrics
def model_eval(X, y, model, cross_val):

    # define model
    clf_model = model

    # define cross validation
    cross_validation = cross_val

    # Initiate empty lists
    clf_accuracy_scores = [] # Calculate classification scores
    clf_balanced_scores = [] # Calculate balanced classification scores
    clf_importance_scores = [] # Calculate model coefficients
    clf_reports = [] # Calculate Classification reports
    conf_matrix_lists = [] # Calculate Confusion Matrices
    clf_kappa_scores = [] # Calculate Cohan Kappa Coefficients

    # Evaluate model and calculate metrics
    count = 0
    for train_index, test_index in cross_validation.split(X, y):

        count+=1
        #print(count, " of ",  cross_validation.get_n_splits(), " CV folds ", end="\r")

        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # Fit model
        clf_model.fit(X_train, y_train)

        # Find coefficients - importance
        clf_importance = np.abs(clf_model.feature_importances_)

        # Predict using the fitted model
        y_predict = clf_model.predict(X_test)

        # Calculate classification accuracy score
        clf_score = accuracy_score(y_test, y_predict)

        # Calculate balanced classification accuracy score
        clf_balanced_score = balanced_accuracy_score(y_test, y_predict)

        # Calculate classification report
        clf_metric = classification_report(y_test, y_predict, digits=4, output_dict=True, zero_division=0)

        # Calculate confusion matrix
        con_map = confusion_matrix(y_test, y_predict)

        # Store each iteration
        clf_accuracy_scores.append(clf_score)
        clf_balanced_scores.append(clf_balanced_score)
        clf_importance_scores.append(clf_importance)
        clf_reports.append(clf_metric)
        conf_matrix_lists.append(con_map)

    return clf_balanced_scores, clf_accuracy_scores, clf_importance_scores, clf_reports, conf_matrix_lists, clf_kappa_scores

#### The OVO Approach
No Validation Dataset

In [None]:

# Define random state
random_state = 42

# Define classification model - Decision Tree
dt_model_binary = DecisionTreeClassifier(max_depth = 2,random_state=42)

# Define model evaluation method
cv_binary = RepeatedStratifiedKFold(n_splits=10, n_repeats=10, random_state=random_state)

# Define binary combinations
list_tissue_comb = list(combinations(tissue_types,2))
if not os.path.exists('order_of_tissue_comb.joblib'):
    dump(list_tissue_comb, 'order_of_tissue_comb.joblib')

# Initiate lists for storing
# All averaged accuracy scores and standard deviations for all combinations
all_avg_accuracy_binary = []
all_avg_std_binary = []
# All classification metrics for all combinations
all_accuracy_binary = []
all_clf_reports_binary = []
all_conf_matrix_lists_binary = []
all_importance_binary = []
all_kappa_scores_binary = []
# Total time of execution
total_time = []

print('The total number of combinations: ', len(list_tissue_comb))
print('\n')
print('Feature Selection: Only ONE feature\n')
print('Classification Model: Decision tree with max_depth = 2')
print('Cross Validation: Repeated stratified K fold with n_splits = 10 and n_repeats = 10')
print('\n')

count = 0
for i,k in combinations(tissue_types,2):

    # Track timestamp - elapsed time
    start_time = time.time()

    count += 1

    # if count == 2:
    #     print('count = 2; break')
    #     break

    print('Iteration: ', count)
    print('First label: ', i)
    print('Second label: ',k)

    # Data selection
    binary_subset = df.loc[(df['target_y'] == i) + (df['target_y'] == k)]

    # Features
    X_binary = binary_subset.drop(['target_y'], axis=1)
    # Target
    y_binary = binary_subset['target_y']

    # Evaluate model accuracy - binary clf
    # Averaged accuracy scores and standard deviations for each combination
    avg_accuracy_binary = []
    avg_std_binary = []
    # Classification metrics for each combination
    accuracy_binary = []
    clf_reports_binary = []
    conf_matrix_lists_binary = []
    importance_binary = []
    kappa_scores_binary = []
    print('Model Evaluation Starts:')
    for n in range(X_binary.shape[1]):
        print('Wavelength: ', col_wavelengths[n])
        binary_1Feature = pd.DataFrame(X_binary.iloc[:,n].values.reshape(-1,1))
        accuracy_scores, importance_scores, clf_reports, conf_matrix_lists, kappa_scores = model_eval(binary_1Feature,y_binary,
                                                                                                      dt_model_binary, cv_binary)
        print('Clf Accuracy: %.3f +- %.3f %%' % (np.mean(accuracy_scores)*100, np.std(accuracy_scores)*100))
        avg_accuracy_binary.append(np.mean(accuracy_scores)*100)
        avg_std_binary.append(np.std(accuracy_scores)*100)
        accuracy_binary.append(accuracy_scores)
        importance_binary.append(importance_scores)
        clf_reports_binary.append(clf_reports)
        conf_matrix_lists_binary.append(conf_matrix_lists)
        kappa_scores_binary.append(kappa_scores)

    # Store calculations from each combination in lists
    all_avg_accuracy_binary.append(avg_accuracy_binary)
    all_avg_std_binary.append(avg_std_binary)
    all_accuracy_binary.append(accuracy_binary)
    all_importance_binary.append(importance_binary)
    all_clf_reports_binary.append(clf_reports_binary)
    all_conf_matrix_lists_binary.append(conf_matrix_lists_binary)
    all_kappa_scores_binary.append(kappa_scores_binary)

    # Plot and Save Accuracy vs. Wavelengths
    # Set the figure size
    f, ax = plt.subplots(figsize=(10, 8))
    # Tight layout
    f.tight_layout()
    # Set colors
    ax.set_prop_cycle('color',cm.Spectral(np.linspace(0,1,11)))
    # Plot
    plt.plot(col_wavelengths, avg_accuracy_binary)
    pos_std = np.array(avg_accuracy_binary) + np.array(avg_std_binary)
    neg_std = np.array(avg_accuracy_binary) - np.array(avg_std_binary)
    ax.fill_between(col_wavelengths, neg_std, pos_std, alpha = 0.2)
    # Set figure object
    ax.set_title('Accuracy of ' + str(i) + ' vs. ' + str(k))
    ax.set_xlabel('Wavelength (nm)')
    ax.set_ylabel('Classification Accuracy (%)')
    ax.xaxis.set_ticks(np.arange(400, 1850, 100))
    ax.set_xlim([350,1850])
    bbox = matplotlib.transforms.Bbox([[-0.2, -0.36], [10.3, 8.56]])
    # Save figure
    if not os.path.exists(str(i) + '_' + str(k) + '_clf_accuracy.png'):
        f.savefig(save_path + '/' +  str(i) + '_' + str(k) + '_clf_accuracy.png', dpi = 1080, bbox_inches =bbox)

    # End timestamp
    end_time = time.time()
    print('Time to run: ', end_time - start_time, ' seconds')
    total_time.append(end_time - start_time)
    print('\n')

print('Total time to run: ', sum(total_time)/3600, ' hours')
print('END')

# Save result variables
dump(all_avg_accuracy_binary, 'all_avg_accuracy_binary.joblib')
dump(all_avg_std_binary, 'all_avg_std_binary.joblib')
dump(all_accuracy_binary, 'all_accuracy_binary.joblib')
dump(all_importance_binary, 'all_importance_binary.joblib')
dump(all_clf_reports_binary, 'all_clf_reports_binary.joblib')
dump(all_conf_matrix_lists_binary, 'all_conf_matrix_lists_binary.joblib')
dump(all_kappa_scores_binary, 'all_kappa_scores_binary.joblib')


#### The OVR Approach
No Validation Dataset

In [None]:
# Change to save_path
os.chdir(save_path)
os.chdir('YOUR SAVE PATH - SUBFOLDER')
print(os.getcwd())

In [None]:
# Define dataset
df_dataset = df_smooth
tissue_types = tissue_types

# Set global random state
random_state = 42

# Define cross validation
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=10, random_state=random_state)

# Bone Cement = 1 vs. [Cortical Bone, Trabecular Bone, Cartilage, Bone Marrow] = 0
# i = 'boneCement'
# k_1 = 'cortBone'
# k_2 = 'traBone'
# k_3 = 'cartilage'
# k_4 = 'boneMarrow'

# Cortical Bone = 1 vs. [Trabecular Bone, Cartilage, Bone Marrow, Muscle] = 0
i = 'cortBone'
k_1 = 'traBone'
k_2 = 'muscle'
k_3 = 'cartilage'
k_4 = 'boneMarrow'


In [None]:
# boneCement vs. Rest
# cortBone vs. Rest

# Data selection for multiple tissue types
df_subset_ovr = df_dataset[df_dataset['target_y'].isin([i, k_1, k_2, k_3, k_4])]
binary_subset = df_dataset[df_dataset['target_y'].isin([i, k_1, k_2, k_3, k_4])]
binary_subset.loc[df['target_y'].isin([k_1, k_2, k_3, k_4]), 'target_y'] = 'rest'
binary_subset.loc[df['target_y'].isin([i]), 'target_y'] = i
k = 'rest'

# Features
X_binary = binary_subset.drop(['target_y'], axis=1)
# Target
y_binary = binary_subset['target_y']
# Convert to 0 and 1 - i = 1, k = 0
y_binary = (y_binary == i).astype('uint8')

# Save train test sets
dump(X_binary, str(i) + '_' + str(k) + '_X_binary.joblib')
dump(y_binary, str(i) + '_' + str(k) + '_y_binary.joblib')

In [None]:
print('First Label: ', i)
#print('Second Label: ',k)
print('Second Label: ' + k_1 + ', ' + k_2 + ', ' + k_3 + ', ' + k_4)
print('Collectively: ', k)
# Evaluate model accuracy - OVR clf
# Averaged accuracy scores and standard deviations for each combination
avg_accuracy_ovr = []
avg_std_ovr = []
avg_balanced_ovr = []
avg_balanced_std_ovr = []
# Classification metrics for each combination
accuracy_ovr = []
balanced_ovr = []

# Track timestamp - elapsed time
start_time = time.time()

for n in range(X_binary.shape[1]):
    # One Feature Classification using DecisionTree Classifier
    binary_1Feature = pd.DataFrame(X_binary.iloc[:, n].values.reshape(-1, 1))
    balanced_scores, accuracy_scores, _, _, _, _ = model_eval(binary_1Feature, y_binary,
                                                              dt_model_binary, cv_binary)
    # Store clf metrics
    avg_accuracy_ovr.append(np.mean(accuracy_scores) * 100)
    avg_std_ovr.append(np.std(accuracy_scores) * 100)
    avg_balanced_ovr.append(np.mean(balanced_scores) * 100)
    avg_balanced_std_ovr.append(np.std(balanced_scores) * 100)
    # Store more clf metrics
    accuracy_ovr.append(accuracy_scores)
    balanced_ovr.append(balanced_scores)

# Plot and Save Accuracy vs. Wavelengths
# Set the figure size
f, ax = plt.subplots(figsize=(10, 8))
# Tight layout
f.tight_layout()
# Set colors
ax.set_prop_cycle('color', cm.Spectral(np.linspace(0, 1, 11)))
# Plot
plt.plot(col_wavelengths, avg_accuracy_ovr)
pos_std = np.array(avg_accuracy_ovr) + np.array(avg_std_ovr)
neg_std = np.array(avg_accuracy_ovr) - np.array(avg_std_ovr)
ax.fill_between(col_wavelengths, neg_std, pos_std, alpha=0.2)
# Set figure object
ax.set_title('Accuracy of ' + str(i) + ' vs. ' + str(k))
ax.set_xlabel('Wavelength (nm)')
ax.set_ylabel('Classification Accuracy (%)')
ax.xaxis.set_ticks(np.arange(400, 1850, 100))
ax.set_xlim([350, 1850])
bbox = matplotlib.transforms.Bbox([[-0.2, -0.36], [10.3, 8.56]])
# Save figure
if not os.path.exists(str(i) + '_' + str(k) + '_clf_accuracy.png'):
    f.savefig(os.getcwd() + '/' + str(i) + '_' + str(k) + '_clf_accuracy.png', dpi=1080, bbox_inches=bbox)
f.clear()
plt.cla()
plt.clf()
plt.close('all')
plt.close()

# Plot and Save Balanced Accuracy vs. Wavelengths
# Set the figure size
f, ax = plt.subplots(figsize=(10, 8))
# Tight layout
f.tight_layout()
# Set colors
ax.set_prop_cycle('color', cm.Spectral(np.linspace(0, 1, 11)))
# Plot
plt.plot(col_wavelengths, avg_balanced_ovr)
pos_std = np.array(avg_balanced_ovr) + np.array(avg_balanced_std_ovr)
neg_std = np.array(avg_balanced_ovr) - np.array(avg_balanced_std_ovr)
ax.fill_between(col_wavelengths, neg_std, pos_std, alpha=0.2)
# Set figure object
ax.set_title('Balanced Accuracy of ' + str(i) + ' vs. ' + str(k))
ax.set_xlabel('Wavelength (nm)')
ax.set_ylabel('Balanced Classification Accuracy (%)')
ax.xaxis.set_ticks(np.arange(400, 1850, 100))
ax.set_xlim([350, 1850])
bbox = matplotlib.transforms.Bbox([[-0.2, -0.36], [10.3, 8.56]])
# Save figure
if not os.path.exists(str(i) + '_' + str(k) + '_balanced_clf_accuracy.png'):
    f.savefig(os.getcwd() + '/' + str(i) + '_' + str(k) + '_balanced_clf_accuracy.png', dpi=1080, bbox_inches=bbox)
f.clear()
plt.cla()
plt.clf()
plt.close('all')
plt.close()

# End timestamp
end_time = time.time()
print('Time to run: ', end_time - start_time, ' seconds')
total_time.append(end_time - start_time)
print('\n')

# Save result variables
dump(avg_accuracy_ovr, str(i) + '_' + str(k) + '_avg_accuracy_ovr.joblib')
dump(avg_std_ovr, str(i) + '_' + str(k) + '_avg_std_ovr.joblib')
dump(avg_balanced_ovr, str(i) + '_' + str(k) + '_avg_balanced_ovr.joblib')
dump(avg_balanced_std_ovr, str(i) + '_' + str(k) + '_avg_balanced_std_ovr.joblib')
dump(accuracy_ovr, str(i) + '_' + str(k) + '_accuracy_ovr.joblib')
dump(balanced_ovr, str(i) + '_' + str(k) + '_balanced_ovr.joblib')
dump(total_time, str(i) + '_' + str(k) + '_total_time_ovr.joblib')