<div>
<img src='../../img/WSP_red.png' style='height: 95px; float: left' alt='WSP Logo'/>
<img src='../../img/austroads.png' style='height: 115px; float: right' alt='Client Logo'/>
</div>
<center><h2>AAM6201 Development of Machine-Learning Decision-Support tools for Pavement Asset Management<br>Case Study 1: Project Identification</h2></center>


## Importing modules

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pickle
import shap

from src.visualization import plot_metric
from src import util
from pathlib import Path

## Loading models and data

In [None]:
from data import DATA_DIR

report_dir = DATA_DIR.parent / 'reports' / 'raw_results'
model_dir = DATA_DIR.parent / 'models' / 'trained'

In [None]:
juris = ['WA', 'NSW', 'NZ']
suffix = 'final_even_split'

model_dict = {
    juri: {
        'models': {
            'XGB': None
        },
        'prediction_columns': None
    } for juri in juris 
}

juri_to_save_name = {
    'WA': 'mrwa', 'NZ': 'nzta', 'NSW': 'nsw'
}

for juri in juris: 
    juri_model_dir = model_dir / juri_to_save_name[juri].upper()
    with open(juri_model_dir / (juri_to_save_name[juri] + '_' + suffix + '_dir') / f'train_XGB_timehorizon_{juri_to_save_name[juri]}_{suffix}.pkl', 'rb') as f:
        model_dict[juri]['models']['XGB'] = pickle.load(f)
    with open(juri_model_dir / (juri_to_save_name[juri] + '_' + suffix + '_dir') / f'train_labels_columns_{juri_to_save_name[juri]}_{suffix}.pkl', 'rb') as f:
        model_dict[juri]['prediction_columns'] = pickle.load(f)

In [None]:
from src import util

train_flattened_mrwa_labels = util.load_data(DATA_DIR / 'processed' / 'MRWA' / 'mrwa_final' / 'train_flattened_labels_mrwa_final_no_offset.csv', header=[0, 1])
train_flattened_nzta_labels = util.load_data(DATA_DIR / 'processed' / 'NZTA'/ 'nzta_final' / 'train_flattened_labels_nzta_final_no_offset.csv', header=[0, 1])
train_flattened_nsw_labels = util.load_data(DATA_DIR / 'processed' / 'NSW' / 'final' / 'labels_all.csv', header=[0, 1])

train_flattened_mrwa = util.load_data(DATA_DIR / 'processed' / 'MRWA' / 'mrwa_final' / 'train_flattened_data_mrwa_final_no_offset.csv')
train_flattened_nzta = util.load_data(DATA_DIR / 'processed' / 'NZTA' / 'nzta_final' / 'train_flattened_data_nzta_final_no_offset.csv')
train_flattened_nsw = util.load_data(DATA_DIR / 'processed' / 'NSW' / 'final' / 'train_all.csv')

data_dict = {
    'WA': train_flattened_mrwa,
    'NZ': train_flattened_nzta,
    'NSW': train_flattened_nsw
}

label_dict = {
    'WA': train_flattened_mrwa_labels,
    'NZ': train_flattened_nzta_labels,
    'NSW': train_flattened_nsw_labels
}

In [None]:
save_fig_dir = report_dir.parent / 'figures' / 'feature_inspection' / 'even_split'

save_name = 'raw_shap_results_pos_background.pkl'

if (save_fig_dir / save_name).exists():
    with open(save_fig_dir / save_name, 'rb') as f:
        raw_shap_result = pickle.load(f)

## Average SHAP Values

In [None]:
#Calculating the Average SHAP values across 5 models for each prediction column
juris = ['WA', 'NZ', 'NSW']

avg_shap_results = {juri: {} for juri in juris}
for juri in juris:
    for pred_col in raw_shap_result[juri]:
        if pred_col not in avg_shap_results[juri]:
            avg_shap_results[juri][pred_col] = {}
        for idx in raw_shap_result[juri][pred_col]:
            temp = 0
            for mod in raw_shap_result[juri][pred_col][idx]:
                temp = np.add(temp, raw_shap_result[juri][pred_col][idx][mod])
            avg_shap_results[juri][pred_col] = temp/len(raw_shap_result[juri][pred_col][idx])

In [None]:
juris = ['WA', 'NZ', 'NSW']
train_datasets = [train_flattened_mrwa, train_flattened_nzta, train_flattened_nsw]

zero_shap_locs_dict = {juri: {} for juri in juris}

for i in range(len(juris)):
    juri = juris[i]
    train_data = train_datasets[i]
    for pred_col in avg_shap_results[juri]:
        if pred_col not in zero_shap_locs_dict[juri]:
            zero_shap_locs_dict[juri][pred_col] = {}
            
        for j in range(len(train_data.columns)):
            abs_shap_list = np.abs(avg_shap_results[juri][pred_col][:,j])
            zero_shap_locs_dict[juri][pred_col][train_data.columns[j]] = np.argmin(abs_shap_list)

## Feature Values for SHAP = 0 (Normalised)

In [None]:
juris = ['WA', 'NZ', 'NSW']
train_datasets = [train_flattened_mrwa, train_flattened_nzta, train_flattened_nsw]

feat_vals_dict = {juri: {} for juri in juris}

for i in range(len(juris)):
    juri = juris[i]
    train_data = train_datasets[i]
    
    for pred_col in zero_shap_locs_dict[juri]:
        if pred_col not in feat_vals_dict[juri]:
            feat_vals_dict[juri][pred_col] = {} 
        for each_feat in zero_shap_locs_dict[juri][pred_col].keys():
            inst = zero_shap_locs_dict[juri][pred_col][each_feat]
            feat_vals_dict[juri][pred_col][each_feat] = train_data[each_feat].iloc[inst]

### Spearman Rank Correlation Coefficient

In [None]:
from typing import List, Tuple
from scipy.stats import spearmanr

def get_magnitude_direction(shap_values_lst: List[float], feature_values_lst: List[float]) -> Tuple[float, float]:
    """Return the mean of the absolute value of shap coefficients, signifying the magnitude of their effect,
    multiplied by 1 if the values are positively correlated with feature values, else -1"""
    # if the feature has 0 contribution
    if np.max(np.abs(shap_values_lst)) < 1e-5:
        return 0, 0, 0, 0

    try:
        with np.errstate(all='raise'):
            corr_coeff, pval = spearmanr(
                shap_values_lst, feature_values_lst
            )
            return_val = np.mean(np.abs(shap_values_lst)) * (1 if corr_coeff > 0 else - 1)
            return_dir = 1 if corr_coeff > 0 else -1
            if pval < 0.05:
                corr_coeff_magnitude = np.abs(corr_coeff)
            else:
                corr_coeff_magnitude = 0
            
            if corr_coeff_magnitude < 0.5:
                low_or_high = 'low'
            else:
                low_or_high = 'high'

    except Exception as e:
        print('Feature name: ', feature_values_lst.name)
        print('Sample of feature values: ', np.random.choice(feature_values_lst, size=10))
        print('Shap values max abs: ', np.max(np.abs(shap_values_lst)))
        print('Sample of shap values: ', np.random.choice(shap_values_lst, size=10))
        raise e

    return return_dir, corr_coeff_magnitude, low_or_high, return_val

In [None]:
juris = ['WA', 'NZ', 'NSW']
train_datasets = [train_flattened_mrwa, train_flattened_nzta, train_flattened_nsw]
spearman_corr_dict = {juri: {} for juri in juris}

for i in range(len(juris)):
    juri = juris[i]
    train_data = train_datasets[i]
    
    for pred_col in avg_shap_results[juri]:
        if pred_col not in spearman_corr_dict[juri]:
            spearman_corr_dict[juri][pred_col] = {}
        for i in range(len(train_data.columns)):
            #getting rid of binary values
            if train_data.iloc[:, i].nunique() <= 2:
                continue
            else:
                spearman_corr_dict[juri][pred_col][train_data.columns[i]] = get_magnitude_direction(avg_shap_results[juri][pred_col][:, i], train_data.iloc[:, i])

### Loading the PREPROCESSED files

In [None]:
preprocess_dir = model_dir.parent / 'preprocessing_state'
preproc_dict = {}

with open(preprocess_dir / 'mrwa' / 'preprocessing_state_dict_mrwa_final.sav', 'rb') as f:
    preproc_dict['WA'] = pickle.load(f)
with open(preprocess_dir / 'nzta' / 'preprocessing_state_dict_nzta_final.sav', 'rb') as f:
    preproc_dict['NZ'] = pickle.load(f)
with open(preprocess_dir / 'nsw' / 'preprocessing_state_dict.sav', 'rb') as f:
    preproc_dict['NSW'] = pickle.load(f)

## Visualisations

In [None]:
palette = {
    'WA': 'tab:blue',
    'NZ': 'tab:red',
    'NSW': 'tab:orange'
}

#### Spearman Rank Correlation coefficient bar plots

In [None]:
from matplotlib.ticker import FixedLocator

def from_dict_to_df(subdict):
    return pd.DataFrame(subdict).transpose()

year_map_dict = {
    'Treatment within 1 year': 'Year 1',
    'Treatment between 1 to 3 years': 'Year 2 - 3',
    'Treatment between 3 to 5 years': 'Year 4 - 5',
    'Treatment between 5 to 10 years': 'Year 6 - 10',
}    

In [None]:
# Bar Plot for Spearman Correlation
row_dict = {t: i for i, t in enumerate(['Treatment within 1 year', 'Treatment between 1 to 3 years', 'Treatment between 3 to 5 years', 'Treatment between 5 to 10 years'])}
col_dict = {t: i for i, t in enumerate(['Resurfacing_SS', 'Resurfacing_AC', 'Rehabilitation'])}

for juri in ['WA', 'NZ', 'NSW']:

    spear_corr = spearman_corr_dict[juri]
    train_data = data_dict[juri] 
    file_save_name = f'spearman_corr_{juri.lower()}_balanced.jpg'
    fig = plt.figure(figsize=(27,36))

    for col_idx, col in enumerate(spear_corr):
        if (col[1] not in col_dict) or (col[0] not in row_dict):
            continue

        ax = plt.subplot(4, 3, row_dict[col[0]] * 3 + col_dict[col[1]] + 1)
        col_name = f"{col[0].replace('Treatment ', '')} - {col[1].replace('Resurfacing_', '')}" 
        plt.sca(ax)
        
        # make dict into dataframe
        col_df = from_dict_to_df(spear_corr[col])

        # plot bar plot
        ax.bar(
            x=np.arange(len(spear_corr[col])), 
            height=col_df.iloc[:, 1] * col_df.iloc[:, 0], # magnitude * direction
            color=[palette[juri] if c > 0.5 else 'gray' for c in col_df.iloc[:, 1]], # threshold at magnitude = 0.5
            alpha=0.6
        )

        ax.grid()
        ax.set_ylabel('Correlation Coefficient')
        ax.set_title(f'{year_map_dict[col[0]]} - {col[1]}')
        ax.xaxis.set_major_locator(FixedLocator(np.arange(len(spear_corr[col]))))
        ax.xaxis.set_tick_params(direction='out')
        ax.xaxis.set_ticks_position('bottom')
        ax.set_xticklabels(
            [col.replace("_df0", "").replace("|idx=0", "") for col in col_df.index], 
            rotation=45, ha='right'
        )
        threshold_line = ax.axhline(y=0.5, label='Threshold', c='r', linestyle='--')
        threshold_line2 = ax.axhline(y=-0.5, label='Threshold', c='r', linestyle='--')
        ax.legend(handles=[threshold_line])
        plt.grid(b=None)
        
    fig.suptitle(f'Spearman Correlation Plot - {juri}\nCorrelation between features and their corresponding SHAP values\nBinary features are excluded\nSampling performed to correct class imbalance', fontsize=25)
    fig.tight_layout(rect=[0, 0, 1, 0.98])

    save_fig_dir = report_dir.parent / 'figures' / 'inflection_point' 
    if not save_fig_dir.exists(): save_fig_dir.mkdir()
    plt.savefig(save_fig_dir / file_save_name)
    plt.close()

In [None]:
# Example bar plot
juri = 'WA'
spear_corr = spearman_corr_dict[juri]
train_data = data_dict[juri] 
file_save_name = f'spearman_corr_balanced_example.png'
col = ('Treatment between 1 to 3 years', 'Rehabilitation')

fig, ax = plt.subplots(1, 1, figsize=(9, 9))
col_name = f"{col[0].replace('Treatment ', '')} - {col[1].replace('Resurfacing_', '')}" 
plt.sca(ax)

# make dict into dataframe
col_df = from_dict_to_df(spear_corr[col])

# plot bar plot
ax.bar(
    x=np.arange(len(spear_corr[col])), 
    height=col_df.iloc[:, 1] * col_df.iloc[:, 0], # magnitude * direction
    color=[palette[juri] if c > 0.5 else 'gray' for c in col_df.iloc[:, 1]], # threshold at magnitude = 0.5
    alpha=0.6
)

ax.grid()
ax.set_ylabel('Correlation Coefficient')
ax.set_title(f'{year_map_dict[col[0]]} - {col[1]}')
ax.xaxis.set_major_locator(FixedLocator(np.arange(len(spear_corr[col]))))
ax.xaxis.set_tick_params(direction='out')
ax.xaxis.set_ticks_position('bottom')
ax.set_xticklabels(
    [col.replace("_df0", "").replace("|idx=0", "") for col in col_df.index], 
    rotation=45, ha='right'
)
threshold_line = ax.axhline(y=0.5, label='Threshold', c='r', linestyle='--')
threshold_line2 = ax.axhline(y=-0.5, label='Threshold', c='r', linestyle='--')
ax.legend(handles=[threshold_line])
plt.grid(b=None)

fig.suptitle(f'Spearman Correlation Plot - {juri}\nCorrelation between features and their corresponding SHAP values\nBinary features are excluded\nSampling performed to correct class imbalance', fontsize=16)
fig.tight_layout(rect=[0, 0, 1, 0.98])

save_fig_dir = report_dir.parent / 'figures' / 'inflection_point' 
if not save_fig_dir.exists(): save_fig_dir.mkdir()
plt.savefig(save_fig_dir / file_save_name, dpi=200)
plt.close()

## Split Violin Plots

Violin plots for distributions of features, split yes or no per treatment, annotated with inflection point (feature value with 0 shap)

In [None]:
from seaborn import violinplot

ylim_dict = {
    'WA': 6, 'NZ': 4, 'NSW': 4
}

In [None]:
row_dict = {t: i for i, t in enumerate(['Treatment within 1 year', 'Treatment between 1 to 3 years', 'Treatment between 3 to 5 years', 'Treatment between 5 to 10 years'])}
col_dict = {t: i for i, t in enumerate(['Resurfacing_SS', 'Resurfacing_AC', 'Rehabilitation'])}

for juri in ['WA', 'NZ', 'NSW']:
    spear_corr = spearman_corr_dict[juri]
    train_data = data_dict[juri] 
    label_data = label_dict[juri] 
    file_save_name = f'violin_{juri.lower()}_balanced_split.jpg'
    colour = palette[juri] 
    preproc = preproc_dict[juri]
    ylimit = ylim_dict[juri] 
    fig = plt.figure(figsize=(27,36))

    for col_idx, col in enumerate(spear_corr):
        if (col[1] not in col_dict) or (col[0] not in row_dict):
            continue
        ax = plt.subplot(4, 3, row_dict[col[0]] * 3 + col_dict[col[1]] + 1)
        col_name = f"{col[0].replace('Treatment ', '')} - {col[1].replace('Resurfacing_', '')}" 
        plt.sca(ax)
        label_colno = label_data.columns.get_loc(col)
        
        #only take features that have |correlation| > 0.5
        col_df = from_dict_to_df(spear_corr[col])
        features = (col_df[col_df.iloc[:, 1] > 0.5])
        ft_list = list(features.index.values)
        ft_list = [ft for ft in ft_list if train_data[ft].nunique() > 2]
        
        #add line per column - inflection point
        for feat_idx, feat in enumerate(train_data.loc[:, ft_list]):
            xrange = 1 / len(ft_list)
            x_start = 0.05 + feat_idx * xrange
            x_end = xrange-0.05 + feat_idx * xrange
            y_val = feat_vals_dict[juri][col][ft_list[feat_idx]]
            ax.axhline(y = y_val, xmin = x_start, xmax = x_end, color = 'red', linewidth=3)
            
            if juri == 'NSW':
                feat_name = feat
            else:
                feat_name = feat.replace("_df0|idx=0", '')
            feat_val = preproc['scaler'][feat_name].inverse_transform([y_val])[0]
            ax.annotate(str(round(feat_val,1)), xy = (feat_idx+0.1, y_val+0.25), xycoords = 'data', size=18)
            
        cut_data = train_data.loc[:, ft_list]
        feature_names = list(cut_data.columns)
        cut_label = label_data.iloc[:, label_colno]
        cut_data['label'] = cut_label
        
        df = pd.melt(cut_data, value_vars=feature_names, id_vars='label')
        
        violinplot(x='variable', y='value', hue='label', data=df, split = True, color = colour)
        plt.setp(ax.collections, alpha=0.6)
        ax.set_ylim(top=ylimit)
        ax.set_ylabel('Normalised feature values', fontsize=16)
        ax.set_title(f'{year_map_dict[col[0]]} - {col[1]}', fontsize=18)
        ax.set_xticklabels(
                [col.replace("_df0", "").replace("|idx=0", "") for col in feature_names], 
                rotation=45, ha='right', fontsize=16)
        ax.xaxis.label.set_visible(False)
        # Set legend #
        leg = ax.legend(handles=[ax.legend_.legendHandles[1]], labels=['Treatment given'])
        for lh in leg.legendHandles:
            lh.set_alpha(0.6)
        ax.grid(False)

    fig.suptitle(f'Sample Distribution across Features - {juri}\nOnly continuous features with |Correlation| > 0.5 are shown\nInflection points are indicated with red lines\nAnnotations show each feature\'s inflection point in its original data space\nSampling performed to correct for class imbalance', fontsize=25)
    fig.tight_layout(rect=[0, 0, 1, 0.98])
    save_fig_dir = report_dir.parent / 'figures' / 'inflection_point' / 'split_violin'
    if not save_fig_dir.exists(): save_fig_dir.mkdir()
    plt.savefig(save_fig_dir / file_save_name)
    plt.close()

In [None]:
# example violing plot
juri = 'WA'
spear_corr = spearman_corr_dict[juri]
train_data = data_dict[juri] 
label_data = label_dict[juri] 
file_save_name = f'violin_balanced_split_example.jpg'
colour = palette[juri] 
preproc = preproc_dict[juri]
ylimit = ylim_dict[juri] 
fig = plt.figure(figsize=(27,36))

col = ('Treatment between 1 to 3 years', 'Rehabilitation')
fig, ax = plt.subplots(1, 1, figsize=(9, 9))
col_name = f"{col[0].replace('Treatment ', '')} - {col[1].replace('Resurfacing_', '')}" 
plt.sca(ax)
label_colno = label_data.columns.get_loc(col)

#only take features that have |correlation| > 0.5
col_df = from_dict_to_df(spear_corr[col])
features = (col_df[col_df.iloc[:, 1] > 0.5])
ft_list = list(features.index.values)
ft_list = [ft for ft in ft_list if train_data[ft].nunique() > 2]

#add line per column - inflection point
for feat_idx, feat in enumerate(train_data.loc[:, ft_list]):
    xrange = 1 / len(ft_list)
    x_start = 0.05 + feat_idx * xrange
    x_end = xrange-0.05 + feat_idx * xrange
    y_val = feat_vals_dict[juri][col][ft_list[feat_idx]]
    ax.axhline(y = y_val, xmin = x_start, xmax = x_end, color = 'red', linewidth=3)
    
    if juri == 'NSW':
        feat_name = feat
    else:
        feat_name = feat.replace("_df0|idx=0", '')
    feat_val = preproc['scaler'][feat_name].inverse_transform([y_val])[0]
    ax.annotate(str(round(feat_val,1)), xy = (feat_idx+0.1, y_val+0.25), xycoords = 'data', size=18)
    
cut_data = train_data.loc[:, ft_list]
feature_names = list(cut_data.columns)
cut_label = label_data.iloc[:, label_colno]
cut_data['label'] = cut_label

df = pd.melt(cut_data, value_vars=feature_names, id_vars='label')

violinplot(x='variable', y='value', hue='label', data=df, split = True, color = colour)
plt.setp(ax.collections, alpha=0.6)
ax.set_ylim(top=ylimit)
ax.set_ylabel('Normalised feature values', fontsize=16)
ax.set_title(f'{year_map_dict[col[0]]} - {col[1]}', fontsize=18)
ax.set_xticklabels(
        [col.replace("_df0", "").replace("|idx=0", "") for col in feature_names], 
        rotation=45, ha='right', fontsize=16)
ax.xaxis.label.set_visible(False)
# Set legend #
leg = ax.legend(handles=[ax.legend_.legendHandles[1]], labels=['Treatment given'])
for lh in leg.legendHandles:
    lh.set_alpha(0.6)
ax.grid(False)

fig.suptitle(f'Sample Distribution across Features - {juri}\nOnly continuous features with |Correlation| > 0.5 are shown\nInflection points are indicated with red lines\nAnnotations show each feature\'s inflection point in its original data space\nSampling performed to correct for class imbalance', fontsize=16)
fig.tight_layout(rect=[0, 0, 1, 0.98])
save_fig_dir = report_dir.parent / 'figures' / 'inflection_point' / 'split_violin'
if not save_fig_dir.exists(): save_fig_dir.mkdir()
plt.savefig(save_fig_dir / file_save_name)
plt.close()