<div>
<img src='../../img/WSP_red.png' style='height: 95px; float: left' alt='WSP Logo'/>
<img src='../../img/austroads.png' style='height: 115px; float: right' alt='Client Logo'/>
</div>
<center><h2>AAM6201 Development of Machine-Learning Decision-Support tools for Pavement Asset Management<br>Case Study 1: Project Identification</h2></center>


In [None]:
# magic command to autoreload changes in src
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import src.util as util
import matplotlib.pyplot as plt
import matplotlib.cm as mpl_cm
import seaborn as sns
import pickle

# Load data and models 

In [None]:
from data import DATA_DIR

train_flattened_mrwa_indices = util.load_data(DATA_DIR / 'processed' / 'MRWA' / "mrwa_final" / 'train_flattened_index_mrwa_final.csv')
train_flattened_nzta_indices = util.load_data(DATA_DIR / 'processed' / 'NZTA' / "nzta_final" / 'train_flattened_index_nzta_final.csv')
valid_flattened_mrwa_indices = util.load_data(DATA_DIR / 'processed' / 'MRWA' / "mrwa_final" / 'valid_flattened_index_mrwa_final.csv')
valid_flattened_nzta_indices = util.load_data(DATA_DIR / 'processed' / 'NZTA' /"nzta_final" / 'valid_flattened_index_nzta_final.csv')

train_flattened_mrwa_labels = util.load_data(DATA_DIR / 'processed' / 'MRWA' / "mrwa_final" / 'train_flattened_labels_mrwa_final.csv', header=[0, 1])
train_flattened_nzta_labels = util.load_data(DATA_DIR / 'processed' / 'NZTA' / "nzta_final" / 'train_flattened_labels_nzta_final.csv', header=[0, 1])
valid_flattened_mrwa_labels = util.load_data(DATA_DIR / 'processed' / 'MRWA' / "mrwa_final" / 'valid_flattened_labels_mrwa_final.csv', header=[0, 1])
valid_flattened_nzta_labels = util.load_data(DATA_DIR / 'processed' / 'NZTA' / "nzta_final" / 'valid_flattened_labels_nzta_final.csv', header=[0, 1])

train_flattened_mrwa_truncated = util.load_data(DATA_DIR / 'processed' / 'MRWA' / "mrwa_final" / 'train_flattened_data_mrwa_final_no_offset.csv')
train_flattened_nzta_truncated = util.load_data(DATA_DIR / 'processed' / 'NZTA' / "nzta_final" / 'train_flattened_data_nzta_final_no_offset.csv')
valid_flattened_mrwa_truncated = util.load_data(DATA_DIR / 'processed' / 'MRWA' / "mrwa_final" / 'valid_flattened_data_mrwa_final_no_offset.csv')
valid_flattened_nzta_truncated = util.load_data(DATA_DIR / 'processed' / 'NZTA' / "nzta_final" / 'valid_flattened_data_nzta_final_no_offset.csv')

train_flattened_vic_truncated = util.load_data(DATA_DIR / 'processed' / 'VIC' / "final" / 'train_all.csv')
train_flattened_nsw_truncated = util.load_data(DATA_DIR / 'processed' / 'NSW' / "final" / 'train_all.csv')
valid_flattened_vic_truncated = util.load_data(DATA_DIR / 'processed' / 'VIC' / "final" / 'valid_all.csv')
valid_flattened_nsw_truncated = util.load_data(DATA_DIR / 'processed' / 'NSW' / "final" / 'valid_all.csv')

train_flattened_vic_labels = util.load_data(DATA_DIR / 'processed' / 'VIC' / "final" / 'labels_all.csv', header=[0, 1])
train_flattened_nsw_labels = util.load_data(DATA_DIR / 'processed' / 'NSW' / "final" / 'labels_all.csv', header=[0, 1])
valid_flattened_vic_labels = util.load_data(DATA_DIR / 'processed' / 'VIC' / "final" / 'valid_labels_all.csv', header=[0, 1])
valid_flattened_nsw_labels = util.load_data(DATA_DIR / 'processed' / 'NSW' / "final" / 'valid_labels_all.csv', header=[0, 1])

save_fig_dir = DATA_DIR.parent / 'reports' / 'figures' / 'final_transfer'
if not save_fig_dir.exists():
    save_fig_dir.mkdir(parents=True)

In [None]:
from data import DATA_DIR

report_dir = DATA_DIR.parent / 'reports' / 'raw_results'
model_dir = DATA_DIR.parent / 'models' / 'trained'

# Distribution of train labels vs predicted labels on transfer vs predicted labels on valid
from data import DATA_DIR
import pickle

juris = ['NSW', 'MRWA', 'NZTA']
prefixes = ['final'] * 3
suffixes = ['even_split'] * 3

model_dict = {
    juri.replace('TA', '').replace('MR', ''): {
        'models': {
            'XGB': None
        },
        'prediction_columns': None
    } for juri in juris 
}

for train_name, prefix, suffix in zip(juris, prefixes, suffixes):
    juri_name = train_name.replace('TA', '').replace('MR', '')
    model_dir = DATA_DIR.parent / 'models' / 'trained' / train_name / f'{train_name.lower()}_{prefix}_{suffix}_dir'
    for model_type in ['XGB']:
        with open(model_dir / f'train_{model_type}_timehorizon_{train_name.lower()}_{prefix}_{suffix}.pkl', 'rb') as f:
            model_dict[juri_name]['models'][model_type] = pickle.load(f)
    with open(model_dir  / f'train_labels_columns_{train_name.lower()}_{prefix}_{suffix}.pkl', 'rb') as f:
        model_dict[juri_name]['prediction_columns'] = pickle.load(f)

In [None]:
palette = {
    'WA': '#004259', 
    'NZ': '#A72326', 
    'NSW': '#E8A602', 
    'VIC': '#356130'
}

# Distributions of raw features

In [None]:
# load preprpcoessing states to turn normalised features back into raw data space
#  
with open(DATA_DIR.parent / 'models' / 'preprocessing_state' / 'mrwa' / 'preprocessing_state_dict_mrwa_final.sav', 'rb') as f:
    mrwa_prepro_dict = pickle.load(f)

with open(DATA_DIR.parent / 'models' / 'preprocessing_state' / 'nzta' / 'preprocessing_state_dict_nzta_final.sav', 'rb') as f:
    nzta_prepro_dict = pickle.load(f)

with open(DATA_DIR.parent / 'models' / 'preprocessing_state' / 'vic' / 'preprocessing_state_dict.sav', 'rb') as f:
    vic_prepro_dict = pickle.load(f)

with open(DATA_DIR.parent / 'models' / 'preprocessing_state' / 'nsw' / 'preprocessing_state_dict.sav', 'rb') as f:
    nsw_prepro_dict = pickle.load(f)
    
from data import DATA_DIR

encoded_train_mrwa = train_flattened_mrwa_truncated.copy()
for col, scaler in mrwa_prepro_dict['scaler'].items():
    encoded_train_mrwa.loc[:, col] = scaler.inverse_transform(encoded_train_mrwa[col + '_df0|idx=0'].values.reshape(-1, 1)).flatten()

encoded_train_nzta = train_flattened_nzta_truncated.copy()
for col, scaler in nzta_prepro_dict['scaler'].items():
    encoded_train_nzta.loc[:, col] = scaler.inverse_transform(encoded_train_nzta[col + '_df0|idx=0'].values.reshape(-1, 1)).flatten()

encoded_train_nsw = train_flattened_nsw_truncated.copy()
for col, scaler in nsw_prepro_dict['scaler'].items():
    encoded_train_nsw.loc[:, col] = scaler.inverse_transform(encoded_train_nsw[col].values.reshape(-1, 1)).flatten()

encoded_train_vic = train_flattened_vic_truncated.copy()
for col, scaler in vic_prepro_dict['scaler'].items():
    encoded_train_vic.loc[:, col] = scaler.inverse_transform(encoded_train_vic[col + '|idx=0'].values.reshape(-1, 1)).flatten()

encoded_train_nsw = encoded_train_nsw.rename(columns={'iri_iwp': 'IRI'})
# mrwa and nzta datsets differ in Pavement_Type features, need casting so these datasets have the same features
encoded_train_mrwa = encoded_train_mrwa.rename(columns={'Pavement Type_Flexible_df0|idx=0': 'Pavement Type_Flexible', 'Surface Material_SS_df0|idx=0': 'Surface Material_SS'})
encoded_train_mrwa.loc[:, 'Pavement Type_Rigid'] = 1 - encoded_train_mrwa['Pavement Type_Flexible'] - encoded_train_mrwa['Pavement Type_Other']
encoded_train_mrwa = encoded_train_mrwa[[col for col in encoded_train_mrwa if not col.endswith('idx=0')]]
encoded_train_nzta = encoded_train_nzta.rename(columns={'Pavement Type_Flexible_df0|idx=0': 'Pavement Type_Flexible', 'Pavement Type_Rigid_df0|idx=0': 'Pavement Type_Rigid', 'Surface Material_SS_df0|idx=0': 'Surface Material_SS'})
encoded_train_nzta = encoded_train_nzta[[col for col in encoded_train_nzta if not col.endswith('idx=0')]]

In [None]:
fig, axs = plt.subplots(4, 3, figsize=(12, 12))
axs = axs.ravel()

for i, feature in enumerate(sorted(set(encoded_train_mrwa.columns).intersection(set(encoded_train_nzta.columns).intersection(set(encoded_train_nsw.columns))))):
    ax = axs[i]
    ax.set_title(feature)

    feature_df = pd.DataFrame()
    for df, title in [
        (encoded_train_mrwa, 'WA'), 
        (encoded_train_nzta, 'NZ'), 
        (encoded_train_nsw, 'NSW'), 
    ]:
        if feature not in df.columns:
            continue
        feature_df = feature_df.append(pd.DataFrame({
            feature: df[feature],
            'Jurisdiction': [title] * len(df)
        }))

    
    if feature in {'D0', 'D200'}: # exception: normalize this:
        feature_df.loc[:, feature] = feature_df.groupby(['Jurisdiction'])[feature].transform(lambda x: (x - np.mean(x)) / np.std(x))
        sns.kdeplot(data=feature_df, x=feature, hue='Jurisdiction', ax=ax, palette=palette, legend=True, multiple='layer', shade=True, common_norm=False)
        ax.set_title(feature + ' - Normalized')
    elif feature_df[feature].nunique() > 2: # otherwise, plot unnormalised data 
        sns.kdeplot(data=feature_df, x=feature, hue='Jurisdiction', ax=ax, palette=palette, legend=True, multiple='layer', shade=True, common_norm=False)
    else:
        # binary feature gets plotted using bar plots
        feature_df = feature_df.groupby(['Jurisdiction'])[feature].value_counts().rename('Percentage')
        feature_df = feature_df / feature_df.sum(axis=0, level=0)
        feature_df = feature_df.reset_index()
        feature_df.loc[:, feature] = feature_df.replace({feature_df[feature].min(): 'No', feature_df[feature].max(): 'Yes'}) 
        sns.barplot(data=feature_df, x=feature, y='Percentage', hue='Jurisdiction', ax=ax, palette=palette, alpha=0.5)
    
    ax.set_xlabel(None)

fig.suptitle('Distribution of features by jurisdictions')
plt.tight_layout(rect=[0, 0, 1, 0.98])
plt.savefig(save_fig_dir / 'raw_data_distributions_all_jurisdictions.png', dpi=300)
plt.show()

In [None]:
# concatenate all datasets for ease of plotting

value_cols = ['Surface age',
    'Pavement age',
    'AADT',
    'HeavyIndex',
    'IRI',
    'Rutting mm',
    'Crack%',
    'D0',
    'D200'
]

wa = encoded_train_mrwa[value_cols].copy()
nz = encoded_train_nzta[value_cols].copy()
nsw = encoded_train_nsw[value_cols].copy()

wa['Jurisdiction'] = 'WA'
nz['Jurisdiction'] = 'NZ'
nsw['Jurisdiction'] = 'NSW'

wa_labels = train_flattened_mrwa_labels.copy()
nz_labels = train_flattened_nzta_labels.copy()
nsw_labels = train_flattened_nsw_labels.copy()
wa_labels['Jurisdiction'] = 'WA'
nz_labels['Jurisdiction'] = 'NZ'
nsw_labels['Jurisdiction'] = 'NSW'

labels = pd.concat([
    wa_labels, nz_labels, nsw_labels
], ignore_index=True)

all = pd.concat([
    wa, nz, nsw
], ignore_index=True)

In [None]:
for column  in {'Resurfacing_SS', 'Resurfacing_AC', 'Rehabilitation'}:
    fig, axs = plt.subplots(3, 3, figsize=(10, 10))
    axs = axs.ravel()

    for i, feature in enumerate(all.columns):
        if feature in ['Jurisdiction', 'embed_x', 'embed_y']:
            continue

        if feature in ['D0', 'D200']: # normalise these two only
            all.loc[:, feature] = all.groupby(['Jurisdiction'])[feature].transform(lambda x: (x - np.mean(x)) / np.std(x))

        ax = axs[i]
        ax.set_title(feature)
        pos_df = all.loc[
            labels.loc[:, labels.columns.get_level_values(1) == column].any(axis=1), :
        ]

        sns.kdeplot(data=pos_df, x=feature, hue='Jurisdiction', ax=ax, shade=1, palette=palette, common_norm=False)

        if feature in ['D0', 'D200']:
            ax.set_title(feature + ' - Normalised')
        else:
            ax.set_xlabel('')

    fig.suptitle(f'Distribution of raw feature values of sections treated with {column}')
    fig.tight_layout()
    plt.savefig(save_fig_dir / f'flattened_data_distributions_conditioned_{column}.png', dpi=300)
    plt.show()

# Distribution of predictions

## Count

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 6))

treatments = ['Resurfacing_SS', 'Resurfacing_AC', 'Rehabilitation']
treatments_2 = ['Major Patching', 'Regulation', 'Retexturing'] # rare class which need different axis to show different scale
xticks = np.arange(3)
width = 0.1

for i, juri, labels in zip(range(4), ['WA', 'NZ', 'NSW', 'VIC'], [train_flattened_mrwa_labels, train_flattened_nzta_labels, train_flattened_nsw_labels, train_flattened_vic_labels]):
    # filter out 10 yrs plus
    labels : pd.DataFrame = labels[labels.columns[~labels.columns.get_level_values(0).isin({'Treatment between 10 to 30 years'})]]
    labels = labels[[col for col in labels.columns if 'Unnamed' not in col[1]]] # drop no project
    labels = labels.swaplevel(axis=1).sum(axis=1, level=0) # count treatments by categories
    bars1 = ax1.bar(
        height=[labels[t].mean() for t in treatments if t in labels.columns], # average number of treatments over 10 year period
        x=[pos + i * width for pos, t in enumerate(treatments) if t in labels.columns],
        width=width,
        label=juri,
        color=palette[juri]
    )
    bars2 = ax2.bar(
        height=[labels[t].mean() for t in treatments_2 if t in labels.columns], # average number of treatments over 10 year period
        x=[pos + i * width for pos, t in enumerate(treatments_2) if t in labels.columns],
        width=width,
        label=juri,
        color=palette[juri]
    )

ax1.set_xticks(xticks + 3 / 2 * width)
ax1.set_xticklabels(treatments)
ax1.set_ylabel('Count')
ax1.yaxis.grid(True)
ax1.legend()
ax2.set_xticks(xticks + 3 / 2 * width)
ax2.set_xticklabels(treatments_2)
ax2.yaxis.tick_right()
ax2.yaxis.set_label_position('right')
ax2.set_ylabel('Count')
ax2.yaxis.grid(True)
ax2.legend(loc='upper left')
plt.suptitle(
    'Average number of treatments planned on one short road section over 10 years\n'+\
    'Differences in frequency may imply differences in selecting treatments given the same conditions',
)
plt.tight_layout()
plt.savefig(save_fig_dir / 'freq_true_labels.jpg')
plt.show()

In [None]:
# code from https://linuxtut.com/en/92c21048bacadce811ec/
def set_hierarchical_xlabels(index, ax=None,
                             bar_xmargin=0.1, #Margins on the left and right ends of the line, X-axis scale
                             bar_yinterval=0.1, #Relative value with the vertical spacing of the line and the length of the Y axis as 1?
                            ):
    from itertools import groupby
    from matplotlib.lines import Line2D

    ax = ax or plt.gca()

    assert isinstance(index, pd.MultiIndex)
    labels = ax.set_xticklabels([s for *_, s in index])
    for lb in labels:
        lb.set_rotation(0)

    transform = ax.get_xaxis_transform()

    for i in range(1, len(index.codes)):
        xpos0 = -0.5 #Coordinates on the left side of the target group
        for (*_, code), codes_iter in groupby(zip(*index.codes[:-i])):
            xpos1 = xpos0 + sum(1 for _ in codes_iter) #Coordinates on the right side of the target group
            ax.text((xpos0+xpos1)/2, (bar_yinterval * (-i-0.1)),
                    index.levels[-i-1][code],
                    transform=transform,
                    ha="center", va="top")
            ax.add_line(Line2D([xpos0+bar_xmargin, xpos1-bar_xmargin],
                               [bar_yinterval * -i]*2,
                               transform=transform,
                               color="k", clip_on=False))
            xpos0 = xpos1

## Relative frequency

In [None]:
import matplotlib.cm as cm
from data import DATA_DIR
import pickle

df = []

non_zero_freq = pd.DataFrame()
# filter out 10 yrs plus
for juri, labels in zip(['WA', 'NZ', 'NSW', 'VIC'], [train_flattened_mrwa_labels, train_flattened_nzta_labels, train_flattened_nsw_labels, train_flattened_vic_labels]):
    labels = labels[labels.columns[~labels.columns.get_level_values(0).isin({'Treatment between 10 to 30 years', 'no_project_flag'})]]
    non_zero_freq[juri] = labels.sum(axis=0) / len(labels) * 100

def bar_frequency_label(drop=None, ax=None, color_shift=0):
    x = non_zero_freq.copy().reset_index()
    x = x.drop(columns=drop)
    x.loc[:, 'level_0'] = x['level_0'].str.replace(r'^Treatment (between|within) ', '', regex=True)
    x= x\
        .sort_values(by='level_0', key=lambda series: series.str.replace(r'^(\d+)( to )?(\d+)? years?', r'\1\3', regex=True).astype(int))\
        .sort_values(by='level_1', kind='stable')\
        .set_index(['level_1', 'level_0'])
    x.index.names = (None, None)

    ax = ax or plt.subplots(figsize=(28, 12))[1]
    fig = ax.get_figure()
    x.plot.bar(
        color=palette,
        alpha=0.5,
        ax=ax
    )
    set_hierarchical_xlabels(x.index)
    ax.set_xticklabels(ax.get_xticklabels(), rotation=30)
    for bars in ax.containers:
        ax.bar_label(bars, padding=3, fmt='%.1f')
    ax.set_ylabel('Percentage')
    ax.set_title('% of Positive Label')
    ax.legend(bbox_to_anchor=(1, 0), loc="lower left", title='Train / Eval datasets')
    ax.grid(True)
    fig.tight_layout(rect=[0, 0.05, 1, 1])
    return ax

ax = bar_frequency_label(drop=[])
ax.set_title('% of Positive Labels')
ax.get_figure().savefig(save_fig_dir / 'distribution_of_predictions.jpg')