<div>
<img src='../../img/WSP_red.png' style='height: 95px; float: left' alt='WSP Logo'/>
<img src='../../img/austroads.png' style='height: 115px; float: right' alt='Client Logo'/>
</div>
<center><h2>AAM6201 Development of Machine-Learning Decision-Support tools for Pavement Asset Management<br>Case Study 1: Project Identification</h2></center>


In [None]:
# magic command to autoreload changes in src
%load_ext autoreload
%autoreload 2

import time
import pandas as pd
import numpy as np
import src.util as util
import pickle
import warnings

from src.visualization.visualize import plot_baseline_metric_by_treatment_type, plot_confusion_matrix_by_treatment_type, plot_metric_by_treatment_type
from sklearn.metrics import confusion_matrix, multilabel_confusion_matrix

warnings.filterwarnings('ignore')

# Load data and models 

In [None]:
# load data
from src import DATA_DIR

DATASET_NAME = 'NZTA'
REPORT_DIR = DATA_DIR.parent / 'reports' / 'figures' / DATASET_NAME
if REPORT_DIR.exists() is False:
    REPORT_DIR.mkdir(parents=True)

experiment_suffix = 'nzta_final_even_split' 

save_suffix = experiment_suffix
load_suffix = experiment_suffix.replace('on_nzta_', '')
model_suffix = load_suffix
experiment_prefix = 'valid'

SAVE_MODEL_DIR = DATA_DIR.parent / 'models' / 'trained' / DATASET_NAME / (load_suffix + '_dir')
EXPERIMENT_FOLDER = REPORT_DIR / save_suffix 
if EXPERIMENT_FOLDER.exists() is False:
    EXPERIMENT_FOLDER.mkdir()
SAVE_RESULT_DIR = REPORT_DIR.parent.parent / 'raw_results' / DATASET_NAME / (save_suffix + '_dir')
if SAVE_RESULT_DIR.exists() is False:
    SAVE_RESULT_DIR.mkdir(parents=True)

save_path_meta_dict = {
    'experiment_prefix': experiment_prefix,
    'experiment_suffix': save_suffix,
    'experiment_folder': EXPERIMENT_FOLDER,
    'dataset_name': DATASET_NAME
}

In [None]:
data_suffix = 'final_no_offset'
if 'on_mrwa' in experiment_suffix:
    data_suffix = 'mrwa_' + data_suffix
    from src.nzta_configs.final_config import CONFIG as NZTA_CONFIG
    from src.mrwa_configs.final_config import CONFIG as MRWA_CONFIG

    with open(MRWA_CONFIG['preprocessing']['state_save_path'] / (f'preprocessing_state_dict_{data_suffix.replace("_no_offset", "")}.sav'), 'rb') as f:
        mrwa_saved_state_dict = pickle.load(f)
    with open(NZTA_CONFIG['preprocessing']['state_save_path'] / (f'preprocessing_state_dict_{data_suffix.replace("mrwa", "nzta").replace("_no_offset", "")}.sav'), 'rb') as f:
        nzta_saved_state_dict = pickle.load(f)

    mrwa_train_feature_data = util.load_data(source=DATA_DIR / 'processed' / 'MRWA' / data_suffix.replace('_no_offset', '') / f'train_flattened_data{"_" + data_suffix if data_suffix else ""}.csv') 
    mrwa_train_label_data = util.load_data(source=DATA_DIR / 'processed' / 'MRWA' / data_suffix.replace('_no_offset', '') / f'train_flattened_labels{"_" + data_suffix if data_suffix else ""}.csv', header=[0, 1]) 
    nzta_train_feature_data = util.load_data(source=DATA_DIR / 'processed' / 'NZTA' / data_suffix.replace("mrwa", "nzta").replace('_no_offset', '') / f'train_flattened_data{"_" + data_suffix.replace("mrwa", "nzta") if data_suffix else ""}.csv') 

    # rescale all features
    for feature in mrwa_train_feature_data.columns:
        try:
            mrwa_train_feature_data.loc[:, feature] = mrwa_saved_state_dict['scaler']\
                                                    [feature.replace('_df0|idx=0', '')]\
                                                    .inverse_transform(mrwa_train_feature_data.loc[:, feature].values.reshape(-1, 1))\
                                                    .flatten()
            mrwa_train_feature_data.loc[:, feature] = nzta_saved_state_dict['scaler']\
                                                    [feature.replace('_df0|idx=0', '')]\
                                                    .transform(mrwa_train_feature_data.loc[:, feature].values.reshape(-1, 1))\
                                                    .flatten()
        except KeyError:
            continue

    feature_data = mrwa_train_feature_data
    label_data = mrwa_train_label_data

    feature_data['Pavement Type_Rigid_df0|idx=0'] = 1 \
        - feature_data['Pavement Type_Flexible_df0|idx=0'].replace({
            nzta_train_feature_data['Pavement Type_Flexible_df0|idx=0'].min(): 0.0,
            nzta_train_feature_data['Pavement Type_Flexible_df0|idx=0'].max(): 1.0,
        })\
        - feature_data['Pavement Type_Other_df0|idx=0'].replace({
            feature_data['Pavement Type_Other_df0|idx=0'].min(): 0.0,
            feature_data['Pavement Type_Other_df0|idx=0'].max(): 1.0,
        })

    feature_data.loc[:, 'Pavement Type_Rigid_df0|idx=0'] = feature_data['Pavement Type_Rigid_df0|idx=0'].replace({
        0: nzta_train_feature_data['Pavement Type_Rigid_df0|idx=0'].min(),
        1: nzta_train_feature_data['Pavement Type_Rigid_df0|idx=0'].max(),
    })
    feature_data = feature_data[nzta_train_feature_data.columns]
elif 'even_split' in experiment_suffix:
    data_suffix = 'nzta_final_even_split'
    feature_data = util.load_data(source=DATA_DIR / 'processed' / 'NZTA' / data_suffix / f'valid_flattened_data{"_" + data_suffix if data_suffix else ""}.csv') 
    label_data = util.load_data(source=DATA_DIR / 'processed' / 'NZTA' / data_suffix / f'valid_flattened_labels{"_" + data_suffix if data_suffix else ""}.csv', header=[0, 1]) 
else:
    data_suffix = 'nzta_' + data_suffix
    feature_data = util.load_data(source=DATA_DIR / 'processed' / 'NZTA' / data_suffix.replace('_no_offset', '') / f'valid_flattened_data{"_" + data_suffix if data_suffix else ""}.csv') 
    label_data = util.load_data(source=DATA_DIR / 'processed' / 'NZTA' / data_suffix.replace('_no_offset', '') / f'valid_flattened_labels{"_" + data_suffix if data_suffix else ""}.csv', header=[0, 1]) 

DATASET_NAME = 'NZTA'

# Valid code

In [None]:
from sklearn.metrics import classification_report, roc_auc_score

def update_summary_dict(current: dict, y_pred: np.ndarray, y_true: pd.Series, probs: np.ndarray, multiclass_roc: str='raise', labels: list=None) -> None:
    """
    Given a current dict of summary statistics, append the new statistics computed from the new predictions 
    
    Args:
        x: input
        y: true labels
        model: the machine learning model
    """
    summary_dict = classification_report(y_true, y_pred, output_dict=True, zero_division=0, labels=labels) # classification statistics for each label and each type of average weighting
    auc = roc_auc_score(y_true, probs, average=None, multi_class=multiclass_roc)

    # dict of summary statistics and feature importance
    if y_true.nunique() > 2:
        weighted_dict = summary_dict['weighted avg'] # classification reports for weighted average
    elif y_true.nunique() == 2:
        # if target is binary, we care only about precision, recall, and f1-score for the minority class
        weighted_dict = summary_dict[labels[1] if labels is not None else str(y_true.value_counts().sort_values(ascending=True).index[0])]

    current['f1-score'].append(weighted_dict['f1-score'])
    current['precision'].append(weighted_dict['precision'])
    current['recall'].append(weighted_dict['recall'])
    current['accuracy'].append(summary_dict['accuracy'])
    current['auc'].append(auc)


In [None]:
def load_model(estimator_type: str):
    with open(SAVE_MODEL_DIR / f'train_{estimator_type}_timehorizon{"_" + model_suffix if model_suffix else ""}.pkl', 'rb') as f:
        time_horizon_models = pickle.load(f)
    return time_horizon_models

In [None]:
def evaluate(estimator_type: str, feature_data: pd.DataFrame, project_label: pd.DataFrame, dropped_columns: pd.Index, prediction_columns: pd.Index, pad: bool=True):
    running_conf_matrix = []
    summary_dict = {key: [] for key in ['f1-score', 'accuracy', 'auc', 'precision', 'recall']}
    models = load_model(estimator_type)

    if pad is False:
        # the model can predict more than what we want to evaluate on
        # so we locate only the desired columns in project label
        inner = prediction_columns.intersection(project_label.columns, sort=False) 
        assert len(project_label.columns) == len(inner) # labels passed in must all be available
        project_label = project_label[inner] # ensure order is correct
    else:
        # check all labels to be evaluated is in prediction columns and padded dropped columns
        assert set(project_label.columns) - set(prediction_columns.append(dropped_columns)) == set()
        assert len(prediction_columns.intersection(dropped_columns)) == 0
        inner = project_label.columns

    start_t = time.time()
    for model in models: 
        preds = model.predict(feature_data)
        # pad preds with columns we dropped
        if pad:
            preds = np.hstack((preds, np.zeros((preds.shape[0], len(dropped_columns)))))
            preds = pd.DataFrame(preds, columns=prediction_columns.append(dropped_columns))[inner]
        else:
            preds = pd.DataFrame(preds, columns=prediction_columns)[inner]
        # confusion matrix
        if project_label.shape[1] > 1:
            conf_matrix = multilabel_confusion_matrix(project_label, preds)
        else:
            probs = model.predict_proba(feature_data)
            update_summary_dict(summary_dict, preds, project_label.iloc[:, 0], probs[:, 1])
            conf_matrix = confusion_matrix(project_label, preds)
        running_conf_matrix.append(conf_matrix)

    with open(SAVE_RESULT_DIR / f'valid_{estimator_type}_rawconfmat_{save_suffix}.pkl', 'wb') as f:
        pickle.dump(running_conf_matrix, f)

    print("Evaluation completed. Time taken: ", time.time() - start_t)
    return running_conf_matrix, summary_dict

def evaluate_baseline(feature_data: pd.DataFrame, project_label: pd.DataFrame, dropped_columns: pd.Index, prediction_columns: pd.Index, pad: bool=True):
    models_by_strat = load_model('dummy')
    running_conf_matrix = {strat: [] for strat in models_by_strat.keys()}
    
    if pad is False:
        # the model can predict more than what we want to evaluate on
        # so we locate only the desired columns in project label
        inner = prediction_columns.intersection(project_label.columns, sort=False) 
        assert len(project_label.columns) == len(inner) # labels passed in must all be available
        project_label = project_label[inner] # ensure order is correct
    else:
        # check all labels to be evaluated is in prediction columns and padded dropped columns
        assert set(project_label.columns) - set(prediction_columns.append(dropped_columns)) == set()
        assert len(prediction_columns.intersection(dropped_columns)) == 0
        inner = project_label.columns

    start_t = time.time()
    for strat, models in models_by_strat.items(): 
        for model in models:
            preds = model.predict(feature_data)
            # pad preds with columns we dropped
            if pad:
                preds = np.hstack((preds, np.zeros((preds.shape[0], len(dropped_columns)))))
                preds = pd.DataFrame(preds, columns=prediction_columns.append(dropped_columns))[inner]
            else:
                preds = pd.DataFrame(preds, columns=prediction_columns)[inner]
            # confusion matrix
            conf_matrix = multilabel_confusion_matrix(project_label, preds)
            running_conf_matrix[strat].append(conf_matrix)
    
    with open(SAVE_RESULT_DIR / f'valid_dummy_rawconfmat_{save_suffix}.pkl', 'wb') as f:
        pickle.dump(running_conf_matrix, f)

    print("Evaluation completed. Time taken: ", time.time() - start_t)
    return running_conf_matrix

# Prediction on time horizon

Given the above's result, we can train the model chiefly on projects with at least a treatment, since it is trivial to learn to otherwise.

In [None]:
def as_str(df):
    return df.astype(str).agg(''.join, axis=1)

project_label_valid = label_data.drop(columns=['no_project_flag'], level=0)
# project_label_valid = pd.DataFrame(label_data.iloc[:, 0].rename('no_project_flag'))
with open(SAVE_MODEL_DIR / f'train_labels_columns{"_" + load_suffix if load_suffix else ""}.pkl', 'rb') as f:
    prediction_columns = pickle.load(f)

dropped_columns = project_label_valid.columns.difference(prediction_columns)

In [None]:
# if PAD is true, evaluate label not seen in training with most common class
PAD = False 
if not PAD:
    # for transfer
    inner = prediction_columns.intersection(project_label_valid.columns, sort=False) 
    project_label_valid = project_label_valid[prediction_columns]

## Logistic Regression

In [None]:
lr_running_conf_matrix, lr_summary_dict = evaluate('LR', feature_data, project_label_valid, dropped_columns, prediction_columns, pad=PAD)

Results

In [None]:
plot_metric_by_treatment_type(project_label_valid, lr_running_conf_matrix, estimator_type='LR', **save_path_meta_dict)

Confusion matrix

In [None]:
plot_confusion_matrix_by_treatment_type(project_label_valid, lr_running_conf_matrix, estimator_type='LR', per_row=5, figsize=(20, 15), **save_path_meta_dict)

## XGBoost

In [None]:
xgb_running_conf_matrix, xgb_summary_dict = evaluate('XGB', feature_data, project_label_valid, dropped_columns, prediction_columns, pad=PAD) 

Results

In [None]:
plot_metric_by_treatment_type(project_label_valid, xgb_running_conf_matrix, estimator_type='XGB', **save_path_meta_dict)

Confusion matrix

In [None]:
plot_confusion_matrix_by_treatment_type(project_label_valid, xgb_running_conf_matrix, estimator_type='XGB', per_row=5, figsize=(20, 15), **save_path_meta_dict)

## Baseline

In [None]:
baseline_running_conf_matrix = evaluate_baseline(feature_data, project_label_valid, dropped_columns, prediction_columns, pad=PAD)

Results

In [None]:
plot_baseline_metric_by_treatment_type(project_label_valid, baseline_running_conf_matrix, estimator_type='Dummy', **save_path_meta_dict)