<div>
<img src='../../img/WSP_red.png' style='height: 95px; float: left' alt='WSP Logo'/>
<img src='../../img/austroads.png' style='height: 115px; float: right' alt='Client Logo'/>
</div>
<center><h2>AAM6201 Development of Machine-Learning Decision-Support tools for Pavement Asset Management<br>Case Study 1: Project Identification</h2></center>


In [None]:
# magic command to autoreload changes in src
%load_ext autoreload
%autoreload 2

import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import src.data.resampling as resampling
import src.util as util
import pickle
import warnings

from src.nzta_configs.final_config import CONFIG 
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import confusion_matrix, multilabel_confusion_matrix
from xgboost import XGBClassifier

from src.visualization.visualize import plot_baseline_metric_by_treatment_type, plot_metric_by_treatment_type, plot_confusion_matrix_by_treatment_type

warnings.filterwarnings('ignore')

# MODELLING 

In [None]:
# load data
from data import DATA_DIR

DATASET_NAME = 'NZTA'
REPORT_DIR = DATA_DIR.parent / 'reports' / 'figures' / DATASET_NAME
if REPORT_DIR.exists() is False:
    REPORT_DIR.mkdir(parents=True)

data_suffix = 'nzta_final_no_offset'

train_feature_data = util.load_data(source=DATA_DIR / 'processed' / DATASET_NAME / f'{data_suffix.replace("_no_offset", "")}' / f'train_flattened_data{"_" + data_suffix if data_suffix else ""}.csv') 
train_label_data = util.load_data(source=DATA_DIR / 'processed' / DATASET_NAME / f'{data_suffix.replace("_no_offset", "")}' / f'train_flattened_labels{"_" + data_suffix if data_suffix else ""}.csv', header=[0, 1]) 
train_index_data = util.load_data(source=DATA_DIR / 'processed' / DATASET_NAME / f'{data_suffix.replace("_no_offset", "")}' / f'train_flattened_index{"_" + data_suffix.replace("_no_offset", "") if data_suffix else ""}.csv') 
valid_feature_data = util.load_data(source=DATA_DIR / 'processed' / DATASET_NAME / f'{data_suffix.replace("_no_offset", "")}' / f'valid_flattened_data{"_" + data_suffix if data_suffix else ""}.csv') 
valid_label_data = util.load_data(source=DATA_DIR / 'processed' / DATASET_NAME /  f'{data_suffix.replace("_no_offset", "")}' / f'valid_flattened_labels{"_" + data_suffix if data_suffix else ""}.csv', header=[0, 1]) 
valid_index_data = util.load_data(source=DATA_DIR / 'processed' / DATASET_NAME / f'{data_suffix.replace("_no_offset", "")}' / f'valid_flattened_index{"_" + data_suffix.replace("_no_offset", "") if data_suffix else ""}.csv') 

experiment_suffix = 'nzta_final_even_split'
experiment_prefix = 'train'

EXPERIMENT_FOLDER = REPORT_DIR / experiment_suffix
SAVE_MODEL_DIR = DATA_DIR.parent / 'models' / 'trained' / DATASET_NAME / (experiment_suffix + '_dir')
SAVE_RESULT_DIR = REPORT_DIR.parent.parent / 'raw_results' / DATASET_NAME / (experiment_suffix + '_dir')

if SAVE_RESULT_DIR.exists() is False:
    SAVE_RESULT_DIR.mkdir(parents=True)
if SAVE_MODEL_DIR.exists() is False:
    SAVE_MODEL_DIR.mkdir(parents=True)
if EXPERIMENT_FOLDER.exists() is False:
    EXPERIMENT_FOLDER.mkdir()

save_path_meta_dict = {
    'experiment_prefix': experiment_prefix,
    'experiment_suffix': experiment_suffix,
    'experiment_folder': EXPERIMENT_FOLDER,
    'dataset_name': DATASET_NAME
}

# Resplit train-valid according to roadid

In [None]:
if 'even_split' in experiment_suffix:
    # merge everything
    feature_data = pd.concat([train_feature_data, valid_feature_data], axis=0)
    label_data = pd.concat([train_label_data, valid_label_data], axis=0)
    index_data = pd.concat([train_index_data, valid_index_data], axis=0)
    assert np.all(feature_data.index == label_data.index)
    feature_data = feature_data.reset_index(drop=True)
    label_data = label_data.reset_index(drop=True)
    index_data = index_data.reset_index(drop=True)

    # split half, such that valid and train each makes up half a road and a section belonging to the train immediately follows a valid section
    sorted_index = index_data.sort_values(by=['RoadID', 'Start']).index
    train_feature_data = feature_data.loc[[idx for i, idx in enumerate(sorted_index) if i % 2 == 0]]
    train_label_data = label_data.loc[[idx for i, idx in enumerate(sorted_index) if i % 2 == 0]]
    train_index_data = index_data.loc[[idx for i, idx in enumerate(sorted_index) if i % 2 == 0]]

    valid_feature_data = feature_data.loc[[idx for i, idx in enumerate(sorted_index) if i % 2 == 1]]
    valid_label_data = label_data.loc[[idx for i, idx in enumerate(sorted_index) if i % 2 == 1]]
    valid_index_data = index_data.loc[[idx for i, idx in enumerate(sorted_index) if i % 2 == 1]]

    # save new data for retrieval
    save_data_dir = DATA_DIR / 'processed' / DATASET_NAME / experiment_suffix.replace('_no_offset', '')
    if save_data_dir.exists() is False:
        save_data_dir.mkdir()

    train_feature_data.to_csv(save_data_dir / f'train_flattened_data_{experiment_suffix}.csv', index=False)
    train_label_data.to_csv(save_data_dir / f'train_flattened_labels_{experiment_suffix}.csv', index=False)
    train_index_data.to_csv(save_data_dir / f'train_flattened_index_{experiment_suffix}.csv', index=False)

    valid_feature_data.to_csv(save_data_dir / f'valid_flattened_data_{experiment_suffix}.csv', index=False)
    valid_label_data.to_csv(save_data_dir / f'valid_flattened_labels_{experiment_suffix}.csv', index=False)
    valid_index_data.to_csv(save_data_dir / f'valid_flattened_index_{experiment_suffix}.csv', index=False)

    # set config to do not train-test split
    CONFIG['sampling']['test_size'] = 0

# set feature data, label data to train set
feature_data = train_feature_data
label_data = train_label_data

# Train code

In [None]:
from src.models.models_util import make_constraint
from typing import List

def get_model(estimator_type: str, use_constraint: bool=False, treatment: str=None, feature_names: List[str]=None, **kwargs):
    if estimator_type == 'LR':
        return LogisticRegression(max_iter=1000)
    elif estimator_type == 'XGB':
        if use_constraint:
            if treatment is None or feature_names is None: raise ValueError("Treatment and feature names must be provided if constraint is applied!") 
            model_constraints = make_constraint(treatment, feature_names)
        else:
            model_constraints = None
        return XGBClassifier(objective='binary:logistic', use_label_encoder=False, eval_metric='logloss', monotone_constraints=model_constraints, **kwargs)
    else:
        raise NotImplementedError

def get_coeff(estimator):
    if isinstance(estimator, LogisticRegression):
        return estimator.coef_[0]
    elif isinstance(estimator, XGBClassifier):
        bst = estimator.get_booster()
        importance_dicts = []
        for importance_type in ['weight', 'gain', 'cover', 'total_gain', 'total_cover']:
            importance_dicts.append(bst.get_score(importance_type=importance_type))
        return importance_dicts

In [None]:
def train_multioutput_classifier_with_resampling(estimator_type: str, feature_data, project_label, config, model_params: dict={}):
    running_coeffs = [[] for _ in range(len(project_label.columns))]
    running_conf_matrix = []
    train_running_conf_matrix = []
    dummy_running_conf_matrix = {strat: [] for strat in ['stratified', 'most_frequent', 'uniform']}
    models = []
    dummy_models = {strat: [] for strat in ['stratified', 'most_frequent', 'uniform']}

    # training
    start_t = time.time()
    for i, (x_train, x_test, y_train, y_test) in enumerate(resampling.resample_with_split(feature_data, project_label, config)):

        assert y_test.columns.equals(project_label.columns)
        assert y_train.columns.equals(project_label.columns)
        model = MultiOutputClassifier(get_model(estimator_type, **model_params), n_jobs=6) # ovr for binary data, multinomial for multi-class problem
        model.fit(x_train, y_train)
        models.append(model)

        preds = model.predict(x_test)
        train_preds = model.predict(x_train)

        # running importance coefficients
        for i in range(len(project_label.columns)):
            running_coeffs[i].append(get_coeff(model)) # in order of the inputed features: feature_data.columns

        # confusion matrix
        conf_matrix = multilabel_confusion_matrix(y_test, preds)
        running_conf_matrix.append(conf_matrix)
        train_conf_matrix = multilabel_confusion_matrix(y_train, train_preds)
        train_running_conf_matrix.append(train_conf_matrix)

        # train and test dummy model
        for strategy in dummy_running_conf_matrix.keys():
            dummy = MultiOutputClassifier(DummyClassifier(strategy=strategy))
            dummy.fit(x_train, y_train)
            dummy_conf_mat = multilabel_confusion_matrix(y_test, dummy.predict(x_test))
            dummy_running_conf_matrix[strategy].append(dummy_conf_mat)
            dummy_models[strategy].append(dummy)

    # turn list of ndarrays into a numpy array
    running_conf_matrix = np.array(running_conf_matrix)
    train_running_conf_matrix = np.array(train_running_conf_matrix)
    for strat in dummy_running_conf_matrix.keys():
        dummy_running_conf_matrix[strat] = np.array(dummy_running_conf_matrix[strat])

    print("Training completed. Time taken: ", time.time() - start_t)
    print("Saving a sample of trained models...")
    saved_models = np.random.choice(models, size=min(10, len(models)), replace=False)
    with open(SAVE_MODEL_DIR / f'train_{estimator_type}_timehorizon_{experiment_suffix}.pkl', 'wb') as f:
        pickle.dump(saved_models, f)
    with open(SAVE_RESULT_DIR / f'train_{estimator_type}_rawconfmat_{experiment_suffix}.pkl', 'wb') as f:
        pickle.dump(running_conf_matrix, f)
    with open(SAVE_RESULT_DIR / f'train_{estimator_type}_rawconfmat_trainset_{experiment_suffix}.pkl', 'wb') as f:
        pickle.dump(train_running_conf_matrix, f)
    save_dummys = {strat: np.random.choice(models, size=min(10, len(models)), replace=False) for strat, models in dummy_models.items()}
    with open(SAVE_MODEL_DIR / f'train_dummy_timehorizon_{experiment_suffix}.pkl', 'wb') as f:
        pickle.dump(save_dummys, f)

    return running_conf_matrix, running_coeffs, train_running_conf_matrix, dummy_running_conf_matrix

# Prediction on time horizon

Given the above's result, we can train the model chiefly on projects with at least a treatment, since it is trivial to learn to otherwise.

In [None]:
def as_str(df):
    return df.astype(str).agg(''.join, axis=1)

project_label = label_data.drop(columns=['no_project_flag'], level=0) # this can be inferred by the previous columns
project_label = project_label.sort_index(level=1, axis=1)

has_project_train = feature_data

while True:
    # drop treatment/time pair where they do not have at least 2 classes
    key_count = project_label.sum(axis=0)
    project_label = project_label.drop(columns=key_count[key_count == 0].index)

    # get hash of each sample as string of the flags for each treatment/time pair
    target = as_str(project_label)
    target_count = target.value_counts()

    # drop samples where their unique class (i.e. combination of treatment/time pair) has only 1 value or less
    project_label = project_label[target.isin(target_count[target_count > 100].index)]
    has_project_train = has_project_train[target.isin(target_count[target_count > 100].index)]

    if (project_label.sum(axis=0) != 0).all():
        break

with open(SAVE_MODEL_DIR / f'train_labels_columns{"_" + experiment_suffix if experiment_suffix else ""}.pkl', 'wb') as f:
    pickle.dump(project_label.columns, f)
CONFIG['sampling']['method_params']['index_row'] = as_str # set method for identifying type of each row

## Logistic Regression

In [None]:
CONFIG.sampling.kfold = 5
CONFIG.sampling.n_sample_per_fold = 1
CONFIG.random_seed = 200
CONFIG.sampling.method = 'balanced' if 'balanced' in experiment_suffix else 'none'
if 'even_split' in experiment_suffix:
    CONFIG.sampling.test_size = 0.0000001

In [None]:
lr_running_conf_matrix, lr_running_coeffs, lr_train_running_conf_matrix, lr_dummy_running_conf_matrix = \
    train_multioutput_classifier_with_resampling('LR', has_project_train, project_label, CONFIG)

Results

In [None]:
# plot total accuracy for each of type-treatment pair
save_path_meta_dict['experiment_prefix'] = 'Testset'
plot_metric_by_treatment_type(project_label, lr_running_conf_matrix, estimator_type='LR', **save_path_meta_dict)
save_path_meta_dict['experiment_prefix'] = 'Trainset'
plot_metric_by_treatment_type(project_label, lr_train_running_conf_matrix, estimator_type='LR', **save_path_meta_dict)

Confusion matrix

In [None]:
save_path_meta_dict['experiment_prefix'] = 'Testset'
plot_confusion_matrix_by_treatment_type(project_label, lr_running_conf_matrix, estimator_type='LR', per_row=3, figsize=(18, 16), **save_path_meta_dict)
save_path_meta_dict['experiment_prefix'] = 'Trainset'
plot_confusion_matrix_by_treatment_type(project_label, lr_train_running_conf_matrix, estimator_type='LR', per_row=3, figsize=(18, 16), **save_path_meta_dict)

## XGBoost

In [None]:
CONFIG.sampling.n_sample_per_fold = 1
CONFIG.sampling.kfold = 5
CONFIG.random_seed = 100
CONFIG.sampling.method = 'balanced' if ('balanced' in experiment_suffix) else 'none'
if 'even_split' in experiment_suffix:
    CONFIG.sampling.test_size = 0.0000001

In [None]:
xgb_running_conf_matrix, xgb_running_coeffs, xgb_train_running_conf_matrix, xgb_dummy_running_conf_matrix =\
    train_multioutput_classifier_with_resampling('XGB', has_project_train, project_label, CONFIG) 

Results

In [None]:
save_path_meta_dict['experiment_prefix'] = 'Testset'
plot_metric_by_treatment_type(project_label, xgb_running_conf_matrix, estimator_type='XGB', **save_path_meta_dict)
save_path_meta_dict['experiment_prefix'] = 'Trainset'
plot_metric_by_treatment_type(project_label, xgb_train_running_conf_matrix, estimator_type='XGB', **save_path_meta_dict)

Naive methods results

In [None]:
save_path_meta_dict['experiment_prefix'] = 'Testset'
plot_baseline_metric_by_treatment_type(project_label, xgb_dummy_running_conf_matrix, estimator_type='XGB', **save_path_meta_dict)

Confusion matrices

In [None]:
save_path_meta_dict['experiment_prefix'] = 'Testset'
plot_confusion_matrix_by_treatment_type(project_label, xgb_running_conf_matrix, estimator_type='XGB', per_row=4, figsize=(16, 10), **save_path_meta_dict)
save_path_meta_dict['experiment_prefix'] = 'Trainset'
plot_confusion_matrix_by_treatment_type(project_label, xgb_train_running_conf_matrix, estimator_type='XGB', per_row=4, figsize=(16, 10), **save_path_meta_dict)