<div>
<img src='../../img/WSP_red.png' style='height: 95px; float: left' alt='WSP Logo'/>
<img src='../../img/austroads.png' style='height: 115px; float: right' alt='Client Logo'/>
</div>
<center><h2>AAM6201 Development of Machine-Learning Decision-Support tools for Pavement Asset Management<br>Case Study 1: Project Identification</h2></center>


In [None]:
# magic command to autoreload changes in src
%load_ext autoreload
%autoreload 2

import time
import pickle
import pandas as pd
import numpy as np
import src.data.resampling as resampling
import src.util as util

from src.nsw_configs.final_config import CONFIG, DATA_DIR
from tqdm.notebook import tqdm
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import multilabel_confusion_matrix
from src.visualization.visualize import plot_baseline_metric_by_treatment_type, plot_metric_by_treatment_type, plot_confusion_matrix_by_treatment_type
from xgboost import XGBClassifier

# MODELLING 

This notebook runs a Logistic Regression and XGBoost Classifier model on a classifciation problem according to the settings given in the configuration file.


In [None]:
# load data
from data import DATA_DIR

DATASET_NAME = 'NSW'
REPORT_DIR = DATA_DIR.parent / 'reports' / 'figures' / DATASET_NAME
if REPORT_DIR.exists() is False:
    REPORT_DIR.mkdir(parents=True)

DATASET_NAME = 'NSW'
feature_data = util.load_data(source=DATA_DIR / 'processed' / DATASET_NAME / "final" / 'train_all.csv') 
label_data = util.load_data(source=DATA_DIR / 'processed' / DATASET_NAME / "final" / 'labels_all.csv', header=[0, 1]) 

DATASET_NAME = 'NSW'
experiment_suffix = 'nsw_final_even_split'
experiment_prefix = 'train'

EXPERIMENT_FOLDER = REPORT_DIR / experiment_suffix
SAVE_MODEL_DIR = DATA_DIR.parent / 'models' / 'trained' / DATASET_NAME / (experiment_suffix + '_dir')
SAVE_RESULT_DIR = REPORT_DIR.parent.parent / 'raw_results' / DATASET_NAME / (experiment_suffix + '_dir')

if SAVE_MODEL_DIR.exists() is False:
    SAVE_MODEL_DIR.mkdir(parents=True)
if SAVE_RESULT_DIR.exists() is False:
    SAVE_RESULT_DIR.mkdir(parents=True)
if EXPERIMENT_FOLDER.exists() is False:
    EXPERIMENT_FOLDER.mkdir()

save_path_meta_dict = {
    'experiment_prefix': experiment_prefix,
    'experiment_suffix': experiment_suffix,
    'experiment_folder': EXPERIMENT_FOLDER,
    'dataset_name': DATASET_NAME
}

# Train code

In [None]:
def get_model(estimator_type: str, **kwargs):
    if estimator_type == 'LR':
        return LogisticRegression(max_iter=1000, **kwargs)
    elif estimator_type == 'XGB':
        return XGBClassifier(objective='binary:logistic', use_label_encoder=False, eval_metric='logloss', **kwargs)
    else:
        raise NotImplementedError

def get_coeff(estimator):
    if isinstance(estimator, LogisticRegression):
        return estimator.coef_[0]
    elif isinstance(estimator, XGBClassifier):
        bst = estimator.get_booster()
        importance_dicts = []
        for importance_type in ['weight', 'gain', 'cover', 'total_gain', 'total_cover']:
            importance_dicts.append(bst.get_score(importance_type=importance_type))
        return importance_dicts

In [None]:
def train_multioutput_classifier_with_resampling(estimator_type: str, feature_data, project_label, config, model_params: dict={}):
    running_coeffs = [[] for _ in range(len(project_label.columns))]
    running_conf_matrix = []
    train_running_conf_matrix = []
    dummy_running_conf_matrix = {strat: [] for strat in ['stratified', 'most_frequent', 'uniform']}
    models = []
    dummy_models = {strat: [] for strat in ['stratified', 'most_frequent', 'uniform']}

    # training
    start_t = time.time()
    for i, (x_train, x_test, y_train, y_test) in enumerate(resampling.resample_with_split(feature_data, project_label, config)):

        assert y_test.columns.equals(project_label.columns)
        assert y_train.columns.equals(project_label.columns)
        model = MultiOutputClassifier(get_model(estimator_type, **model_params), n_jobs=6) # ovr for binary data, multinomial for multi-class problem
        model.fit(x_train, y_train)
        models.append(model)

        preds = model.predict(x_test)
        train_preds = model.predict(x_train)

        # running importance coefficients
        for i in range(len(project_label.columns)):
            running_coeffs[i].append(get_coeff(model)) # in order of the inputed features: feature_data.columns

        # confusion matrix
        conf_matrix = multilabel_confusion_matrix(y_test, preds)
        running_conf_matrix.append(conf_matrix)
        train_conf_matrix = multilabel_confusion_matrix(y_train, train_preds)
        train_running_conf_matrix.append(train_conf_matrix)

        # train and test dummy model
        for strategy in dummy_running_conf_matrix.keys():
            dummy = MultiOutputClassifier(DummyClassifier(strategy=strategy))
            dummy.fit(x_train, y_train)
            dummy_conf_mat = multilabel_confusion_matrix(y_test, dummy.predict(x_test))
            dummy_running_conf_matrix[strategy].append(dummy_conf_mat)
            dummy_models[strategy].append(dummy)

    # turn list of ndarrays into a numpy array
    running_conf_matrix = np.array(running_conf_matrix)
    train_running_conf_matrix = np.array(train_running_conf_matrix)
    for strat in dummy_running_conf_matrix.keys():
        dummy_running_conf_matrix[strat] = np.array(dummy_running_conf_matrix[strat])

    print("Training completed. Time taken: ", time.time() - start_t)
    print("Saving a sample of trained models...")
    saved_models = np.random.choice(models, size=min(10, len(models)), replace=False)
    with open(SAVE_MODEL_DIR / f'train_{estimator_type}_timehorizon_{experiment_suffix}.pkl', 'wb') as f:
        pickle.dump(saved_models, f)
    with open(SAVE_RESULT_DIR / f'train_{estimator_type}_rawconfmat_{experiment_suffix}.pkl', 'wb') as f:
        pickle.dump(running_conf_matrix, f)
    with open(SAVE_RESULT_DIR / f'train_{estimator_type}_rawconfmat_trainset_{experiment_suffix}.pkl', 'wb') as f:
        pickle.dump(train_running_conf_matrix, f)
    save_dummys = {strat: np.random.choice(models, size=min(10, len(models)), replace=False) for strat, models in dummy_models.items()}
    with open(SAVE_MODEL_DIR / f'train_dummy_timehorizon_{experiment_suffix}.pkl', 'wb') as f:
        pickle.dump(save_dummys, f)

    return running_conf_matrix, running_coeffs, train_running_conf_matrix, dummy_running_conf_matrix

In [None]:
def as_str(df):
    return df.astype(str).agg(''.join, axis=1)

# filter by having at least a project
project_label = label_data.drop(columns=['no_project_flag'], level=0)
has_project_train = feature_data

while True:
    # drop treatment/time pair where they do not have at least 2 classes
    key_count = project_label.sum(axis=0)
    project_label = project_label.drop(columns=key_count[key_count == 0].index)

    # get hash of each sample as string of the flags for each treatment/time pair
    target = as_str(project_label)
    target_count = target.value_counts()

    # drop samples where their unique class (i.e. combination of treatment/time pair) has only 1 value or less
    project_label = project_label[target.isin(target_count[target_count > 100].index)]
    has_project_train = has_project_train[target.isin(target_count[target_count > 100].index)]

    if (project_label.sum(axis=0) != 0).all():
        break

In [None]:
CONFIG['sampling']['method_params']['index_row'] = as_str # set method for identifying type of each row

In [None]:
with open(SAVE_MODEL_DIR / f'train_labels_columns{"_" + experiment_suffix if experiment_suffix else ""}.pkl', 'wb') as f:
    pickle.dump(project_label.columns, f)

Remove classes with less than 100 data points

Split 20% for validation, with even split between classes

In [None]:
validation_split=0.2

In [None]:
val_idx = np.concatenate([np.random.choice(target[target == target_cls].index, size=int(validation_split*len(target[target == target_cls])), replace=False) for target_cls in target.unique()], axis=0)
val_labels = project_label.loc[val_idx]
val_features = has_project_train.loc[val_idx]

In [None]:
train_test_idx = project_label.index[~project_label.index.isin(val_idx)]
train_test_labels = project_label.loc[train_test_idx]
train_test_features = has_project_train.loc[train_test_idx]

In [None]:
assert (len(set(train_test_idx).intersection(set(val_idx))) == 0)

In [None]:
val_features.to_csv(DATA_DIR / 'processed' / DATASET_NAME / "final" / 'valid_all.csv', index=False) 
val_labels.to_csv(DATA_DIR / 'processed' / DATASET_NAME / "final" / 'valid_labels_all.csv', index=False) 

### LR model

In [None]:
CONFIG.sampling.n_sample_per_fold = 1 
CONFIG.sampling.kfold = 5
CONFIG.random_seed = 19
CONFIG.sampling.method = "balanced" if "balanced" in experiment_suffix else "none"

In [None]:
lr_running_conf_matrix, lr_running_coeffs, lr_train_running_conf_matrix, lr_dummy_running_conf_matrix =\
    train_multioutput_classifier_with_resampling('LR', train_test_features, train_test_labels, CONFIG) 

Results

In [None]:
# plot total accuracy for each of type-treatment pair
save_path_meta_dict['experiment_prefix'] = 'Testset'
plot_metric_by_treatment_type(project_label, lr_running_conf_matrix, estimator_type='LR', **save_path_meta_dict)
save_path_meta_dict['experiment_prefix'] = 'Trainset'
plot_metric_by_treatment_type(project_label, lr_train_running_conf_matrix, estimator_type='LR', **save_path_meta_dict)

Confusion matrices

In [None]:
save_path_meta_dict['experiment_prefix'] = 'Testset'
plot_confusion_matrix_by_treatment_type(project_label, lr_running_conf_matrix, estimator_type='LR', per_row=3, figsize=(18, 16), **save_path_meta_dict)
save_path_meta_dict['experiment_prefix'] = 'Trainset'
plot_confusion_matrix_by_treatment_type(project_label, lr_train_running_conf_matrix, estimator_type='LR', per_row=3, figsize=(18, 16), **save_path_meta_dict)

### XGB model

In [None]:
CONFIG.sampling.n_sample_per_fold = 1
CONFIG.sampling.kfold = 5
CONFIG.random_seed = 100
CONFIG.sampling.method = "balanced" if "balanced" in experiment_suffix else "none" 

In [None]:
xgb_running_conf_matrix, xgb_running_coeffs, xgb_train_running_conf_matrix, xgb_dummy_running_conf_matrix =\
    train_multioutput_classifier_with_resampling('XGB', train_test_features, train_test_labels, CONFIG) 

In [None]:
save_path_meta_dict['experiment_prefix'] = 'Testset'
plot_confusion_matrix_by_treatment_type(project_label, xgb_running_conf_matrix, estimator_type='XGB', per_row=5, **save_path_meta_dict)
save_path_meta_dict['experiment_prefix'] = 'Trainset'
plot_confusion_matrix_by_treatment_type(project_label, xgb_running_conf_matrix, estimator_type='XGB', per_row=5, **save_path_meta_dict)

In [None]:
save_path_meta_dict['experiment_prefix'] = 'Testset'
plot_metric_by_treatment_type(project_label, xgb_running_conf_matrix, estimator_type='XGB', **save_path_meta_dict)
save_path_meta_dict['experiment_prefix'] = 'Trainset'
plot_metric_by_treatment_type(project_label, xgb_train_running_conf_matrix, estimator_type='XGB', **save_path_meta_dict)

In [None]:
save_path_meta_dict['experiment_prefix'] = 'Testset'
plot_baseline_metric_by_treatment_type(project_label, xgb_dummy_running_conf_matrix, estimator_type='XGB', **save_path_meta_dict)

In [None]:
def load_model(estimator_type: str):
    with open(SAVE_MODEL_DIR / f'train_{estimator_type}_timehorizon_{experiment_suffix}.pkl', 'rb') as f:
    #with open(DATA_DIR.parent / 'models' / 'trained' / 'VIC' / f'train_{estimator_type}_timehorizon{"_" + model_suffix if model_suffix else ""}.pkl', 'rb') as f:
        time_horizon_models = pickle.load(f)
    return time_horizon_models

# load valid
val_features = pd.read_csv(DATA_DIR / 'processed' / DATASET_NAME / "final" / 'valid_all.csv') 
val_labels = pd.read_csv(DATA_DIR / 'processed' / DATASET_NAME / "final" / 'valid_labels_all.csv', header=[0, 1]) 

def as_str(df):
    return df.astype(str).agg(''.join, axis=1)

project_label_valid = val_labels.copy()
with open(SAVE_MODEL_DIR / f'train_labels_columns{"_" + experiment_suffix if experiment_suffix else ""}.pkl', 'rb') as f:
    prediction_columns = pickle.load(f)

dropped_columns = project_label_valid.columns.difference(prediction_columns)

# if PAD is true, evaluate label not seen in training with most common class
PAD = False 

if not PAD:
    # for transfer
    inner = prediction_columns.intersection(project_label_valid.columns, sort=False) 
    project_label_valid = project_label_valid[inner]

In [None]:
save_path_meta_dict_val = {
    'experiment_prefix': "valid",
    'experiment_suffix': experiment_suffix,
    'experiment_folder': EXPERIMENT_FOLDER,
    'dataset_name': DATASET_NAME
}

In [None]:
xgb_models = load_model("XGB")
val_conf_matrix = []

for model in tqdm(xgb_models): 
    preds = model.predict(val_features)
    conf_matrix = multilabel_confusion_matrix(val_labels, preds)
    val_conf_matrix.append(conf_matrix)

with open(SAVE_RESULT_DIR / f'{save_path_meta_dict_val["experiment_prefix"]}_XGB_rawconfmat_{save_path_meta_dict_val["experiment_suffix"]}.pkl', 'wb') as f:
    pickle.dump(np.array(val_conf_matrix), f)

In [None]:
def evaluate_baseline(feature_data: pd.DataFrame, project_label: pd.DataFrame, dropped_columns: pd.Index, prediction_columns: pd.Index, pad: bool=True):
    models_by_strat = load_model('dummy')
    running_conf_matrix = {strat: [] for strat in models_by_strat.keys()}
    
    if pad is False:
        # the model can predict more than what we want to evaluate on
        # so we locate only the desired columns in project label
        inner = prediction_columns.intersection(project_label.columns, sort=False) 
        assert len(project_label.columns) == len(inner) # labels passed in must all be available
        project_label = project_label[inner] # ensure order is correct
    else:
        # check all labels to be evaluated is in prediction columns and padded dropped columns
        assert set(project_label.columns) - set(prediction_columns.append(dropped_columns)) == set()
        assert len(prediction_columns.intersection(dropped_columns)) == 0
        inner = project_label.columns

    start_t = time.time()
    for strat, models in models_by_strat.items(): 
        for model in models:
            preds = model.predict(feature_data)
            # pad preds with columns we dropped
            if pad:
                preds = np.hstack((preds, np.zeros((preds.shape[0], len(dropped_columns)))))
                preds = pd.DataFrame(preds, columns=prediction_columns.append(dropped_columns))[inner]
            else:
                preds = pd.DataFrame(preds, columns=prediction_columns)[inner]
            # confusion matrix
            conf_matrix = multilabel_confusion_matrix(project_label, preds)
            running_conf_matrix[strat].append(conf_matrix)

    with open(SAVE_RESULT_DIR / f'valid_dummy_rawconfmat_{save_path_meta_dict_val["experiment_suffix"]}.pkl', 'wb') as f:
        pickle.dump(running_conf_matrix, f)

    print("Evaluation completed. Time taken: ", time.time() - start_t)
    return running_conf_matrix

In [None]:
plot_confusion_matrix_by_treatment_type(val_labels, val_conf_matrix, estimator_type='XGB', per_row=5, **save_path_meta_dict_val)
plot_metric_by_treatment_type(val_labels, val_conf_matrix, estimator_type='XGB', **save_path_meta_dict_val)

In [None]:
xgb_dummy_running_conf_matrix = evaluate_baseline(val_features, val_labels, dropped_columns=dropped_columns, prediction_columns=prediction_columns, pad=PAD)
plot_baseline_metric_by_treatment_type(project_label, xgb_dummy_running_conf_matrix, estimator_type='XGB', **save_path_meta_dict)