<div>
<img src='../../img/WSP_red.png' style='height: 95px; float: left' alt='WSP Logo'/>
<img src='../../img/austroads.png' style='height: 115px; float: right' alt='Client Logo'/>
</div>
<center><h2>AAM6201 Development of Machine-Learning Decision-Support tools for Pavement Asset Management<br>Case Study 1: Project Identification</h2></center>


In [None]:
# magic command to autoreload changes in src
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import src.util as util
import pickle
import warnings
from tqdm.auto import tqdm

import seaborn as sns
warnings.filterwarnings('ignore')

# Load everything

In [None]:
from src import DATA_DIR
SUFFIX = 'even_split'

REPORT_DIR = DATA_DIR.parent / 'reports' / 'figures' / 'cluster'
if REPORT_DIR.exists() is False:
    REPORT_DIR.mkdir(parents=True)

## Load WA

In [None]:
# load data
from src import DATA_DIR


# Load MRWA Data
DATASET_NAME = 'MRWA'
if SUFFIX == 'even_split':
    experiment_suffix = 'mrwa_final_even_split'
elif SUFFIX == 'no_bootstrap':
    experiment_suffix = 'mrwa_final_no_offset'
else:
    raise NotImplementedError
wa_valid_feature = util.load_data(source=DATA_DIR / 'processed' / DATASET_NAME / experiment_suffix.replace('_no_offset', '') /  f'valid_flattened_data{"_" + experiment_suffix if experiment_suffix else ""}.csv') 
wa_train_feature = util.load_data(source=DATA_DIR / 'processed' / DATASET_NAME / experiment_suffix.replace('_no_offset', '') / f'train_flattened_data{"_" + experiment_suffix if experiment_suffix else ""}.csv') 
wa_valid_labels = util.load_data(source=DATA_DIR / 'processed' / DATASET_NAME / experiment_suffix.replace('_no_offset', '') /  f'valid_flattened_labels{"_" + experiment_suffix if experiment_suffix else ""}.csv', header=[0, 1]) 
wa_train_labels = util.load_data(source=DATA_DIR / 'processed' / DATASET_NAME / experiment_suffix.replace('_no_offset', '') /  f'train_flattened_labels{"_" + experiment_suffix if experiment_suffix else ""}.csv', header=[0, 1]) 
wa_valid_index = util.load_data(source=DATA_DIR / 'processed' / DATASET_NAME / experiment_suffix.replace('_no_offset', '') / f'valid_flattened_index{"_" + experiment_suffix.replace("_no_offset", "") if experiment_suffix else ""}.csv') 
wa_train_index = util.load_data(source=DATA_DIR / 'processed' / DATASET_NAME / experiment_suffix.replace('_no_offset', '') / f'train_flattened_index{"_" + experiment_suffix.replace("_no_offset", "") if experiment_suffix else ""}.csv') 
experiment_suffix = 'mrwa_final_' + SUFFIX

# original projects
wa_projects = util.load_data(source=DATA_DIR / 'interim' / DATASET_NAME / 'cleaned_projects.csv')

model_dir = REPORT_DIR.parent.parent.parent / 'models' / 'trained' / DATASET_NAME / (experiment_suffix + '_dir') 
# Load Prediction columns
with open(model_dir / f'train_labels_columns_{experiment_suffix}.pkl', 'rb') as f:
    wa_prediction_cols = pickle.load(f)
# Load model
with open(model_dir / f'train_XGB_timehorizon_{experiment_suffix}.pkl', 'rb') as f:
    wa_models = pickle.load(f)

# join all
wa_features = wa_valid_feature 
wa_index = wa_valid_index 

# cast year offset
year_offset = ((wa_projects['Date Treatment'].astype(np.datetime64) - wa_projects['Date Planned'].astype(np.datetime64)) / np.timedelta64(1, 'Y'))
for col in ['Treatment within 1 year', 'Treatment between 1 to 3 years', 'Treatment between 3 to 5 years', 'Treatment between 5 to 10 years', 'Treatment between 10 to 30 years']:
    wa_projects.loc[:, col] = False
wa_projects.loc[year_offset < 1, 'Treatment within 1 year'] = True
wa_projects.loc[(year_offset > 1) & (year_offset <= 3), 'Treatment between 1 to 3 years'] = True
wa_projects.loc[((year_offset > 3) & (year_offset <= 5)), 'Treatment between 3 to 5 years'] = True
wa_projects.loc[((year_offset > 5) & (year_offset <= 10)), 'Treatment between 5 to 10 years'] = True
wa_projects.loc[((year_offset > 10) & (year_offset <= 30)), 'Treatment between 10 to 30 years'] = True

# cast to int to avoid floating point erros (e.g. 400 + e-32 > 400 + e-33)
wa_index.loc[:, ['Start', 'End']] = wa_index[['Start', 'End']].astype(int)
wa_projects.loc[:, ['Start', 'End']] = wa_projects[['Start', 'End']].astype(int)

## Load NZ 

In [None]:
# load data
from src import DATA_DIR

# Load MRWA Data
DATASET_NAME = 'NZTA'
if SUFFIX == 'even_split':
    experiment_suffix = 'nzta_final_even_split'
elif SUFFIX == 'no_bootstrap':
    experiment_suffix = 'nzta_final_no_offset'
else:
    raise NotImplementedError
nz_valid_feature = util.load_data(source=DATA_DIR / 'processed' / DATASET_NAME / experiment_suffix.replace('_no_offset', '') /  f'valid_flattened_data{"_" + experiment_suffix if experiment_suffix else ""}.csv') 
nz_train_feature = util.load_data(source=DATA_DIR / 'processed' / DATASET_NAME / experiment_suffix.replace('_no_offset', '') / f'train_flattened_data{"_" + experiment_suffix if experiment_suffix else ""}.csv') 
nz_valid_labels = util.load_data(source=DATA_DIR / 'processed' / DATASET_NAME / experiment_suffix.replace('_no_offset', '') / f'valid_flattened_labels{"_" + experiment_suffix if experiment_suffix else ""}.csv', header=[0, 1]) 
nz_train_labels = util.load_data(source=DATA_DIR / 'processed' / DATASET_NAME / experiment_suffix.replace('_no_offset', '') / f'train_flattened_labels{"_" + experiment_suffix if experiment_suffix else ""}.csv', header=[0, 1]) 
nz_valid_index = util.load_data(source=DATA_DIR / 'processed' / DATASET_NAME / experiment_suffix.replace('_no_offset', '') / f'valid_flattened_index{"_" + experiment_suffix.replace("_no_offset", "") if experiment_suffix else ""}.csv') 
nz_train_index = util.load_data(source=DATA_DIR / 'processed' / DATASET_NAME / experiment_suffix.replace('_no_offset', '') / f'train_flattened_index{"_" + experiment_suffix.replace("_no_offset", "") if experiment_suffix else ""}.csv') 
experiment_suffix = 'nzta_final_' + SUFFIX

# original projects
nz_projects = util.load_data(source=DATA_DIR / 'interim' / DATASET_NAME / 'cleaned_projects.csv')
nz_projects = nz_projects.rename(columns={'Treatment Category': 'Treatment'})

model_dir = REPORT_DIR.parent.parent.parent / 'models' / 'trained' / DATASET_NAME / (experiment_suffix + '_dir') 
# Load Prediction columns
with open(model_dir / f'train_labels_columns_{experiment_suffix}.pkl', 'rb') as f:
    nz_prediction_cols = pickle.load(f)
# Load model
with open(model_dir / f'train_XGB_timehorizon_{experiment_suffix}.pkl', 'rb') as f:
    nz_models = pickle.load(f)

# join all
nz_features = nz_valid_feature 
nz_index = nz_valid_index

# cast year offset
year_offset = ((nz_projects['Date Treatment'].astype(np.datetime64) - nz_projects['Date Planned'].astype(np.datetime64)) / np.timedelta64(1, 'Y'))
for col in ['Treatment within 1 year', 'Treatment between 1 to 3 years', 'Treatment between 3 to 5 years', 'Treatment between 5 to 10 years', 'Treatment between 10 to 30 years']:
    nz_projects.loc[:, col] = False
nz_projects.loc[year_offset < 1, 'Treatment within 1 year'] = True
nz_projects.loc[(year_offset > 1) & (year_offset <= 3), 'Treatment between 1 to 3 years'] = True
nz_projects.loc[((year_offset > 3) & (year_offset <= 5)), 'Treatment between 3 to 5 years'] = True
nz_projects.loc[((year_offset > 5) & (year_offset <= 10)), 'Treatment between 5 to 10 years'] = True
nz_projects.loc[((year_offset > 10) & (year_offset <= 30)), 'Treatment between 10 to 30 years'] = True

# cast to int to avoid floating point erros (e.g. 400 + e-32 > 400 + e-33)
nz_index.loc[:, ['Start', 'End']] = nz_index[['Start', 'End']].astype(int)
nz_projects.loc[:, ['Start', 'End']] = nz_projects[['Start', 'End']].astype(int)

## LOAD VIC

In [None]:
# load data
from src import DATA_DIR

# Load MRWA Data
DATASET_NAME = 'VIC'
if SUFFIX == 'even_split':
    experiment_suffix = 'vic_final_even_split'
elif SUFFIX == 'no_bootstrap':
    experiment_suffix = 'vic_final_no_offset'
else:
    raise NotImplementedError
vic_valid_feature = util.load_data(source=DATA_DIR / 'processed' / DATASET_NAME / experiment_suffix.replace('_no_offset', '') /  f'valid_flattened_data{"_" + experiment_suffix if experiment_suffix else ""}.csv') 
vic_train_feature = util.load_data(source=DATA_DIR / 'processed' / DATASET_NAME / experiment_suffix.replace('_no_offset', '') / f'train_flattened_data{"_" + experiment_suffix if experiment_suffix else ""}.csv') 
vic_valid_labels = util.load_data(source=DATA_DIR / 'processed' / DATASET_NAME / experiment_suffix.replace('_no_offset', '') / f'valid_flattened_labels{"_" + experiment_suffix if experiment_suffix else ""}.csv', header=[0, 1]) 
vic_train_labels = util.load_data(source=DATA_DIR / 'processed' / DATASET_NAME / experiment_suffix.replace('_no_offset', '') / f'train_flattened_labels{"_" + experiment_suffix if experiment_suffix else ""}.csv', header=[0, 1]) 
vic_valid_index = util.load_data(source=DATA_DIR / 'processed' / DATASET_NAME / experiment_suffix.replace('_no_offset', '') / f'valid_flattened_index{"_" + experiment_suffix.replace("_no_offset", "") if experiment_suffix else ""}.csv') 
vic_train_index = util.load_data(source=DATA_DIR / 'processed' / DATASET_NAME / experiment_suffix.replace('_no_offset', '') / f'train_flattened_index{"_" + experiment_suffix.replace("_no_offset", "") if experiment_suffix else ""}.csv') 
experiment_suffix = 'vic_final_' + SUFFIX

# original projects
projects = util.load_data(DATA_DIR / 'raw' / 'VIC' / 'AAM6201 Data Reques' / 'Work Program' / 'Pavement Diary since 2014_2019.xlsx', sheet_name=0)
treatment_lookup = util.load_data(DATA_DIR.parent / "references" / "TreatmentCategory.csv")
treatment_lookup = treatment_lookup[treatment_lookup['Jurisdiction'] == 'VIC']
projects = projects.rename(columns={"Route Number": "Road_Number",
                                    "From Measure": "From_Measure"})
cleaned_projects = projects.dropna(
    subset=['Road_Number', 'Direction', 'From_Measure', 'Length', 'Treatment Date', 'Treatment Type']
).copy()
cleaned_projects.loc[cleaned_projects['Direction'].str.contains('Forward'), 'Direction'] = 'Forward'
cleaned_projects.loc[cleaned_projects['Direction'].str.contains('Reverse'), 'Direction'] = 'Reverse'
cleaned_projects = cleaned_projects[cleaned_projects['Direction'].isin({'Forward', 'Reverse'})]
cleaned_projects["Treatment Category"] = cleaned_projects["Treatment Type"]
cleaned_projects["Treatment Category"] = cleaned_projects["Treatment Category"].replace(dict(zip(treatment_lookup["Specific Category Value"], treatment_lookup["Generic Category"])))
cleaned_projects = cleaned_projects.drop(index=cleaned_projects[~cleaned_projects["Treatment Category"].isin(treatment_lookup["Generic Category"])].index)
cleaned_projects = cleaned_projects[["Road_Number", "Route Name", "Direction", "From_Measure", "To Measure", "Length", "Treatment Date", "Treatment Category"]]
cleaned_projects = cleaned_projects.rename(columns={"Road_Number": "RoadID",
                                                    "From_Measure": "Start",
                                                    "Treatment Date": "Date Treatment"})
cleaned_projects = cleaned_projects.drop_duplicates()
cleaned_projects = cleaned_projects[cleaned_projects['Treatment Category'].notna()]
cleaned_projects.loc[:, 'Date Treatment'] = cleaned_projects['Date Treatment'].astype(np.datetime64)
vic_projects = cleaned_projects
vic_projects = vic_projects.rename(columns={'Treatment Category': 'Treatment'})


model_dir = REPORT_DIR.parent.parent.parent / 'models' / 'trained' / DATASET_NAME / (experiment_suffix + '_dir') 
# Load Prediction columns
with open(model_dir / f'train_labels_columns_{experiment_suffix}.pkl', 'rb') as f:
    vic_prediction_cols = pickle.load(f)
# Load model
with open(model_dir / f'train_XGB_timehorizon_{experiment_suffix}.pkl', 'rb') as f:
    vic_models = pickle.load(f)

# join all
vic_features = vic_valid_feature 
vic_index = vic_valid_index

# cast year offset
vic_all_index = pd.concat([vic_index, vic_valid_index], axis=0, ignore_index=True)
vic_all_offsets = pd.concat([vic_train_feature['offset_month|idx=0'], vic_valid_feature['offset_month|idx=0']], ignore_index=True, axis=0)
year_offset = vic_all_offsets / 12
vic_all_index['year_offset'] = year_offset
vic_all_index = vic_all_index[vic_all_index['num_treatments'] > 0]
year_offset = vic_all_index['year_offset']

def parse_list_of_index_series(series):
    return [int(elem) for x in series for elem in x.split(',')]

for col in ['Treatment within 1 year', 'Treatment between 1 to 3 years', 'Treatment between 3 to 5 years', 'Treatment between 5 to 10 years', 'Treatment between 10 to 30 years']:
    vic_projects.loc[:, col] = False

vic_projects.loc[
    parse_list_of_index_series(vic_all_index[year_offset < 1]['treatment_idx']), 
    'Treatment within 1 year'
] = True 
vic_projects.loc[
    parse_list_of_index_series(vic_all_index[(year_offset > 1) & (year_offset <= 3)]['treatment_idx']), 
    'Treatment between 1 to 3 years'
] = True 
vic_projects.loc[
    parse_list_of_index_series(vic_all_index[(year_offset > 3) & (year_offset <= 5)]['treatment_idx']), 
    'Treatment between 3 to 5 years'
] = True
vic_projects.loc[
    parse_list_of_index_series(vic_all_index[(year_offset > 5) & (year_offset <= 10)]['treatment_idx']), 
    'Treatment between 5 to 10 years'
] = True 
vic_projects.loc[
    parse_list_of_index_series(vic_all_index[(year_offset > 10) & (year_offset <= 30)]['treatment_idx']), 
    'Treatment between 10 to 30 years'
] = True

# cast to int to avoid floating point erros (e.g. 400 + e-32 > 400 + e-33)
vic_projects['End'] = vic_projects['Start'] + vic_projects['Length']
vic_index['End'] = vic_index['Start'] + vic_index['Length']
vic_train_index['End'] = vic_train_index['Start'] + vic_train_index['Length']
vic_index.loc[:, ['Start', 'End']] = vic_index[['Start', 'End']].astype(int)
vic_projects.loc[:, ['Start', 'End']] = vic_projects[['Start', 'End']].astype(int)

# cast roadid to str
vic_projects.loc[:, 'RoadID'] = vic_projects['RoadID'].astype(str)
vic_index.loc[:, 'RoadID'] = vic_index['RoadID'].astype(str)
vic_train_index.loc[:, 'RoadID'] = vic_train_index['RoadID'].astype(str)

## Put into dict

In [None]:
input_dict = {
    'WA': {
        'prediction_cols': wa_prediction_cols,
        'features': wa_features,
        'models': wa_models,
        'index': wa_index,
        'h_index': ['RoadID', 'Direction'],
        'projects': wa_projects
    },
    'NZ': {
        'prediction_cols': nz_prediction_cols,
        'features': nz_features,
        'models': nz_models,
        'index': nz_index,
        'h_index': ['RoadID'],
        'projects': nz_projects
    },
    'VIC': {
        'prediction_cols': vic_prediction_cols,
        'features': vic_features,
        'models': vic_models,
        'index': vic_index,
        'h_index': ['RoadID', 'Direction'],
        'projects': vic_projects
    }
}

year_map_dict = {
    'Treatment within 1 year': 'Year 1',
    'Treatment between 1 to 3 years': 'Year 2 - 3',
    'Treatment between 3 to 5 years': 'Year 4 - 5',
    'Treatment between 5 to 10 years': 'Year 6 - 10',
}

year_order_dict = {
    'Treatment within 1 year': 0,
    'Treatment between 1 to 3 years': 1, 
    'Treatment between 3 to 5 years': 2, 
    'Treatment between 5 to 10 years': 3,
}

treatment_type_order = {
    'Resurfacing_SS': 0,
    'Resurfacing_AC': 1,
    'Major Patching': 2,
    'Rehabilitation': 3,
    'Retexturing': 4,
    'Regulation': 5
}

treatment_type_colors = {
    'Resurfacing_SS': 'tab:purple',
    'Resurfacing_AC': 'tab:brown',
    'Major Patching': 'tab:gray',
    'Rehabilitation': 'tab:olive',
    'Retexturing': 'tab:cyan',
    'Regulation': 'tab:pink',
}

# UTIL

Define method for:
* visualising clustered results vs true projects
* quantify metrics for the sets

## Generic helpers

In [None]:
# input two dataframes on same hierarchical index
# cast them both to fixed sections
# groupby fixed sections and count, 
from typing import List, Tuple
from matplotlib.patches import Patch
import matplotlib.cm as cm
import matplotlib.colors as mpl_colors

def cast_to_fixed_section(df: pd.DataFrame, fixed_length: float=100):
    split_df = df.copy()
    split_df = split_df.reset_index(drop=True)

    split_df['Start_Index'] = (split_df['Start'] // fixed_length)
    split_df['End_Index'] = (split_df['End']) // fixed_length - (split_df['End'] % fixed_length == 0)
    split_df.loc[:, 'Duplicate_Count'] = (split_df['End_Index'] - split_df['Start_Index'] + 1)
    split_df = split_df.loc[split_df.index.repeat(split_df['Duplicate_Count'])]
    split_df = split_df.reset_index().rename(columns={'index': 'position'})

    # assign column matching position with the earliest index with that position
    index_position_lookup = split_df.drop_duplicates(subset=['position'], keep='first')['position'].reset_index()
    split_df = split_df.set_index('position')
    split_df.loc[:, 'original_position'] = index_position_lookup.set_index('position')['index']
    split_df = split_df.reset_index()
    split_df.loc[:, 'fixed_length_Index'] = split_df['Start_Index'] + split_df.index - split_df['original_position']
    split_df = split_df.drop(columns=['position', 'original_position', 'Start_Index', 'End_Index', 'Duplicate_Count'])

    # calculate length contribution
    split_df['fixed_length_Start'] = split_df['fixed_length_Index'] * fixed_length
    split_df['fixed_length_End'] = split_df['fixed_length_Start'] + fixed_length
    split_df['Length_Contribution'] = split_df[['End', 'fixed_length_End']].min(axis=1) - split_df[['Start', 'fixed_length_Start']].max(axis=1)

    # make sure newly minted start ends are used as index instead of the old ones
    split_df.drop(columns=['fixed_length_Index', 'Start', 'End'], inplace=True)
    split_df.rename(columns={'fixed_length_Start': 'Start', 'fixed_length_End': 'End'}, inplace=True)

    return split_df
    
def merge_intervals(lst: List[Tuple[float, float]]):
    """[(a, b), (b-1, c)] -> [(a, c)]"""
    if len(lst) == 0:
        return lst
        
    lst = sorted(lst, key=lambda x: x[0]) # sort by start
    curr_start, curr_end = lst[0]
    new_lst = []
    for lst_pt in range(1, len(lst)):
        # check valid extension 
        if (lst[lst_pt][0] <= curr_end) and (lst[lst_pt][1] > curr_end):
            curr_end = lst[lst_pt][1]
            lst_pt += 1
        # beyond range 
        elif lst[lst_pt][0] > curr_end:
            new_lst.append((curr_start, curr_end))
            curr_start, curr_end = lst[lst_pt]
    # append current interval
    new_lst.append((curr_start, curr_end))
    return new_lst

assert merge_intervals([(1, 1), (2, 2)]) == [(1, 1), (2, 2)]
assert merge_intervals([(1, 2), (2, 3), (3, 4), (5, 7), (6, 8)]) == [(1, 4), (5, 8)]
assert merge_intervals([]) == []
assert merge_intervals([(1, 1), (1, 2), (2, 3), (3, 4)]) == [(1, 4)]


def extend_index(index_df: pd.DataFrame, h_index: tuple) -> pd.DataFrame:
    """Extend missing start-end sections with rows of nan"""
    final_df_lst = []
    for index_tup, inner_index in index_df.groupby(h_index):
        intervals = list(inner_index[['Start', 'End']].itertuples(index=False, name=None))
        intervals = merge_intervals(intervals)
        missing_intervals = [(0, intervals[0][0])]
        missing_intervals.extend([
            (prev[1], nxt[0]) for prev, nxt in zip(intervals, intervals[1:])
        ])

        for interval in missing_intervals:
            inner_index = inner_index.append({
                'Start': interval[0], 'End': interval[1]
            }, ignore_index=True)
        
        for key, val in zip(h_index, index_tup):
            inner_index.loc[:, key] = val

        final_df_lst.append(inner_index)
    return pd.concat(final_df_lst, axis=0, ignore_index=True)

def check_inputted_frames(actual_df: pd.DataFrame, predicted_df: pd.DataFrame, h_index: tuple):
    assert actual_df.drop_duplicates(subset=h_index).shape[0] <= 1 # can be empty
    assert predicted_df.drop_duplicates(subset=h_index).shape[0] <= 1 # can be empty
    # assert actual_df.drop_duplicates(subset=['Treatment', 'Treatment Time']).shape[0] <= 1
    # assert predicted_df.drop_duplicates(subset=['Treatment', 'Treatment Time']).shape[0] <= 1
    if len(predicted_df) > 0 and len(actual_df) > 0:
        assert all(predicted_df[h_index].iloc[0].values == actual_df[h_index].iloc[0].values)
    #     assert all(predicted_df[['Treatment', 'Treatment Time']].iloc[0].values == actual_df[['Treatment', 'Treatment Time']].iloc[0].values)

def plot_compare_projects(
    actual_df: pd.DataFrame, 
    predicted_df: pd.DataFrame, 
    unclustered_df: pd.DataFrame=None,
    bin_width: float=20, # bin width should be smaller than condition section length
    h_index: List[str]=['RoadID', 'Direction'],
    index_mask: pd.DataFrame=None,
    axs: Tuple[plt.Axes]=None,
):
    """Plot number of projects for fixed size bin. Both datasets should contain projects for only one treatment-time pair"""
    if len(actual_df) == 0 and len(predicted_df) == 0:
        print("No project provided")
        return None, None

    if len(actual_df) > 0 and len(predicted_df) > 0:
        check_inputted_frames(actual_df, predicted_df, h_index)

    # get range for plot
    start = np.nanmax(np.nanmin([actual_df['Start'].min(), predicted_df['Start'].min()])- 100, 0)
    end = np.nanmax([actual_df['End'].max(), predicted_df['End'].max()]) + 100

    # round to nearest bin_width * n
    start = (start // bin_width) * bin_width
    end = (end // bin_width) * bin_width + (bin_width if end % bin_width > 0 else 0)
    assert end > start

    all_splits = []

    split_actual = cast_to_fixed_section(actual_df[h_index + ['Start', 'End']], fixed_length=bin_width)
    split_predicted = cast_to_fixed_section(predicted_df[h_index + ['Start', 'End']], fixed_length=bin_width)

    if len(split_actual) > 0:
        split_actual['flag'] = 'Ground truth'
    if len(split_predicted) > 0:
        split_predicted['flag'] = 'After clustering'
    all_splits.extend([split_actual, split_predicted])

    if unclustered_df is not None:
        split_raw = cast_to_fixed_section(unclustered_df[h_index + ['Start', 'End']], fixed_length=bin_width)
        split_raw['flag'] = 'Before clustering'
        all_splits.append(split_raw)

    stacked_split = pd.concat(all_splits, axis=0).reset_index(drop=True)
    stacked_count = stacked_split.groupby(['flag', 'Start']).count().iloc[:, 0].rename('Count').reset_index()

    colors = mpl_colors.ListedColormap(['#004259', '#002D3D', '#000F14'], name='austroads_cmap') # color correspond to flag index

    # make plot
    if axs is None:
        fig, axs = plt.subplots(len(all_splits), 1, figsize=(15, 7.5))

    patches = []
    labels = []
    for i, flag in enumerate(['Ground truth', 'After clustering'] + (['Before clustering'] if unclustered_df is not None else [])):
        if flag not in set(stacked_count['flag']):
            continue
        flag_df = stacked_count[stacked_count['flag'] == flag].set_index('Start')['Count']
        bars = axs[i].bar(
            x=flag_df.index, 
            height=flag_df.values,
            width=bin_width,
            align='edge',
            color=colors(i)
        )
        patches.append(Patch(facecolor=bars[0].get_facecolor()))
        if flag == 'Ground truth':
            labels.append('Ground truth projects')
        elif flag == 'After clustering':
            labels.append('Projects as determined after the clustering step')
        elif flag == 'Before clustering':
            labels.append('Sections predicted to have treatment out of sections the model see')

    # if there is mask, plot mask ranges
    if index_mask is not None and len(index_mask) > 1:
        assert index_mask[h_index].drop_duplicates().shape[0] == 1
        assert index_mask[h_index].iloc[0].to_list() == actual_df[h_index].iloc[0].to_list()

        mask_intervals = list(index_mask.sort_values(by=['Start'])[['Start', 'End']].itertuples(index=False, name=None))
        mask_intervals = merge_intervals(mask_intervals)

        any_mask = False
        for mask_start, mask_end in mask_intervals:
            axs[-1].fill_betweenx(y=[0, 1], x1=mask_start, x2=mask_end, color='silver')
            any_mask = True

        if any_mask:
            patches.append(Patch(facecolor='silver'))
            labels.append('Sections hidden from model')

    # set yaxis to discrete integers
    for ax in axs:
        ax.set_yticklabels([])
        ax.set_ylabel(None)
        ax.set_ylim((0, 1))
        ax.set_xlim((start, end))
        ax.set_xlabel('Chainage (m)')
        ax.set_xticks(np.linspace(start, end, num=20))

    # axs[0].legend(
    #     handles=patches, labels=labels, bbox_to_anchor=(0, 1), loc='lower left'
    # )
    axs[0].set_title('Ground truth projects', loc='left')
    axs[1].set_title('Predicted projects using clustering', loc='left')
    axs[2].set_title('Sections predicted to have treatment out of sections the model see', loc='left')
    axs[2].legend(
        handles=[Patch(facecolor='black', alpha=0.2)],
        labels=['Sections hidden from model'],
        bbox_to_anchor=(1, 1),
        loc='lower right'
    )
    
    return axs, stacked_split

## Utilities for finding invalid labels and ranges

In [None]:
from matplotlib.patches import Patch

def clip_mask(start: float, end: float, df: pd.DataFrame):
    # clip end
    df.loc[
        (df['Start'] < start) &\
        (df['End'] > start) &\
        (df['End'] <= end),
        'End'
    ] = start
    # clip start
    df.loc[
        (df['Start'] < end) &\
        (df['Start'] >= start) &\
        (df['End'] > end),
        'Start'
    ] = end
    # clip start & end
    x = df.loc[
        (df['Start'] < start) &\
        (df['End'] > end)
    ] 
    if len(x) > 0:
        df.drop(index=x.index, inplace=True)
        new_x = x.copy()
        x.loc[:, 'End'] = start
        new_x.loc[:, 'Start'] = end
        df = df.append([new_x, x], ignore_index=True)

    # throw away inside 
    df = df[~(
        (df['Start'] >= start) & \
        (df['End'] <= end)
    )]
    return df

def find_mask_ranges(projects: pd.DataFrame, index_df: pd.DataFrame, h_index: Tuple=['RoadID', 'Direction']):
    """Generate all intervals where there are no spatial index from the condition data. 
    Removing this set helps improving the results of all algorithms, as these have no information and cannot be predicted

    Important: mask only indexes not available in both train AND valid set, as index in train set and valid set are evenly distributed and thus we expect clustering to do well
    """

    disjoint = pd.DataFrame()

    for index in index_df[h_index].drop_duplicates().itertuples(index=False, name=None):
        index = [str(elem) for elem in index]
        filtered_index = index_df[(index_df[h_index].values == index).all(axis=1)]
        filtered_projects = projects[(projects[h_index].values == index).all(axis=1)]
        sorted_index = filtered_index.sort_values(['Start'])

        if len(filtered_projects) == 0:
            continue

        merged = merge_intervals(list(sorted_index.sort_values(['Start'])[['Start', 'End']].itertuples(index=False, name=None))) # linear scan and condense intervals of all index

        # check continuity
        if len(merged) > 1: # more than 2 intervals
            # retrieve ranges where labels are discontinous and therefore no algorithm can do better
            for j in range(1, len(merged)):
                prev, curr = merged[j - 1], merged[j]
                assert curr[0] > prev[1]
                disjoint = pd.concat(
                    [disjoint,
                    pd.DataFrame({
                        'Start': prev[1],
                        'End': curr[0],
                        **{
                            col: [val] for col, val in zip(h_index, index)
                        }
                    })],
                    axis=0
                )

        # outer disjoint (iff projects overshoots index at either end. If not the case, pruned at the end)
        disjoint = pd.concat([
            disjoint,
            pd.DataFrame({
                'Start': [filtered_projects['Start'].min(), filtered_index['End'].max()],
                'End': [filtered_index['Start'].min(), filtered_projects['End'].max()],
                **{
                    col: [val] * 2 for col, val in zip(h_index, index)
                }
            })
        ], axis=0)

    disjoint = disjoint.reset_index(drop=True)
    disjoint = disjoint[disjoint['Start'] < disjoint['End']].reset_index(drop=True)
    return disjoint

## Method for making quantitative comparison dict

In [None]:
def calculate_compare_projects(
    actual_df: pd.DataFrame, 
    predicted_df: pd.DataFrame, 
    h_index: List[str]=['RoadID', 'Direction'],
):
    check_inputted_frames(actual_df, predicted_df, h_index)
    actual_intervals = actual_df[actual_df['End'] > actual_df['Start']][['Start', 'End']].itertuples(index=False, name=None)
    predicted_intervals = predicted_df[predicted_df['End'] > predicted_df['Start']][['Start', 'End']].itertuples(index=False, name=None)

    # merge intervals 
    actual_intervals = merge_intervals(list(actual_intervals)) 
    predicted_intervals = merge_intervals(list(predicted_intervals))

    # maps which intervals has intersections
    actual_intersection : dict = {}
    predicted_intersection : dict = {}

    # find all intersections (with non overlap assumption. If overlap happens, goodness is over counted, but this should be extremely rare)
    actual_pt, predicted_pt = 0, 0
    while actual_pt < len(actual_intervals) and predicted_pt < len(predicted_intervals):
        actual_start, actual_end = actual_intervals[actual_pt]
        predicted_start, predicted_end = predicted_intervals[predicted_pt]

        if predicted_start < actual_end and predicted_end > actual_start: # intersect yes
            intersection_start, intersection_end = max(actual_start, predicted_start), min(actual_end, predicted_end)
            if (actual_start, actual_end) not in actual_intersection:
                actual_intersection[(actual_start, actual_end)] = [(intersection_start, intersection_end)]
            else:
                actual_intersection[(actual_start, actual_end)].append((intersection_start, intersection_end))
            if (predicted_start, predicted_end) not in predicted_intersection:
                predicted_intersection[(predicted_start, predicted_end)] = [(intersection_start, intersection_end)]
            else:
                predicted_intersection[(predicted_start, predicted_end)].append((intersection_start, intersection_end))

        if predicted_end <= actual_end:
            predicted_pt += 1
        else:
            actual_pt += 1

    # compute metrics
    result_dict = dict(
        metr_true_area_total = sum(y - x for x, y in actual_intervals),
        metr_matched_total = sum(y - x for intervals in actual_intersection.values() for x, y in intervals),
        metr_predicted_total = sum(y - x for x, y in predicted_intervals),
    )

    result_dict.update(dict(
        metr_predicted_no_match_total = result_dict['metr_predicted_total'] - result_dict['metr_matched_total'],
        metr_true_no_prediction_total = result_dict['metr_true_area_total'] - result_dict['metr_matched_total'],
    ))

    result_dict['abs_true_matched_per_project'] = []
    result_dict['perc_true_matched_per_project'] = []
    result_dict['project_length'] = []
    for actual in actual_intervals:
        result_dict['abs_true_matched_per_project'].append(sum((interval[1] - interval[0]) for interval in actual_intersection[actual]) if actual in actual_intersection else 0)
        result_dict['perc_true_matched_per_project'].append(result_dict['abs_true_matched_per_project'][-1] / (actual[1] - actual[0]))
        result_dict['project_length'].append(actual[1] - actual[0])

    result_dict['abs_predicted_matched_per_predicted_project'] = []
    result_dict['perc_predicted_matched_per_predicted_project'] = []
    for predicted in predicted_intervals:
        result_dict['abs_predicted_matched_per_predicted_project'].append(sum((interval[1] - interval[0]) for interval in predicted_intersection[predicted]) if predicted in predicted_intersection else 0)
        result_dict['perc_predicted_matched_per_predicted_project'].append(result_dict['abs_predicted_matched_per_predicted_project'][-1] / (predicted[1] - predicted[0]))

    return result_dict

In [None]:
from tqdm.notebook import tqdm

def make_quant_comparison(
    projects: pd.DataFrame, 
    clustered_projects: pd.DataFrame, 
    index_df: pd.DataFrame,
    prediction_cols: pd.Index, 
    h_index: Tuple=['RoadID', 'Direction'],
    mask: pd.DataFrame=None, 
    invalid_labels: set=set(),
):

    results = {
        treatment: {
            treatment_time: [] for treatment_time in set(prediction_cols.get_level_values(0))
        } for treatment in set(prediction_cols.get_level_values(1))
    }

    index_lst = list(index_df[h_index].drop_duplicates().itertuples(index=False, name=None))

    tqdm_prediction = tqdm(desc='Prediction columns', total=len(prediction_cols), leave=False)
    tqdm_index_lst = tqdm(desc='Index', total=len(index_lst), leave=False)

    tqdm_prediction.reset()
    for (treatment_time, treatment) in prediction_cols:
        tqdm_prediction.update()
        tqdm_index_lst.reset()
        for index in index_lst:
            tqdm_index_lst.update()
            if len(invalid_labels) > 0 and (*index, treatment_time, treatment) in invalid_labels:
                continue

            actual_df = projects[
                (projects[h_index].values == index).all(axis=1) &\
                (projects['Treatment'] == treatment) &\
                (projects[treatment_time] == True)
            ].copy()
            predicted_df = clustered_projects[
                (clustered_projects[h_index].values == index).all(axis=1) &\
                (clustered_projects['Treatment'] == treatment) &\
                (clustered_projects['Treatment Time'] == treatment_time)
            ].copy()
    
            # clip invalid regions 
            if mask is not None and len(mask) > 0:
                subset = mask[(mask[h_index].values == index).all(axis=1)]
                if len(subset) > 0:
                    # ignore true treatments where labels have no entries; improve result over what no algorithm can do better, since labels are missing anyway
                    for _, row in subset.iterrows():
                        actual_df = clip_mask(row['Start'], row['End'], actual_df)
                        predicted_df = clip_mask(row['Start'], row['End'], predicted_df)
            
            if len(actual_df) == 0:
                continue
     
            r = calculate_compare_projects(
                actual_df=actual_df, predicted_df=predicted_df,
                h_index=h_index,
            )

            r['index'] = index
            results[treatment][treatment_time].append(r)

        results[treatment][treatment_time] = pd.DataFrame(results[treatment][treatment_time])

    tqdm_index_lst.close()
    tqdm_prediction.close()

    return results 

## Method plotting quantiative comparison

In [None]:
from collections import OrderedDict
from itertools import chain

# code from https://linuxtut.com/en/92c21048bacadce811ec/
def set_hierarchical_xlabels(index, ax=None,
                             bar_xmargin=0.1, #Margins on the left and right ends of the line, X-axis scale
                             bar_yinterval=0.1, #Relative value with the vertical spacing of the line and the length of the Y axis as 1?
                            ):
    from itertools import groupby
    from matplotlib.lines import Line2D

    ax = ax or plt.gca()

    assert isinstance(index, pd.MultiIndex)
    labels = ax.set_xticklabels([s for *_, s in index])
    for lb in labels:
        lb.set_rotation(0)

    transform = ax.get_xaxis_transform()

    for i in range(1, len(index.codes)):
        xpos0 = -0.5 #Coordinates on the left side of the target group
        for (*_, code), codes_iter in groupby(zip(*index.codes[:-i])):
            xpos1 = xpos0 + sum(1 for _ in codes_iter) #Coordinates on the right side of the target group
            ax.text((xpos0+xpos1)/2, (bar_yinterval * (-i-0.1)),
                    index.levels[-i-1][code],
                    transform=transform,
                    ha="center", va="top")
            ax.add_line(Line2D([xpos0+bar_xmargin, xpos1-bar_xmargin],
                               [bar_yinterval * -i]*2,
                               transform=transform,
                               color="k", clip_on=False))
            xpos0 = xpos1

def plot_quant_comparison(quant_results: dict, axs: List[plt.Axes]=None, ylim: tuple=None):

    # preprocess quant_results
    plot_df = pd.DataFrame({'Treatment': [], 'Treatment Time': [], 'Metric': [], 'Value': [], 'Error': []})

    for treatment, treatment_dict in quant_results.items():
        if treatment not in ['Resurfacing_SS', 'Rehabilitation', 'Resurfacing_AC']:
            continue
        for treatment_time, result_df in treatment_dict.items():
            if (treatment_time == 'Treatment between 10 to 30 years') or (len(result_df) == 0):
                continue

            # ratio of total length of accurately predicted treatment over total length of all project
            recall_row = {'Treatment': treatment, 'Treatment Time': treatment_time, 'Metric': r'Recall by length ($\frac{overlap}{actual})$'}
            recall_row['Value'] = np.sum(result_df['metr_matched_total']) / np.sum(result_df['metr_true_area_total'])
            recall_row['Error'] = 0

            # ratio of total length of accurately predicted treatment over total length of all predicted project
            precision_row = {'Treatment': treatment, 'Treatment Time': treatment_time, 'Metric': r'Precision by length ($\frac{overlap}{predicted}$)'}
            precision_row['Value'] = np.sum(result_df['metr_matched_total']) / np.sum(result_df['metr_predicted_total'])
            precision_row['Error'] = 0

            # percentage_of_actual_planned_project_predicted needs to be merged 
            perc_row = {'Treatment': treatment, 'Treatment Time': treatment_time, 'Metric': 'Average percentage of actual project predicted'}
            all_percs = np.array(list(chain.from_iterable(result_df['perc_true_matched_per_project'].to_list())))
            perc_row['Value'] = np.mean(all_percs)
            perc_row['Error'] = 0 

            # count_of_actual_projects_with_more_than_30
            row = {'Treatment': treatment, 'Treatment Time': treatment_time, 'Metric': 'Percentage of actual projects with more than 30% predicted in length'}
            all_lengths = np.array(list(chain.from_iterable(result_df['project_length'].to_list())))
            more_than_threshold = all_percs >= 0.3
            row['Value'] = np.sum(more_than_threshold) / len(all_lengths) 
            row['Error'] = 0

            # plot_df = plot_df.append(metr_row, ignore_index=True)
            plot_df = plot_df.append(perc_row, ignore_index=True)
            plot_df = plot_df.append(row, ignore_index=True)
            plot_df = plot_df.append(recall_row, ignore_index=True)
            plot_df = plot_df.append(precision_row, ignore_index=True)

    plot_df = plot_df.sort_values(by=['Treatment', 'Treatment Time'], 
        key=lambda x: x.replace(year_order_dict) if x.name == 'Treatment Time' else x.replace(treatment_type_order)
    )
    plot_df.loc[:, 'Treatment Time'] = plot_df['Treatment Time'].replace(year_map_dict)

    if axs is None:
        fig, axs = plt.subplots(nrows=plot_df['Metric'].nunique(), ncols=1, figsize=(12, 4 * plot_df['Metric'].nunique()))

    for i, metric in enumerate(plot_df['Metric'].unique()):
        inner_df = plot_df[plot_df['Metric'] == metric].set_index(['Treatment', 'Treatment Time'])
        inner_df.plot(
            kind='bar',
            y='Value',
            yerr='Error',
            ax=axs[i],
            xlabel=None,
            color=[treatment_type_colors[treatment] for treatment, _ in inner_df.index],
            alpha=0.6,
            legend=False,
        )
        set_hierarchical_xlabels(inner_df.index, ax=axs[i])
        axs[i].set_title(metric)
        axs[i].set_xlabel(None)
        axs[i].set_ylabel('Fraction')
        if axs[i].get_ylim()[1] < 1:
            axs[i].set_ylim((0, 1))
        axs[i].grid(True)

        if ylim is not None:
            axs[metric].set_ylim(ylim)

    return axs, plot_dict

# Define algorithms

We expect the validation set to occupy every two road section for any particular road id. Thus, we will fill the immediate predecessor and successor of a predicted treatment section with the same treatment.

## Algorithm 1: Linear scan

Given hierarchical spatial index (RoadID and Direction), scan linearly for continous spatial index (chainage as defined by start_end) and groups consecutive scans of specific treatment into the projects.

This method merely use the merge_intervals helper method defined previous

In [None]:
def linear_scan(sorted_index: pd.DataFrame, nudge_outer: bool=True, **kwargs):
    sorted_intervals = list(sorted_index[['Start', 'End']].itertuples(index=False, name=None))

    if nudge_outer:
        sorted_intervals = [(max(a - 100, 0), b + 100) for a, b in sorted_intervals]
    return merge_intervals(sorted_intervals)

## Algorithm 2: Density Based Clustering

Use DBSCAN algorithm on all predicted sections (after merging consecutive runs where possible), with the 'distance' metric between any two predictions being the distance between their edges in meters.

Parameters:
-  `eps`, `min_samples`. Used in DBSCAN algorithm as documented [here](https://scikit-learn.org/stable/modules/generated/sklearn.cluster.DBSCAN.html#sklearn.cluster.DBSCAN). Intuitively, a clustered project greedily expands itself until no elementary project remains within `eps` meters from either edges

In [None]:
from sklearn.metrics import pairwise_distances
from sklearn.cluster import DBSCAN

def cluster(sorted_index: pd.DataFrame, nudge_outer: bool=True, **kwargs):
    """
    Cluster groups of smaller projects into a larger project if they are close together.
    Underlying algorithm

    :params:
        eps, min_samples: used in DBSCAN algorithm. 
    
    """
    elementary_projects = linear_scan(sorted_index, nudge_outer)

    distance_mat = pairwise_distances(np.array(elementary_projects), metric=lambda x, y: max(y[0] - x[1], x[0] - y[1]), n_jobs=3 if len(elementary_projects) > 10000 else 1)
    np.fill_diagonal(distance_mat, 0) # distance between same row not 0 by lambda method

    clusterer = DBSCAN(
        eps=2000 if 'eps' not in kwargs else kwargs['eps'], 
        min_samples=1 if 'min_samples' not in kwargs else kwargs['min_samples'],
        metric='precomputed',
    )
    clustering_result = clusterer.fit_predict(distance_mat)

    # groupby clustering results
    projects = pd.DataFrame(elementary_projects, columns=['Start', 'End'])
    projects['cluster_labels'] = clustering_result

    # remove noise
    projects = projects[projects['cluster_labels'] != -1]

    projects = projects.groupby(['cluster_labels']).agg({'Start': 'min', 'End': 'max'})
    sorted_intervals = list(projects[['Start', 'End']].itertuples(index=False, name=None))

    return merge_intervals(sorted_intervals)


# Perform algorithm

In [None]:
result_dict = {
    juri: dict(
        clustered_projects=[],
    ) for juri in ['WA', 'NZ', 'VIC']
}

plot_dict = {juri: {} for juri in ['WA', 'NZ', 'VIC']}

In [None]:
from tqdm.notebook import tqdm

for juri in tqdm(['WA', 'NZ', 'VIC'], desc='Juri'):
    predictions = np.array([
        model.predict(input_dict[juri]['features']) for model in input_dict[juri]['models']
    ]).mean(axis=0)
    predictions = np.where(predictions >= 0.5, 1, 0)
    predictions = pd.DataFrame(predictions, columns=input_dict[juri]['prediction_cols'])
    index_df = input_dict[juri]['index']

    for index_tup in tqdm(index_df[input_dict[juri]['h_index']].drop_duplicates().itertuples(index=False, name=None), desc='Spatial Index', total=index_df[input_dict[juri]['h_index']].drop_duplicates().shape[0]):
        # filter empty patches
        for treatment_idx, (treatment_time, treatment) in enumerate(input_dict[juri]['prediction_cols']):
            filtered_index = index_df[
                (np.all(index_df[input_dict[juri]['h_index']].values == index_tup, axis=1)) &\
                (predictions[(treatment_time, treatment)] == 1)
            ]
            sorted_index = filtered_index.sort_values(['Start'])
            if len(sorted_index) == 0:
                continue
                
            # projects_intervals = linear_scan(sorted_index)
            projects_intervals = cluster(sorted_index, nudge_outer=True)

            for start_chainage, end_chainage in projects_intervals:
                row = {
                    'Treatment': treatment, 
                    'Treatment Time': treatment_time, 
                    'Start': start_chainage,
                    'End': end_chainage,
                }
                row.update({k: v for k, v in zip(input_dict[juri]['h_index'], index_tup)})
                result_dict[juri]['clustered_projects'].append(row)

In [None]:
# compute mask for index and labels
# mask on both train and valid index, as we want to include train index into our final metrics without performing computations explictily on them
for juri in ['VIC', 'WA', 'NZ']:
    result_dict[juri]['index_mask'] = find_mask_ranges(
        input_dict[juri]['projects'], 
        pd.concat([input_dict[juri]['index'], 
            wa_train_index if juri == 'WA' else\
            (nz_train_index if juri == 'NZ' else vic_train_index)
        ], ignore_index=True, axis=0), 
        h_index=input_dict[juri]['h_index']
    )

In [None]:
USE_MASK = True 

for juri, val_dict in tqdm(result_dict.items(), desc='juri'):
    results = make_quant_comparison(
        input_dict[juri]['projects'], 
        pd.DataFrame(val_dict['clustered_projects']), 
        input_dict[juri]['index'],
        input_dict[juri]['prediction_cols'], 
        h_index=input_dict[juri]['h_index'],
        mask=val_dict['index_mask'] if USE_MASK else None
    )
    plot_dict[juri] = results

# Plot result

In [None]:
USE_MASK = True 

with open(REPORT_DIR / f'raw_plot_dict_{SUFFIX}_{"masked" if USE_MASK else "unmasked"}.pkl', 'wb') as f:
    pickle.dump(plot_dict, f)

with open(REPORT_DIR / f'raw_result_dict_{SUFFIX}.pkl', 'wb') as f:
    pickle.dump(result_dict, f)

with open(REPORT_DIR / f'raw_plot_dict_{SUFFIX}_{"masked" if USE_MASK else "unmasked"}.pkl', 'rb') as f:
    plot_dict = pickle.load(f)

with open(REPORT_DIR / f'raw_result_dict_{SUFFIX}.pkl', 'rb') as f:
    result_dict = pickle.load(f)

In [None]:
for juri, val_dict in tqdm(plot_dict.items()):
    axs, plot_dict_cumulative = plot_quant_comparison(val_dict)
    for ax in axs:
        ax.xaxis.grid(False)
    fig = axs[0].get_figure()
    fig.suptitle(f'Performance on whole road network by clustering predictions on validation set of short sections.\n{juri} - {" ".join(map(str.capitalize, SUFFIX.split("_")))}')
    fig.tight_layout(rect=[0, 0, 1, 0.98])
    fig.savefig(REPORT_DIR / f'{juri.lower()}_cluster_level_perf_{SUFFIX}_{"masked" if USE_MASK else "unmasked"}.png', dpi=200)
    plt.show()

# Plot example 

In [None]:
juri = 'WA'
treatment_tup = ('Treatment within 1 year', 'Resurfacing_SS')

juri_predictions = np.array([
    model.predict(input_dict[juri]['features']) for model in input_dict[juri]['models']
]).mean(axis=0)
juri_predictions = np.where(juri_predictions >= 0.5, 1, 0)
juri_predictions = pd.DataFrame(juri_predictions, columns=input_dict[juri]['prediction_cols'])
juri_index_df = input_dict[juri]['index']

In [None]:
# compute mask for index and labels
plot_mask = {}
for juri in ['WA', 'NZ']:
    plot_mask[juri] = find_mask_ranges(
        input_dict[juri]['projects'], 
        input_dict[juri]['index'],
        h_index=input_dict[juri]['h_index']
    )

In [None]:
juri = 'WA'
x = plot_dict['WA']['Resurfacing_SS']['Treatment within 1 year']
x[x['index'] == ('H005', 'S')]
index_tup = ('H005', 'S')
h_index = input_dict[juri]['h_index']

axs, stacked_split = plot_compare_projects(
    actual_df=input_dict[juri]['projects'][
        np.all(input_dict[juri]['projects'][h_index].values == index_tup, axis=1) &\
        (input_dict[juri]['projects']['Treatment'] == treatment_tup[1]) &\
        (input_dict[juri]['projects'][treatment_tup[0]] == True)
    ],
    predicted_df=pd.DataFrame(result_dict[juri]['clustered_projects'])[
        np.all(pd.DataFrame(result_dict[juri]['clustered_projects'])[h_index].values == index_tup, axis=1) &\
        np.all(pd.DataFrame(result_dict[juri]['clustered_projects'])[['Treatment Time', 'Treatment']].values == treatment_tup, axis=1)
    ],
    unclustered_df=juri_index_df.loc[
        pd.Series(np.all(juri_index_df[h_index].values == index_tup, axis=1)) &\
        (juri_predictions[treatment_tup] == 1)
    ],
    h_index=h_index,
    index_mask=plot_mask[juri][
        np.all(plot_mask[juri][h_index].values == index_tup, axis=1)
    ]
) 

for ax in axs:
    step = 500
    xticks = np.arange(ax.get_xlim()[0], ax.get_xlim()[1], step=step)
    ax.set_xticks(xticks)
    ax.set_xticklabels([f'{step * x}' for x in range(len(xticks))])

fig = axs[0].get_figure()

fig.suptitle("Example comparison of clustered and ground truth projects")
fig.tight_layout()

fig.savefig(REPORT_DIR / f'example_{SUFFIX}.png', dpi=100)


# Compare recall and precision by length vs recall and precision by section prediction

In [None]:
# load valid result
suffix = SUFFIX 
from src import DATA_DIR

report_dir = DATA_DIR.parent / 'reports' / 'raw_results'
model_dir = DATA_DIR.parent / 'models' / 'trained'

paths_dict = { 
    'conf_mat': {
        'WA': report_dir / 'MRWA' / f'mrwa_final_{suffix}_dir' / f'valid_XGB_rawconfmat_mrwa_final_{suffix}.pkl',
        'NZ': report_dir / 'NZTA' / f'nzta_final_{suffix}_dir' / f'valid_XGB_rawconfmat_nzta_final_{suffix}.pkl',
        'VIC': report_dir / 'VIC' / f'vic_final_{suffix}_dir' / f'valid_XGB_rawconfmat_vic_final_{suffix}.pkl',
    },
    'prediction_columns': {
        'WA': model_dir / 'MRWA' / f'mrwa_final_{suffix}_dir' / f'train_labels_columns_mrwa_final_{suffix}.pkl',
        'NZ': model_dir / 'NZTA' / f'nzta_final_{suffix}_dir' / f'train_labels_columns_nzta_final_{suffix}.pkl',
        'VIC': model_dir / 'VIC' / f'vic_final_{suffix}_dir' / f'train_labels_columns_vic_final_{suffix}.pkl',
    }
}

metric_dict = {}
for juri, running_conf_mat_path in paths_dict['conf_mat'].items():
    with open(paths_dict['prediction_columns'][juri], 'rb') as f:
        pred_cols = pickle.load(f)
    with open(running_conf_mat_path, 'rb') as f:
        running_conf_mat_arr = np.array(pickle.load(f))
    metric_dict[juri] = {}
    for i, (time_type, treatment_type) in enumerate(pred_cols):
        running_conf_mat = running_conf_mat_arr[:, i, :, :]

        if treatment_type not in metric_dict[juri]: metric_dict[juri][treatment_type] = {}
        if time_type not in metric_dict[juri][treatment_type]: metric_dict[juri][treatment_type][time_type] = {}
        precision = (running_conf_mat[:, 1, 1] / running_conf_mat[:, :, 1].sum(axis=1))
        recall = (running_conf_mat[:, 1, 1] / running_conf_mat[:, 1, :].sum(axis=1))

        metric_dict[juri][treatment_type][time_type]['Precision'] = np.mean(precision)
        metric_dict[juri][treatment_type][time_type]['Recall'] = np.mean(recall)
        metric_dict[juri][treatment_type][time_type]['F-Score'] = np.mean(2 / (1 / precision + 1 / recall))

In [None]:
def plot_compare_recall_precision_by_length_by_section(juri: str, project_plot_dict: dict, section_metric_dict: dict, axs=None, ylim=None, cmap=None):
    """Compare precision and recall by length vs precision and recall by section classification results"""

    plot_df = []
    for treatment, treatment_dict in project_plot_dict[juri].items():
        if treatment not in ['Resurfacing_SS', 'Resurfacing_AC', 'Rehabilitation']:
            continue
        for treatment_time, result_df in treatment_dict.items():
            if (treatment_time == 'Treatment between 10 to 30 years') or (len(result_df) == 0):
                continue

            # ratio of total length of accurately predicted treatment over total length of all project
            recall_row = {'Treatment': treatment, 'Treatment Time': treatment_time, 'Metric': 'Recall', 'By': 'project-level'}
            recall_row['Value'] = np.sum(result_df['metr_matched_total']) / np.sum(result_df['metr_true_area_total'])
            recall_row['Error'] = 0
            # ratio of total length of accurately predicted treatment over total length of all predicted project
            precision_row = {'Treatment': treatment, 'Treatment Time': treatment_time, 'Metric': 'Precision', 'By': 'project-level'}
            precision_row['Value'] = np.sum(result_df['metr_matched_total']) / np.sum(result_df['metr_predicted_total'])
            precision_row['Error'] = 0

            plot_df.append(recall_row)
            plot_df.append(precision_row)
    
    for treatment, treatment_dict in section_metric_dict[juri].items():
        for treatment_time, result_df in treatment_dict.items():
            if (treatment_time == 'Treatment between 10 to 30 years') or (len(result_df) == 0):
                continue

            # ratio of total length of accurately predicted treatment over total length of all project
            recall_row = {'Treatment': treatment, 'Treatment Time': treatment_time, 'Metric': 'Recall', 'By': 'section-level'}
            recall_row['Value'] = result_df['Recall'] 
            recall_row['Error'] = 0

            # ratio of total length of accurately predicted treatment over total length of all predicted project
            precision_row = {'Treatment': treatment, 'Treatment Time': treatment_time, 'Metric': 'Precision', 'By': 'section-level'}
            precision_row['Value'] = result_df['Precision'] 
            precision_row['Error'] = 0

            plot_df.append(recall_row)
            plot_df.append(precision_row)

    def key(series: pd.Series):
        if series.name == 'Treatment': return series.replace(year_order_dict)
        if series.name == 'Treatment Time': return series.replace(treatment_type_order)
        if series.name == 'By': return series.replace({'projecrt-level': 0, 'section-level': 1})

    plot_df = pd.DataFrame(plot_df)
    plot_df.loc[:, 'Treatment Time'] = plot_df['Treatment Time'].replace(year_map_dict)
    plot_df : pd.DataFrame = plot_df.sort_values(by=['Treatment', 'Treatment Time', 'By'], 
        key=key
    )

    # remove treatment, treatment_time if do not have both project-section level
    count_by = plot_df.groupby(['Treatment', 'Treatment Time', 'Metric'])['By'].transform('nunique')
    plot_df = plot_df[count_by > 1]

    if axs is None:
        fig, axs = plt.subplots(nrows=plot_df['Metric'].nunique(), ncols=1, figsize=(12, 5 * plot_df['Metric'].nunique()))

    for i, metric in enumerate(plot_df['Metric'].unique()):
        inner_df = plot_df.loc[plot_df['Metric'] == metric, :]\
                .pivot(index=['Treatment', 'Treatment Time'], columns='By', values='Value')
        inner_df.plot(
            kind='bar',
            ax=axs[i],
            xlabel=False,
            legend=False,
            cmap='tab10' if cmap is None else cmap
        )

        # set xticks between any 2 bars
        set_hierarchical_xlabels(inner_df.index, ax=axs[i])
        axs[i].set_title((f'Jurisdiction: {juri}\nMethod: {" ".join(map(str.capitalize, SUFFIX.split("_")))}\n\n' if i == 0 else '') + metric + " by project level (using length) v.s. by section-level (using prediction)", loc='left')
        axs[i].set_xlabel(None)
        axs[i].set_ylabel('Fraction')
        axs[i].legend(loc='upper left')
        if axs[i].get_ylim()[1] < 1:
            axs[i].set_ylim((0, 1))

        if ylim is not None:
            axs[metric].set_ylim(ylim)
    
    plt.tight_layout()

In [None]:
colors = mpl_colors.ListedColormap(['#00729B', '#004259', '#002D3D', '#000F14'], name='austroads_cmap') # color correspond to flag index

plot_compare_recall_precision_by_length_by_section('NZ', plot_dict, metric_dict, cmap=colors)
plt.tight_layout()
plt.savefig(REPORT_DIR / 'NZ_compare_by_length_by_prediction_{}.png'.format(SUFFIX))

plot_compare_recall_precision_by_length_by_section('WA', plot_dict, metric_dict, cmap=colors)
plt.tight_layout()
plt.savefig(REPORT_DIR / 'WA_compare_by_length_by_prediction_{}.png'.format(SUFFIX))

plot_compare_recall_precision_by_length_by_section('VIC', plot_dict, metric_dict, cmap=colors)
plt.tight_layout()
plt.savefig(REPORT_DIR / 'VIC_compare_by_length_by_prediction_{}.png'.format(SUFFIX))