<div>
<img src='../../img/WSP_red.png' style='height: 95px; float: left' alt='WSP Logo'/>
<img src='../../img/austroads.png' style='height: 115px; float: right' alt='Client Logo'/>
</div>
<center><h2>AAM6201 Development of Machine-Learning Decision-Support tools for Pavement Asset Management<br>Case Study 1: Project Identification</h2></center>


In [None]:
# magic command to autoreload changes in src
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
from src.nsw_configs.final_config import DATA_DIR
from tqdm.notebook import tqdm
import src.util as util
import src.features.preprocessing as preprocessing 

from src.nsw_configs.final_config import CONFIG, FeatureAdder
import src.nsw_configs.final_config as config
from IPython.display import display

# Data Preprocessing

This notebook preprocesses a dataset through the following steps in order:

- Apply transformations 
- Adding new columns
- Filtering the dataframe
- Dropping unused columns

As detailed in the configuration

In [None]:
cracking = util.load_data(config.crack_data)
dtims_in = util.load_data(config.dtims_in_data)
dtims_out = util.load_data(config.dtims_out_data)
profile = util.load_data(config.profile_data)
deflection = util.load_data(config.deflection_data)

In [None]:
class FeatureEncodeImputeNormalizeContainer:
    """
    Container for encoding, imputation, and normalization operations, in that order.
    """

    def __init__(self):
        self.feature_encoding = None # remember feature encoding for future
        self.date_encoding = None # remember date encoding
        self.feature_scaling = None # remember feature scaling for future
        self.imputer_dict = {} # dictionary between columns and its imputer

    def __call__(self, df: pd.DataFrame) -> pd.DataFrame:
        # Perform imputation.
        if CONFIG['preprocessing']['imputing']['groupby_first']['feature_list']:
            df = preprocessing.groupby_impute(df, CONFIG)

        if len(self.imputer_dict) == 0:
            self.imputer_dict = preprocessing.fit_imputer(df, CONFIG)
        imputed_df = preprocessing.impute(self.imputer_dict, df)

        # encoding must be done after imputation, otherwise NA value is treated as a unique category unintentionally
        # Perform categorical encoding on specified variables
        if self.feature_encoding is None:
            self.feature_encoding = preprocessing.get_categorical_encoding(imputed_df , CONFIG)
        encoded_df = preprocessing.encode_categorical_features(imputed_df, CONFIG, self.feature_encoding)

        # Perform scaling
        if self.feature_scaling is None:
            try:
                self.feature_scaling = preprocessing.fit_scaler(encoded_df, CONFIG) # TODO: now we need to remember the scaler hasn't been fitted on CONFIG['target']. Is this good?
            except KeyError:
                raise KeyError(f"Target column {CONFIG['target']} is not in the dataframe's columns!")
        encoded_df = preprocessing.scale(encoded_df, self.feature_scaling, CONFIG)

        return encoded_df

In [None]:
# perform filtering on samples by thresholding against features 
class SampleFilterByFeatureThresholdContainer:
    """
    Container for filtering operations on the datset to remove unwanted rows. The index is not changed, however.
    """
    def __call__(self, df: pd.DataFrame):
        for col, key_fn in CONFIG['preprocessing']['filtering'].items():
            df = df[key_fn(df[col])] # remove height = 0 as they are invalid
        return df

In [None]:
# drop features
# TODO: Generalise this to accept configuration file
class FeatureRemovalContainer: 
    """
    Container for feature removal operations to remove unwanted features.
    """
    def __call__(self, df: pd.DataFrame):
        # remove by setting in config
        col_names = CONFIG['preprocessing']['feature_removal']['feature_list']
        drop = CONFIG['preprocessing']['feature_removal']['drop']
        if drop:
            df = df.drop(columns=col_names)
        else:
            df = df[col_names].copy()
        return df

In [None]:
# drop_cols = "AUSLINK_NETWORK DI_CLIMATE_ZONE DI_D0_STDEV DI_D1500 DI_D900 DI_DEFECT DI_IRI LGA_2 LINK_NO DI_NRM DI_PATCHES I_SPEED_TRUCK DI_SHLDWIDTH_LEFT DI_SHLDWIDTH_RIGHT I_AC_AGG_SIZE I_AC_BINDER I_AC_SPECIAL_TREATMENT I_AC_TYPE I_CONSISTENT_SURFACE_TYPE I_SS_CAT I_SS_COAT surf_function".split()
drop_cols = []
df = dtims_in.drop(columns=drop_cols)

In [None]:
df = df.assign(age=2020-df["DI_YEAR_CONSTRUCTION"])
df = df.assign(age_surface=2020-df["DI_YEAR_SURF"])
df["COM_TRT"] = df["COM_TRT"].fillna("NoTreatment")

In [None]:
df = FeatureAdder()(df)
display(df)

In [None]:
# Initialise class containers
feature_preprocess = FeatureEncodeImputeNormalizeContainer()
sample_fitler = SampleFilterByFeatureThresholdContainer()
feature_removal = FeatureRemovalContainer()

# Sequential processing.
filtered_df = sample_fitler(df)
col_filtered_df = feature_removal(filtered_df)
complete_df = feature_preprocess(col_filtered_df)

display(complete_df)

# Saving completed dataset
util.save_complete_data(complete_df, **CONFIG['preprocessing']['save_complete'])

# Saving preprocessing states for use on validation datasets
state_dict = {
    'config': CONFIG,
    'feature_encoder': feature_preprocess.feature_encoding,
    'scaler': feature_preprocess.feature_scaling,
    'imputer_dict': feature_preprocess.imputer_dict
}
util.pickle_data(state_dict, CONFIG['preprocessing']['state_save_path'], 'preprocessing_state_dict.sav')

In [None]:
projects = dtims_out
projects.head()

In [None]:
projects["Treatment Date"] = pd.to_datetime(projects["n_year"], format="%Y")

In [None]:
treatment_lookup = util.load_data(DATA_DIR.parent / "references" / "TreatmentCategory.csv")
treatment_lookup = treatment_lookup[treatment_lookup["Jurisdiction"] == "NSW"]
treatment_lookup

In [None]:
old_shape = projects.shape
cleaned_projects = projects.dropna(
    subset=["road", "ElementID_subseg_no", "n_year", "Trt", "Length_km", "program", "ElementID"]
).copy()

old_shape = cleaned_projects.shape
cleaned_projects = cleaned_projects[cleaned_projects["program"] == "dTIMS"]

cleaned_projects["Treatment Category"] = cleaned_projects["Trt"]
cleaned_projects["Treatment Category"] = cleaned_projects["Treatment Category"].replace(dict(zip(treatment_lookup["Specific Category Value"], treatment_lookup["Generic Category"])))
old_shape = cleaned_projects.shape
cleaned_projects = cleaned_projects.drop(index=cleaned_projects[~cleaned_projects["Treatment Category"].isin(treatment_lookup["Generic Category"])].index)

old_shape = cleaned_projects.shape
cleaned_projects = cleaned_projects.drop(index=cleaned_projects[cleaned_projects["Treatment Category"] == "drop"].index)

cleaned_projects = cleaned_projects[["ElementID", "Treatment Date", "Treatment Category"]]
cleaned_projects = cleaned_projects.rename(columns={"ElementID": "Road_Number"})
old_shape = cleaned_projects.shape
cleaned_projects = cleaned_projects.drop_duplicates()


In [None]:
cleaned_projects.sample(5)

In [None]:
complete_df = complete_df.rename(columns={"ELEMENTID": "Road_Number"})
cleaned_df = complete_df
cleaned_df.sample(5)

In [None]:
def make_label_mat(grouped_labels: pd.DataFrame, treatments: list, latest_condition_date) -> pd.DataFrame:
    label_mat = pd.DataFrame(columns=treatments, index=[
        'Treatment within 1 year',
        'Treatment between 1 to 3 years',
        'Treatment between 3 to 5 years',
        'Treatment between 5 to 10 years',
        'Treatment between 10 to 30 years'
    ])
    label_mat.loc[:, :] = 0
    if len(grouped_labels) == 0:
        return label_mat

    for i, treatment in enumerate(treatments):
        category_labels = grouped_labels[grouped_labels['Treatment Category'] == treatment]
        if len(category_labels) == 0:
            continue
        
        year_offset = ((category_labels['Treatment Date'] - latest_condition_date) / np.timedelta64(1, 'Y'))

        if (year_offset <= 1).any():
            label_mat.iloc[0, i] = 1
    
        if ((year_offset > 1) & (year_offset <= 3)).any():
            label_mat.iloc[1, i] = 1

        if ((year_offset > 3) & (year_offset <= 5)).any():
            label_mat.iloc[2, i] = 1

        if ((year_offset > 5) & (year_offset <= 10)).any():
            label_mat.iloc[3, i] = 1
        
        if ((year_offset > 10) & (year_offset <= 30)).any():
            label_mat.iloc[4, i] = 1
    
    return label_mat

In [None]:

treatments = cleaned_projects['Treatment Category'].unique()
min_date_planned = cleaned_projects['Treatment Date'].min()

flattened_data = []
flattened_projects = []
flattened_idx = []
discarded_count = 0

for idx in tqdm(cleaned_df.index, desc="index"):
    cur_seg = cleaned_df.loc[idx]
    road_id = cur_seg["Road_Number"]
    
    # find all projects in that section
    labels = cleaned_projects[cleaned_projects['Road_Number'] == road_id]

    # flatten label groups
    label_mat = make_label_mat(labels, treatments, min_date_planned)
    label_mat = pd.melt(label_mat.reset_index(), id_vars='index').rename(columns={
        'index': 'key_type',
        'variable': 'Treatment Category',
        'value': 'boolean'
    }).set_index(['key_type', 'Treatment Category']).transpose()
    label_mat['no_project_flag'] = 1 if (label_mat.values != 0).sum() == 0 else 0

    # append train and labels
    flattened_projects.append(label_mat)
flattened_data = cleaned_df.drop(columns="Road_Number")

In [None]:
train_df = pd.DataFrame(flattened_data)
train_labels = pd.concat(flattened_projects, axis=0)

In [None]:
no_project_mask = train_labels.reset_index()["no_project_flag"] == 1

In [None]:
from pathlib import Path
save_dir = Path(DATA_DIR / "processed" / "NSW" / "final")
if save_dir.exists() is False:
    save_dir.mkdir(parents=True)

train_df.to_csv(save_dir / "train_all.csv", index=False)
train_labels.to_csv(save_dir / "labels_all.csv", index=False)
cleaned_df.to_csv(save_dir / "cleaned_condition_data.csv", index=False)
cleaned_projects.to_csv(save_dir / "cleaned_projects.csv", index=False)