<div>
<img src='../../img/WSP_red.png' style='height: 95px; float: left' alt='WSP Logo'/>
<img src='../../img/austroads.png' style='height: 115px; float: right' alt='Client Logo'/>
</div>
<center><h2>AAM6201 Development of Machine-Learning Decision-Support tools for Pavement Asset Management<br>Case Study 1: Project Identification</h2></center>


In [None]:
# magic command to autoreload changes in src
%load_ext autoreload
%autoreload 2

import pandas as pd
import src.util as util
import src.features.preprocessing as preprocessing 
import numpy as np

from pathlib import Path

# Data Preprocessing

This notebook preprocesses a dataset through the following steps in order:

- Apply transformations 
- Adding new columns
- Filtering the dataframe
- Dropping unused columns

As detailed in the configuration

In [None]:
# load data
from src.nzta_configs.final_config import DATA_DIR
NZTA_DATADIR = Path(DATA_DIR) / 'interim' / 'NZTA'

DATASET = 'NZTA'
experiment_suffix = f'nzta_final'

rutting_skid = pd.read_csv(NZTA_DATADIR / '100m_Rutting_Skid_new.csv')
structure = pd.read_csv(NZTA_DATADIR / '100m_Surface_Structure_new.csv')
carriageway = pd.read_csv(NZTA_DATADIR / '100m_Carriageway_new.csv')
pavement_layer = pd.read_csv(NZTA_DATADIR / '100m_Pavement_Layer_new.csv')
traffic = pd.read_csv(NZTA_DATADIR / '100m_Traffic_new.csv')
deflection_crack = pd.read_csv(NZTA_DATADIR / '100m_Deflection_Cracking_new.csv')

cleaned_rutting = rutting_skid.rename(columns={
    'Road Name': 'RoadID',
    'Year': 'Date of condition data'
})
cleaned_rutting.loc[:, 'Date of condition data'] = cleaned_rutting['Date of condition data'].astype(np.datetime64)

cleaned_carriageway = carriageway.rename(columns={
    'Road': 'RoadID'
})

cleaned_structure = structure.rename(columns={
    'Road ID': 'RoadID',
    'Age': 'Surface age',
})
cleaned_structure.loc[:, 'Surfacing Date'] = cleaned_structure['Surfacing Date'].astype(np.datetime64)
cleaned_structure = cleaned_structure[cleaned_structure['Surfacing Date'] < pd.to_datetime('2020', format='%Y')]

cleaned_pavement = pavement_layer.rename(columns={
    'Road': 'RoadID',
    'Age': 'Pavement age',
})

cleaned_def_crack = deflection_crack

In [None]:
# Change traffic to make them yearless
traffic.loc[:, 'Count Date'] = traffic['Count Date'].astype(np.datetime64)
cleaned_traffic = traffic.sort_values(by='Count Date')
old_shape = cleaned_traffic.shape
cleaned_traffic = cleaned_traffic.drop_duplicates(subset=['Road', 'Start'], keep='first')
cleaned_traffic = cleaned_traffic.sort_values(by=['Road', 'Start']).drop(columns=['Count Date'])

cleaned_traffic = cleaned_traffic.rename(columns={
    'Road': 'RoadID'
})

# Merge all tables

In [None]:
cleaned_dfs = [cleaned_rutting, cleaned_structure, cleaned_pavement, cleaned_carriageway, cleaned_traffic, cleaned_def_crack]
cleaned_yearless_dfs = [cleaned_pavement, cleaned_carriageway, cleaned_traffic, cleaned_def_crack]
cleaned_year_dfs = [cleaned_rutting, cleaned_structure]

## Merge yearless dfs

In [None]:
all_roads = set.union(*(set(df['RoadID']) for df in cleaned_yearless_dfs))
shared_roads = set.intersection(*(set(df['RoadID']) for df in cleaned_yearless_dfs))
all_indices = set.union(*(set(df[['RoadID', 'Start']].itertuples(index=False, name=None)) for df in cleaned_yearless_dfs))
shared_indices = set.intersection(*(set(df[['RoadID', 'Start']].itertuples(index=False, name=None)) for df in cleaned_yearless_dfs))


In [None]:
from functools import reduce, partial

index = ['RoadID', 'Start']
indexed_yearless_dfs = [df.dropna(subset=index, how='any').set_index(index) for df in cleaned_yearless_dfs]
cleaned_yearless = reduce(partial(pd.merge, how='outer', left_index=True, right_index=True, validate='1:1'), indexed_yearless_dfs)
cleaned_yearless = cleaned_yearless.reset_index()

In [None]:
merged_rutting_structure = cleaned_rutting.merge(cleaned_structure, 
    how='outer', 
    on=['RoadID', 'Start'], 
    validate='m:m',
    suffixes=['_traffic', '_structure']
)

In [None]:
cleaned_rutting_structure = merged_rutting_structure[
    merged_rutting_structure['Date of condition data'] > \
        merged_rutting_structure['Surfacing Date']
]

cleaned_rutting_structure = \
    cleaned_rutting_structure.sort_values(by='Surfacing Date', ascending=False)\
                             .drop_duplicates(subset=['RoadID', 'Start', 'Date of condition data'], keep='first')

In [None]:
cleaned_merge_all = cleaned_rutting_structure.merge(cleaned_yearless, on=['RoadID', 'Start'], how='left')

In [None]:
from src.nzta_configs.final_config import CONFIG, FeatureAdder
from IPython.display import display
cleaned_added_df = FeatureAdder()(cleaned_merge_all)
display(cleaned_added_df)

In [None]:
cleaned_added_df.to_csv(NZTA_DATADIR / '100m_Merged_new.csv', index=False)

# Normalising, Imputation, and Encoding Boilerplate Code

In [None]:
class FeatureEncodeImputeNormalizeContainer:
    """
    Container for encoding, imputation, and normalization operations, in that order.
    """

    def __init__(self):
        self.feature_encoding = None # remember feature encoding for future
        self.date_encoding = None # remember date encoding
        self.feature_scaling = None # remember feature scaling for future
        self.imputer_dict = {} # dictionary between columns and its imputer

    def __call__(self, df: pd.DataFrame, config: dict) -> pd.DataFrame:
        # Perform imputation.
        if config['preprocessing']['imputing']['groupby_first']['feature_list']:
            df = preprocessing.groupby_impute(df, config)

        if len(self.imputer_dict) == 0:
            self.imputer_dict = preprocessing.fit_imputer(df, config)
        imputed_df = preprocessing.impute(self.imputer_dict, df)

        # encoding must be done after imputation, otherwise NA value is treated as a unique category unintentionally
        # Perform categorical encoding on specified variables
        if self.feature_encoding is None:
            self.feature_encoding = preprocessing.get_categorical_encoding(imputed_df, config)
        encoded_df = preprocessing.encode_categorical_features(imputed_df, config, self.feature_encoding)

        # Perform scaling
        if self.feature_scaling is None:
            try:
                self.feature_scaling = preprocessing.fit_scaler(encoded_df, config) # TODO: now we need to remember the scaler hasn't been fitted on config['target']. Is this good?
            except KeyError:
                raise KeyError(f"Target column {config['target']} is not in the dataframe's columns!")
        encoded_df = preprocessing.scale(encoded_df, self.feature_scaling, config)

        return encoded_df

In [None]:
# perform filtering on samples by thresholding against features 
class SampleFilterByFeatureThresholdContainer:
    """
    Container for filtering operations on the datset to remove unwanted rows. The index is not changed, however.
    """
    def __call__(self, df: pd.DataFrame, config: dict):
        for col, key_fn in config['preprocessing']['filtering'].items():
            prev_len = len(df)
            df = df[key_fn(df[col])] # remove height = 0 as they are invalid
            new_len = len(df)
        return df

In [None]:
# drop features
class FeatureRemovalContainer: 
    """
    Container for feature removal operations to remove unwanted features.
    """
    def __call__(self, df: pd.DataFrame, config: dict):
        # remove by setting in config
        col_names = config['preprocessing']['feature_removal']['feature_list']
        drop = config['preprocessing']['feature_removal']['drop']
        if drop:
            df = df.drop(columns=col_names)
        else:
            df = df[col_names].copy()
        return df

# Define custom config for each of 5 datasets

In [None]:
grouped_index = cleaned_added_df.groupby(['RoadID', 'Start'])
train_df = cleaned_added_df[grouped_index.ngroup().isin(
    np.random.choice(
        range(grouped_index.ngroups), 
        size=int(0.8 * grouped_index.ngroups), 
        replace=False)
    )]
valid_df = cleaned_added_df.loc[list(set(cleaned_added_df.index) - set(train_df.index))] 

# Preprocessing code

In [None]:
from src.nzta_configs.final_config import CONFIG, FeatureAdder

# Initialise class containers
feature_preprocess = FeatureEncodeImputeNormalizeContainer()
sample_fitler = SampleFilterByFeatureThresholdContainer()
feature_removal = FeatureRemovalContainer()

# Sequential processing.
filtered_df = sample_fitler(train_df, CONFIG)
col_filtered_df = feature_removal(filtered_df, CONFIG)
complete_df = feature_preprocess(col_filtered_df, CONFIG)

# drop index now that we have performed imputed groupby
display(complete_df)

# Saving completed dataset
util.save_complete_data(complete_df, flag=True, save_path=DATA_DIR / 'interim' / DATASET /  ('train_processed' + (f'_{experiment_suffix}' if experiment_suffix else '') + '.csv'), save_method='save_csv')

# Saving preprocessing states for use on validation datasets
state_dict = {
    'config': CONFIG,
    'feature_encoder': feature_preprocess.feature_encoding,
    'scaler': feature_preprocess.feature_scaling,
    'imputer_dict': feature_preprocess.imputer_dict
}
util.pickle_data(state_dict, CONFIG['preprocessing']['state_save_path'], f'preprocessing_state_dict' + (f'_{experiment_suffix}' if experiment_suffix else '') + '.sav')

## Run fitted preprocessing on valid data

In [None]:
import pickle

with open(CONFIG['preprocessing']['state_save_path'] / f'preprocessing_state_dict{("_" + experiment_suffix) if experiment_suffix else ""}.sav', 'rb') as f:
    saved_state_dict = pickle.load(f)

filtered_valid_df = sample_fitler(valid_df, CONFIG)
col_filtered_valid_df = feature_removal(filtered_valid_df, CONFIG)
groupby_impute_valid_df = preprocessing.groupby_impute(col_filtered_valid_df, CONFIG)
imputed_valid_df = preprocessing.impute(saved_state_dict['imputer_dict'], groupby_impute_valid_df)
encoded_valid_df = preprocessing.encode_categorical_features(imputed_valid_df, CONFIG, saved_state_dict['feature_encoder'])
normalized_valid_df = preprocessing.scale(encoded_valid_df, saved_state_dict['scaler'], CONFIG)

encoded_valid_df.to_csv(DATA_DIR / 'interim' / DATASET / 'encoded_valid.csv', index=False)
util.save_complete_data(normalized_valid_df, flag=True, save_path=DATA_DIR / 'interim' / DATASET / f'valid_processed{("_" + experiment_suffix) if experiment_suffix else ""}.csv', save_method='save_csv')

Apply MRWA preprocessing on NZTA dataset. Can only done after running MRWA preprocessing

In [None]:
from src.mrwa_configs.final_config import CONFIG as MRWA_CONFIG

with open(MRWA_CONFIG['preprocessing']['state_save_path'] / (f'preprocessing_state_dict_{experiment_suffix.replace("nzta", "mrwa")}.sav'), 'rb') as f:
    saved_state_dict = pickle.load(f)

filtered_valid_df = sample_fitler(valid_df, CONFIG)
col_filtered_valid_df = feature_removal(filtered_valid_df, CONFIG)
groupby_impute_valid_df = preprocessing.groupby_impute(col_filtered_valid_df, CONFIG)
imputed_valid_df = preprocessing.impute(saved_state_dict['imputer_dict'], groupby_impute_valid_df)
encoded_valid_df = preprocessing.encode_categorical_features(imputed_valid_df, CONFIG, saved_state_dict['feature_encoder'])
MRWA_CONFIG['preprocessing']['normalizing']['feature_list'].remove('Direction')
normalized_valid_df = preprocessing.scale(encoded_valid_df, saved_state_dict['scaler'], MRWA_CONFIG) # use mrwa config here since nz config does not contain unique one hot encode from wa

encoded_valid_df.to_csv(DATA_DIR / 'interim' / DATASET / 'encoded_valid_transfer.csv', index=False)
util.save_complete_data(normalized_valid_df, flag=True, save_path=DATA_DIR / 'interim' / DATASET / (f'valid_processed_{experiment_suffix}_TRANSFERED.csv'), save_method='save_csv')