<div>
<img src='../../img/WSP_red.png' style='height: 95px; float: left' alt='WSP Logo'/>
<img src='../../img/austroads.png' style='height: 115px; float: right' alt='Client Logo'/>
</div>
<center><h2>AAM6201 Development of Machine-Learning Decision-Support tools for Pavement Asset Management<br>Case Study 1: Project Identification</h2></center>


# Data Preprocessing

This notebook preprocesses a dataset through the following steps in order:

- Apply transformations 
- Adding new columns
- Filtering the dataframe
- Dropping unused columns

As detailed in the configuration

In [None]:
# magic command to autoreload changes in src
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import src.util as util
import src.features.preprocessing as preprocessing 

from src.mrwa_configs.final_config import CONFIG, FeatureAdder
from IPython.display import display
pd.set_option('display.max_rows', 1)

In [None]:
# load data
from src import DATA_DIR

DATASET = 'MRWA'
all_df = util.load_data(**CONFIG['data']['preprocessing'])

experiment_suffix = f'mrwa_final'

# renaming
all_df = all_df.rename(columns={
    'Road': 'RoadID',
    'Start Chainage': 'Start',
    'Cway': 'Direction'
})

# convert km to m
all_df.loc[:, 'Start'] = all_df['Start'] * 1000
all_df.loc[:, 'End'] = all_df['Start'] + all_df['Length'] * 1000
all_df = all_df.drop(columns=['Length'])

# drop nas with index
all_df = all_df.dropna(subset=['RoadID', 'Direction', 'Start', 'End', 'Date of condition data'])

In [None]:
added_df = FeatureAdder()(all_df)
display(added_df.head(1).drop([0]))

## Train-Valid split

In [None]:
grouped_index = added_df.groupby(['RoadID', 'Direction', 'Start'])
train_df = added_df[grouped_index.ngroup().isin(
    np.random.choice(
        range(grouped_index.ngroups), 
        size=int(0.8 * grouped_index.ngroups), 
        replace=False)
    )]
valid_df = added_df.loc[list(set(added_df.index) - set(train_df.index))] 

## Definition of transformers

In [None]:
class FeatureEncodeImputeNormalizeContainer:
    """
    Container for encoding, imputation, and normalization operations, in that order.
    """

    def __init__(self):
        self.feature_encoding = None # remember feature encoding for future
        self.date_encoding = None # remember date encoding
        self.feature_scaling = None # remember feature scaling for future
        self.imputer_dict = {} # dictionary between columns and its imputer

    def __call__(self, df: pd.DataFrame, config: dict) -> pd.DataFrame:
        # Perform imputation.
        if config['preprocessing']['imputing']['groupby_first']['feature_list']:
            df = preprocessing.groupby_impute(df, config)

        if len(self.imputer_dict) == 0:
            self.imputer_dict = preprocessing.fit_imputer(df, config)
        imputed_df = preprocessing.impute(self.imputer_dict, df)

        # encoding must be done after imputation, otherwise NA value is treated as a unique category unintentionally
        # Perform categorical encoding on specified variables
        if self.feature_encoding is None:
            self.feature_encoding = preprocessing.get_categorical_encoding(imputed_df, config)
        encoded_df = preprocessing.encode_categorical_features(imputed_df, config, self.feature_encoding)

        # Perform scaling
        if self.feature_scaling is None:
            try:
                self.feature_scaling = preprocessing.fit_scaler(encoded_df, config) # TODO: now we need to remember the scaler hasn't been fitted on config['target']. Is this good?
            except KeyError:
                raise KeyError(f"Target column {config['target']} is not in the dataframe's columns!")
        encoded_df = preprocessing.scale(encoded_df, self.feature_scaling, config)

        return encoded_df

In [None]:
# perform filtering on samples by thresholding against features 
class SampleFilterByFeatureThresholdContainer:
    """
    Container for filtering operations on the datset to remove unwanted rows. The index is not changed, however.
    """
    def __call__(self, df: pd.DataFrame, config: dict):
        for col, key_fn in config['preprocessing']['filtering'].items():
            prev_len = len(df)
            df = df[key_fn(df[col])] # remove height = 0 as they are invalid
            new_len = len(df)
        return df

In [None]:
# drop features
class FeatureRemovalContainer: 
    """
    Container for feature removal operations to remove unwanted features.
    """
    def __call__(self, df: pd.DataFrame, config: dict):
        # remove by setting in config
        col_names = config['preprocessing']['feature_removal']['feature_list']
        drop = config['preprocessing']['feature_removal']['drop']
        if drop:
            df = df.drop(columns=col_names)
        else:
            df = df[col_names].copy()
        return df

# Perform preprocessing

In [None]:
# Initialise class containers
feature_preprocess = FeatureEncodeImputeNormalizeContainer()
sample_fitler = SampleFilterByFeatureThresholdContainer()
feature_removal = FeatureRemovalContainer()

# Sequential processing.
filtered_df = sample_fitler(train_df, CONFIG)
col_filtered_df = feature_removal(filtered_df, CONFIG)
complete_df = feature_preprocess(col_filtered_df, CONFIG)

# drop index now that we have performed imputed groupby
display(complete_df.head(1).drop(0))

# Saving completed dataset
util.save_complete_data(complete_df, flag=True, save_path=DATA_DIR / 'interim' / DATASET /  ('train_processed' + (f'_{experiment_suffix}' if experiment_suffix else '') + '.csv'), save_method='save_csv')

# Saving preprocessing states for use on validation datasets
state_dict = {
    'config': CONFIG,
    'feature_encoder': feature_preprocess.feature_encoding,
    'scaler': feature_preprocess.feature_scaling,
    'imputer_dict': feature_preprocess.imputer_dict
}
util.pickle_data(state_dict, CONFIG['preprocessing']['state_save_path'], f'preprocessing_state_dict' + (f'_{experiment_suffix}' if experiment_suffix else '') + '.sav')

## Run fitted preprocessing on valid data

In [None]:
import pickle

with open(CONFIG['preprocessing']['state_save_path'] / ('preprocessing_state_dict' + (f'_{experiment_suffix}' if experiment_suffix else '') + '.sav'), 'rb') as f:
    saved_state_dict = pickle.load(f)

filtered_valid_df = sample_fitler(valid_df, CONFIG)
col_filtered_valid_df = feature_removal(filtered_valid_df, CONFIG)
groupby_impute_valid_df = preprocessing.groupby_impute(col_filtered_valid_df, CONFIG)
imputed_valid_df = preprocessing.impute(saved_state_dict['imputer_dict'], col_filtered_valid_df)
encoded_valid_df = preprocessing.encode_categorical_features(imputed_valid_df, CONFIG, saved_state_dict['feature_encoder'])
normalized_valid_df = preprocessing.scale(encoded_valid_df, saved_state_dict['scaler'], CONFIG)

encoded_valid_df.to_csv(DATA_DIR / 'interim' / DATASET / 'encoded_valid.csv', index=False)
util.save_complete_data(normalized_valid_df, flag=True, save_path=DATA_DIR / 'interim' / DATASET / ('valid_processed' + (f'_{experiment_suffix}' if experiment_suffix else '') + '.csv'), save_method='save_csv')

Apply mrwa preprocessing to nzta dataset. Can only be run after NZTA preprocessing is run

In [None]:
from src.nzta_configs.final_config import CONFIG as NZTA_CONFIG

with open(NZTA_CONFIG['preprocessing']['state_save_path'] / (f'preprocessing_state_dict_{experiment_suffix.replace("mrwa", "nzta")}.sav'), 'rb') as f:
    saved_state_dict = pickle.load(f)

filtered_valid_df = sample_fitler(valid_df, NZTA_CONFIG)
col_filtered_valid_df = feature_removal(filtered_valid_df, CONFIG) # cannot use NZTA CONFIG here because we want to keep our direction column
groupby_impute_valid_df = preprocessing.groupby_impute(col_filtered_valid_df, CONFIG) # cannot use NZ config here since group by condition is different
imputed_valid_df = preprocessing.impute(saved_state_dict['imputer_dict'], groupby_impute_valid_df)
encoded_valid_df = preprocessing.encode_categorical_features(imputed_valid_df, NZTA_CONFIG, saved_state_dict['feature_encoder'])
normalized_valid_df = preprocessing.scale(encoded_valid_df, saved_state_dict['scaler'], NZTA_CONFIG)

encoded_valid_df.to_csv(DATA_DIR / 'interim' / DATASET / 'encoded_valid_transfer.csv', index=False)
util.save_complete_data(normalized_valid_df, flag=True, save_path=DATA_DIR / 'interim' / DATASET / (f'valid_processed_{experiment_suffix}_TRANSFERED.csv'), save_method='save_csv')