# Preprocessing

## Imports

In [162]:
import pandas as pd
import geopandas as gpd
import numpy as np
import os
import torch
import random
import cv2
from tqdm import tqdm
from matplotlib import pyplot as plt
import segmentation_models_pytorch as smp
import albumentations as album
from PIL import Image
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import random_split
from sklearn.metrics import accuracy_score
import plotly.express as px
import torchmetrics
from torchmetrics import MeanAbsolutePercentageError
from glob import glob

%matplotlib inline

## Global Variables

In [163]:
ROOT_DIR_PATH = os.path.abspath('..')

AADT_PATH = os.path.join(ROOT_DIR_PATH, 'data/ground_truth_data/')

AADT_PROCESSED_PATH = os.path.join(ROOT_DIR_PATH, 'data/ground_truth_data/processed_aadt.csv')

NN_MODEL_PATH = os.path.join(ROOT_DIR_PATH, "models/nn_aadt_model.pth")

NORMALISE = True

DROP_UNNORMALISE = False

REMOVE_METADATA = True

## General Functions

In [164]:
# Function used to normalsise the count data
def normalise(clean_report):
    interger_cols = ['0-520cm', '521-660cm', '661-1160cm', '1160+cm', 'total_volume']
    for name in interger_cols:
        new_name = f"{name}_normalised"
        # for ever row in the report present the row's site id's mean volume
        mean = clean_report.groupby('site_id')[name].transform("mean")
        # normalise
        clean_report.loc[:, new_name] = clean_report[name] / mean
        # filter so we don't have rows with a small mean which causes a pole
    return clean_report[mean>1]

In [165]:
def drop_unnormalise(df):
    interger_cols = ['0-520cm', '521-660cm', '661-1160cm', '1160+cm', 'total_volume']
    df = df.drop(columns=interger_cols, axis=1)
    return df

In [166]:
def drop_metadata(df):
    metadata = ['site_id', 'time_period_ending', 'time_interval', 'daily_count', 'report_date', 'site_name', 'timestamp']
    df = df.drop(columns=metadata, axis=1)
    return df

## Data

In [167]:
pattern = os.path.join(ROOT_DIR_PATH, 'data/ground_truth_data/aadt_*.csv')

aadt_file_paths = [os.path.join(AADT_PATH, os.path.basename(x)) for x in glob(pattern)]
print("AADT files: {}".format(aadt_file_paths))

df = pd.read_csv(aadt_file_paths[0])

for i in range(1, len(aadt_file_paths)):
    df_aadt = pd.read_csv(aadt_file_paths[i])
    df = pd.concat([df, df_aadt], axis=0, ignore_index=True)

AADT files: ['/home/ah2719/FYP/Spatial_Finance_Transport/data/ground_truth_data/aadt_M1_2297B_year_2018.csv', '/home/ah2719/FYP/Spatial_Finance_Transport/data/ground_truth_data/aadt_M1_2297A_year_2018.csv', '/home/ah2719/FYP/Spatial_Finance_Transport/data/ground_truth_data/aadt_M5_7465B_year_2018.csv', '/home/ah2719/FYP/Spatial_Finance_Transport/data/ground_truth_data/aadt_30360819_year_2018.csv', '/home/ah2719/FYP/Spatial_Finance_Transport/data/ground_truth_data/aadt_M5_7465B_year_2017.csv', '/home/ah2719/FYP/Spatial_Finance_Transport/data/ground_truth_data/aadt_30360818_year_2018.csv']


In [168]:
df.head()

Unnamed: 0.1,Unnamed: 0,site_name,site_id,report_date,time_period_ending,time_interval,0-520cm,521-660cm,661-1160cm,1160+cm,avg_mph,total_volume,timestamp,year,month,day,hour,daily_count,aadt
0,0,M1/2297B,2857,2018-01-01,00:14:00,0,84,11,2,0,66.0,97,2018-01-01 00:14:00,2018,1,1,0,32110,51919.556474
1,1,M1/2297B,2857,2018-01-01,00:29:00,1,116,4,0,2,70.0,122,2018-01-01 00:29:00,2018,1,1,0,32110,51919.556474
2,2,M1/2297B,2857,2018-01-01,00:44:00,2,128,10,5,3,67.0,146,2018-01-01 00:44:00,2018,1,1,0,32110,51919.556474
3,3,M1/2297B,2857,2018-01-01,00:59:00,3,133,12,5,2,68.0,152,2018-01-01 00:59:00,2018,1,1,0,32110,51919.556474
4,4,M1/2297B,2857,2018-01-01,01:14:00,4,160,16,3,2,67.0,181,2018-01-01 01:14:00,2018,1,1,1,32110,51919.556474


## Corrupt values

In [169]:
df = df.dropna(axis=0)
print("df number of rows: {}".format(len(df)))
df.isnull().sum()

df number of rows: 4243


Unnamed: 0            0
site_name             0
site_id               0
report_date           0
time_period_ending    0
time_interval         0
0-520cm               0
521-660cm             0
661-1160cm            0
1160+cm               0
avg_mph               0
total_volume          0
timestamp             0
year                  0
month                 0
day                   0
hour                  0
daily_count           0
aadt                  0
dtype: int64

## Standardisation

In [170]:
if NORMALISE:
    df = normalise(df)

if DROP_UNNORMALISE:
    df = drop_unnormalise(df)

In [171]:
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
df.head()

Unnamed: 0,site_name,site_id,report_date,time_period_ending,time_interval,0-520cm,521-660cm,661-1160cm,1160+cm,avg_mph,...,month,day,hour,daily_count,aadt,0-520cm_normalised,521-660cm_normalised,661-1160cm_normalised,1160+cm_normalised,total_volume_normalised
0,M1/2297B,2857,2018-01-01,00:14:00,0,84,11,2,0,66.0,...,1,1,0,32110,51919.556474,0.203784,0.232113,0.062825,0.0,0.191499
1,M1/2297B,2857,2018-01-01,00:29:00,1,116,4,0,2,70.0,...,1,1,0,32110,51919.556474,0.281416,0.084405,0.0,0.132425,0.240854
2,M1/2297B,2857,2018-01-01,00:44:00,2,128,10,5,3,67.0,...,1,1,0,32110,51919.556474,0.310527,0.211012,0.157062,0.198638,0.288236
3,M1/2297B,2857,2018-01-01,00:59:00,3,133,12,5,2,68.0,...,1,1,0,32110,51919.556474,0.322657,0.253215,0.157062,0.132425,0.300081
4,M1/2297B,2857,2018-01-01,01:14:00,4,160,16,3,2,67.0,...,1,1,1,32110,51919.556474,0.388159,0.33762,0.094237,0.132425,0.357333


## Metadata

In [172]:
if REMOVE_METADATA:
    df = drop_metadata(df)

df.head()

Unnamed: 0,0-520cm,521-660cm,661-1160cm,1160+cm,avg_mph,total_volume,year,month,day,hour,aadt,0-520cm_normalised,521-660cm_normalised,661-1160cm_normalised,1160+cm_normalised,total_volume_normalised
0,84,11,2,0,66.0,97,2018,1,1,0,51919.556474,0.203784,0.232113,0.062825,0.0,0.191499
1,116,4,0,2,70.0,122,2018,1,1,0,51919.556474,0.281416,0.084405,0.0,0.132425,0.240854
2,128,10,5,3,67.0,146,2018,1,1,0,51919.556474,0.310527,0.211012,0.157062,0.198638,0.288236
3,133,12,5,2,68.0,152,2018,1,1,0,51919.556474,0.322657,0.253215,0.157062,0.132425,0.300081
4,160,16,3,2,67.0,181,2018,1,1,1,51919.556474,0.388159,0.33762,0.094237,0.132425,0.357333


## Saving data

In [173]:
df.to_csv(AADT_PROCESSED_PATH)