# Import Modules

In [6]:
import pandas as pd
import numpy as np
import json
import glob

# Read Data

In [7]:
data = {}
for path in glob.glob(r"../Data/t_taxi/*.csv"):
    data[path.split('\\')[-1].split('.')[0]] = pd.read_csv(path)
data.keys()

dict_keys(['processed_dep_h0', 'processed_dep_h120', 'processed_dep_h180', 'processed_dep_h30'])

# Model Evaluation Function

In [8]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
import time

def model_eval(y, y_pred, name=None, file=None, verbose=True, **kwargs):
    report = {}
    if name:
        report['name'] = name
        if verbose:
            print(name)
    
    report["RMSE"] = mean_squared_error(y, y_pred, squared=False)
    report["MAE"] = mean_absolute_error(y, y_pred)
    report["% <2 min"] = sum(abs(y-y_pred) < 2*60)/len(y)*100
    report["% <5 min"] = sum(abs(y-y_pred) < 5*60)/len(y)*100
    report["% <7 min"] = sum(abs(y-y_pred) < 7*60)/len(y)*100
    report["time"] = round(time.time())
    
    for kwarg in kwargs:
        report[kwarg] = kwargs[kwarg]
    
    if file is not None:
        with open(file, "a") as f:
            f.write(str(report)+"\n")
    if verbose:
        print(report)
    return(report)

# Manual Decision Tree

In [9]:
class manual_tree:
    """
    make a manual tree for data
    """
    
    def __init__(self):
        self.transformations = []
        self.fitted = False
        import pandas as pd
    
    def add_transformation(self, X_col, y_col, **kwargs):
        """
        the X_cols to split along,
        the y_col
        
        optional: name: name of the training column (default equal to X_col)
        optional: X_col_train: column names for training
        """
        X_col_train = X_col
        name = len(self.transformations)
        for kwarg in kwargs:
            if kwarg=='X_col_train':
                X_col_train = kwargs[kwarg]
            if kwarg=='name':
                name = name
        
        self.transformations.append(
        
        {
            'name' : name,
            'X_col': X_col, 
            'y_col': y_col,
            'X_col_train': X_col_train, # WIP only store when != X_col
        })
        
        self.fitted = False
        
    def fit(self, df_train):
        self.model = []
        for t in self.transformations:
            self.model.append((t['name'], 
                               df_train[t['X_col_train'] + [t['y_col']]].groupby(t['X_col_train']).mean()))            
        self.fallback = df_train[t['y_col']].mean()
        self.fitted = True

    def predict(self, df):
        if self.fitted:
            y = pd.DataFrame(df.index).set_index(0)
            y['y'] = np.nan
            for n, m in enumerate(self.model):
                filler = pd.merge(df.reset_index(), 
                                  m[1], 
                                  how='left', 
                                  left_on=self.transformations[n]['X_col'], 
                                  right_on=self.transformations[n]['X_col_train'])
                filler = filler.set_index('index')[self.transformations[n]['y_col']+'_y']
                y['y'] = y['y'].fillna(filler)
            return y['y'].fillna(self.fallback)
        else:
            raise Exception("Model Not Fitted")

In [10]:
mt_h30 = manual_tree()
mt_h30.add_transformation(
    X_col = ['depgnr', 'trwy_ext', 'wtc'],
    y_col='t_taxi')
mt_h30.add_transformation(
    X_col = ['trwy_ext', 'wtc'], 
    y_col='t_taxi')
df = data['processed_dep_h30']
mt_h30.fit(df[df['dtype'].isin(["TRAIN", "VALIDATE"])])

for h in [0, 30, 120, 180]:
    
    # This is ugly, apologies!
    df = data['processed_dep_h{}'.format(h)]
    
    X_train = df[df['dtype']=="TRAIN"]
    X_train.pop("dtype")
    y_train = X_train.pop("t_taxi")
    
    X_val = df[df['dtype']=="VALIDATE"]
    X_val.pop("dtype")
    y_val = X_val.pop("t_taxi")
    
    X_test = df[df['dtype']=="TEST"]
    X_test.pop("dtype")
    y_test = X_test.pop("t_taxi")
    

    
    from sklearn.metrics import mean_squared_error

        
    mt = manual_tree()

    mt.add_transformation(X_col = ['depgnr', 'trwy_ext', 'wtc'],
                         y_col='t_taxi')

    # Fallback Option When depgnr is Null, barely makes a difference
    mt.add_transformation(X_col = ['trwy_ext', 'wtc'], 
                          y_col='t_taxi') 

    # MT requires one df, only validation data used for evaluation, no hyperparameter tuning!
    df_mt_train = pd.concat([pd.concat([X_train, X_val]), pd.concat([y_train, y_val])], axis=1)
    df_mt_test = pd.concat([X_test, y_test], axis=1)
    mt.fit(df_mt_train)

    y_test_pred_h = mt.predict(df_mt_test)
    y_test_pred_h30 = mt_h30.predict(df_mt_test)

    model_eval(
        y_test, 
        y_test_pred_h, 
        name="vtt_h{}".format(h,h), # model, eval technique, params
        file="./results/model_vtt.results",
        dataset_train="processed_dep_h{}".format(h),
        dataset_test="processed_dep_h{}".format(h),
        model_type="VTT",
        eval_type="SIMPLE_TEST",
    )
    model_eval(
        y_test, 
        y_test_pred_h30, 
        name="vtt_h30".format(h,h), # model, eval technique, params
        file="./results/model_vtt.results",
        dataset_train="processed_dep_h30",
        dataset_test="processed_dep_h{}".format(h),
        model_type="VTT",
        eval_type="SIMPLE_TEST",
    )
    

vtt_h0
{'name': 'vtt_h0', 'RMSE': 151.37257752428079, 'MAE': 108.3464180114934, '% <2 min': 66.57208056354797, '% <5 min': 95.6344875483679, '% <7 min': 98.5474749479115, 'time': 1618436352, 'dataset_train': 'processed_dep_h0', 'dataset_test': 'processed_dep_h0', 'model_type': 'VTT', 'eval_type': 'SIMPLE_TEST'}
vtt_h30
{'name': 'vtt_h30', 'RMSE': 152.10289203317856, 'MAE': 109.13842075860204, '% <2 min': 66.2029963290009, '% <5 min': 95.59083242385157, '% <7 min': 98.50183549955352, 'time': 1618436352, 'dataset_train': 'processed_dep_h30', 'dataset_test': 'processed_dep_h0', 'model_type': 'VTT', 'eval_type': 'SIMPLE_TEST'}
vtt_h30
{'name': 'vtt_h30', 'RMSE': 161.77193253803696, 'MAE': 115.45557620987479, '% <2 min': 64.40519521848957, '% <5 min': 94.3333930027647, '% <7 min': 97.83996658511845, 'time': 1618436353, 'dataset_train': 'processed_dep_h30', 'dataset_test': 'processed_dep_h30', 'model_type': 'VTT', 'eval_type': 'SIMPLE_TEST'}
vtt_h30
{'name': 'vtt_h30', 'RMSE': 161.7719325380