# Import Modules

In [3]:
import pandas as pd
from sqlalchemy import create_engine
import numpy as np
import json
import glob

# Read Data

In [4]:
data = {}
for path in glob.glob(r"../Data/t_taxi/*.csv"):
    data[path.split('\\')[-1].split('.')[0]] = pd.read_csv(path)
data.keys()

dict_keys(['processed_dep_h0', 'processed_dep_h120', 'processed_dep_h180', 'processed_dep_h30'])

# Model Evaluation Function

In [5]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
import time

def model_eval(y, y_pred, name=None, file=None, verbose=True, **kwargs):
    report = {}
    if name:
        report['name'] = name
        if verbose:
            print(name)
    
    report["RMSE"] = mean_squared_error(y, y_pred, squared=False)
    report["MAE"] = mean_absolute_error(y, y_pred)
    report["% <2 min"] = sum(abs(y-y_pred) < 2*60)/len(y)*100
    report["% <5 min"] = sum(abs(y-y_pred) < 5*60)/len(y)*100
    report["% <7 min"] = sum(abs(y-y_pred) < 7*60)/len(y)*100
    report["time"] = round(time.time())
    
    for kwarg in kwargs:
        report[kwarg] = kwargs[kwarg]
    
    if file is not None:
        with open(file, "a") as f:
            f.write(str(report)+"\n")
    if verbose:
        print(report)
    return(report)

# Mean Taxi Time Model

In [12]:
avg_h30 = data['processed_dep_h30']
avg_h30 = avg_h30[avg_h30['dtype'].isin(['TRAIN', "VALIDATE"])]['t_taxi'].mean()

for h in [0, 30, 120, 180]:
    
    # This is ugly, apologies!
    df = data['processed_dep_h{}'.format(h)]
    
    X_train = df[df['dtype']=="TRAIN"]
    X_train.pop("dtype")
    y_train = X_train.pop("t_taxi")
    
    X_val = df[df['dtype']=="VALIDATE"]
    X_val.pop("dtype")
    y_val = X_val.pop("t_taxi")
    
    X_test = df[df['dtype']=="TEST"]
    X_test.pop("dtype")
    y_test = X_test.pop("t_taxi")
    
    avg_h = pd.concat([y_train, y_val]).mean()
    
    model_eval(
        y_test, 
        np.ones(len(y_test))*avg_h, 
        name="model_avg_h{}".format(h), 
        file="./results/model_average.results",
        dataset_train="processed_dep_h{}".format(h),
        dataset_test="processed_dep_h{}".format(h),
        model_type="Average",
        eval_type="SIMPLE_TEST",
    )
    model_eval(
        y_test, 
        np.ones(len(y_test))*avg_h30, 
        name="model_avg_h30",
        file="./results/model_average.results",
        dataset_train="processed_dep_h30",
        dataset_test="processed_dep_h{}".format(h),
        model_type="Average",
        eval_type = "SIMPLE_TEST",
    )

model_avg_h0
{'name': 'model_avg_h0', 'RMSE': 262.7578933186738, 'MAE': 205.94839328000802, '% <2 min': 34.55104673082647, '% <5 min': 77.39061414822899, '% <7 min': 91.57456096835004, 'time': 1618412291, 'dataset_train': 'processed_dep_h0', 'dataset_test': 'processed_dep_h0', 'model_type': 'Average', 'eval_type': 'SIMPLE_TEST'}
model_avg_h30
{'name': 'model_avg_h30', 'RMSE': 262.742998949917, 'MAE': 205.91180176808544, '% <2 min': 34.55104673082647, '% <5 min': 77.39061414822899, '% <7 min': 91.57456096835004, 'time': 1618412291, 'dataset_train': 'processed_dep_h30', 'dataset_eval': 'processed_dep_h0', 'model_type': 'Average', 'eval_type': 'SIMPLE_TEST'}
model_avg_h30
{'name': 'model_avg_h30', 'RMSE': 262.16427367945937, 'MAE': 205.84238223873606, '% <2 min': 34.55257871392486, '% <5 min': 77.38727449927401, '% <7 min': 91.57865425542494, 'time': 1618412291, 'dataset_train': 'processed_dep_h30', 'dataset_test': 'processed_dep_h30', 'model_type': 'Average', 'eval_type': 'SIMPLE_TEST'}
