In [1]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.impute import KNNImputer
import datetime as dt
import sqlite3

import seaborn as sns

import matplotlib
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches

from patsy import dmatrices

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
# import xgboost as xg
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.tree import export_graphviz
from sklearn.datasets import make_classification
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import f1_score
from sklearn.metrics import auc
from sklearn.model_selection import cross_val_score
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV
import joblib
from joblib import dump, load
import pickle
import bz2

import requests
import traceback
from sqlalchemy import create_engine, MetaData, Table, Column, Integer, Float, String, DateTime

# import graphviz
# from graphviz import Source

# hide ipykernel warnings 
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [2]:
df = pd.read_csv("missforest_imputed_outliers_removed_all_features.csv")

In [3]:
df.head()

Unnamed: 0,ROUTEID,DIRECTION,temp,feels_like,temp_min,temp_max,pressure,humidity,wind_speed,wind_deg,...,HOUR_20,HOUR_21,HOUR_22,HOUR_23,weekend_true_1,peak_true_1,HOUR_1,HOUR_2,JOURNEYTIME,PLANNED_JT
0,102_8,1,9.81,5.49,8.69,10.0,1016,100,6.17,240,...,0,0,0,0,1,0,0,0,3391,3420
1,102_8,1,9.26,6.32,8.24,10.0,1016,93,3.6,220,...,0,0,0,0,1,0,0,0,3659,3420
2,102_8,1,9.76,5.5,8.43,10.0,1016,93,5.66,240,...,0,0,0,0,1,0,0,0,3490,3420
3,102_8,1,19.39,17.97,16.71,21.39,1027,64,3.09,100,...,0,0,0,0,0,0,0,0,3800,3738
4,102_8,1,13.93,14.14,13.0,15.0,1026,94,1.03,0,...,0,0,0,0,0,1,0,0,3582,3294


In [9]:
pd.set_option('display.max_rows', 70)
df.dtypes

ROUTEID                  object
DIRECTION                 int64
temp                    float64
feels_like              float64
temp_min                float64
temp_max                float64
pressure                  int64
humidity                  int64
wind_speed              float64
wind_deg                  int64
clouds_all                int64
weather_main_Clouds       uint8
weather_main_Drizzle      uint8
weather_main_Fog          uint8
weather_main_Mist         uint8
weather_main_Rain         uint8
weather_main_Smoke        uint8
weather_main_Snow         uint8
precipitation_true_1      uint8
MONTH_1                   uint8
MONTH_2                   uint8
MONTH_3                   uint8
MONTH_4                   uint8
MONTH_5                   uint8
MONTH_6                   uint8
MONTH_7                   uint8
MONTH_8                   uint8
MONTH_9                   uint8
MONTH_10                  uint8
MONTH_11                  uint8
WEEKDAY_1                 uint8
WEEKDAY_

In [10]:
int_features = ["pressure", "humidity", "wind_deg", "clouds_all", "JOURNEYTIME", "PLANNED_JT"]

for feature in int_features:
    df[feature] = df[feature].astype("int64")
    
cat_features = ["weather_main_Clouds", "weather_main_Drizzle", "weather_main_Fog", "weather_main_Mist", "weather_main_Rain", "weather_main_Smoke", "weather_main_Snow", "MONTH_1", "MONTH_2", "MONTH_3", "MONTH_4", "MONTH_5", "MONTH_6", "MONTH_7", "MONTH_8", "MONTH_9", "MONTH_10", "MONTH_11", "WEEKDAY_1", "WEEKDAY_2", "WEEKDAY_3", "WEEKDAY_4", "WEEKDAY_5", "WEEKDAY_6", "HOUR_1", "HOUR_2", "HOUR_3", "HOUR_4", "HOUR_5", "HOUR_6", "HOUR_7", "HOUR_8", "HOUR_9", "HOUR_10", "HOUR_11", "HOUR_12", "HOUR_13", "HOUR_14", "HOUR_15", "HOUR_16", "HOUR_17", "HOUR_18", "HOUR_19", "HOUR_20", "HOUR_21", "HOUR_22", "HOUR_23", "precipitation_true_1", "weekend_true_1", "peak_true_1"]

for feature in cat_features:
    df[feature] = df[feature].astype("uint8")

In [11]:
# Create list of most common subroutes; these are the routes we will model
with open("most_common_subroutes.txt") as file:
    most_common_subroutes = file.readlines()
most_common_subroutes = [line.strip() for line in most_common_subroutes] 

In [12]:
print(most_common_subroutes[:10])

['102_8', '102_9', '104_15', '104_16', '111_7', '111_8', '114_5', '114_6', '116_1', '116_3']


In [13]:
df.columns

Index(['ROUTEID', 'DIRECTION', 'temp', 'feels_like', 'temp_min', 'temp_max',
       'pressure', 'humidity', 'wind_speed', 'wind_deg', 'clouds_all',
       'weather_main_Clouds', 'weather_main_Drizzle', 'weather_main_Fog',
       'weather_main_Mist', 'weather_main_Rain', 'weather_main_Smoke',
       'weather_main_Snow', 'precipitation_true_1', 'MONTH_1', 'MONTH_2',
       'MONTH_3', 'MONTH_4', 'MONTH_5', 'MONTH_6', 'MONTH_7', 'MONTH_8',
       'MONTH_9', 'MONTH_10', 'MONTH_11', 'WEEKDAY_1', 'WEEKDAY_2',
       'WEEKDAY_3', 'WEEKDAY_4', 'WEEKDAY_5', 'WEEKDAY_6', 'HOUR_3', 'HOUR_4',
       'HOUR_5', 'HOUR_6', 'HOUR_7', 'HOUR_8', 'HOUR_9', 'HOUR_10', 'HOUR_11',
       'HOUR_12', 'HOUR_13', 'HOUR_14', 'HOUR_15', 'HOUR_16', 'HOUR_17',
       'HOUR_18', 'HOUR_19', 'HOUR_20', 'HOUR_21', 'HOUR_22', 'HOUR_23',
       'weekend_true_1', 'peak_true_1', 'HOUR_1', 'HOUR_2', 'JOURNEYTIME',
       'PLANNED_JT'],
      dtype='object')

<p>We will examine those features rated as most important for a subset of routes, which will help us to choose the features we will use as input for our model.</p>

In [14]:
test = df.drop(["feels_like", "temp_min", "temp_max", "pressure", "clouds_all", "peak_true_1"], 1)

In [15]:
top_features = pd.DataFrame(columns=["feature", "importance"])
for route in most_common_subroutes[:10]:
    subset = test.loc[test["ROUTEID"] == route]

    y = pd.DataFrame(subset["JOURNEYTIME"])
    X = subset.drop(["JOURNEYTIME", "ROUTEID", "PLANNED_JT", "DIRECTION"], 1)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

    rfr = RandomForestRegressor(n_estimators=100, max_features='auto', oob_score=True, random_state=1)
    rfr.fit(X_train, y_train)

    importance = pd.DataFrame({"feature": X_train.columns, "importance":rfr.feature_importances_})
    importance.sort_values(by=["importance"], inplace=True, ascending=False)
    importance_top_10 = importance.head(10)
    top_features = top_features.append(importance_top_10)

In [16]:
top_features.sort_values(by=["importance"], inplace=True, ascending=False)

In [17]:
pd.set_option("display.max_rows", 200)
top_features

Unnamed: 0,feature,importance
0,temp,0.426872
0,temp,0.263442
0,temp,0.19911
0,temp,0.188142
0,temp,0.169283
48,HOUR_22,0.15679
3,wind_deg,0.153131
1,humidity,0.143227
2,wind_speed,0.142757
0,temp,0.141775


In [18]:
grouped = top_features.groupby("feature").mean()
grouped.sort_values(by=["importance"], inplace=True, ascending=False)
grouped

Unnamed: 0_level_0,importance
feature,Unnamed: 1_level_1
temp,0.186025
wind_deg,0.09174
HOUR_22,0.088247
wind_speed,0.08399
humidity,0.080215
weekend_true_1,0.069485
HOUR_16,0.064492
MONTH_6,0.055463
MONTH_8,0.05538
HOUR_21,0.054298


In [19]:
df.columns

Index(['ROUTEID', 'DIRECTION', 'temp', 'feels_like', 'temp_min', 'temp_max',
       'pressure', 'humidity', 'wind_speed', 'wind_deg', 'clouds_all',
       'weather_main_Clouds', 'weather_main_Drizzle', 'weather_main_Fog',
       'weather_main_Mist', 'weather_main_Rain', 'weather_main_Smoke',
       'weather_main_Snow', 'precipitation_true_1', 'MONTH_1', 'MONTH_2',
       'MONTH_3', 'MONTH_4', 'MONTH_5', 'MONTH_6', 'MONTH_7', 'MONTH_8',
       'MONTH_9', 'MONTH_10', 'MONTH_11', 'WEEKDAY_1', 'WEEKDAY_2',
       'WEEKDAY_3', 'WEEKDAY_4', 'WEEKDAY_5', 'WEEKDAY_6', 'HOUR_3', 'HOUR_4',
       'HOUR_5', 'HOUR_6', 'HOUR_7', 'HOUR_8', 'HOUR_9', 'HOUR_10', 'HOUR_11',
       'HOUR_12', 'HOUR_13', 'HOUR_14', 'HOUR_15', 'HOUR_16', 'HOUR_17',
       'HOUR_18', 'HOUR_19', 'HOUR_20', 'HOUR_21', 'HOUR_22', 'HOUR_23',
       'weekend_true_1', 'peak_true_1', 'HOUR_1', 'HOUR_2', 'JOURNEYTIME',
       'PLANNED_JT'],
      dtype='object')

In [20]:
copy = df.copy()

In [21]:
copy["rain_true_1"] = np.where((copy["weather_main_Rain"] == 1), 1, 0)

In [22]:
final_trimmed = copy.drop(["feels_like", "temp_min", "temp_max", "pressure", "clouds_all", "weather_main_Clouds", "weather_main_Drizzle", "weather_main_Fog", "weather_main_Mist", "weather_main_Rain", "weather_main_Smoke", "weather_main_Snow", "precipitation_true_1"], 1)

In [23]:
# Function to perform random forest regression, dump regressors to pickle files and return a dataframe of metrics for each regressor
def RFR_prediction(df, route_list):
    model_metrics = pd.DataFrame(columns=["model_type", "data", "route", "MAE_test", "MSE_test", "RMSE_test", "MAPE_test", "R2_test", "MAE_actual_vs_planned", "MAPE_actual_vs_planned"])
    
    for route in route_list:
        print("ROUTE", route)
        current = df.loc[df["ROUTEID"] == route]
        current.drop(["ROUTEID"], axis = 1, inplace=True)
        direction = int(current.iloc[0]["DIRECTION"])
        line = route.split("_")[0]

        y = pd.DataFrame(current["JOURNEYTIME"])
        X = current.drop(["JOURNEYTIME", "DIRECTION"], 1)

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
        
        df_PLANNED_JT = X_test[["PLANNED_JT"]]

        X_train.drop(["PLANNED_JT"], axis=1, inplace=True)
        X_test.drop(["PLANNED_JT"], axis=1, inplace=True)
        
        rfr = RandomForestRegressor(n_estimators=100, max_features="auto", oob_score=True, random_state=1)
           
        rfr.fit(X_train, y_train)
        
#         pickle_filename = "pickles/{}_{}.pickle"
        compressed_filename = ("final_rfr_joblib/{}_{}.pickle".format(line, direction))
        
        joblib.dump(rfr, compressed_filename, compress=("bz2", 7))
        
#         pickle.dump(rfr, open(pickle_filename.format(line, direction), "wb"))
#         compressed_pickle = bz2.compress(open(pickle_filename.format(line, direction), "rb").read())
        
#         compressed_file = open(compressed_filename.format(line, direction), "wb")
#         compressed_file.write(compressed_pickle)
#         compressed_file.close()

        rfr_predictions_train = rfr.predict(X_train) 
        rfr_predictions_test = rfr.predict(X_test)
        
#         param_grid = {"n_estimators":[150, 200], "max_depth":[10, 20, 30], "min_samples_split":[6, 8, 10, 12], "min_samples_leaf":[1, 2]}
        
#         grid = HalvingGridSearchCV(rfr, param_grid, scoring="neg_mean_absolute_error", random_state=1, n_jobs=8).fit(X, y.values.ravel())
        
#         print(route)
#         print(abs(grid.best_score_))
#         print(grid.best_params_, "\n")
#         print('Training set score: ' + str(grid.score(X_train, y_train)))
#         print('Test set score: ' + str(grid.score(X_test, y_test)))
        
#         X_for_cross_val = X.drop(["PLANNED_JT"], axis=1)
        
#         scores = cross_val_score(rfr, X_for_cross_val, y, cv=5, scoring="neg_mean_absolute_error")
#         cross_val_mean = abs(scores.mean())

        mae_actual_vs_planned = metrics.mean_absolute_error(y_test, df_PLANNED_JT)
        mape_actual_vs_planned = metrics.mean_absolute_percentage_error(y_test, df_PLANNED_JT)

        mae_train = metrics.mean_absolute_error(y_train, rfr_predictions_train)
        mse_train = metrics.mean_squared_error(y_train, rfr_predictions_train)
        rmse_train = mse_train**0.5
        r2_train = metrics.r2_score(y_train, rfr_predictions_train)
        mape_train = metrics.mean_absolute_percentage_error(y_train, rfr_predictions_train)

        mae_test = metrics.mean_absolute_error(y_test, rfr_predictions_test)
        mse_test = metrics.mean_squared_error(y_test, rfr_predictions_test)
        rmse_test = mse_test**0.5
        r2_test = metrics.r2_score(y_test, rfr_predictions_test)
        mape_test = metrics.mean_absolute_percentage_error(y_test, rfr_predictions_test)

        new_row_model_metrics = {"model_type":"RFR", "data":"test", "route":route, "MAE_test":mae_test, "MSE_test":mse_test, "RMSE_test":rmse_test, "MAPE_test":mape_test, "R2_test":r2_test, "MAE_actual_vs_planned":mae_actual_vs_planned, "MAPE_actual_vs_planned":mape_actual_vs_planned}
        model_metrics = model_metrics.append(new_row_model_metrics, ignore_index=True)

        print("ROUTE", route)
        print("Number of entries:", current.shape[0], "\n")
        print("MAE train:", mae_train)
        print("MSE train:", mse_train)
        print("RMSE train:", rmse_train)
        print("MAPE train", mape_train)
        print("R2 train:", r2_train, "\n")
        print("MAE test:", mae_test)
        print("MSE test:", mse_test)
        print("RMSE test:", rmse_test)
        print("MAPE test", mape_test)
        print("R2 test:", r2_test, "\n")
#         print("Mean MAE from 5-fold cross-validation is:", cross_val_mean, "\n")
        print("MAE actual vs planned:", mae_actual_vs_planned)
        print("MAPE actual vs planned:", mape_actual_vs_planned)
        print("*************************")
    return model_metrics

In [24]:
test = RFR_prediction(final_trimmed, ["102_8"])

ROUTE 102_8
ROUTE 102_8
Number of entries: 9305 

MAE train: 148.84725303594163
MSE train: 39632.31797213215
RMSE train: 199.07867282090302
MAPE train 0.042072617219024805
R2 train: 0.8349864544466213 

MAE test: 280.76658518101607
MSE test: 134196.97180674144
RMSE test: 366.3290485434392
MAPE test 0.0795142427058627
R2 test: 0.45615927136368073 

MAE actual vs planned: 338.4297994269341
MAPE actual vs planned: 0.09286484894944758
*************************


In [25]:
test

Unnamed: 0,model_type,data,route,MAE_test,MSE_test,RMSE_test,MAPE_test,R2_test,MAE_actual_vs_planned,MAPE_actual_vs_planned
0,RFR,test,102_8,280.766585,134196.971807,366.329049,0.079514,0.456159,338.429799,0.092865


In [47]:
test_metrics = RFR_prediction(df, ["102_8"])

ROUTE 102_8
Number of entries: 9305 

MAE train: 148.68062463418153
MSE train: 39601.94978271723
RMSE train: 199.00238637442826
MAPE train 0.042046097242493856
R2 train: 0.8351128957668317 

MAE test: 281.5158629630577
MSE test: 134027.65270534586
RMSE test: 366.0978731232208
MAPE test 0.07977400003342455
R2 test: 0.4568454464854833 

Mean MAE from 5-fold cross-validation is: 276.65820823456795 

MAE actual vs planned: 338.4297994269341
MAPE actual vs planned: 0.09286484894944758
*************************


In [48]:
test_metrics

Unnamed: 0,model_type,data,route,MAE_test,MSE_test,RMSE_test,MAPE_test,R2_test,5_fold_cv_MAE,MAE_actual_vs_planned,MAPE_actual_vs_planned
0,RFR,test,102_8,281.515863,134027.652705,366.097873,0.079774,0.456845,276.658208,338.429799,0.092865


<p>Amongst the metrics returned is MAE for static vs actual. This is the mean absolute error for actual journey times versus planned journey times. The Dublin Bus journey planner uses static timetables, which correspond to the planned journey times in the historical data. Through comparing the MAE for our predictions to the MAE for actual versus planned, we can deduce whether our model and therefore our approach surpasses the approach currently used by Dublin Bus. In the dataframe above where we modelled for a single test route we can see that our MAE is lower than that for static versus actual and therefore the journey times predicted by our model are an improvement on those predicted through the current Dublin Bus journey planner.</p>

In [22]:
final_routes_metrics = RFR_prediction(final_trimmed, most_common_subroutes)

ROUTE 102_8
Number of entries: 9305 

MAE train: 148.84725303594163
MSE train: 39632.31797213215
RMSE train: 199.07867282090302
MAPE train 0.042072617219024805
R2 train: 0.8349864544466213 

MAE test: 280.76658518101607
MSE test: 134196.97180674144
RMSE test: 366.3290485434392
MAPE test 0.0795142427058627
R2 test: 0.45615927136368073 

Mean MAE from 5-fold cross-validation is: 277.8607715357691 

MAE actual vs planned: 338.4297994269341
MAPE actual vs planned: 0.09286484894944758
*************************
ROUTE 102_9
Number of entries: 9052 

MAE train: 141.68084846272203
MSE train: 35834.98936528083
RMSE train: 189.3013189739597
MAPE train 0.04171906143416166
R2 train: 0.8548375759748311 

MAE test: 279.48433778227786
MSE test: 126280.93433409756
RMSE test: 355.3602880656441
MAPE test 0.08322889664620459
R2 test: 0.4959747119476947 

Mean MAE from 5-fold cross-validation is: 268.3090556176406 

MAE actual vs planned: 332.4337260677467
MAPE actual vs planned: 0.09567262820553511
******

ROUTE 123_34
Number of entries: 23741 

MAE train: 245.1448520431605
MSE train: 99615.92912844948
RMSE train: 315.61991243970886
MAPE train 0.07000638957255634
R2 train: 0.7365970435553151 

MAE test: 348.5040769149521
MSE test: 197909.3666869363
RMSE test: 444.8700559567213
MAPE test 0.09976236509307838
R2 test: 0.47544948443846025 

Mean MAE from 5-fold cross-validation is: 349.11558028169367 

MAE actual vs planned: 377.41162431559735
MAPE actual vs planned: 0.10339912099109876
*************************
ROUTE 123_36
Number of entries: 23986 

MAE train: 237.86615063297083
MSE train: 94720.54623811701
RMSE train: 307.767032409446
MAPE train 0.06656028644610334
R2 train: 0.7318191596938339 

MAE test: 339.00615123398006
MSE test: 188687.88173094107
RMSE test: 434.38218394743245
MAPE test 0.09558268471261527
R2 test: 0.47347727396660844 

Mean MAE from 5-fold cross-validation is: 344.26492327247854 

MAE actual vs planned: 398.7688993885492
MAPE actual vs planned: 0.10680723757005535
*

ROUTE 150_9
Number of entries: 13678 

MAE train: 142.78150307557786
MSE train: 34867.27096710427
RMSE train: 186.72779912777924
MAPE train 0.06011016286150925
R2 train: 0.793430285044521 

MAE test: 248.35207811631457
MSE test: 99584.32674264855
RMSE test: 315.5698444760661
MAPE test 0.10506972919249344
R2 test: 0.3957942155966505 

Mean MAE from 5-fold cross-validation is: 240.5191047580669 

MAE actual vs planned: 366.6985867446394
MAPE actual vs planned: 0.14454944656519145
*************************
ROUTE 151_15
Number of entries: 12048 

MAE train: 196.10370987600686
MSE train: 64640.00819803932
RMSE train: 254.2439934355172
MAPE train 0.04898986624677489
R2 train: 0.8162923059057743 

MAE test: 323.76783322026824
MSE test: 174934.81326848356
RMSE test: 418.25209296366177
MAPE test 0.08088498524355733
R2 test: 0.4958722159772412 

Mean MAE from 5-fold cross-validation is: 323.16259975225495 

MAE actual vs planned: 397.9585062240664
MAPE actual vs planned: 0.09525522682539833
****

ROUTE 16_24
Number of entries: 15471 

MAE train: 287.5801699638025
MSE train: 138710.86069139634
RMSE train: 372.4390697703402
MAPE train 0.051765462048596056
R2 train: 0.7713298910834426 

MAE test: 459.6118711044748
MSE test: 336239.1071674498
RMSE test: 579.86128269393
MAPE test 0.08378967232023381
R2 test: 0.4748554577377746 

Mean MAE from 5-fold cross-validation is: 459.9489864629342 

MAE actual vs planned: 573.3250753985351
MAPE actual vs planned: 0.0993053632980019
*************************
ROUTE 17A_15
Number of entries: 9842 

MAE train: 242.0586277519194
MSE train: 97909.92148358129
RMSE train: 312.90561114109363
MAPE train 0.05364096574348468
R2 train: 0.736898781810531 

MAE test: 389.3965247698552
MSE test: 243279.59311935483
RMSE test: 493.2338118168247
MAPE test 0.08735888896592514
R2 test: 0.33539562787658483 

Mean MAE from 5-fold cross-validation is: 374.25982512020573 

MAE actual vs planned: 435.91398577717575
MAPE actual vs planned: 0.09403362753714868
*********

ROUTE 238_11
Number of entries: 4546 

MAE train: 55.422834695160276
MSE train: 5321.405072344438
RMSE train: 72.94796139951025
MAPE train 0.03319046785731443
R2 train: 0.9250363871438589 

MAE test: 151.72469941348973
MSE test: 38853.9702648827
RMSE test: 197.11410468275145
MAPE test 0.09190015577171107
R2 test: 0.45402798633937136 

Mean MAE from 5-fold cross-validation is: 150.0516654698437 

MAE actual vs planned: 225.983137829912
MAPE actual vs planned: 0.1280996057427238
*************************
ROUTE 238_15
Number of entries: 4483 

MAE train: 53.46474187380497
MSE train: 4942.106564818356
RMSE train: 70.30011781511007
MAPE train 0.03377876878545511
R2 train: 0.9148414033618087 

MAE test: 147.27453531598513
MSE test: 36410.45710089219
RMSE test: 190.81524336617395
MAPE test 0.09286089723465644
R2 test: 0.40508646782049074 

Mean MAE from 5-fold cross-validation is: 146.29355041109255 

MAE actual vs planned: 289.48624535315986
MAPE actual vs planned: 0.16895184098759752
******

ROUTE 270_44
Number of entries: 5702 

MAE train: 73.98071160110248
MSE train: 8379.74606474568
RMSE train: 91.54095293771897
MAPE train 0.05288285793689077
R2 train: 0.8846622974237253 

MAE test: 205.71230859146698
MSE test: 61849.867744827585
RMSE test: 248.6963364121546
MAPE test 0.14568836572016977
R2 test: 0.11152692076411241 

Mean MAE from 5-fold cross-validation is: 204.39379048849116 

MAE actual vs planned: 251.23495032144945
MAPE actual vs planned: 0.16084916289915815
*************************
ROUTE 27A_4
Number of entries: 9259 

MAE train: 83.8269834148904
MSE train: 12945.482466681262
RMSE train: 113.77821613420234
MAPE train 0.04475901625900146
R2 train: 0.8893556619770082 

MAE test: 171.20054252802635
MSE test: 49754.80009237556
RMSE test: 223.05784023964628
MAPE test 0.09262966279756897
R2 test: 0.5745306466284696 

Mean MAE from 5-fold cross-validation is: 166.58686774404674 

MAE actual vs planned: 194.48596112311014
MAPE actual vs planned: 0.10172093120036828
****

ROUTE 31_15
Number of entries: 11020 

MAE train: 141.4507586031993
MSE train: 34078.046599833935
RMSE train: 184.6024013923815
MAPE train 0.050417323331303145
R2 train: 0.848126066763351 

MAE test: 268.2152142321364
MSE test: 114270.46504451585
RMSE test: 338.0391472071184
MAPE test 0.09521809033166896
R2 test: 0.4901396636504287 

Mean MAE from 5-fold cross-validation is: 259.6984572890233 

MAE actual vs planned: 310.2882637628554
MAPE actual vs planned: 0.10541122221752292
*************************
ROUTE 31_18
Number of entries: 11632 

MAE train: 125.01859886677774
MSE train: 27470.77038040844
RMSE train: 165.74308546786634
MAPE train 0.05106626045276074
R2 train: 0.8497806498854201 

MAE test: 242.26138068517764
MSE test: 96516.76131487534
RMSE test: 310.67146845965004
MAPE test 0.09874763167825439
R2 test: 0.4680259819178997 

Mean MAE from 5-fold cross-validation is: 225.07557644307244 

MAE actual vs planned: 266.52234957020056
MAPE actual vs planned: 0.104799173206255
******

ROUTE 37_14
Number of entries: 15959 

MAE train: 243.44934157035436
MSE train: 104467.38450604351
RMSE train: 323.2141465128708
MAPE train 0.05892799795414371
R2 train: 0.8082378174358212 

MAE test: 398.7976458653026
MSE test: 269157.65713319415
RMSE test: 518.804064299032
MAPE test 0.09837581472083377
R2 test: 0.494883270037647 

Mean MAE from 5-fold cross-validation is: 395.42288296900875 

MAE actual vs planned: 490.8801169590643
MAPE actual vs planned: 0.11292794153590092
*************************
ROUTE 37_15
Number of entries: 8079 

MAE train: 173.27818832727596
MSE train: 54822.105167542424
RMSE train: 234.1412077519513
MAPE train 0.0447224595222901
R2 train: 0.8157498201380037 

MAE test: 317.4486299194022
MSE test: 164246.22621791656
RMSE test: 405.2730267584022
MAPE test 0.08292736339530848
R2 test: 0.4564119627231932 

Mean MAE from 5-fold cross-validation is: 308.3417255972055 

MAE actual vs planned: 381.41584158415844
MAPE actual vs planned: 0.0961564430188021
*********

ROUTE 40B_65
Number of entries: 1846 

MAE train: 78.44771671826624
MSE train: 10509.729244349846
RMSE train: 102.51697051878702
MAPE train 0.029957751906329134
R2 train: 0.9256693936285652 

MAE test: 213.43281588447653
MSE test: 72233.21035740072
RMSE test: 268.7623678222097
MAPE test 0.08195359912786657
R2 test: 0.4918015825750317 

Mean MAE from 5-fold cross-validation is: 216.55258781220246 

MAE actual vs planned: 299.6046931407942
MAPE actual vs planned: 0.11051127163919734
*************************
ROUTE 40D_102
Number of entries: 8140 

MAE train: 157.3445430725763
MSE train: 42133.426370387664
RMSE train: 205.26428420547902
MAPE train 0.05178286804696513
R2 train: 0.7336628197793735 

MAE test: 256.9721995506862
MSE test: 108056.30493140011
RMSE test: 328.7191885658641
MAPE test 0.0856675422919472
R2 test: 0.30650676179170355 

Mean MAE from 5-fold cross-validation is: 258.98929163675865 

MAE actual vs planned: 370.78746928746926
MAPE actual vs planned: 0.11629489734212713
*

ROUTE 41_7
Number of entries: 15392 

MAE train: 213.21308319196893
MSE train: 79666.16718809368
RMSE train: 282.25195692518
MAPE train 0.05788306232293823
R2 train: 0.7950509619683384 

MAE test: 366.40628866838375
MSE test: 222203.17379069174
RMSE test: 471.3843164453944
MAPE test 0.10006537015204241
R2 test: 0.4342780534980041 

Mean MAE from 5-fold cross-validation is: 340.9485573460067 

MAE actual vs planned: 426.09138155045474
MAPE actual vs planned: 0.11127685587836492
*************************
ROUTE 42D_50
Number of entries: 171 

MAE train: 73.39344537815127
MSE train: 8264.991094117651
RMSE train: 90.91199642576139
MAPE train 0.024170851126358486
R2 train: 0.8801885559708663 

MAE test: 189.7457692307692
MSE test: 58430.94811923076
RMSE test: 241.72494310523842
MAPE test 0.06042199751960553
R2 test: 0.02120483000603457 

Mean MAE from 5-fold cross-validation is: 204.4981176470588 

MAE actual vs planned: 470.6730769230769
MAPE actual vs planned: 0.15251030098527876
*********

ROUTE 47_139
Number of entries: 5227 

MAE train: 140.96022814314352
MSE train: 39779.79004414992
RMSE train: 199.44871532338814
MAPE train 0.04069484019675303
R2 train: 0.9433121441539648 

MAE test: 325.3142196424778
MSE test: 190465.40669438473
RMSE test: 436.42342592301884
MAPE test 0.09663721511072802
R2 test: 0.7120985040321772 

Mean MAE from 5-fold cross-validation is: 332.4199133994824 

MAE actual vs planned: 394.17335882727855
MAPE actual vs planned: 0.11704889665548783
*************************
ROUTE 49_27
Number of entries: 10849 

MAE train: 157.53979061684663
MSE train: 45353.47317272787
RMSE train: 212.9635489296886
MAPE train 0.04810448145218328
R2 train: 0.9003934336307166 

MAE test: 289.03610153236417
MSE test: 143733.32112315702
RMSE test: 379.1217761131073
MAPE test 0.09009202282360661
R2 test: 0.6641378679674192 

Mean MAE from 5-fold cross-validation is: 291.40595589161745 

MAE actual vs planned: 347.5321044546851
MAPE actual vs planned: 0.10379150108156703
***

ROUTE 61_108
Number of entries: 5266 

MAE train: 81.52493488876833
MSE train: 11606.736761448725
RMSE train: 107.73456623316736
MAPE train 0.029062778060768138
R2 train: 0.9598117842607389 

MAE test: 219.00096202531645
MSE test: 82753.96712898734
RMSE test: 287.66989263561686
MAPE test 0.07804113834613376
R2 test: 0.6946060974231282 

Mean MAE from 5-fold cross-validation is: 228.02962743296015 

MAE actual vs planned: 325.2645569620253
MAPE actual vs planned: 0.11086803873950482
*************************
ROUTE 63_26
Number of entries: 5510 

MAE train: 167.5290500903326
MSE train: 52102.57591159088
RMSE train: 228.2598867773111
MAPE train 0.0557515231486617
R2 train: 0.7800728861708153 

MAE test: 345.72345265702563
MSE test: 198771.9354405814
RMSE test: 445.83846339294394
MAPE test 0.11450178000628464
R2 test: 0.22315510891819723 

Mean MAE from 5-fold cross-validation is: 330.2722837279981 

MAE actual vs planned: 435.82214156079857
MAPE actual vs planned: 0.13426757007686427
****

ROUTE 67_4
Number of entries: 11638 

MAE train: 195.0829774854752
MSE train: 68211.89244822836
RMSE train: 261.1740654204172
MAPE train 0.04526845734941836
R2 train: 0.8537686955635875 

MAE test: 364.9593650596333
MSE test: 221661.65835878576
RMSE test: 470.8095775988269
MAPE test 0.08486551494380458
R2 test: 0.5270975866690216 

Mean MAE from 5-fold cross-validation is: 358.1392742708409 

MAE actual vs planned: 451.4455899198167
MAPE actual vs planned: 0.0997586439070851
*************************
ROUTE 67_6
Number of entries: 12355 

MAE train: 174.66396124791092
MSE train: 55903.90819090794
RMSE train: 236.44007314943028
MAPE train 0.04215981803200489
R2 train: 0.8830866759503583 

MAE test: 328.1224602971213
MSE test: 191603.24931840348
RMSE test: 437.72508417773304
MAPE test 0.07883419494475623
R2 test: 0.5927612805094766 

Mean MAE from 5-fold cross-validation is: 325.67205942070876 

MAE actual vs planned: 505.08902077151333
MAPE actual vs planned: 0.11466416335926742
********

ROUTE 76A_28
Number of entries: 723 

MAE train: 203.7180830039526
MSE train: 69183.50583458497
RMSE train: 263.02757618657586
MAPE train 0.04464288106596955
R2 train: 0.8894692000722663 

MAE test: 502.22327188940096
MSE test: 413677.06662903226
RMSE test: 643.177321295638
MAPE test 0.10922814227815375
R2 test: 0.2504863460011172 

Mean MAE from 5-fold cross-validation is: 531.2975070881226 

MAE actual vs planned: 824.557603686636
MAPE actual vs planned: 0.16189436944517957
*************************
ROUTE 76A_29
Number of entries: 720 

MAE train: 147.43176587301588
MSE train: 40915.39649464286
RMSE train: 202.27554596303247
MAPE train 0.033959428968161336
R2 train: 0.9443383719678244 

MAE test: 381.09018518518525
MSE test: 257788.18569166673
RMSE test: 507.7284566494838
MAPE test 0.08826178968706783
R2 test: 0.665390899800566 

Mean MAE from 5-fold cross-validation is: 385.673125 

MAE actual vs planned: 563.2361111111111
MAPE actual vs planned: 0.1229892262746408
*****************

ROUTE 7_70
Number of entries: 6569 

MAE train: 234.22533473145674
MSE train: 94793.12130893426
RMSE train: 307.8849156891813
MAPE train 0.05342785126955153
R2 train: 0.7621441751670833 

MAE test: 470.84268346379645
MSE test: 361868.88757717237
RMSE test: 601.5553902818696
MAPE test 0.10598096421816144
R2 test: 0.11751960907128356 

Mean MAE from 5-fold cross-validation is: 456.6020120100775 

MAE actual vs planned: 594.1887366818873
MAPE actual vs planned: 0.12595246025488063
*************************
ROUTE 83_16
Number of entries: 14896 

MAE train: 277.46084253925
MSE train: 136156.86400804526
RMSE train: 368.99439563229856
MAPE train 0.05622590378176882
R2 train: 0.8046294541120926 

MAE test: 437.98556070119554
MSE test: 321597.6953205631
RMSE test: 567.0958431522515
MAPE test 0.08929944513251858
R2 test: 0.5426374648600734 

Mean MAE from 5-fold cross-validation is: 449.6280455491395 

MAE actual vs planned: 509.5867084358917
MAPE actual vs planned: 0.10006621662420201
*********

In [27]:
final_routes_metrics.to_csv("rfr_final_metrics.csv", index=False)

In [26]:
final_routes_metrics = pd.read_csv("rfr_final_metrics.csv")

In [28]:
# Max MAE for predictions vs actual
final_routes_metrics["5_fold_cv_MAE"].max()

692.3713367147424

In [29]:
# Min MAE for predictions vs actual
final_routes_metrics["5_fold_cv_MAE"].min()

96.0856314070998

In [30]:
# Mean MAE for predictions vs actual
final_routes_metrics["5_fold_cv_MAE"].mean()

319.8247101675142

In [32]:
# Max MAE for planned vs actual
final_routes_metrics.MAE_actual_vs_planned.max()

1606.6205737916775

In [33]:
# Min MAE for planned vs actual
final_routes_metrics.MAE_actual_vs_planned.min()

121.70691333982474

In [34]:
# Mean MAE for planned vs actual
final_routes_metrics.MAE_actual_vs_planned.mean()

462.2779995944215

<p>Above we can see the min, max and mean MAE for predicted vs actual and planned vs actual. In creating these models the same input features were used when modelling for each route. It is believed that there would likely be further improvement if the input features were adjusted on a route by route basis. However, our predicted journey times are already more accurate than the Dublin Bus planned journey times and so our models are already a success.</p>

In [35]:
final_routes_metrics.loc[final_routes_metrics["MAE_actual_vs_planned"] < final_routes_metrics["5_fold_cv_MAE"]]

Unnamed: 0,model_type,data,route,MAE_test,MSE_test,RMSE_test,MAPE_test,R2_test,5_fold_cv_MAE,MAE_actual_vs_planned,MAPE_actual_vs_planned
101,RFR,test,31D_51,325.155094,167420.291015,409.170247,0.142171,0.020935,381.957689,377.09434,0.168076
132,RFR,test,39X_3,451.887513,309325.483154,556.170372,0.108687,-0.169306,424.751329,378.861925,0.088042
148,RFR,test,41D_26,460.755,214026.14605,462.629599,0.128751,-0.451458,561.08,384.0,0.116972
216,RFR,test,70D_71,277.785,132942.578906,364.612917,0.084181,0.166327,301.087529,284.648148,0.08247


<p>These routes will be excluded and Google Maps predictions which uses Dublin Bus static GTFS information will be used instead, unless predictions can be improved with neural network modelling.</p>