In [51]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.impute import KNNImputer
import datetime as dt
import sqlite3

import seaborn as sns

import matplotlib
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches

from patsy import dmatrices

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
# import xgboost as xg
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler  
from sklearn.tree import export_graphviz
from sklearn.datasets import make_classification
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import f1_score
from sklearn.metrics import auc
from sklearn.metrics import r2_score
from sklearn.pipeline import Pipeline, make_pipeline  
from sklearn.model_selection import GridSearchCV
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV
from sklearn.model_selection import HalvingRandomSearchCV
from sklearn.model_selection import KFold
import joblib
from joblib import dump, load
import pickle
import bz2

import requests
import traceback
from sqlalchemy import create_engine, MetaData, Table, Column, Integer, Float, String, DateTime

# import graphviz
# from graphviz import Source

# hide ipykernel warnings 
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [2]:
# Create list of most common subroutes; these are the routes we will model
with open("most_common_subroutes.txt") as file:
    most_common_subroutes = file.readlines()
most_common_subroutes = [line.strip() for line in most_common_subroutes] 

In [3]:
print(most_common_subroutes[0:10])

['102_8', '102_9', '104_15', '104_16', '111_7', '111_8', '114_5', '114_6', '116_1', '116_3']


In [4]:
trips = pd.read_csv("rt_trips_DB_2018.txt", sep=";")

In [5]:
trips.shape

(2182637, 16)

In [6]:
# Find overall most common routes to use for neural network experimentation
entries_per_trip = pd.DataFrame(columns=["ROUTEID", "ENTRIES"])
for route in most_common_subroutes:
    temp = trips.loc[trips["ROUTEID"] == route]
    new_row_entries = {"ROUTEID":route, "ENTRIES":temp.shape[0]}
    entries_per_trip = entries_per_trip.append(new_row_entries, ignore_index=True)

In [9]:
entries_per_trip.sort_values(by=["ENTRIES"], inplace=True, ascending=False)
entries_per_trip.head()

Unnamed: 0,ROUTEID,ENTRIES
167,46A_74,37182
166,46A_67,35759
27,145_102,29259
44,15_17,28471
28,145_105,27964


In [10]:
entries_per_trip_list = entries_per_trip.ROUTEID.unique().tolist()
print(entries_per_trip_list[0:10])

['46A_74', '46A_67', '145_102', '15_17', '145_105', '40_31', '39A_43', '130_11', '40_27', '130_10']


In [13]:
routes_for_ann = entries_per_trip.drop(entries_per_trip.tail(3).index)

In [14]:
routes_for_ann_list = routes_for_ann.ROUTEID.unique().tolist()
print(routes_for_ann_list[0:10])

['46A_74', '46A_67', '145_102', '15_17', '145_105', '40_31', '39A_43', '130_11', '40_27', '130_10']


In [15]:
df = pd.read_csv("missforest_imputed_outliers_removed_all_features.csv")

In [16]:
df.head()

Unnamed: 0,ROUTEID,DIRECTION,temp,feels_like,temp_min,temp_max,pressure,humidity,wind_speed,wind_deg,...,HOUR_20,HOUR_21,HOUR_22,HOUR_23,weekend_true_1,peak_true_1,HOUR_1,HOUR_2,JOURNEYTIME,PLANNED_JT
0,102_8,1,9.81,5.49,8.69,10.0,1016,100,6.17,240,...,0,0,0,0,1,0,0,0,3391,3420
1,102_8,1,9.26,6.32,8.24,10.0,1016,93,3.6,220,...,0,0,0,0,1,0,0,0,3659,3420
2,102_8,1,9.76,5.5,8.43,10.0,1016,93,5.66,240,...,0,0,0,0,1,0,0,0,3490,3420
3,102_8,1,19.39,17.97,16.71,21.39,1027,64,3.09,100,...,0,0,0,0,0,0,0,0,3800,3738
4,102_8,1,13.93,14.14,13.0,15.0,1026,94,1.03,0,...,0,0,0,0,0,1,0,0,3582,3294


In [17]:
pd.set_option('display.max_rows', 70)
df.dtypes

ROUTEID                  object
DIRECTION                 int64
temp                    float64
feels_like              float64
temp_min                float64
temp_max                float64
pressure                  int64
humidity                  int64
wind_speed              float64
wind_deg                  int64
clouds_all                int64
weather_main_Clouds       int64
weather_main_Drizzle      int64
weather_main_Fog          int64
weather_main_Mist         int64
weather_main_Rain         int64
weather_main_Smoke        int64
weather_main_Snow         int64
precipitation_true_1      int64
MONTH_1                   int64
MONTH_2                   int64
MONTH_3                   int64
MONTH_4                   int64
MONTH_5                   int64
MONTH_6                   int64
MONTH_7                   int64
MONTH_8                   int64
MONTH_9                   int64
MONTH_10                  int64
MONTH_11                  int64
WEEKDAY_1                 int64
WEEKDAY_

In [18]:
int_features = ["pressure", "humidity", "wind_deg", "clouds_all", "JOURNEYTIME", "PLANNED_JT"]

for feature in int_features:
    df[feature] = df[feature].astype("int64")
    
cat_features = ["weather_main_Clouds", "weather_main_Drizzle", "weather_main_Fog", "weather_main_Mist", "weather_main_Rain", "weather_main_Smoke", "weather_main_Snow", "MONTH_1", "MONTH_2", "MONTH_3", "MONTH_4", "MONTH_5", "MONTH_6", "MONTH_7", "MONTH_8", "MONTH_9", "MONTH_10", "MONTH_11", "WEEKDAY_1", "WEEKDAY_2", "WEEKDAY_3", "WEEKDAY_4", "WEEKDAY_5", "WEEKDAY_6", "HOUR_1", "HOUR_2", "HOUR_3", "HOUR_4", "HOUR_5", "HOUR_6", "HOUR_7", "HOUR_8", "HOUR_9", "HOUR_10", "HOUR_11", "HOUR_12", "HOUR_13", "HOUR_14", "HOUR_15", "HOUR_16", "HOUR_17", "HOUR_18", "HOUR_19", "HOUR_20", "HOUR_21", "HOUR_22", "HOUR_23", "precipitation_true_1", "weekend_true_1", "peak_true_1"]

for feature in cat_features:
    df[feature] = df[feature].astype("uint8")

In [19]:
copy = df.copy()

In [20]:
copy.columns

Index(['ROUTEID', 'DIRECTION', 'temp', 'feels_like', 'temp_min', 'temp_max',
       'pressure', 'humidity', 'wind_speed', 'wind_deg', 'clouds_all',
       'weather_main_Clouds', 'weather_main_Drizzle', 'weather_main_Fog',
       'weather_main_Mist', 'weather_main_Rain', 'weather_main_Smoke',
       'weather_main_Snow', 'precipitation_true_1', 'MONTH_1', 'MONTH_2',
       'MONTH_3', 'MONTH_4', 'MONTH_5', 'MONTH_6', 'MONTH_7', 'MONTH_8',
       'MONTH_9', 'MONTH_10', 'MONTH_11', 'WEEKDAY_1', 'WEEKDAY_2',
       'WEEKDAY_3', 'WEEKDAY_4', 'WEEKDAY_5', 'WEEKDAY_6', 'HOUR_3', 'HOUR_4',
       'HOUR_5', 'HOUR_6', 'HOUR_7', 'HOUR_8', 'HOUR_9', 'HOUR_10', 'HOUR_11',
       'HOUR_12', 'HOUR_13', 'HOUR_14', 'HOUR_15', 'HOUR_16', 'HOUR_17',
       'HOUR_18', 'HOUR_19', 'HOUR_20', 'HOUR_21', 'HOUR_22', 'HOUR_23',
       'weekend_true_1', 'peak_true_1', 'HOUR_1', 'HOUR_2', 'JOURNEYTIME',
       'PLANNED_JT'],
      dtype='object')

In [21]:
copy["rain_true_1"] = np.where((copy["weather_main_Rain"] == 1), 1, 0)

In [22]:
final_trimmed = copy.drop(["feels_like", "temp_min", "temp_max", "pressure", "clouds_all", "weather_main_Clouds", "weather_main_Drizzle", "weather_main_Fog", "weather_main_Mist", "weather_main_Rain", "weather_main_Smoke", "weather_main_Snow", "precipitation_true_1"], 1)

In [23]:
final_trimmed.columns

Index(['ROUTEID', 'DIRECTION', 'temp', 'humidity', 'wind_speed', 'wind_deg',
       'MONTH_1', 'MONTH_2', 'MONTH_3', 'MONTH_4', 'MONTH_5', 'MONTH_6',
       'MONTH_7', 'MONTH_8', 'MONTH_9', 'MONTH_10', 'MONTH_11', 'WEEKDAY_1',
       'WEEKDAY_2', 'WEEKDAY_3', 'WEEKDAY_4', 'WEEKDAY_5', 'WEEKDAY_6',
       'HOUR_3', 'HOUR_4', 'HOUR_5', 'HOUR_6', 'HOUR_7', 'HOUR_8', 'HOUR_9',
       'HOUR_10', 'HOUR_11', 'HOUR_12', 'HOUR_13', 'HOUR_14', 'HOUR_15',
       'HOUR_16', 'HOUR_17', 'HOUR_18', 'HOUR_19', 'HOUR_20', 'HOUR_21',
       'HOUR_22', 'HOUR_23', 'weekend_true_1', 'peak_true_1', 'HOUR_1',
       'HOUR_2', 'JOURNEYTIME', 'PLANNED_JT', 'rain_true_1'],
      dtype='object')

In [107]:
final_trimmed.shape

(1844270, 51)

In [24]:
final_trimmed.head()

Unnamed: 0,ROUTEID,DIRECTION,temp,humidity,wind_speed,wind_deg,MONTH_1,MONTH_2,MONTH_3,MONTH_4,...,HOUR_21,HOUR_22,HOUR_23,weekend_true_1,peak_true_1,HOUR_1,HOUR_2,JOURNEYTIME,PLANNED_JT,rain_true_1
0,102_8,1,9.81,100,6.17,240,1,0,0,0,...,0,0,0,1,0,0,0,3391,3420,0
1,102_8,1,9.26,93,3.6,220,1,0,0,0,...,0,0,0,1,0,0,0,3659,3420,0
2,102_8,1,9.76,93,5.66,240,1,0,0,0,...,0,0,0,1,0,0,0,3490,3420,0
3,102_8,1,19.39,64,3.09,100,0,0,0,0,...,0,0,0,0,0,0,0,3800,3738,1
4,102_8,1,13.93,94,1.03,0,0,0,0,0,...,0,0,0,0,1,0,0,3582,3294,0


In [94]:
def MLP_param_halving_random(df, route_list):
#     model_metrics = pd.DataFrame(columns=["model_type", "data", "route", "MAE_test", "MSE_test", "RMSE_test", "R2_test", "MAE_static_vs_actual"])
    
    best_params = pd.DataFrame(columns=["route", "direction", "hl_size", "alpha", "mx_iter", "lr_init", "MAE_cv"])

    for route in route_list:
        print("Starting route", route)
        
        current = df.loc[df["ROUTEID"] == route]
        direction = int(current.iloc[0]["DIRECTION"])
        current.drop(["ROUTEID", "DIRECTION"], axis=1, inplace=True)
        
        line = route.split("_")[0]

        X = current.drop(["JOURNEYTIME", "PLANNED_JT"], 1)
        y = pd.DataFrame(current["JOURNEYTIME"])
                
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)      
        
        pipe = Pipeline([("scaler", StandardScaler()), ("regressor", MLPRegressor(solver="adam", activation="relu", random_state=1))])
        
        param_grid = {"regressor__hidden_layer_sizes":[(50,), (60,), (70,), (80,), (90,), (100,), (110,), (120,), (130,), (140,)], "regressor__alpha":[0.4, 0.5, 0.6, 0.7, 0.8], "regressor__learning_rate_init":[0.005, 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07]}
        
        grid = HalvingRandomSearchCV(pipe, param_grid, scoring="neg_mean_absolute_error", random_state=1, n_jobs=12).fit(X, y.values.ravel())
        
#         print("Training set score:", abs(grid.score(X_train, y_train)))
#         print("Test set score:", + abs(grid.score(X_test, y_test)))

        current_params = grid.best_params_  
    
        new_row_best_params = {"route":route, "direction":direction, "hl_size":current_params.get("regressor__hidden_layer_sizes"), "alpha":current_params.get("regressor__alpha"), "lr_init":current_params.get("regressor__learning_rate_init"), "MAE_cv":abs(grid.best_score_)}
        best_params = best_params.append(new_row_best_params, ignore_index=True)

        print("Cross-val best score:", abs(grid.best_score_))
        print(grid.best_params_, "\n")
        
    return best_params
    

In [97]:
tuned_params = MLP_param_halving_random(final_trimmed, routes_for_ann_list)

Starting route 46A_74
Cross-val best score: 400.2708811745275
{'regressor__alpha': 0.4, 'regressor__hidden_layer_sizes': (110,), 'regressor__learning_rate_init': 0.05} 

Starting route 46A_67
Cross-val best score: 447.17252495110296
{'regressor__alpha': 0.8, 'regressor__hidden_layer_sizes': (110,), 'regressor__learning_rate_init': 0.05} 

Starting route 145_102
Cross-val best score: 401.88317678888154
{'regressor__alpha': 0.8, 'regressor__hidden_layer_sizes': (130,), 'regressor__learning_rate_init': 0.04} 

Starting route 15_17
Cross-val best score: 512.4584190641526
{'regressor__alpha': 0.7, 'regressor__hidden_layer_sizes': (90,), 'regressor__learning_rate_init': 0.05} 

Starting route 145_105
Cross-val best score: 488.5487718826915
{'regressor__alpha': 0.6, 'regressor__hidden_layer_sizes': (140,), 'regressor__learning_rate_init': 0.04} 

Starting route 40_31
Cross-val best score: 455.9150990328062
{'regressor__alpha': 0.6, 'regressor__hidden_layer_sizes': (140,), 'regressor__learning

Cross-val best score: 312.82488845248054
{'regressor__alpha': 0.5, 'regressor__hidden_layer_sizes': (120,), 'regressor__learning_rate_init': 0.06} 

Starting route 39_21
Cross-val best score: 428.0046689208919
{'regressor__alpha': 0.8, 'regressor__hidden_layer_sizes': (140,), 'regressor__learning_rate_init': 0.04} 

Starting route 77A_29
Cross-val best score: 427.2292292153696
{'regressor__alpha': 0.6, 'regressor__hidden_layer_sizes': (140,), 'regressor__learning_rate_init': 0.06} 

Starting route 11_40
Cross-val best score: 542.6665095214464
{'regressor__alpha': 0.8, 'regressor__hidden_layer_sizes': (90,), 'regressor__learning_rate_init': 0.06} 

Starting route 41C_78
Cross-val best score: 335.5242271261348
{'regressor__alpha': 0.7, 'regressor__hidden_layer_sizes': (140,), 'regressor__learning_rate_init': 0.05} 

Starting route 18_4
Cross-val best score: 380.655757312813
{'regressor__alpha': 0.7, 'regressor__hidden_layer_sizes': (140,), 'regressor__learning_rate_init': 0.04} 

Startin

Cross-val best score: 521.836493248097
{'regressor__alpha': 0.7, 'regressor__hidden_layer_sizes': (140,), 'regressor__learning_rate_init': 0.07} 

Starting route 17_10
Cross-val best score: 334.43319386646584
{'regressor__alpha': 0.6, 'regressor__hidden_layer_sizes': (110,), 'regressor__learning_rate_init': 0.06} 

Starting route 184_29
Cross-val best score: 190.54862914345875
{'regressor__alpha': 0.6, 'regressor__hidden_layer_sizes': (140,), 'regressor__learning_rate_init': 0.07} 

Starting route 32_58
Cross-val best score: 214.3844381350853
{'regressor__alpha': 0.5, 'regressor__hidden_layer_sizes': (130,), 'regressor__learning_rate_init': 0.07} 

Starting route 184_28
Cross-val best score: 241.27901278128328
{'regressor__alpha': 0.4, 'regressor__hidden_layer_sizes': (120,), 'regressor__learning_rate_init': 0.06} 

Starting route 32_57
Cross-val best score: 270.77093433236433
{'regressor__alpha': 0.5, 'regressor__hidden_layer_sizes': (140,), 'regressor__learning_rate_init': 0.06} 

St

Cross-val best score: 386.91782959971187
{'regressor__alpha': 0.6, 'regressor__hidden_layer_sizes': (140,), 'regressor__learning_rate_init': 0.06} 

Starting route 104_16
Cross-val best score: 230.64961747208378
{'regressor__alpha': 0.8, 'regressor__hidden_layer_sizes': (80,), 'regressor__learning_rate_init': 0.06} 

Starting route 104_15
Cross-val best score: 249.11801308710446
{'regressor__alpha': 0.5, 'regressor__hidden_layer_sizes': (100,), 'regressor__learning_rate_init': 0.05} 

Starting route 185_56
Cross-val best score: 101.91779030672976
{'regressor__alpha': 0.5, 'regressor__hidden_layer_sizes': (140,), 'regressor__learning_rate_init': 0.04} 

Starting route 69_47
Cross-val best score: 298.771572504697
{'regressor__alpha': 0.4, 'regressor__hidden_layer_sizes': (130,), 'regressor__learning_rate_init': 0.06} 

Starting route 239_28
Cross-val best score: 424.9991126082509
{'regressor__alpha': 0.7, 'regressor__hidden_layer_sizes': (90,), 'regressor__learning_rate_init': 0.07} 

St

Cross-val best score: 240.80628090184732
{'regressor__alpha': 0.8, 'regressor__hidden_layer_sizes': (130,), 'regressor__learning_rate_init': 0.01} 

Starting route 68A_86
Cross-val best score: 270.39006432766917
{'regressor__alpha': 0.6, 'regressor__hidden_layer_sizes': (60,), 'regressor__learning_rate_init': 0.05} 

Starting route 67X_43
Cross-val best score: 534.4949331547542
{'regressor__alpha': 0.8, 'regressor__hidden_layer_sizes': (130,), 'regressor__learning_rate_init': 0.07} 

Starting route 76A_28
Cross-val best score: 594.3030911910039
{'regressor__alpha': 0.6, 'regressor__hidden_layer_sizes': (110,), 'regressor__learning_rate_init': 0.05} 

Starting route 31B_46
Cross-val best score: 362.1698336127802
{'regressor__alpha': 0.8, 'regressor__hidden_layer_sizes': (120,), 'regressor__learning_rate_init': 0.07} 

Starting route 76A_29
Cross-val best score: 495.114587616319
{'regressor__alpha': 0.6, 'regressor__hidden_layer_sizes': (70,), 'regressor__learning_rate_init': 0.05} 

Sta

Cross-val best score: 319.69992408567555
{'regressor__alpha': 0.8, 'regressor__hidden_layer_sizes': (70,), 'regressor__learning_rate_init': 0.05} 

Starting route 42D_50
Cross-val best score: 329.75080872063
{'regressor__alpha': 0.8, 'regressor__hidden_layer_sizes': (60,), 'regressor__learning_rate_init': 0.04} 

Starting route 31D_51
Cross-val best score: 398.78123395817147
{'regressor__alpha': 0.4, 'regressor__hidden_layer_sizes': (90,), 'regressor__learning_rate_init': 0.04} 

Starting route 16D_30
Cross-val best score: 1381.8444442525354
{'regressor__alpha': 0.6, 'regressor__hidden_layer_sizes': (110,), 'regressor__learning_rate_init': 0.04} 



In [98]:
tuned_params.to_csv("mlp_tuned_params.csv", index=False)

In [99]:
tuned_params.head()

Unnamed: 0,route,direction,hl_size,alpha,mx_iter,lr_init,MAE_cv
0,46A_74,1,"(110,)",0.4,,0.05,400.270881
1,46A_67,2,"(110,)",0.8,,0.05,447.172525
2,145_102,1,"(130,)",0.8,,0.04,401.883177
3,15_17,2,"(90,)",0.7,,0.05,512.458419
4,145_105,2,"(140,)",0.6,,0.04,488.548772


In [103]:
# Function to perform mlp regression, dump regressors to pickle files and return a dataframe of metrics for each regressor
def MLP_prediction(df, route_list, lrt, hls, al):
    model_metrics = pd.DataFrame(columns=["model_type", "data", "route", "MAE_test", "MSE_test", "RMSE_test", "MAPE_test", "R2_test", "5_fold_cv_MAE", "MAE_actual_vs_planned", "MAPE_actual_vs_planned"])
    
    for route in route_list:
        print("Route", route)
        current = df.loc[df["ROUTEID"] == route]
        current.drop(["ROUTEID"], axis = 1, inplace=True)
        direction = int(current.iloc[0]["DIRECTION"])
        line = route.split("_")[0]
        print("Direction", direction)

        y = pd.DataFrame(current["JOURNEYTIME"])
        X = current.drop(["JOURNEYTIME", "DIRECTION"], 1)

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
        
        df_PLANNED_JT = X_test[["PLANNED_JT"]]

        X_train.drop(["PLANNED_JT"], axis=1, inplace=True)
        X_test.drop(["PLANNED_JT"], axis=1, inplace=True)
        
        X_for_cross_val = X.drop(["PLANNED_JT"], axis=1)
        
        pipe = make_pipeline(StandardScaler(), MLPRegressor(solver="adam", activation="relu", random_state=1, learning_rate_init=lrt, hidden_layer_sizes=hls, alpha=al))
        
        cv = KFold()
        all_cv_scores = cross_val_score(pipe, X_for_cross_val, y, cv=cv, scoring="neg_mean_absolute_error")
        mean_cv_score = abs(all_cv_scores.mean())
                
        pipe.fit(X_train, y_train)
        mlp_predictions_train = pipe.predict(X_train)
        mlp_predictions_test = pipe.predict(X_test)
        
        compressed_filename = ("MLP_compressed_pickles_joblib/{}_{}.pickle".format(line, direction))
        
        joblib.dump(pipe, compressed_filename, compress=("bz2", 7))
        
        mae_actual_vs_planned = metrics.mean_absolute_error(y_test, df_PLANNED_JT)
        mape_actual_vs_planned = metrics.mean_absolute_percentage_error(y_test, df_PLANNED_JT)

        mae_train = metrics.mean_absolute_error(y_train, mlp_predictions_train)
        mse_train = metrics.mean_squared_error(y_train, mlp_predictions_train)
        rmse_train = mse_train**0.5
        r2_train = metrics.r2_score(y_train, mlp_predictions_train)
        mape_train = metrics.mean_absolute_percentage_error(y_train, mlp_predictions_train)

        mae_test = metrics.mean_absolute_error(y_test, mlp_predictions_test)
        mse_test = metrics.mean_squared_error(y_test, mlp_predictions_test)
        rmse_test = mse_test**0.5
        r2_test = metrics.r2_score(y_test, mlp_predictions_test)
        mape_test = metrics.mean_absolute_percentage_error(y_test, mlp_predictions_test)

        new_row_model_metrics = {"model_type":"MLP", "data":"test", "route":route, "MAE_test":mae_test, "MSE_test":mse_test, "RMSE_test":rmse_test, "MAPE_test":mape_test, "R2_test":r2_test, "5_fold_cv_MAE":mean_cv_score, "MAE_actual_vs_planned":mae_actual_vs_planned, "MAPE_actual_vs_planned":mape_actual_vs_planned}
        model_metrics = model_metrics.append(new_row_model_metrics, ignore_index=True)

        print("Number of entries:", current.shape[0], "\n")
        print("MAE train:", mae_train)
        print("MSE train:", mse_train)
        print("RMSE train:", rmse_train)
        print("MAPE train", mape_train)
        print("R2 train:", r2_train, "\n")
        print("MAE test:", mae_test)
        print("MSE test:", mse_test)
        print("RMSE test:", rmse_test)
        print("MAPE test", mape_test)
        print("R2 test:", r2_test, "\n")
        print("Mean MAE from 5-fold cross-validation is:", mean_cv_score, "\n")
        print("MAE actual vs planned:", mae_actual_vs_planned)
        print("MAPE actual vs planned:", mape_actual_vs_planned)
        print("*************************")
    return model_metrics

In [93]:
test = MLP_prediction(final_trimmed, ["61_108"], 0.05, (140,), 0.4)

Route 61_108
Direction 2
Number of entries: 5266 

MAE train: 156.48434457165203
MSE train: 43056.03143967175
RMSE train: 207.49947334793828
MAPE train 0.05569681133080957
R2 train: 0.8509189002958002 

MAE test: 232.77727972942952
MSE test: 94891.59532232696
RMSE test: 308.04479434382097
MAPE test 0.08392113793070484
R2 test: 0.6498135905429034 

Mean MAE from 5-fold cross-validation is: 235.23234500014752 

MAE actual vs planned: 325.2645569620253
MAPE actual vs planned: 0.11086803873950482
*************************


In [104]:
mlp_metrics = pd.DataFrame(columns=["model_type", "data", "route", "MAE_test", "MSE_test", "RMSE_test", "MAPE_test", "R2_test", "5_fold_cv_MAE", "MAE_actual_vs_planned", "MAPE_actual_vs_planned"])

In [105]:
for index, row in tuned_params.iterrows():
    route = [row["route"]]
    lrt = row["lr_init"]
    hls = row["hl_size"]
#     hls = int(row["hl_size"].strip("(,)"))
#     hls = (hls,)
    alpha = row["alpha"]
    current = MLP_prediction(final_trimmed, route, lrt, hls, alpha)
    mlp_metrics = mlp_metrics.append(current)

Route 46A_74
Direction 1
Number of entries: 36946 

MAE train: 350.4262661480429
MSE train: 204729.27518033862
RMSE train: 452.47019258768705
MAPE train 0.081103546765742
R2 train: 0.6295323010385661 

MAE test: 378.66045802775244
MSE test: 235984.7322357465
RMSE test: 485.78259770780846
MAPE test 0.08836733256093086
R2 test: 0.5792793605889711 

Mean MAE from 5-fold cross-validation is: 394.44680364947476 

MAE actual vs planned: 443.97076867556837
MAPE actual vs planned: 0.09976953348387757
*************************
Route 46A_67
Direction 2
Number of entries: 35543 

MAE train: 393.9219136294077
MSE train: 253316.16056366017
RMSE train: 503.3052359787847
MAPE train 0.08603956514966206
R2 train: 0.6308663506624876 

MAE test: 421.88059538000715
MSE test: 289437.4455496291
RMSE test: 537.993908468887
MAPE test 0.0924398627109254
R2 test: 0.5732394740076734 

Mean MAE from 5-fold cross-validation is: 438.2520852102787 

MAE actual vs planned: 546.1009096877051
MAPE actual vs planned: 0.

Number of entries: 22643 

MAE train: 549.4613526159495
MSE train: 445641.08069550496
RMSE train: 667.5635405678661
MAPE train 0.10393077373993564
R2 train: 0.4890715069625451 

MAE test: 598.7689744212132
MSE test: 529914.7618367256
RMSE test: 727.9524447632041
MAPE test 0.11277227336072115
R2 test: 0.3782732710234138 

Mean MAE from 5-fold cross-validation is: 600.7529169767449 

MAE actual vs planned: 867.431767996467
MAPE actual vs planned: 0.14725851241579554
*************************
Route 140_19
Direction 1
Number of entries: 21567 

MAE train: 248.25870996990707
MSE train: 103227.44731395069
RMSE train: 321.29028512227177
MAPE train 0.06953446755061762
R2 train: 0.6050048798879697 

MAE test: 279.50974487474065
MSE test: 127562.96508989514
RMSE test: 357.15957930579873
MAPE test 0.07911281978327914
R2 test: 0.5049824934787719 

Mean MAE from 5-fold cross-validation is: 292.8830862751667 

MAE actual vs planned: 349.70190078813164
MAPE actual vs planned: 0.09522334079058752
****

Number of entries: 15290 

MAE train: 346.7296002822894
MSE train: 196504.4376541333
RMSE train: 443.288210596823
MAPE train 0.07969177341433688
R2 train: 0.6740196276678135 

MAE test: 391.1362887359052
MSE test: 243844.6494109801
RMSE test: 493.8062873343961
MAPE test 0.08942320582834533
R2 test: 0.6095009990107949 

Mean MAE from 5-fold cross-validation is: 406.4787023807524 

MAE actual vs planned: 470.7645519947678
MAPE actual vs planned: 0.10272590238403059
*************************
Route 150_8
Direction 1
Number of entries: 15213 

MAE train: 190.37205095351143
MSE train: 62216.50081419323
RMSE train: 249.43235719167075
MAPE train 0.0810938883859768
R2 train: 0.6111804085911461 

MAE test: 222.1596366664508
MSE test: 83373.10292110383
RMSE test: 288.74400932504875
MAPE test 0.09495935897618173
R2 test: 0.4838434712135874 

Mean MAE from 5-fold cross-validation is: 224.7755907205485 

MAE actual vs planned: 278.84180543382996
MAPE actual vs planned: 0.11203967719079226
**********

Number of entries: 8233 

MAE train: 304.6327204376222
MSE train: 159340.1947433621
RMSE train: 399.17439139223615
MAPE train 0.07850227993769893
R2 train: 0.7347631419981824 

MAE test: 376.2056136784652
MSE test: 234809.94299152738
RMSE test: 484.5719172543198
MAPE test 0.09761621518150061
R2 test: 0.6000128080467215 

Mean MAE from 5-fold cross-validation is: 399.12099274132083 

MAE actual vs planned: 480.3396761133603
MAPE actual vs planned: 0.11786582084334433
*************************
Route 15B_61
Direction 2
Number of entries: 12512 

MAE train: 264.83674301089366
MSE train: 116697.81443068643
RMSE train: 341.6106181468697
MAPE train 0.07808893351584491
R2 train: 0.7161191614664448 

MAE test: 305.41874077145224
MSE test: 155371.53556802304
RMSE test: 394.1719619252783
MAPE test 0.08989658024927985
R2 test: 0.6054077245350489 

Mean MAE from 5-fold cross-validation is: 312.9613857385655 

MAE actual vs planned: 431.96403835908364
MAPE actual vs planned: 0.11838452404863972
****

Number of entries: 11638 

MAE train: 286.8846806243801
MSE train: 136094.84840029708
RMSE train: 368.91035279630887
MAPE train 0.06709654980775018
R2 train: 0.7082425586747061 

MAE test: 347.1853081136387
MSE test: 201153.14692694522
RMSE test: 448.5009999174419
MAPE test 0.08105285003278503
R2 test: 0.5708513175657072 

Mean MAE from 5-fold cross-validation is: 353.00195997040777 

MAE actual vs planned: 451.4455899198167
MAPE actual vs planned: 0.0997586439070851
*************************
Route 41C_79
Direction 2
Number of entries: 11772 

MAE train: 275.8141689339331
MSE train: 131434.51761734503
RMSE train: 362.5389877204175
MAPE train 0.07654613602303896
R2 train: 0.6464417033121941 

MAE test: 331.93656579506484
MSE test: 189365.9409246284
RMSE test: 435.16197090810726
MAPE test 0.09128667493753892
R2 test: 0.5183074650258994 

Mean MAE from 5-fold cross-validation is: 329.0248773819125 

MAE actual vs planned: 419.85645526613814
MAPE actual vs planned: 0.11002308400152236
****

Number of entries: 9305 

MAE train: 222.23547633720287
MSE train: 81967.57158674437
RMSE train: 286.29979320066644
MAPE train 0.06353835808564053
R2 train: 0.658718936968566 

MAE test: 271.0114643765898
MSE test: 122595.3534286611
RMSE test: 350.1361926860191
MAPE test 0.07754825134501346
R2 test: 0.5031754782657412 

Mean MAE from 5-fold cross-validation is: 279.62626295784355 

MAE actual vs planned: 338.4297994269341
MAPE actual vs planned: 0.09286484894944758
*************************
Route 54A_12
Direction 2
Number of entries: 9084 

MAE train: 292.49137600327634
MSE train: 145373.35877611628
RMSE train: 381.27858420860235
MAPE train 0.07876365232238884
R2 train: 0.7060543910367855 

MAE test: 364.43190164837887
MSE test: 219789.05482118615
RMSE test: 468.81665373702987
MAPE test 0.09930958617812374
R2 test: 0.5546449504135121 

Mean MAE from 5-fold cross-validation is: 374.00350179106175 

MAE actual vs planned: 545.3253851797506
MAPE actual vs planned: 0.1388740762990008
*****

Number of entries: 7809 

MAE train: 279.26362966254675
MSE train: 133363.1017269503
RMSE train: 365.1891314469124
MAPE train 0.06063953733869938
R2 train: 0.8066346198155426 

MAE test: 351.417274860341
MSE test: 209004.03646348894
RMSE test: 457.16959267156966
MAPE test 0.07783025642793585
R2 test: 0.6846439861921578 

Mean MAE from 5-fold cross-validation is: 370.3033632460921 

MAE actual vs planned: 584.063593683312
MAPE actual vs planned: 0.12308403719740763
*************************
Route 13_60
Direction 1
Number of entries: 7662 

MAE train: 351.71911232159226
MSE train: 208272.66216447327
RMSE train: 456.36899781259604
MAPE train 0.0565821092148562
R2 train: 0.6378133218465731 

MAE test: 429.5093777647734
MSE test: 318970.1086115623
RMSE test: 564.7743873544216
MAPE test 0.06868839377091832
R2 test: 0.4584768931317976 

Mean MAE from 5-fold cross-validation is: 447.2837610821931 

MAE actual vs planned: 560.9090909090909
MAPE actual vs planned: 0.08574445981575732
***********

Number of entries: 6434 

MAE train: 247.04948058842888
MSE train: 120228.70405630892
RMSE train: 346.7401102501828
MAPE train 0.0684530538358641
R2 train: 0.7994405829980198 

MAE test: 350.65208220016245
MSE test: 227192.6707346706
RMSE test: 476.64732322197153
MAPE test 0.09853813374233825
R2 test: 0.6278716032984512 

Mean MAE from 5-fold cross-validation is: 346.30378928321346 

MAE actual vs planned: 433.5375453133092
MAPE actual vs planned: 0.1137869947088456
*************************
Route 33B_58
Direction 2
Number of entries: 6526 

MAE train: 219.61551208589796
MSE train: 82362.0492608962
RMSE train: 286.9878904429527
MAPE train 0.11339017462483769
R2 train: 0.4985868145789435 

MAE test: 281.6002633821738
MSE test: 132709.89823888187
RMSE test: 364.2936977754101
MAPE test 0.14740743386043564
R2 test: 0.1926678201156885 

Mean MAE from 5-fold cross-validation is: 284.23440537482224 

MAE actual vs planned: 407.3508682328907
MAPE actual vs planned: 0.18428166856990513
********

Number of entries: 2956 

MAE train: 216.96049084235196
MSE train: 96674.4682822309
RMSE train: 310.92518116458643
MAPE train 0.059169635657259086
R2 train: 0.8266402032354074 

MAE test: 345.0390483151031
MSE test: 251479.00001808832
RMSE test: 501.4768190236597
MAPE test 0.09524234475557332
R2 test: 0.5401870302261946 

Mean MAE from 5-fold cross-validation is: 336.2386054699265 

MAE actual vs planned: 748.489289740699
MAPE actual vs planned: 0.1922725491734452
*************************
Route 56A_31
Direction 1
Number of entries: 4981 

MAE train: 184.51290763769063
MSE train: 61331.74509795856
RMSE train: 247.65246838656498
MAPE train 0.055433325419585594
R2 train: 0.854498988579855 

MAE test: 292.7783737297617
MSE test: 150517.39493669738
RMSE test: 387.96571360971757
MAPE test 0.08796528864006169
R2 test: 0.6306518273770346 

Mean MAE from 5-fold cross-validation is: 289.0152194635054 

MAE actual vs planned: 475.01939799331103
MAPE actual vs planned: 0.13777202057740354
*******

Number of entries: 4070 

MAE train: 86.27105171200877
MSE train: 12343.482051321187
RMSE train: 111.1012243466344
MAPE train 0.06846438846062776
R2 train: 0.6572717557393546 

MAE test: 126.12124436141437
MSE test: 26216.490810609626
RMSE test: 161.91507283329005
MAPE test 0.09933184128771805
R2 test: 0.2571514029059997 

Mean MAE from 5-fold cross-validation is: 131.40818306588022 

MAE actual vs planned: 164.94512694512696
MAPE actual vs planned: 0.12314522969114035
*************************
Route 111_7
Direction 1
Number of entries: 3975 

MAE train: 139.17496982814328
MSE train: 36140.7372570317
RMSE train: 190.10717308147977
MAPE train 0.05415224970552697
R2 train: 0.8436333444395576 

MAE test: 225.87317661544878
MSE test: 97204.99709160729
RMSE test: 311.7771593488004
MAPE test 0.0874053882941826
R2 test: 0.5936466591673604 

Mean MAE from 5-fold cross-validation is: 230.97410558148403 

MAE actual vs planned: 303.7770326906957
MAPE actual vs planned: 0.11571638828589696
******

Number of entries: 3022 

MAE train: 161.70606820956456
MSE train: 45207.32488967579
RMSE train: 212.62014224827286
MAPE train 0.041437480328332645
R2 train: 0.856538475903463 

MAE test: 272.0478778707536
MSE test: 128356.53914604218
RMSE test: 358.2688085028366
MAPE test 0.07054298237592765
R2 test: 0.6049597218535001 

Mean MAE from 5-fold cross-validation is: 268.5352198347392 

MAE actual vs planned: 472.1389195148842
MAPE actual vs planned: 0.11679552694633293
*************************
Route 68_81
Direction 1
Number of entries: 3033 

MAE train: 220.7777272415454
MSE train: 88801.87345391436
RMSE train: 297.99643194829423
MAPE train 0.05004078057864257
R2 train: 0.7771490731801147 

MAE test: 354.24368700989066
MSE test: 211169.33649986202
RMSE test: 459.531649073121
MAPE test 0.07978081839717097
R2 test: 0.5299454148705691 

Mean MAE from 5-fold cross-validation is: 353.17018831150233 

MAE actual vs planned: 496.7813186813187
MAPE actual vs planned: 0.10765964219008652
********

Number of entries: 1378 

MAE train: 180.10265377869823
MSE train: 59644.71714253785
RMSE train: 244.22267941888165
MAPE train 0.06341567064573567
R2 train: 0.8090896223557578 

MAE test: 306.5062649290565
MSE test: 163983.42924481625
RMSE test: 404.9486748278308
MAPE test 0.11102514487217177
R2 test: 0.3969387227599842 

Mean MAE from 5-fold cross-validation is: 303.8500131730281 

MAE actual vs planned: 334.77294685990336
MAPE actual vs planned: 0.11594324197706571
*************************
Route 41B_52
Direction 1
Number of entries: 1368 

MAE train: 178.18324558608813
MSE train: 61836.288201246105
RMSE train: 248.6690334586237
MAPE train 0.05478267207559952
R2 train: 0.8688138779338778 

MAE test: 302.0547972340173
MSE test: 170741.72359575465
RMSE test: 413.2090555587506
MAPE test 0.09487666056326946
R2 test: 0.6193600456545394 

Mean MAE from 5-fold cross-validation is: 302.58267603029196 

MAE actual vs planned: 584.4598540145986
MAPE actual vs planned: 0.17381753265081984
*****

Number of entries: 94 

MAE train: 72.42781085522192
MSE train: 9294.358129378292
RMSE train: 96.40725143565857
MAPE train 0.04406592507291976
R2 train: 0.9050224827040868 

MAE test: 208.34696936233675
MSE test: 70137.40870870333
RMSE test: 264.83468184643664
MAPE test 0.1220094883038854
R2 test: 0.3337163701206187 

Mean MAE from 5-fold cross-validation is: 232.26301038750307 

MAE actual vs planned: 419.3448275862069
MAPE actual vs planned: 0.2330912441155599
*************************
Route 142_13
Direction 1
Number of entries: 819 

MAE train: 233.9883622013136
MSE train: 86278.58949547863
RMSE train: 293.73217306839
MAPE train 0.03950706255614385
R2 train: 0.6340780503377659 

MAE test: 294.6875215536396
MSE test: 143456.6322670804
RMSE test: 378.7566927026906
MAPE test 0.05052078263125913
R2 test: 0.38034710108233183 

Mean MAE from 5-fold cross-validation is: 335.4042285136693 

MAE actual vs planned: 595.8617886178862
MAPE actual vs planned: 0.09811736747140987
****************

Number of entries: 618 

MAE train: 287.04665348905377
MSE train: 141610.33352061265
RMSE train: 376.31148470464285
MAPE train 0.0527945035928198
R2 train: 0.6049411475840438 

MAE test: 340.71285629474426
MSE test: 182450.60602347812
RMSE test: 427.1423720768968
MAPE test 0.06452203524989444
R2 test: 0.4180929113618468 

Mean MAE from 5-fold cross-validation is: 396.0937794568773 

MAE actual vs planned: 615.8333333333334
MAPE actual vs planned: 0.11126076422472161
*************************
Route 66X_92
Direction 1
Number of entries: 603 

MAE train: 284.2464757066006
MSE train: 129141.07998815483
RMSE train: 359.362045837001
MAPE train 0.045827906387146684
R2 train: 0.5606291126272904 

MAE test: 376.3917653963542
MSE test: 230936.57018995014
RMSE test: 480.55860224321253
MAPE test 0.06182345001337959
R2 test: 0.10986503887237231 

Mean MAE from 5-fold cross-validation is: 438.8697832695666 

MAE actual vs planned: 582.9116022099447
MAPE actual vs planned: 0.09149411946279633
*******

Number of entries: 362 

MAE train: 210.62447673409596
MSE train: 73001.32368062569
RMSE train: 270.1875712919188
MAPE train 0.044929420402279624
R2 train: 0.6963799207545525 

MAE test: 391.60833727217664
MSE test: 241754.8875788441
RMSE test: 491.685761008842
MAPE test 0.0822331507233546
R2 test: -0.0830003977257916 

Mean MAE from 5-fold cross-validation is: 414.4756126981997 

MAE actual vs planned: 515.5412844036697
MAPE actual vs planned: 0.1043539011288936
*************************
Route 41X_121
Direction 1
Number of entries: 369 

MAE train: 326.6423199685292
MSE train: 163068.54843891796
RMSE train: 403.8174692096888
MAPE train 0.06078640700414259
R2 train: 0.49083505804809613 

MAE test: 474.3380777103181
MSE test: 393282.27418481134
RMSE test: 627.1222163062088
MAPE test 0.08887917527578756
R2 test: -0.2556385894652615 

Mean MAE from 5-fold cross-validation is: 519.7164876005696 

MAE actual vs planned: 833.3603603603603
MAPE actual vs planned: 0.14699767984660125
*********

Number of entries: 70 

MAE train: 142.19401235956062
MSE train: 31250.013415154484
RMSE train: 176.77673324041965
MAPE train 0.04283756581955949
R2 train: 0.3347086690032426 

MAE test: 625.9480770680111
MSE test: 524217.9947932901
RMSE test: 724.0290013482127
MAPE test 0.18452196225806142
R2 test: -11.967482908816006 

Mean MAE from 5-fold cross-validation is: 446.42585696750194 

MAE actual vs planned: 354.14285714285717
MAPE actual vs planned: 0.10434805676167572
*************************
Route 15D_63
Direction 2
Number of entries: 181 

MAE train: 367.88121101223174
MSE train: 246815.82325067045
RMSE train: 496.80561918185913
MAPE train 0.08063238212009707
R2 train: 0.4475173034652342 

MAE test: 455.2993468212865
MSE test: 351041.0231215369
RMSE test: 592.4871501741932
MAPE test 0.09972027472252946
R2 test: 0.3130952053440086 

Mean MAE from 5-fold cross-validation is: 553.3574758272089 

MAE actual vs planned: 596.2727272727273
MAPE actual vs planned: 0.12435307116787495
*******

In [106]:
mlp_metrics.to_csv("mlp_final_metrics_standardised.csv", index=False)