<a id="top"></a>

The purpose of this notebook is to train and save a predictive model for each Bus route.

***

# Import Packages

In [2]:
import sqlite3
import json
import pandas as pd
import pickle
import math
import datetime

from sklearn.linear_model import LinearRegression
from sklearn import metrics

***

<a id="contents"></a>
# Contents

- [1. Create Connection to Database](#create_conn)
- [2. Load Line Routes Dictionary](#load_line_routes)
- [3. Train Models - Linear Regression](#linear_reg)
    - [3.1. Model Metrics - Linear Regression](#3.1)

***

<a id="create_conn"></a>
# 1. Create Connection to Database
[Back to contents](#contents)

In [3]:
# def function to create connection to db
def create_connection(db_file):
    """
    create a database connection to the SQLite database specified by db_file
    :param df_file: database file
    :return: Connection object or None
    """
    conn = None
    try: 
        conn = sqlite3.connect(db_file)
        return conn
    except 'Error' as e:
        print(e)
        
    return conn

In [4]:
# create connection to db
db_file = '/home/faye/Data-Analytics-CityRoute/dublinbus.db'
conn = create_connection(db_file)

<a id="load_line_routes"></a>
# 2. Load Line Routes Dictionary
[Back to contents](#contents)

In [5]:
# load in line_routes json
with open('/home/faye/data/line_routes.json') as json_file:
    line_routes = json.load(json_file)
    
    #print("Type:", type(data))

<a id="linear_reg"></a>
# 3. Train Models - Linear Regression
[Back to contents](#contents)

In [6]:
# set trip features
trip_features = """
    T.ACTUALTIME_TRAVEL, T.MONTHOFSERVICE, T.DAYOFWEEK, T.HOUR, T.IS_HOLIDAY"""

In [7]:
# set weather features
weather_features = """
    W.temp, W.humidity, W.wind_speed, W.rain_1h, W.weather_main
"""

In [8]:
# set ordered feature list
ordered_features = [
    'HOUR',
    
    'DAYOFWEEK_Monday',
    'DAYOFWEEK_Tuesday',
    'DAYOFWEEK_Wednesday',
    'DAYOFWEEK_Thursday',
    'DAYOFWEEK_Friday',
    'DAYOFWEEK_Saturday',
    'DAYOFWEEK_Sunday',
       
    'MONTHOFSERVICE_January',
    'MONTHOFSERVICE_February',
    'MONTHOFSERVICE_March',
    'MONTHOFSERVICE_April',
    'MONTHOFSERVICE_May',
    'MONTHOFSERVICE_June',
    'MONTHOFSERVICE_July',
    'MONTHOFSERVICE_August',
    'MONTHOFSERVICE_September',
    'MONTHOFSERVICE_October',
    'MONTHOFSERVICE_November',
    'MONTHOFSERVICE_December',
    
    'IS_HOLIDAY_0',
    'IS_HOLIDAY_1',
    
    'humidity',
    'rain_1h',
    'temp',
    'wind_speed',
    
    'weather_main_Clear',
    'weather_main_Clouds',
    'weather_main_Drizzle',
    'weather_main_Fog',
    'weather_main_Mist',
    'weather_main_Rain',
    'weather_main_Smoke',
    'weather_main_Snow',
    
]

In [9]:
# set extra dummy features to drop
features_to_drop = [
    'DAYOFWEEK_Monday',
    'MONTHOFSERVICE_January', 
    'IS_HOLIDAY_0', 
    'weather_main_Clear', 
]

In [10]:
# # 
# model_metrics = {
#     lineID : {
#         RMSE: X
#         R2 : X
#     },
    
#     lineID : {
#         RMSE: X
#         R2 : X
#     },
# }

In [11]:
def evaluate_model(linear_model, df, tf):
    linear_reg_predictions_test = (linear_model.predict(df))

    actual_vs_predicted_linear_reg_test = pd.DataFrame(tf)
    actual_vs_predicted_linear_reg_test['Predicted'] = linear_reg_predictions_test

    return actual_vs_predicted_linear_reg_test

def get_metrics(actual_vs_predicted_linear_reg_test):
    metrics_dict = {
        "explained_variance_score" : metrics.explained_variance_score(actual_vs_predicted_linear_reg_test['ACTUALTIME_TRAVEL'], actual_vs_predicted_linear_reg_test['Predicted']),
        "max_error" : metrics.max_error(actual_vs_predicted_linear_reg_test['ACTUALTIME_TRAVEL'], actual_vs_predicted_linear_reg_test['Predicted']),
        "mean_absolute_error" : metrics.mean_absolute_error(actual_vs_predicted_linear_reg_test['ACTUALTIME_TRAVEL'], actual_vs_predicted_linear_reg_test['Predicted']),
        "mean_squared_error" : metrics.mean_squared_error(actual_vs_predicted_linear_reg_test['ACTUALTIME_TRAVEL'], actual_vs_predicted_linear_reg_test['Predicted']),
        "mean_squared_error" : metrics.mean_squared_error(actual_vs_predicted_linear_reg_test['ACTUALTIME_TRAVEL'], actual_vs_predicted_linear_reg_test['Predicted']),
        "mean_squared_log_error" : metrics.mean_squared_log_error(actual_vs_predicted_linear_reg_test['ACTUALTIME_TRAVEL'], actual_vs_predicted_linear_reg_test['Predicted']),
        "median_absolute_error" : metrics.median_absolute_error(actual_vs_predicted_linear_reg_test['ACTUALTIME_TRAVEL'], actual_vs_predicted_linear_reg_test['Predicted']),
        "r2_score" : metrics.r2_score(actual_vs_predicted_linear_reg_test['ACTUALTIME_TRAVEL'], actual_vs_predicted_linear_reg_test['Predicted']),
        "mean_poisson_deviance" : metrics.mean_poisson_deviance(actual_vs_predicted_linear_reg_test['ACTUALTIME_TRAVEL'], actual_vs_predicted_linear_reg_test['Predicted']),
        "mean_gamma_deviance" : metrics.mean_gamma_deviance(actual_vs_predicted_linear_reg_test['ACTUALTIME_TRAVEL'], actual_vs_predicted_linear_reg_test['Predicted']),
        "mean_absolute_percentage_error" : metrics.mean_absolute_percentage_error(actual_vs_predicted_linear_reg_test['ACTUALTIME_TRAVEL'], actual_vs_predicted_linear_reg_test['Predicted'])
    }
    
    return metrics_dict

In [12]:
# train models for each line
total_line = len(line_routes)
line_count = 0
model_metrics = {}
for line in line_routes:
    
    line_count += 1
    print(f"Line {line} -- {line_count}/{total_line}")
    
    
    
    for direction in line_routes[line]:
        
        routeID = line_routes[line][direction]
        
        # initialise query
        query = f"""
            SELECT {trip_features}, {weather_features}
            FROM trips2 T, weather W
            WHERE ROUTEID = '{routeID}' and T.dt = W.dt
        """
        
        # read in query to dataframe
        df = pd.read_sql(query, conn)
        
        # change hour to numerical
        df['HOUR'] = df['HOUR'].astype('int64')
        
        # get dummy variables
        df = pd.get_dummies(df)
        
        # find differences between df and master features
        diff_cols = list(set(ordered_features) - set(df.columns))
        
        # add missing features to df
        for c in diff_cols:
            df[c] = 0
            
        # separate target feature
        tf = df['ACTUALTIME_TRAVEL']
            
        # reorder features, dropping target feature
        df = df[ordered_features]
        
        # drop extra dummy variables
        df = df.drop(columns=features_to_drop)
        
        # train linear regression
        linear_reg = LinearRegression().fit(df, tf)
        
        # test model
        model_test = evaluate_model(linear_reg, df, tf)
        
        # get metrics
        all_metrics = get_metrics(model_test)
        
        # save metrics to dictionary
        model_metrics[f"{line}_{direction}"] = all_metrics
        
        # save model as pickle
#         file_name = f"route_{line}_{direction}.pkl" 
#         file_path = f"/home/faye/Data-Analytics-CityRoute/route_models/{file_name}"
#         with open(file_path, 'wb') as handle:
#             pickle.dump(linear_reg, handle)


Line 1 -- 1/130
Line 102 -- 2/130
Line 104 -- 3/130
Line 11 -- 4/130
Line 111 -- 5/130
Line 114 -- 6/130
Line 116 -- 7/130
Line 118 -- 8/130
Line 120 -- 9/130
Line 122 -- 10/130
Line 123 -- 11/130
Line 13 -- 12/130
Line 130 -- 13/130
Line 14 -- 14/130
Line 140 -- 15/130
Line 142 -- 16/130
Line 145 -- 17/130
Line 14C -- 18/130
Line 15 -- 19/130
Line 150 -- 20/130
Line 151 -- 21/130
Line 15A -- 22/130
Line 15B -- 23/130
Line 15D -- 24/130
Line 16 -- 25/130
Line 161 -- 26/130
Line 16C -- 27/130
Line 16D -- 28/130
Line 17 -- 29/130
Line 17A -- 30/130
Line 18 -- 31/130
Line 184 -- 32/130
Line 185 -- 33/130
Line 220 -- 34/130
Line 236 -- 35/130
Line 238 -- 36/130
Line 239 -- 37/130
Line 25 -- 38/130
Line 25A -- 39/130
Line 25B -- 40/130
Line 25D -- 41/130
Line 25X -- 42/130
Line 26 -- 43/130
Line 27 -- 44/130
Line 270 -- 45/130
Line 27A -- 46/130
Line 27B -- 47/130
Line 27X -- 48/130
Line 29A -- 49/130
Line 31 -- 50/130
Line 31A -- 51/130
Line 31B -- 52/130
Line 31D -- 53/130
Line 32 -- 54/1

In [28]:
from sklearn import metrics
metrics.explained_variance_score(model_test['ACTUALTIME_TRAVEL'], model_test['Predicted'])

0.28524644123634024

> The above produced 252 models for 130 lines. This must mean that 8 lines operate in only one direction.

In [13]:
# find lines with only 1 direction
one_direction_lines = []
for line in line_routes:    
    if len(line_routes[line].keys()) != 2:
        one_direction_lines.append(line)

print(one_direction_lines)

['118', '16D', '33E', '41A', '46E', '51X', '68X', '77X']


***

<a id="3.1"></a>
# 3.1.  Model Metrics - Linear Regression
[Back to contents](#contents)

Calculate the average MAE, MSE, and RMSE across all models for each direction.

#### Across All Models

In [24]:
# calculate metrics for inbound models
mae = []
mse = []
for l in model_metrics:
    mae.append( model_metrics[l]['mean_absolute_error'] )
    mse.append( model_metrics[l]['mean_squared_error'] )

In [25]:
# print metrics
print('Metrics for all models')
print('~'*30)
print('MAE  :', sum(mae)/len(mae))
print('MSE  :', sum(mse)/len(mse))
print('RMSE :', math.sqrt(sum(mse)/len(mse)))

Metrics for all models
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
MAE  : 391.45292299434476
MSE  : 307096.0706617102
RMSE : 554.1624948169176


In [30]:
# interpreting RMSE in HH:MM:SS
print(datetime.timedelta(seconds=554.1624948169176))

0:09:14.162495


#### Across Inbound Models

In [26]:
# calculate metrics for inbound models
direction = 'inbound'
inbound_mae = []
inbound_mse = []
for l in model_metrics:
    line, d = l.split('_')
    
    if d == direction:
        inbound_mae.append( model_metrics[l]['mean_absolute_error'] )
        inbound_mse.append( model_metrics[l]['mean_squared_error'] )

In [27]:
# print metrics
print('Metrics for inbound models')
print('~'*30)
print('MAE  :', sum(inbound_mae)/len(inbound_mae))
print('MSE  :', sum(inbound_mse)/len(inbound_mse))
print('RMSE :', math.sqrt(sum(inbound_mse)/len(inbound_mse)))

Metrics for inbound models
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
MAE  : 389.32751611416313
MSE  : 303890.74850501155
RMSE : 551.2628669745601


In [31]:
# interpreting RMSE in HH:MM:SS
print(datetime.timedelta(seconds=551.2628669745601))

0:09:11.262867


#### Across Outbound Models

In [28]:
# calculate metrics for inbound models
direction = 'outbound'
outbound_mae = []
outbound_mse = []
for l in model_metrics:
    line, d = l.split('_')
    
    if d == direction:
        outbound_mae.append( model_metrics[l]['mean_absolute_error'] )
        outbound_mse.append( model_metrics[l]['mean_squared_error'] )

In [29]:
# print metrics
print('Metrics for outbound models')
print('~'*30)
print('MAE  :', sum(outbound_mae)/len(outbound_mae))
print('MSE  :', sum(outbound_mse)/len(outbound_mse))
print('RMSE :', math.sqrt(sum(outbound_mse)/len(outbound_mse)))

Metrics for outbound models
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
MAE  : 393.6468913867905
MSE  : 310404.7903073347
RMSE : 557.13983012107


In [32]:
# interpreting RMSE in HH:MM:SS
print(datetime.timedelta(seconds=557.13983012107))

0:09:17.139830


***

[Back to top](#top)