In [69]:
import pandas as pd

# this is the standard import if you're using "formula notation" (similar to R)
import statsmodels.formula.api as smf

In [70]:
df = pd.read_hdf("cleaned_store.h5", key="table_name", where='Journey_Pattern_ID == "01450001"', columns=['Timestamp', "Journey_Pattern_ID", "Time_Frame", "Vehicle_Journey_ID", "Week_Day", "Distance", "TravelTime", "TimeCategory", "Rain"])
df.columns = ["Timestamp", "JourneyPatternId", "TimeFrame", "VehicleJourneyId", "Day", "Distance", "TravelTime", "TimeCategory", "Rain"]

In [71]:
df.head()

Unnamed: 0,Timestamp,JourneyPatternId,TimeFrame,VehicleJourneyId,Day,Distance,TravelTime,TimeCategory,Rain
6715,2012-11-12 13:10:04,1450001,2012-11-12,6643,0,0.0,0.0,13:00:00,0.0
6716,2012-11-12 13:12:25,1450001,2012-11-12,6643,0,0.593437,141.0,13:00:00,0.0
6717,2012-11-12 13:14:24,1450001,2012-11-12,6643,0,1.391437,260.0,13:00:00,0.0
6718,2012-11-12 13:14:46,1450001,2012-11-12,6643,0,1.518467,282.0,13:00:00,0.0
6719,2012-11-12 13:15:04,1450001,2012-11-12,6643,0,1.640963,300.0,13:00:00,0.0


## Change Day Category

In [72]:
# Organise the Data
df = df.sort_values(['TimeFrame', 'VehicleJourneyId', 'Timestamp'], ascending=True)

# Clean up index
df = df.reset_index()
del df['index']

In [73]:
df.Day = df.Day.astype("str")

In [74]:
# Create group object to work with 
gb = df.groupby(["TimeFrame"], as_index=False, group_keys=False)

In [75]:
def change_day(group):
    
    day_num = group.Day.value_counts().index.tolist()[0]
    
    if day_num == '0' or day_num == '1' or day_num == '2' or day_num == '3' or day_num == '4':
        group.Day = 'Mon-Fri'
        
    if day_num == '5':
        group.Day = 'Sat'
        
    if day_num == '6':
        group.Day = 'Sun'
    
    return group

In [76]:
df = gb.apply(change_day)

In [77]:
df.Day = df.Day.astype("category")

In [78]:
df.dtypes

Timestamp           datetime64[ns]
JourneyPatternId            object
TimeFrame           datetime64[ns]
VehicleJourneyId            object
Day                       category
Distance                   float64
TravelTime                 float64
TimeCategory                object
Rain                       float32
dtype: object

## Change TimeCategory To New Speed Category

In [79]:
df.head()

Unnamed: 0,Timestamp,JourneyPatternId,TimeFrame,VehicleJourneyId,Day,Distance,TravelTime,TimeCategory,Rain
0,2012-11-06 23:19:58,1450001,2012-11-06,6628,Mon-Fri,0.0,0.0,23:30:00,0.0
1,2012-11-06 23:22:59,1450001,2012-11-06,6628,Mon-Fri,0.783226,181.0,23:30:00,0.0
2,2012-11-06 23:24:59,1450001,2012-11-06,6628,Mon-Fri,1.525179,301.0,23:30:00,0.0
3,2012-11-06 23:26:00,1450001,2012-11-06,6628,Mon-Fri,1.707548,362.0,23:30:00,0.0
4,2012-11-06 23:28:58,1450001,2012-11-06,6628,Mon-Fri,2.209265,540.0,23:30:00,0.0


In [80]:
df.columns = ["Timestamp", "JourneyPatternId", "TimeFrame", "VehicleJourneyId", "Day", "Distance", "TravelTime", "Speed", "Rain"]

In [81]:
# Organise the Data
df = df.sort_values(['TimeFrame', 'VehicleJourneyId', 'Timestamp'], ascending=True)

# Clean up index
df = df.reset_index()
del df['index']

In [82]:
# Create group object to work with 
gb = df.groupby(["TimeFrame", "VehicleJourneyId"], as_index=False, group_keys=False)

In [83]:
def change_day(group):
    
    day = group.Day.value_counts().index.tolist()[0]
    time_cat = group.Speed.value_counts().index.tolist()[0]
    
    # For Weekdays
    if day == 'Mon-Fri':
        if time_cat == "06:00:00":
            group.Speed = 'Fast'
        if time_cat == "06:30:00":
            group.Speed = 'Fast'
        if time_cat == "07:00:00":
            group.Speed = 'Medium'
        if time_cat == "07:30:00":
            group.Speed = 'Medium'
        if time_cat == "08:00:00":
            group.Speed = 'Slow'
        if time_cat == "08:30:00":
            group.Speed = 'Slow'
        if time_cat == "09:00:00":
            group.Speed = 'Slow'
        if time_cat == "09:30:00":
            group.Speed = 'Medium'
        if time_cat == "10:00:00":
            group.Speed = 'Medium'
        if time_cat == "10:30:00":
            group.Speed = 'Medium'
        if time_cat == "11:00:00":
            group.Speed = 'Medium'
        if time_cat == "11:30:00":
            group.Speed = 'Medium'
        if time_cat == "12:00:00":
            group.Speed = 'Medium'
        if time_cat == "12:30:00":
            group.Speed = 'Medium'
        if time_cat == "13:00:00":
            group.Speed = 'Slow'
        if time_cat == "13:30:00":
            group.Speed = 'Slow'
        if time_cat == "14:00:00":
            group.Speed = 'Medium'
        if time_cat == "14:30:00":
            group.Speed = 'Medium'
        if time_cat == "15:00:00":
            group.Speed = 'Medium'
        if time_cat == "15:30:00":
            group.Speed = 'Medium'
        if time_cat == "16:00:00":
            group.Speed = 'Slow'
        if time_cat == "16:30:00":
            group.Speed = 'Slow'
        if time_cat == "17:00:00":
            group.Speed = 'Slow'
        if time_cat == "17:30:00":
            group.Speed = 'Slow'
        if time_cat == "18:00:00":
            group.Speed = 'Slow'
        if time_cat == "18:30:00":
            group.Speed = 'Medium'
        if time_cat == "19:00:00":
            group.Speed = 'Medium'
        if time_cat == "19:30:00":
            group.Speed = 'Medium'
        if time_cat == "20:00:00":
            group.Speed = 'Medium'
        if time_cat == "20:30:00":
            group.Speed = 'Medium'
        if time_cat == "21:00:00":
            group.Speed = 'Medium'
        if time_cat == "21:30:00":
            group.Speed = 'Fast'
        if time_cat == "22:00:00":
            group.Speed = 'Fast'
        if time_cat == "22:30:00":
            group.Speed = 'Fast'
        if time_cat == "23:00:00":
            group.Speed = 'Fast'
        if time_cat == "23:30:00":
            group.Speed = 'Fast'
        if time_cat == "00:00:00":
            group.Speed = 'Fast'
        if time_cat == "00:30:00":
            group.Speed = 'Fast'
        if time_cat == "01:00:00":
            group.Speed = 'Fast'
        if time_cat == "01:30:00":
            group.Speed = 'Fast'
        if time_cat == "02:00:00":
            group.Speed = 'Fast'
        if time_cat == "02:30:00":
            group.Speed = 'Fast'
            
    # For Saturday
    if day == 'Sat':
        if time_cat == "06:00:00":
            group.Speed = 'Fast'
        if time_cat == "06:30:00":
            group.Speed = 'Fast'
        if time_cat == "07:00:00":
            group.Speed = 'Fast'
        if time_cat == "07:30:00":
            group.Speed = 'Fast'
        if time_cat == "08:00:00":
            group.Speed = 'Medium'
        if time_cat == "08:30:00":
            group.Speed = 'Medium'
        if time_cat == "09:00:00":
            group.Speed = 'Slow'
        if time_cat == "09:30:00":
            group.Speed = 'Medium'
        if time_cat == "10:00:00":
            group.Speed = 'Medium'
        if time_cat == "10:30:00":
            group.Speed = 'Medium'
        if time_cat == "11:00:00":
            group.Speed = 'Medium'
        if time_cat == "11:30:00":
            group.Speed = 'Medium'
        if time_cat == "12:00:00":
            group.Speed = 'Medium'
        if time_cat == "12:30:00":
            group.Speed = 'Medium'
        if time_cat == "13:00:00":
            group.Speed = 'Slow'
        if time_cat == "13:30:00":
            group.Speed = 'Medium'
        if time_cat == "14:00:00":
            group.Speed = 'Medium'
        if time_cat == "14:30:00":
            group.Speed = 'Medium'
        if time_cat == "15:00:00":
            group.Speed = 'Medium'
        if time_cat == "15:30:00":
            group.Speed = 'Medium'
        if time_cat == "16:00:00":
            group.Speed = 'Slow'
        if time_cat == "16:30:00":
            group.Speed = 'Medium'
        if time_cat == "17:00:00":
            group.Speed = 'Medium'
        if time_cat == "17:30:00":
            group.Speed = 'Medium'
        if time_cat == "18:00:00":
            group.Speed = 'Medium'
        if time_cat == "18:30:00":
            group.Speed = 'Medium'
        if time_cat == "19:00:00":
            group.Speed = 'Medium'
        if time_cat == "19:30:00":
            group.Speed = 'Medium'
        if time_cat == "20:00:00":
            group.Speed = 'Medium'
        if time_cat == "20:30:00":
            group.Speed = 'Medium'
        if time_cat == "21:00:00":
            group.Speed = 'Fast'
        if time_cat == "21:30:00":
            group.Speed = 'Fast'
        if time_cat == "22:00:00":
            group.Speed = 'Fast'
        if time_cat == "22:30:00":
            group.Speed = 'Fast'
        if time_cat == "23:00:00":
            group.Speed = 'Fast'
        if time_cat == "23:30:00":
            group.Speed = 'Fast'
        if time_cat == "00:00:00":
            group.Speed = 'Fast'
        if time_cat == "00:30:00":
            group.Speed = 'Fast'
        if time_cat == "01:00:00":
            group.Speed = 'Fast'
        if time_cat == "01:30:00":
            group.Speed = 'Fast'
        if time_cat == "02:00:00":
            group.Speed = 'Fast'
        if time_cat == "02:30:00":
            group.Speed = 'Fast'
            
    # For Sunday
    if day == 'Sun':
        if time_cat == "06:00:00":
            group.Speed = 'Fast'
        if time_cat == "06:30:00":
            group.Speed = 'Fast'
        if time_cat == "07:00:00":
            group.Speed = 'Fast'
        if time_cat == "07:30:00":
            group.Speed = 'Fast'
        if time_cat == "08:00:00":
            group.Speed = 'Fast'
        if time_cat == "08:30:00":
            group.Speed = 'Fast'
        if time_cat == "09:00:00":
            group.Speed = 'Medium'
        if time_cat == "09:30:00":
            group.Speed = 'Fast'
        if time_cat == "10:00:00":
            group.Speed = 'Fast'
        if time_cat == "10:30:00":
            group.Speed = 'Fast'
        if time_cat == "11:00:00":
            group.Speed = 'Medium'
        if time_cat == "11:30:00":
            group.Speed = 'Medium'
        if time_cat == "12:00:00":
            group.Speed = 'Medium'
        if time_cat == "12:30:00":
            group.Speed = 'Medium'
        if time_cat == "13:00:00":
            group.Speed = 'Slow'
        if time_cat == "13:30:00":
            group.Speed = 'Medium'
        if time_cat == "14:00:00":
            group.Speed = 'Medium'
        if time_cat == "14:30:00":
            group.Speed = 'Medium'
        if time_cat == "15:00:00":
            group.Speed = 'Medium'
        if time_cat == "15:30:00":
            group.Speed = 'Medium'
        if time_cat == "16:00:00":
            group.Speed = 'Medium'
        if time_cat == "16:30:00":
            group.Speed = 'Medium'
        if time_cat == "17:00:00":
            group.Speed = 'Medium'
        if time_cat == "17:30:00":
            group.Speed = 'Medium'
        if time_cat == "18:00:00":
            group.Speed = 'Medium'
        if time_cat == "18:30:00":
            group.Speed = 'Medium'
        if time_cat == "19:00:00":
            group.Speed = 'Medium'
        if time_cat == "19:30:00":
            group.Speed = 'Medium'
        if time_cat == "20:00:00":
            group.Speed = 'Medium'
        if time_cat == "20:30:00":
            group.Speed = 'Medium'
        if time_cat == "21:00:00":
            group.Speed = 'Fast'
        if time_cat == "21:30:00":
            group.Speed = 'Fast'
        if time_cat == "22:00:00":
            group.Speed = 'Fast'
        if time_cat == "22:30:00":
            group.Speed = 'Fast'
        if time_cat == "23:00:00":
            group.Speed = 'Fast'
        if time_cat == "23:30:00":
            group.Speed = 'Fast'
        if time_cat == "00:00:00":
            group.Speed = 'Fast'
        if time_cat == "00:30:00":
            group.Speed = 'Fast'
        if time_cat == "01:00:00":
            group.Speed = 'Fast'
        if time_cat == "01:30:00":
            group.Speed = 'Fast'
        if time_cat == "02:00:00":
            group.Speed = 'Fast'
        if time_cat == "02:30:00":
            group.Speed = 'Fast'
    
    
    return group

In [84]:
df = gb.apply(change_day)

In [85]:
df.JourneyPatternId = df.JourneyPatternId.astype("category")
df.Day = df.Day.astype("category")
df.Speed = df.Speed.astype("category")

df.Distance = df.Distance.astype("float")
df.TravelTime = df.TravelTime.astype("float")

# Make Linear Regression

In [86]:
df.dtypes

Timestamp           datetime64[ns]
JourneyPatternId          category
TimeFrame           datetime64[ns]
VehicleJourneyId            object
Day                       category
Distance                   float64
TravelTime                 float64
Speed                     category
Rain                       float32
dtype: object

In [87]:
df.head()

Unnamed: 0,Timestamp,JourneyPatternId,TimeFrame,VehicleJourneyId,Day,Distance,TravelTime,Speed,Rain
0,2012-11-06 23:19:58,1450001,2012-11-06,6628,Mon-Fri,0.0,0.0,Fast,0.0
1,2012-11-06 23:22:59,1450001,2012-11-06,6628,Mon-Fri,0.783226,181.0,Fast,0.0
2,2012-11-06 23:24:59,1450001,2012-11-06,6628,Mon-Fri,1.525179,301.0,Fast,0.0
3,2012-11-06 23:26:00,1450001,2012-11-06,6628,Mon-Fri,1.707548,362.0,Fast,0.0
4,2012-11-06 23:28:58,1450001,2012-11-06,6628,Mon-Fri,2.209265,540.0,Fast,0.0


In [88]:
# Train test split
train = df[df['TimeFrame'] < '2012-11-16']
test = df[df['TimeFrame'] >= '2012-11-16']

In [89]:
# # Optional -> Look at one Weekday?
# train = train[train.Day == 0]
# test = test[test.Day == 0]

In [90]:
# # Optional -> Look at one Journey?
# train = train[train.JourneyPatternId == '00401001']
# test = test[test.JourneyPatternId == '00401001']

In [91]:
# create a fitted test model
lm = smf.ols(formula='TravelTime ~ Speed + Distance', data=train).fit()

In [92]:
print(lm.summary())

                            OLS Regression Results                            
Dep. Variable:             TravelTime   R-squared:                       0.861
Model:                            OLS   Adj. R-squared:                  0.861
Method:                 Least Squares   F-statistic:                 2.503e+04
Date:                Thu, 20 Jul 2017   Prob (F-statistic):               0.00
Time:                        14:09:47   Log-Likelihood:                -92320.
No. Observations:               12133   AIC:                         1.846e+05
Df Residuals:                   12129   BIC:                         1.847e+05
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                      coef    std err          t      P>|t|      [95.0% Conf. Int.]
-----------------------------------------------------------------------------------
Intercept          91.3621     12.869     

In [93]:
test_y = test["TravelTime"]
test_X = test[["Distance", "Speed"]]

In [94]:
# test_X = test_X[test_X.TimeCategory != '00:00']
predictions = lm.predict(test_X)

In [95]:
pd.DataFrame({"Real": test_y, "Predicted":predictions})

Unnamed: 0,Predicted,Real
12133,600.320759,0.0
12134,685.489493,319.0
12135,771.168534,440.0
12136,822.679169,479.0
12137,842.215235,522.0
12138,909.424685,600.0
12139,955.519542,800.0
12140,1039.547891,960.0
12141,1093.317291,1040.0
12142,1132.837350,1099.0


In [96]:
# Custom Metric

print("Our own metric:", (abs(abs(test_y) - abs(predictions)).mean()), "sec wrong on average.")

Our own metric: 351.0532215400461 sec wrong on average.


In [97]:
# Print the Mean Squared Error of the model on the training set
mse = ((test_y - predictions)** 2).mean()
print("\nMean Squared Error:\n", mse)


Mean Squared Error:
 248812.78895092595


In [98]:
from math import sqrt

print("Root mean squared error:", sqrt(mse))

Root mean squared error: 498.81137612420787
