In [1]:
import pandas as pd
import numpy as np
import datetime
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from matplotlib.backends.backend_pdf import PdfPages
from patsy import dmatrices
from sklearn.cross_validation import train_test_split
from sklearn.svm import SVR
# Allows plots to appear directly in the notebook.
%matplotlib inline



In [2]:
# read files into dataframe
df = pd.read_hdf("cleaned_store.h5", key="table_name", where='Journey_Pattern_ID == "00401001"', columns=['Timestamp', "Journey_Pattern_ID", "Time_Frame", "Vehicle_Journey_ID", "Week_Day", "Distance", "TravelTime", "TimeCategory"])
# read files into dataframe
#df = pd.read_csv("twelth_stage.csv")
df.head(1)

Unnamed: 0,Timestamp,Journey_Pattern_ID,Time_Frame,Vehicle_Journey_ID,Week_Day,Distance,TravelTime,TimeCategory
868,2012-11-12 13:01:23,401001,2012-11-12,6462,0,10.693702,1953.0,2012-11-12 13:00:00


#look at only weekdays
#df = df.drop(df[(df.TimeFrame == 2012-11-11) | (df.TimeFrame == 2012-11-10 )].index)
df = df[df.TimeFrame != '2012-11-10']

#look at only 15s, 39s, and 18s
df = df[(df['LineId'] == 15)|(df['LineId'] == 39)]
#df.head(5)

In [3]:
#normalize cont data
def normalize(df):
    columns=['Distance']   
    for i in columns:
        df[i]= (df[i]-df[i].min()) / (df[i].max() - df[i].min())
    return df

normalize(df)

#show data frame
df.head(5)
df.tail(5)

Unnamed: 0,Timestamp,Journey_Pattern_ID,Time_Frame,Vehicle_Journey_ID,Week_Day,Distance,TravelTime,TimeCategory
745662,2013-01-27 19:56:26,401001,2013-01-27,15946,6,0.335989,3980.0,2013-01-27 20:00:00
745663,2013-01-27 19:57:05,401001,2013-01-27,15946,6,0.340195,4019.0,2013-01-27 20:00:00
745664,2013-01-27 19:57:47,401001,2013-01-27,15946,6,0.34452,4061.0,2013-01-27 20:00:00
745665,2013-01-27 19:58:26,401001,2013-01-27,15946,6,0.34786,4100.0,2013-01-27 20:00:00
745666,2013-01-27 19:59:07,401001,2013-01-27,15946,6,0.352638,4141.0,2013-01-27 20:00:00


In [4]:
#convert from readable to datetime
df['Timestamp'] =pd.to_datetime(df['Timestamp'])

In [5]:
#make new column called hour and set to null
Hour = np.nan
df['Hour'] = Hour

In [6]:
#find the timestamp from the int64
#for index, row in df.iterrows():
    #hour = convert_and_pull(row['Timestamp'])
    #df.set_value(index,'Hour',hour)
df['Hour'] = df['Timestamp'].dt.hour

In [7]:
df['Hour'].value_counts()


18    20496
9     18896
10    18872
19    17732
17    17271
12    16519
11    16260
7     16109
13    15773
16    15372
8     15263
14    13465
15    12783
20    12513
6     11586
21     9844
22     9664
23     4216
0        17
Name: Hour, dtype: int64

In [8]:
#bin the hours into AM rush hour, normal, PM Rush hour, and nightlinks
def binning(col, cut_points, labels=None):
    #Define min and max values:
    min = 0
    max = 23
    #create list by adding min and max to cut_points
    break_points = [min] + cut_points + [max]
    if not labels:
        labels = range(len(cut_points)+1)
    bins = pd.cut(col,bins=break_points,labels=labels,include_lowest=True)
    return bins

In [9]:
cut_points = [2,7,9,15,18]
labels = ["NiteLink","Morning","AMRush","Day","PMRush","Evening"]
df["Hour_Bins"] = binning(df["Hour"], cut_points, labels)
print (pd.value_counts(df["Hour_Bins"], sort=False))

NiteLink       17
Morning     27695
AMRush      34159
Day         93672
PMRush      53139
Evening     53969
Name: Hour_Bins, dtype: int64


In [10]:
#remove non-essential information for RF from df_new
#this info is either non-dummied categorical or identification information
#del df['LineId']
del df['TimeCategory']

#this was added
#remove non-essential information for RF from df_new
#this info is either non-dummied categorical or identification information
del df['LineId']
del df['Long']
del df['Lat']
del df['BlockId']
del df['VehicleId']
del df['AtStop']

In [11]:
#get dummies
#JPID_dummies = pd.get_dummies(df.JourneyPatternId, prefix='JourneyPatternId')
#busop_dummies = pd.get_dummies(df.BusOperator, prefix='BusOperator')
weekday_dummies = pd.get_dummies(df['Week_Day'])
hour_bin_dummies = pd.get_dummies(df.Hour_Bins, prefix='Hour_Bins')
# concatenate the dummy variable columns onto the original DataFrame (axis=0 means rows, axis=1 means columns)
df= pd.concat([df,weekday_dummies,hour_bin_dummies], axis=1)
del df['Week_Day']
del df['Hour']
del df['Journey_Pattern_ID']
del df['Hour_Bins']
del df['Timestamp']
#get dummies
#JPID_dummies = pd.get_dummies(df.JourneyPatternId, prefix='JourneyPatternId')
#hour_bin_dummies = pd.get_dummies(df.Hour_Bins, prefix='Hour_Bins')
# concatenate the dummy variable columns onto the original DataFrame (axis=0 means rows, axis=1 means columns)
#df= pd.concat([df,JPID_dummies, hour_bin_dummies], axis=1)
#del df['Weekday']
#del df['Hour']
#del df['JourneyPatternId']
#del df['BusOperator']
#del df['VehicleJourneyId']
#del df['StopId']
#del df['Hour_Bins']
#del df['Timestamp']
df.head(1)

Unnamed: 0,Time_Frame,Vehicle_Journey_ID,Distance,TravelTime,0,1,2,3,4,5,6,Hour_Bins_NiteLink,Hour_Bins_Morning,Hour_Bins_AMRush,Hour_Bins_Day,Hour_Bins_PMRush,Hour_Bins_Evening
868,2012-11-12,6462,0.167223,1953.0,1,0,0,0,0,0,0,0,0,0,1,0,0


In [12]:

#df['Delay'] = df['Delay'].astype(np.int64)
df['Distance'] = df['Distance'].astype(np.int64)

In [13]:
#make the target feature it's own df
df_time = pd.concat([df['TravelTime'], df['Time_Frame']], axis=1)
#df_time = pd.concat([df['TravelTime'], df['TimeFrame']], axis=1)

In [14]:
#must remove target feature to fit to y later
del df['TravelTime']
del df['Time_Frame']
#del df['TimeFrame']

In [15]:
# Prepare the descriptive features
X = df
y = df_time.TravelTime

In [16]:
# Split the data into train and test sets
# Take a third (random) data samples as test data, rest as training data
# Note that this training set if very small and the model will not be very reliable due to this sample size problem.
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

print("Training data:\n", pd.concat([X_train, y_train], axis=1))
print("\nTest data:\n", pd.concat([X_test, y_test], axis=1))

Training data:
         Vehicle_Journey_ID  Distance  0  1  2  3  4  5  6  Hour_Bins_NiteLink  \
761512                5612         0  0  0  0  0  1  0  0                   0   
1005779               5569         0  0  0  1  0  0  0  0                   0   
1519159               5455         0  1  0  0  0  0  0  0                   0   
890035                5539         0  0  0  0  0  1  0  0                   0   
1587756               5555         0  0  0  0  1  0  0  0                   0   
496337                5651         0  0  0  1  0  0  0  0                   0   
300533                6455         0  0  1  0  0  0  0  0                   0   
203716                6409         0  0  1  0  0  0  0  0                   0   
971300               11731         0  0  0  0  0  0  1  0                   0   
605046               15894         0  0  0  0  0  0  0  1                   0   
1758463               5512         0  0  0  1  0  0  0  0                   0   
1001716     

In [17]:
X_train.dtypes

Vehicle_Journey_ID    object
Distance               int64
0                      uint8
1                      uint8
2                      uint8
3                      uint8
4                      uint8
5                      uint8
6                      uint8
Hour_Bins_NiteLink     uint8
Hour_Bins_Morning      uint8
Hour_Bins_AMRush       uint8
Hour_Bins_Day          uint8
Hour_Bins_PMRush       uint8
Hour_Bins_Evening      uint8
dtype: object

In [None]:
clf= SVR(C=1.0, epsilon=0.2, kernel='linear')

In [None]:
svm_train = clf.fit(X_train, y_train) 

In [None]:
print (clf.score(X_test,y_test))

In [None]:
predictions =svm_train.predict(X_test)

In [None]:
df_true_vs_predicted = pd.DataFrame({'ActualTime': y_test, 'PredictedTime': predictions})
df_true_vs_predicted

In [None]:
# Custom Metric
abs(abs(y_test) - abs(predictions)).mean() 

In [None]:
# Print the Mean Squared Error of the model on the training set
mse = ((y_test - predictions)** 2).mean()
print("\nMean Squared Error:\n", mse)

In [None]:
from math import sqrt

sqrt(mse)