In [102]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from matplotlib.backends.backend_pdf import PdfPages
from patsy import dmatrices
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn.cross_validation import cross_val_score
from sklearn.tree import export_graphviz
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
# Allows plots to appear directly in the notebook.
%matplotlib inline

In [103]:
# Convert csv and json files into dataframes
df= pd.read_csv('sprint2_cleaned.csv')
df.head(1)

Unnamed: 0,Timestamp_x,LineId,JourneyPatternId,TimeFrame,VehicleJourneyId,BusOperator,Congestion,Long,Lat,Delay,...,AtStop,Distance,TravelTime,Weekday,TimeCategory,Time_hour,Rain,Temp,windSpeed,Timestamp_y
0,2012-11-08 07:50:33,15,015B1001,2012-11-08,3277,RD,0,-6.32627,53.271095,28,...,1.0,0.0,0,3,07:30,2012-11-08 07:00:00,0.0,7.1,6.62,1352358000


In [104]:
#look at only 15s,18s, and 39s
df1 = df[df.JourneyPatternId == "00401001"]
del df1['Timestamp_y']

In [105]:
#get dummies
#weekday_dummies = pd.get_dummies(df1.Weekday, prefix='Weekday')
#JPID_dummies = pd.get_dummies(df1.JourneyPatternId, prefix='JourneyPatternId')
busop_dummies = pd.get_dummies(df1.BusOperator, prefix='BusOperator')
time_cat_dum = pd.get_dummies(df1.TimeCategory, prefix='TimeCategory')
# concatenate the dummy variable columns onto the original DataFrame (axis=0 means rows, axis=1 means columns)
df_new = pd.concat([df1,busop_dummies,time_cat_dum], axis=1)
del df_new['Weekday']
del df_new['JourneyPatternId']
del df_new['BusOperator']
del df_new['TimeCategory']
df_new.head(1)

Unnamed: 0,Timestamp_x,LineId,TimeFrame,VehicleJourneyId,Congestion,Long,Lat,Delay,BlockId,VehicleId,...,TimeCategory_19:00,TimeCategory_19:30,TimeCategory_20:00,TimeCategory_20:30,TimeCategory_21:00,TimeCategory_21:30,TimeCategory_22:00,TimeCategory_22:30,TimeCategory_23:00,TimeCategory_23:30
1485,2012-11-08 07:59:32,40,2012-11-08,6421,0,-6.395383,53.351982,0,40105,33153,...,0,0,0,0,0,0,0,0,0,0


should we normalize continuous features???

In [106]:
#remove non-essential information for RF from df_new
#this info is either non-dummied categorical or identification information
del df_new['Timestamp_x']
del df_new['Time_hour']
del df_new['LineId']
del df_new['VehicleJourneyId']
del df_new['Long']
del df_new['Lat']
del df_new['BlockId']
del df_new['VehicleId']
del df_new['AtStop']
del df_new['StopId']
#must remove target feature to fit to y later
del df_new['TravelTime']

In [107]:
train = df_new[df_new.TimeFrame == '2012-11-08']
test = df_new[df_new.TimeFrame == '2012-11-09']
time_train = df1[df1.TimeFrame == '2012-11-08']
time_test = df1[df1.TimeFrame == '2012-11-09']

In [108]:
del train['TimeFrame']
del test['TimeFrame']

In [109]:
# Prepare all features
X = train
y = time_train.TravelTime

In [110]:
# Instantiate estimator, fit with training set
# Train a classification tree with max_depth=3 on all data
dtc = DecisionTreeClassifier(max_depth=50, random_state=1)
dtc.fit(X, y)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=50,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=1, splitter='best')

In [111]:
# Compute the importance of each feature based on the trained decision tree classifier
pd.DataFrame({'feature': X.columns, 'importance': dtc.feature_importances_})

Unnamed: 0,feature,importance
0,Congestion,0.0
1,Delay,0.280369
2,Distance,0.380359
3,Rain,0.0
4,Temp,0.049513
5,windSpeed,0.053437
6,BusOperator_CD,0.018828
7,BusOperator_HN,0.021241
8,TimeCategory_06:00,0.001425
9,TimeCategory_06:30,0.002653


In [112]:
#regressor time
RF_train = RandomForestRegressor(n_estimators=10, criterion='mse', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_split=1e-07, bootstrap=True, oob_score=False, n_jobs=1, random_state=None, verbose=0, warm_start=False)
fitted_RF = RF_train.fit(X, y)

In [113]:

X = test
y = time_test.TravelTime

In [114]:
#This is never worse than 2 minutes off but that's not much better than Dublin Bus atm
RFtest_predictions = fitted_RF.predict(X)

RFtest_predictions

array([ 4530.45,  4577.5 ,  4645.3 , ...,  4213.25,  4413.15,  4426.05])

In [115]:
len(RFtest_predictions)

2886

In [116]:
time_test = time_test.reset_index()
del time_test['index']

In [117]:
df_true_vs_predicted = pd.DataFrame({'ActualTime': time_test.TravelTime, 'PredictedTime': RFtest_predictions})
df_true_vs_predicted

Unnamed: 0,ActualTime,PredictedTime
0,4795,4530.45
1,4836,4577.50
2,4895,4645.30
3,4954,4673.15
4,5016,4723.30
5,5077,4740.30
6,5116,4804.75
7,5194,4837.25
8,5237,4865.05
9,5275,4882.55


In [118]:
how_wrong_is_my_data = df_true_vs_predicted['ActualTime'].sub(df_true_vs_predicted['PredictedTime'], axis=0)
how_wrong_is_my_data

0        264.55
1        258.50
2        249.70
3        280.85
4        292.70
5        336.70
6        311.25
7        356.75
8        371.95
9        392.45
10       561.90
11       638.95
12       749.55
13      1064.25
14       739.65
15      1030.05
16       967.95
17       993.65
18       270.85
19       538.10
20       696.00
21       726.55
22      1127.70
23      1282.35
24      1344.20
25       154.70
26       167.30
27        69.85
28       120.30
29       228.50
         ...   
2856      14.30
2857    -597.05
2858    -506.20
2859    -165.55
2860    -179.35
2861    -305.45
2862    -241.50
2863    -731.10
2864    -697.85
2865    -738.15
2866    -823.95
2867    -353.85
2868     -71.80
2869    -126.00
2870    -276.95
2871    -420.05
2872    -437.40
2873    -403.45
2874    -296.10
2875    -258.40
2876    -204.90
2877    -125.70
2878      -0.65
2879      35.30
2880      18.40
2881      21.85
2882       0.90
2883       8.75
2884     -32.15
2885      -5.05
dtype: float64

In [119]:
how_wrong_is_my_data.mean()

216.62300762300828

I'm not sure if that's actually a bad difference. Let me know what y'all think