In [1]:
import pandas as pd
import numpy as np
import datetime
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from matplotlib.backends.backend_pdf import PdfPages
from patsy import dmatrices
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn.cross_validation import cross_val_score
from sklearn.tree import export_graphviz
from sklearn.ensemble import RandomForestRegressor
from sklearn import svm
# Allows plots to appear directly in the notebook.
%matplotlib inline



In [2]:
# Convert csv and json files into dataframes
df= pd.read_csv('eighth_stage.csv')
df.head(1)

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,Timestamp,LineId,JourneyPatternId,TimeFrame,VehicleJourneyId,BusOperator,Congestion,Long,Lat,Delay,BlockId,VehicleId,StopId,AtStop,Distance,TravelTime,Weekday,TimeCategory
0,2012-11-07 07:44:11,15,015A0002,2012-11-07,3174,RD,0,-6.233317,53.342152,0,15032,24549,395,1,0.0,0,2,07:30


In [3]:
#look at only weekdays
#df = df.drop(df[(df.TimeFrame == 2012-11-11) | (df.TimeFrame == 2012-11-10 )].index)
df = df[df.TimeFrame != '2012-11-11']
df = df[df.TimeFrame != '2012-11-10']

In [4]:
df = df.dropna()

In [5]:
#normalize cont data
def normalize(df):
    columns=['Distance','Delay']   
    for i in columns:
        df[i]= (df[i]-df[i].min()) / (df[i].max() - df[i].min())
    return df

normalize(df)

#show data frame
df.head(5)
df.tail(5)

Unnamed: 0,Timestamp,LineId,JourneyPatternId,TimeFrame,VehicleJourneyId,BusOperator,Congestion,Long,Lat,Delay,BlockId,VehicleId,StopId,AtStop,Distance,TravelTime,Weekday,TimeCategory
733278,2012-11-09 20:26:23,104,1040001,2012-11-09,4727,SL,0,-6.249307,53.390411,0.266984,104001,43061,222,0,0.096999,1502,4,20:00
733279,2012-11-09 20:27:00,104,1040001,2012-11-09,4727,SL,0,-6.253744,53.391075,0.26623,104001,43061,223,0,0.099651,1539,4,20:00
733280,2012-11-09 20:27:41,104,1040001,2012-11-09,4727,SL,0,-6.255539,53.390472,0.26623,104001,43061,224,0,0.100813,1580,4,20:00
733281,2012-11-09 20:28:01,104,1040001,2012-11-09,4727,SL,0,-6.258351,53.390812,0.26623,104001,43061,225,0,0.102417,1600,4,20:00
733282,2012-11-09 20:36:29,104,1040001,2012-11-09,4727,SL,0,-6.262133,53.391201,0.284192,104001,43061,226,1,0.104564,2108,4,20:30


In [6]:
#convert from readable to datetime
df['Timestamp'] =pd.to_datetime(df['Timestamp'])

In [7]:
#make new column called hour and set to null
Hour = np.nan
df['Hour'] = Hour

In [8]:
#find the timestamp from the int64
#for index, row in df.iterrows():
    #hour = convert_and_pull(row['Timestamp'])
    #df.set_value(index,'Hour',hour)
df['Hour'] = df['Timestamp'].dt.hour

In [9]:
# find number of rows associated with each hour - this gives us an idea of how many busses there are and thus the traffic
#it stands to reason that if dublin bus is sending out many busses, then this is when they expect the most people to be traveling which is rush hour
df["Hour"].value_counts()

7     61454
8     58582
9     51460
17    50205
18    49053
16    46117
10    41806
11    40097
12    39666
15    39623
14    37945
13    37425
19    36114
20    31631
6     30040
21    28136
22    27825
23    24464
0      1629
1         8
Name: Hour, dtype: int64

In [10]:
#bin the hours into AM rush hour, normal, PM Rush hour, and nightlinks
def binning(col, cut_points, labels=None):
    #Define min and max values:
    min = 0
    max = 23
    #create list by adding min and max to cut_points
    break_points = [min] + cut_points + [max]
    if not labels:
        labels = range(len(cut_points)+1)
    bins = pd.cut(col,bins=break_points,labels=labels,include_lowest=True)
    return bins

In [11]:
cut_points = [2,6,9,15,18]
labels = ["NiteLink","Morning","AMRush","Day","PMRush","Evening"]
df["Hour_Bins"] = binning(df["Hour"], cut_points, labels)
print (pd.value_counts(df["Hour_Bins"], sort=False))

NiteLink      1637
Morning      30040
AMRush      171496
Day         236562
PMRush      145375
Evening     148170
Name: Hour_Bins, dtype: int64


In [12]:
#remove non-essential information for RF from df_new
#this info is either non-dummied categorical or identification information
del df['LineId']
del df['Long']
del df['Lat']
del df['BlockId']
del df['VehicleId']
del df['AtStop']

In [13]:
#get dummies
JPID_dummies = pd.get_dummies(df.JourneyPatternId, prefix='JourneyPatternId')
#busop_dummies = pd.get_dummies(df.BusOperator, prefix='BusOperator')
hour_bin_dummies = pd.get_dummies(df.Hour_Bins, prefix='Hour_Bins')
# concatenate the dummy variable columns onto the original DataFrame (axis=0 means rows, axis=1 means columns)
df= pd.concat([df,JPID_dummies, hour_bin_dummies], axis=1)
del df['Weekday']
del df['Hour']
del df['JourneyPatternId']
del df['BusOperator']
del df['TimeCategory']
del df['VehicleJourneyId']
del df['StopId']
del df['Hour_Bins']
del df['Timestamp']
df.head(1)

Unnamed: 0,TimeFrame,Congestion,Delay,Distance,TravelTime,JourneyPatternId_00010001,JourneyPatternId_00010002,JourneyPatternId_00011001,JourneyPatternId_00011002,JourneyPatternId_00040001,...,JourneyPatternId_084X0002,JourneyPatternId_084X1001,JourneyPatternId_084X1002,JourneyPatternId_OL77X101,Hour_Bins_NiteLink,Hour_Bins_Morning,Hour_Bins_AMRush,Hour_Bins_Day,Hour_Bins_PMRush,Hour_Bins_Evening
0,2012-11-07,0,0.283956,0.0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [16]:
df['Delay'] = df['Delay'].astype(np.int64)
df['Distance'] = df['Distance'].astype(np.int64)

In [17]:
#make the target feature it's own df
df_time = pd.concat([df['TravelTime'], df['TimeFrame']], axis=1)


In [18]:
df_time.dtypes

TravelTime     int64
TimeFrame     object
dtype: object

In [19]:
#must remove target feature to fit to y later
del df['TravelTime']

In [20]:
train = df[df.TimeFrame != '2012-11-09']
test = df[df.TimeFrame == '2012-11-09']
time_train = df_time[df_time.TimeFrame != '2012-11-09']
time_test = df[df_time.TimeFrame == '2012-11-09']
train

Unnamed: 0,TimeFrame,Congestion,Delay,Distance,JourneyPatternId_00010001,JourneyPatternId_00010002,JourneyPatternId_00011001,JourneyPatternId_00011002,JourneyPatternId_00040001,JourneyPatternId_00040002,...,JourneyPatternId_084X0002,JourneyPatternId_084X1001,JourneyPatternId_084X1002,JourneyPatternId_OL77X101,Hour_Bins_NiteLink,Hour_Bins_Morning,Hour_Bins_AMRush,Hour_Bins_Day,Hour_Bins_PMRush,Hour_Bins_Evening
0,2012-11-07,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,2012-11-07,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,2012-11-07,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,2012-11-07,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,2012-11-07,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
5,2012-11-07,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
6,2012-11-07,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
7,2012-11-07,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
8,2012-11-07,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
9,2012-11-07,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [21]:
train.dtypes

TimeFrame                    object
Congestion                    int64
Delay                         int64
Distance                      int64
JourneyPatternId_00010001     uint8
JourneyPatternId_00010002     uint8
JourneyPatternId_00011001     uint8
JourneyPatternId_00011002     uint8
JourneyPatternId_00040001     uint8
JourneyPatternId_00040002     uint8
JourneyPatternId_00041001     uint8
JourneyPatternId_00041002     uint8
JourneyPatternId_00070001     uint8
JourneyPatternId_00070002     uint8
JourneyPatternId_00070003     uint8
JourneyPatternId_00071001     uint8
JourneyPatternId_00071002     uint8
JourneyPatternId_00080001     uint8
JourneyPatternId_00081001     uint8
JourneyPatternId_00090001     uint8
JourneyPatternId_00090002     uint8
JourneyPatternId_00091001     uint8
JourneyPatternId_00091002     uint8
JourneyPatternId_00110001     uint8
JourneyPatternId_00111001     uint8
JourneyPatternId_00111002     uint8
JourneyPatternId_00130001     uint8
JourneyPatternId_00130002   

In [22]:
time_train.dtypes

TravelTime     int64
TimeFrame     object
dtype: object

In [23]:
del train['TimeFrame']
del test['TimeFrame']

In [24]:
# Prepare all features
X = train
y = time_train.TravelTime

In [None]:
#regressor time
RF_train = RandomForestRegressor(n_estimators=10, criterion='mse', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_split=1e-07, bootstrap=True, oob_score=False, n_jobs=1, random_state=None, verbose=0, warm_start=False)
fitted_RF = RF_train.fit(X, y)

In [None]:
#This is never worse than 2 minutes off but that's not much better than Dublin Bus atm
RFtest_predictions = fitted_RF.predict(X)

RFtest_predictions

In [None]:
len(RFtest_predictions)

In [None]:
time_test = time_test.reset_index()
del time_test['index']

In [None]:
df_true_vs_predicted = pd.DataFrame({'ActualTime': time_test.TravelTime, 'PredictedTime': RFtest_predictions})
df_true_vs_predicted

In [None]:
how_wrong_is_my_data = df_true_vs_predicted['ActualTime'].sub(df_true_vs_predicted['PredictedTime'], axis=0)
how_wrong_is_my_data

In [None]:
how_wrong_is_my_data.mean()

I'm not sure if that's actually a bad difference. Let me know what y'all think