In [1]:
import pandas as pd
import numpy as np
import networkx as nx
import cPickle as pickle
from model import utilities as ut

# Reading in static schedule data
stops_full = pd.read_csv('data/google_transit/stops.txt', index_col='stop_id')
routes = pd.read_csv('data/google_transit/routes.txt', index_col='route_id')
trips = pd.read_csv('data/google_transit/trips.txt', index_col='trip_id')
stop_times = pd.read_csv('data/google_transit/stop_times.txt')
shapes = pd.read_csv('data/google_transit/shapes.txt')

# Some of these stops are named "Not a public stop" but are still in trips.
# Luckily, in the few trips they appear in, they're only either at the
# beginning or the end, so we can remove them now and we'll still build
# a nice graph with the connections we expect.
stops = stops_full[~stops_full.index.isin([7520, 7530, 7531, 7540])]
stop_times = stop_times[~stop_times['stop_id'].isin([7520, 7530, 7531, 7540])]

# Oh and some stops are in stops.txt but not used in trips... let's remove 'em
used_stops = set(stop_times['stop_id'].unique())
stops = stops[stops.index.isin(used_stops)]

# Let's make some sorted stop-timepoint lists for each stop_id to
# make lookup faster for things
all_stop_timepoints = {}
for stopid in used_stops:
    node_names = stop_times[stop_times['stop_id'] == stopid].\
                    apply(lambda x: '{0}_{1}'.\
                              format(stopid, x['arrival_time']),\
                          axis=1)
    all_stop_timepoints[stopid] = sorted(list(set(node_names)))


In [2]:
blocks = pd.read_csv('../project/lookUpBlockIDToBlockNumNam.csv', index_col='BLOCKID')    
raw_test = pd.read_csv('../project/test3.csv')
raw_test['TRAIN_ASSIGNMENT'] = raw_test['TRAIN_ASSIGNMENT'].apply(lambda x: int(x) if str(x)[0].isdigit() else 'F')
raw_test = raw_test[raw_test['TRAIN_ASSIGNMENT'] != 'F']

  interactivity=interactivity, compiler=compiler, result=result)


In [12]:
s = []
for uh in raw_test.TRAIN_ASSIGNMENT.unique():
    s.append(type(uh))

In [18]:
np.unique(np.array(s))

array([<type 'int'>], dtype=object)

In [3]:
raw_test.TRAIN_ASSIGNMENT.unique()[:100]

array([6003, 6004, 6007, 5903, 5908, 6005, 5907, 6010, 6002, 5904, 6006,
       5906, 9805, 9801, 9803, 9804, 9806, 9821, 9822, 9814, 9816, 9999,
       9808, 9809, 9811, 9802, 9812, 9817, 9810, 9815, 9819, 9813, 9818,
       5901, 9401, 9409, 9405, 9419, 9506, 9713, 9508, 9603, 9304, 9504,
       9704, 9706, 9606, 9611, 9621, 9604, 9502, 9410, 9717, 9421, 9303,
       9413, 9501, 9609, 9703, 9718, 9503, 9511, 9705, 9724, 9509, 9414,
       9310, 9708, 9507, 9605, 9412, 9712, 9721, 9607, 9522, 9608, 9716,
       9711, 9702, 9305, 9407, 9404, 9601, 9723, 9306, 9308, 9710, 9521,
       9422, 9416, 9402, 9415, 9725, 9602, 9510, 9610, 9709, 9406, 9420,
       9321], dtype=object)

In [4]:
test_block = raw_test[raw_test.TRAIN_ASSIGNMENT == 6005]
print test_block.info()
test_block.head()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 154 entries, 850 to 224175
Data columns (total 9 columns):
REV                 154 non-null int64
REPORT_TIME         154 non-null object
VEHICLE_TAG         154 non-null object
LONGITUDE           154 non-null float64
LATITUDE            154 non-null float64
SPEED               154 non-null float64
HEADING             154 non-null float64
TRAIN_ASSIGNMENT    154 non-null object
PREDICTABLE         154 non-null int64
dtypes: float64(4), int64(2), object(3)
memory usage: 12.0+ KB
None


Unnamed: 0,REV,REPORT_TIME,VEHICLE_TAG,LONGITUDE,LATITUDE,SPEED,HEADING,TRAIN_ASSIGNMENT,PREDICTABLE
850,1506,01/06/2016 00:23:35,6,-122.42086,37.8069,0.0,0.0,6005,1
851,1506,01/06/2016 00:34:05,6,-122.42088,37.80688,0.0,0.0,6005,0
852,1506,01/06/2016 00:37:05,6,-122.42114,37.80698,0.278,148.0,6005,0
853,1506,01/06/2016 00:38:51,6,-122.42033,37.80523,3.889,173.0,6005,1
854,1506,01/06/2016 00:44:51,6,-122.41624,37.79388,3.889,80.0,6005,1


In [28]:
test_block.VEHICLE_TAG.unique()

array([6, 15, 19, 59, 8208], dtype=object)

In [30]:
test_block[test_block.VEHICLE_TAG == 19]

Unnamed: 0,REV,REPORT_TIME,VEHICLE_TAG,LONGITUDE,LATITUDE,SPEED,HEADING,TRAIN_ASSIGNMENT,PREDICTABLE
87696,1506,01/06/2016 08:06:46,19,-122.41211,37.79491,0.278,153.0,6005,1
87697,1506,01/06/2016 08:44:16,19,-122.41176,37.79505,0.000,299.0,6005,0
87698,1506,01/06/2016 08:50:16,19,-122.41168,37.79500,0.000,267.0,6005,0
87699,1506,01/06/2016 08:53:16,19,-122.41157,37.79491,0.000,306.0,6005,0
87700,1506,01/06/2016 08:56:16,19,-122.41163,37.79483,0.278,206.0,6005,0
87701,1506,01/06/2016 09:00:46,19,-122.41155,37.79493,0.000,151.0,6005,0
87702,1506,01/06/2016 09:05:16,19,-122.41172,37.79491,0.000,78.0,6005,0
87703,1506,01/06/2016 09:08:16,19,-122.41175,37.79491,0.000,62.0,6005,0
87704,1506,01/06/2016 09:11:16,19,-122.41169,37.79488,0.000,63.0,6005,0
87705,1506,01/06/2016 09:17:16,19,-122.41164,37.79485,0.000,113.0,6005,0


In [25]:
trips[trips.block_id == 6005].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 96 entries, 6899745 to 6899926
Data columns (total 6 columns):
route_id         96 non-null int64
service_id       96 non-null int64
trip_headsign    96 non-null object
direction_id     96 non-null int64
block_id         96 non-null int64
shape_id         96 non-null int64
dtypes: int64(5), object(1)
memory usage: 5.2+ KB


In [38]:
trips[trips.block_id == 3101].head(10)

Unnamed: 0_level_0,route_id,service_id,trip_headsign,direction_id,block_id,shape_id
trip_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
6886268,11075,1,Ferry Plaza,1,3101,135771
6886254,11075,1,Ferry Plaza,1,3101,135771
6886243,11075,1,Ferry Plaza,1,3101,135771
6886233,11075,1,Ferry Plaza,1,3101,135771
6886224,11075,1,Ferry Plaza,1,3101,135771
6886216,11075,1,Ferry Plaza,1,3101,135771
6886195,11075,1,Ferry Plaza,1,3101,135771
6886397,11075,2,Ferry Plaza,1,3101,135771
6886392,11075,2,Ferry Plaza,1,3101,135771
6886386,11075,2,Ferry Plaza,1,3101,135771


In [41]:
trips[trips.block_id == 2504]['shape_id'].unique()

array([135725, 135726])

In [7]:
shapes.head()

Unnamed: 0,shape_id,shape_pt_lon,shape_pt_lat,shape_pt_sequence,shape_dist_traveled
0,135594,-122.446805,37.787266,1,0
1,135594,-122.448481,37.787054,2,149
2,135594,-122.450131,37.786842,3,296
3,135594,-122.450238,37.786822,4,306
4,135594,-122.451771,37.786624,5,443


In [8]:
stop_times[stop_times.trip_id == 6900999]

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,stop_headsign,pickup_type,drop_off_type,shape_dist_traveled
714586,6900999,22:26:00,22:26:00,6063,1,,,,
714587,6900999,22:27:21,22:27:21,6068,2,,,,
714588,6900999,22:28:01,22:28:01,6058,3,,,,
714589,6900999,22:28:40,22:28:40,6072,4,,,,
714590,6900999,22:29:20,22:29:20,6075,5,,,,
714591,6900999,22:30:04,22:30:04,6047,6,,,,
714592,6900999,22:30:44,22:30:44,6069,7,,,,
714593,6900999,22:31:35,22:31:35,6049,8,,,,
714594,6900999,22:31:59,22:31:59,6073,9,,,,
714595,6900999,22:32:36,22:32:36,6051,10,,,,


In [10]:
stop_times[stop_times.trip_id == 6900999]['stop_id'].values

array([6063, 6068, 6058, 6072, 6075, 6047, 6069, 6049, 6073, 6051, 6077,
       6059, 5156, 5164, 5152, 5154, 5150, 5082, 5062, 5090, 5074, 5088,
       5069, 5073, 5077, 5066, 5059, 5080, 5064])

In [12]:
trips[trips.block_id == 4422]['shape_id'].unique()

array([135815, 135819])

In [13]:
trips[trips.block_id == 4422].head()

Unnamed: 0_level_0,route_id,service_id,trip_headsign,direction_id,block_id,shape_id
trip_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
6894179,11085,1,Golden Gate Park & Third Street,0,4422,135815
6894165,11085,1,Golden Gate Park & Third Street,0,4422,135815
6894147,11085,1,Golden Gate Park & Third Street,0,4422,135815
6894136,11085,1,Golden Gate Park & Third Street,0,4422,135815
6894191,11085,1,Golden Gate Park & Third Street,0,4422,135815


In [14]:
stop_times[stop_times.trip_id == 6894191]['stop_id'].values

array([3826, 3181, 3185, 3178, 7538, 3202, 4772, 3221, 3213, 3215, 3216,
       3218, 4853, 3188, 5253, 4878, 5247, 6933, 6935, 6940, 6938, 5834,
       5831, 5833, 3694, 3699, 3696, 5330, 6394, 6415, 6410, 6402, 6407,
       6399, 6418, 6404, 6408, 6412, 6421, 6397, 7288, 6419, 6416, 5880,
       5877, 5864, 5876, 5874, 5233, 5487, 5488, 6983, 5051, 5045, 5108,
       7187, 5503, 5501, 4550, 4558, 7077, 7428, 5041])

In [15]:
routes.head()

Unnamed: 0_level_0,agency_id,route_short_name,route_long_name,route_desc,route_type,route_url,route_color,route_text_color
route_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
11047,SFMTA,1,CALIFORNIA,,3,,,
1033,SFMTA,1AX,CALIFORNIA A EXPRESS,,3,,,
1034,SFMTA,1BX,CALIFORNIA B EXPRESS,,3,,,
1031,SFMTA,31AX,BALBOA A EXPRESS,,3,,,
1032,SFMTA,31BX,BALBOA B EXPRESS,,3,,,


In [16]:
trips.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 29087 entries, 6858659 to 6916562
Data columns (total 6 columns):
route_id         29087 non-null int64
service_id       29087 non-null int64
trip_headsign    29071 non-null object
direction_id     29087 non-null int64
block_id         29087 non-null int64
shape_id         29087 non-null int64
dtypes: int64(5), object(1)
memory usage: 1.6+ MB


In [21]:
29087.*5/3600

40.39861111111111

In [39]:
routes.loc[11075]

agency_id            SFMTA
route_short_name        31
route_long_name     BALBOA
route_desc                
route_type               3
route_url                 
route_color               
route_text_color          
Name: 11075, dtype: object