In [1]:
import pandas as pd
import numpy as np

from shapely.geometry import Point
from shapely.ops import nearest_points
from geopy.distance import great_circle
import matplotlib.pyplot as plt
from tqdm import tqdm as tqdm_base, tqdm_notebook as tqdm

tqdm_base.pandas()

In [16]:
import os
import gtfs_transformer as gt

In [2]:
%load_ext autoreload
%autoreload 2

## Read positions from August 2018

In [4]:
locations_df = pd.read_hdf('../data/intermed/positions_201808.h5', key='df')

In [5]:
locations_df.head()

Unnamed: 0_level_0,route_id,timestamp,trip_id,vehicle_id,vehicle_lat,vehicle_long,time_utc,time_pct
timestamp_utc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2018-07-31 23:55:06,100495,1533081306,39570076,8109,47.606529,-122.322571,2018-07-31 23:55:06,2018-07-31 16:55:06-07:00
2018-07-31 23:55:06,100128,1533081306,39494444,1154,47.75499,-122.155212,2018-07-31 23:55:06,2018-07-31 16:55:06-07:00
2018-07-31 23:55:07,102574,1533081307,39515435,7006,47.599838,-122.329063,2018-07-31 23:55:07,2018-07-31 16:55:07-07:00
2018-07-31 23:55:09,100061,1533081309,39571262,7167,47.383316,-122.230255,2018-07-31 23:55:09,2018-07-31 16:55:09-07:00
2018-07-31 23:55:31,100146,1533081331,39496283,6900,47.596931,-122.328285,2018-07-31 23:55:31,2018-07-31 16:55:31-07:00


In [6]:
locations_df.size

79683368

In [7]:
locations_df.shape

(9960421, 8)

In [8]:
pd.to_datetime('August 2018')

Timestamp('2018-08-01 00:00:00')

In [11]:
os.listdir('..')

['01_transform_source_data.ipynb',
 '03_e_segment_analysis.html',
 '.DS_Store',
 'unpack_pb_files.py',
 '04_transform_d_locations.ipynb',
 '02_transform_e_locations.html',
 'data_transformation',
 'README.md',
 'download_raw_locations.sh',
 '.gitignore',
 'route_shape_process',
 '03_e_segment_analysis.ipynb',
 '02_transform_e_locations.ipynb',
 '.ipynb_checkpoints',
 '.git',
 'data']

In [15]:
for entry in os.scandir('..'):
    print(entry.name, entry.path)

01_transform_source_data.ipynb ../01_transform_source_data.ipynb
03_e_segment_analysis.html ../03_e_segment_analysis.html
.DS_Store ../.DS_Store
unpack_pb_files.py ../unpack_pb_files.py
04_transform_d_locations.ipynb ../04_transform_d_locations.ipynb
02_transform_e_locations.html ../02_transform_e_locations.html
data_transformation ../data_transformation
README.md ../README.md
download_raw_locations.sh ../download_raw_locations.sh
.gitignore ../.gitignore
route_shape_process ../route_shape_process
03_e_segment_analysis.ipynb ../03_e_segment_analysis.ipynb
02_transform_e_locations.ipynb ../02_transform_e_locations.ipynb
.ipynb_checkpoints ../.ipynb_checkpoints
.git ../.git
data ../data


In [34]:
!ls ../data/source/

[34mgtfs_20180118[m[m [34mgtfs_20180611[m[m [34mgtfs_20180718[m[m
[34mgtfs_20180511[m[m [34mgtfs_20180717[m[m [34mgtfs_20180815[m[m


## Create GTFS objects for July and August 2018 and compare their tables

The first few rows of `routes` and `stops` look the same.

In [85]:
#Initializes GTFS object with tables in given directory
gtfs_jul18 = gt.StaticGTFS('../data/source/gtfs_20180718')
gtfs_aug18 = gt.StaticGTFS('../data/source/gtfs_20180815') #, post_date='August 15, 2018')

In [86]:
gtfs_aug18.__dict__.keys() #Each table is stored as an attribute

dict_keys(['directory', 'table_names', 'fare_attributes', 'block_trip', 'agency', 'fare_rules', 'calendar_dates', 'stop_times', 'shapes', 'trips', 'stops', 'block', 'calendar', 'routes', 'post_date'])

In [87]:
gtfs_aug18.post_date

Timestamp('2018-08-15 07:00:00+0000', tz='UTC')

In [88]:
#gtfs_aug18.table_names.sort()
gtfs_aug18.table_names

['agency',
 'block',
 'block_trip',
 'calendar',
 'calendar_dates',
 'fare_attributes',
 'fare_rules',
 'routes',
 'shapes',
 'stop_times',
 'stops',
 'trips']

In [25]:
'abcdefg'[:-3]

'abcd'

In [42]:
gtfs_jul18.routes.head()

Unnamed: 0,route_id,agency_id,route_short_name,route_long_name,route_desc,route_type,route_url,route_color,route_text_color
0,100001,KCM,1,,Kinnear - Downtown Seattle,3,http://metro.kingcounty.gov/schedules/001/n0.html,,
1,100002,KCM,10,,Capitol Hill - Downtown Seattle,3,http://metro.kingcounty.gov/schedules/010/n0.html,,
2,100003,KCM,101,,Renton Transit Center - Downtown Seattle,3,http://metro.kingcounty.gov/schedules/101/n0.html,,
3,100004,KCM,105,,Renton Highlands - Renton Transit Center,3,http://metro.kingcounty.gov/schedules/105/n0.html,,
4,100005,KCM,106,,Renton Transit Center - Skyway - Downtown Seattle,3,http://metro.kingcounty.gov/schedules/106/n0.html,,


In [41]:
gtfs_aug18.routes.head()

Unnamed: 0,route_id,agency_id,route_short_name,route_long_name,route_desc,route_type,route_url,route_color,route_text_color
0,100001,KCM,1,,Kinnear - Downtown Seattle,3,http://metro.kingcounty.gov/schedules/001/n0.html,,
1,100002,KCM,10,,Capitol Hill - Downtown Seattle,3,http://metro.kingcounty.gov/schedules/010/n0.html,,
2,100003,KCM,101,,Renton Transit Center - Downtown Seattle,3,http://metro.kingcounty.gov/schedules/101/n0.html,,
3,100004,KCM,105,,Renton Highlands - Renton Transit Center,3,http://metro.kingcounty.gov/schedules/105/n0.html,,
4,100005,KCM,106,,Renton Transit Center - Skyway - Downtown Seattle,3,http://metro.kingcounty.gov/schedules/106/n0.html,,


In [55]:
gtfs_jul18.stops.head(10)

Unnamed: 0,stop_id,stop_code,stop_name,stop_desc,stop_lat,stop_lon,zone_id,stop_url,location_type,parent_station,stop_timezone
0,1000,,Pine St & 9th Ave,,47.613415,-122.332138,21,,0,,America/Los_Angeles
1,10000,,NE 55th St & 43rd Ave NE,,47.668575,-122.283653,1,,0,,America/Los_Angeles
2,10005,,40th Ave NE & NE 51st St,,47.665886,-122.284897,1,,0,,America/Los_Angeles
3,10010,,NE 55th St & 39th Ave NE,,47.668579,-122.285667,1,,0,,America/Los_Angeles
4,10020,,NE 55th St & 37th Ave NE,,47.668579,-122.2883,1,,0,,America/Los_Angeles
5,10030,,NE 55th St & 35th Ave NE,,47.668579,-122.290512,1,,0,,America/Los_Angeles
6,10040,,NE 55th St & 33rd Ave NE,,47.668583,-122.293015,1,,0,,America/Los_Angeles
7,10050,,NE 55th St & 30th Ave NE,,47.668591,-122.295448,1,,0,,America/Los_Angeles
8,10060,,NE 55th St & 27th Ave NE,,47.668594,-122.298859,1,,0,,America/Los_Angeles
9,10070,,NE 55th St & 25th Ave NE,,47.668594,-122.30098,1,,0,,America/Los_Angeles


In [56]:
gtfs_aug18.stops.head(10)

Unnamed: 0,stop_id,stop_code,stop_name,stop_desc,stop_lat,stop_lon,zone_id,stop_url,location_type,parent_station,stop_timezone
0,1000,,Pine St & 9th Ave,,47.613415,-122.332138,21,,0,,America/Los_Angeles
1,10000,,NE 55th St & 43rd Ave NE,,47.668575,-122.283653,1,,0,,America/Los_Angeles
2,10005,,40th Ave NE & NE 51st St,,47.665886,-122.284897,1,,0,,America/Los_Angeles
3,10010,,NE 55th St & 39th Ave NE,,47.668579,-122.285667,1,,0,,America/Los_Angeles
4,10020,,NE 55th St & 37th Ave NE,,47.668579,-122.2883,1,,0,,America/Los_Angeles
5,10030,,NE 55th St & 35th Ave NE,,47.668579,-122.290512,1,,0,,America/Los_Angeles
6,10040,,NE 55th St & 33rd Ave NE,,47.668583,-122.293015,1,,0,,America/Los_Angeles
7,10050,,NE 55th St & 30th Ave NE,,47.668591,-122.295448,1,,0,,America/Los_Angeles
8,10060,,NE 55th St & 27th Ave NE,,47.668594,-122.298859,1,,0,,America/Los_Angeles
9,10070,,NE 55th St & 25th Ave NE,,47.668594,-122.30098,1,,0,,America/Los_Angeles


### There are 7 more stops in July than in August

In [46]:
print(gtfs_jul18.stops.shape, gtfs_aug18.stops.shape)

(7686, 11) (7679, 11)


In [49]:
gtfs_jul18.stops.index.symmetric_difference(gtfs_aug18.stops.index)

Int64Index([7679, 7680, 7681, 7682, 7683, 7684, 7685], dtype='int64')

### Check whether the rows up to 7679 are equal in the stops dataframes -- they're not

In [52]:
(gtfs_jul18.stops[:7679] == gtfs_aug18.stops).all()

stop_id           False
stop_code         False
stop_name         False
stop_desc         False
stop_lat          False
stop_lon          False
zone_id           False
stop_url          False
location_type      True
parent_station    False
stop_timezone      True
dtype: bool

### The first 100 rows are not equal either, but the first 10 are, as displayed above

In [53]:
(gtfs_jul18.stops[:100] == gtfs_aug18.stops[:100]).all()

stop_id           False
stop_code         False
stop_name         False
stop_desc         False
stop_lat          False
stop_lon          False
zone_id           False
stop_url          False
location_type      True
parent_station    False
stop_timezone      True
dtype: bool

In [54]:
(gtfs_jul18.stops[:10] == gtfs_aug18.stops[:10]).all()

stop_id            True
stop_code         False
stop_name          True
stop_desc         False
stop_lat           True
stop_lon           True
zone_id            True
stop_url          False
location_type      True
parent_station    False
stop_timezone      True
dtype: bool

## Merge the 2 stops dataframes together with an outer join, and mark which table each row came from

In [57]:
merged_stops = gtfs_jul18.stops.merge(gtfs_aug18.stops, how='outer', indicator=True)
merged_stops.shape

(7705, 12)

In [58]:
merged_stops.head(10)

Unnamed: 0,stop_id,stop_code,stop_name,stop_desc,stop_lat,stop_lon,zone_id,stop_url,location_type,parent_station,stop_timezone,_merge
0,1000,,Pine St & 9th Ave,,47.613415,-122.332138,21,,0,,America/Los_Angeles,both
1,10000,,NE 55th St & 43rd Ave NE,,47.668575,-122.283653,1,,0,,America/Los_Angeles,both
2,10005,,40th Ave NE & NE 51st St,,47.665886,-122.284897,1,,0,,America/Los_Angeles,both
3,10010,,NE 55th St & 39th Ave NE,,47.668579,-122.285667,1,,0,,America/Los_Angeles,both
4,10020,,NE 55th St & 37th Ave NE,,47.668579,-122.2883,1,,0,,America/Los_Angeles,both
5,10030,,NE 55th St & 35th Ave NE,,47.668579,-122.290512,1,,0,,America/Los_Angeles,both
6,10040,,NE 55th St & 33rd Ave NE,,47.668583,-122.293015,1,,0,,America/Los_Angeles,both
7,10050,,NE 55th St & 30th Ave NE,,47.668591,-122.295448,1,,0,,America/Los_Angeles,both
8,10060,,NE 55th St & 27th Ave NE,,47.668594,-122.298859,1,,0,,America/Los_Angeles,both
9,10070,,NE 55th St & 25th Ave NE,,47.668594,-122.30098,1,,0,,America/Los_Angeles,both


### There are 45 rows in the symmetric difference of stops -- display them below

In [60]:
merged_stops[merged_stops._merge != 'both'].shape

(45, 12)

In [61]:
merged_stops[merged_stops._merge != 'both']

Unnamed: 0,stop_id,stop_code,stop_name,stop_desc,stop_lat,stop_lon,zone_id,stop_url,location_type,parent_station,stop_timezone,_merge
78,1082,,Convention Place Tunnel Station - Bay I,,47.614113,-122.331726,1,,0,,America/Los_Angeles,left_only
80,1083,,Convention Place Tunnel Station - Bay C,,47.61422,-122.331795,1,,0,,America/Los_Angeles,left_only
82,1084,,Convention Place Tunnel Station - Bay D,,47.614277,-122.331863,1,,0,,America/Los_Angeles,left_only
85,1086,,Convention Place Tunnel Station - Bay E,,47.614574,-122.331688,1,,0,,America/Los_Angeles,left_only
101,11011,,Broadway E & E Roy St,,47.625248,-122.321114,1,,0,,America/Los_Angeles,left_only
179,1192,,Convention Place Tunnel Station - Bay A,,47.613941,-122.33152,1,,0,,America/Los_Angeles,left_only
433,15640,,SW Admiral Way & 59th Ave SW,,47.576111,-122.408051,1,,0,,America/Los_Angeles,left_only
562,17440,,Meridian Ave N & N 49th St,,47.664352,-122.333626,1,,0,,America/Los_Angeles,left_only
653,18610,,Nickerson St & Warren Ave N,,47.648422,-122.354568,1,,0,,America/Los_Angeles,left_only
1274,27180,,Lakeside Ave S & S Day St,,47.590302,-122.286514,1,,0,,America/Los_Angeles,left_only


## Experiment with pandas Timestamps

In [62]:
pd.to_datetime('August 15, 2018')

Timestamp('2018-08-15 00:00:00')

In [74]:
pd.to_datetime('20180815').tz_localize('US/Pacific')

Timestamp('2018-08-15 00:00:00-0700', tz='US/Pacific')

In [75]:
pd.to_datetime('20180815').tz_localize('US/Pacific').tz_convert('UTC')

Timestamp('2018-08-15 07:00:00+0000', tz='UTC')

In [64]:
pd.to_datetime('20180302')

Timestamp('2018-03-02 00:00:00')

In [72]:
pd.to_datetime('gtfs_20180718', format='%Y%m%d', exact=False)

Timestamp('2018-07-18 00:00:00')

In [73]:
pd.Timestamp.max

Timestamp('2262-04-11 23:47:16.854775807')

In [82]:
# Are Timestamps hashable?
t1 = pd.to_datetime('20180302')
t2 = pd.to_datetime('20180815').tz_localize('US/Pacific').tz_convert('UTC')
d = {t1: 'naive', t2: 'aware'}

In [84]:
#Yes, Timestamps are hashable!
d

{Timestamp('2018-03-02 00:00:00'): 'naive',
 Timestamp('2018-08-15 07:00:00+0000', tz='UTC'): 'aware'}