In [1]:
from enum import Enum
import requests, yaml, time, os
import numpy as np
import pandas as pd
from google.transit import gtfs_realtime_pb2
from datetime import datetime
import duckdb
import fsspec

In [3]:
database = duckdb.connect('transit.db')

In [5]:
#position
database.sql('''CREATE TABLE IF NOT EXISTS rt_position
            (trip_id VARCHAR,
            start_date DATE,
            schedule_relationship INTEGER,
            route_id VARCHAR,
            direction_id VARCHAR,
            latitude FLOAT,
            longitude FLOAT,
            current_stop_sequence VARCHAR,
            current_status VARCHAR,
            timestamp TIMESTAMP,
            stop_id VARCHAR,
            vehicle_id VARCHAR,
            vehicle_label VARCHAR,
            PRIMARY KEY (trip_id, vehicle_id, timestamp))
            ''')

#trips
database.sql('''
            CREATE TABLE IF NOT EXISTS rt_trip (
                trip_id VARCHAR,
                start_date DATE,
                schedule_relationship INTEGER,
                route_id VARCHAR,
                direction_id VARCHAR,
                next_stop_sequence VARCHAR,
                next_stop_arrival_delay INTERVAL,
                next_stop_arrival_time TIMESTAMP,
                next_stop_departure_delay INTERVAL,
                next_stop_departure_time TIMESTAMP,
                next_stop_id VARCHAR,
                next_stop_schedule_relationship VARCHAR,
                PRIMARY KEY (trip_id, next_stop_id, next_stop_arrival_time)
            )
            ''')

#alerts
database.sql('''
            CREATE TABLE IF NOT EXISTS rt_alerts (
                alert_id VARCHAR,
                active_period_start TIMESTAMP,
                active_period_end TIMESTAMP,
                affected_route_ids VARCHAR[],
                affected_route_type VARCHAR[],
                affected_stop_ids VARCHAR[],
                cause VARCHAR,
                effect VARCHAR,
                description VARCHAR,
                severity_level VARCHAR,
                PRIMARY KEY (alert_id)
            )
            ''')

In [783]:
# database.sql('SELECT * FROM rt_position').show()

In [70]:
# database.sql('SELECT * FROM rt_position WHERE current_status != 2').show()

In [68]:
# database.sql('SELECT * FROM rt_position WHERE schedule_relationship != 0').show()

In [785]:
# database.sql('SELECT * FROM rt_trip').show()

In [66]:
# database.sql('SELECT * FROM rt_trip where trip_id == 13997591').show()

In [64]:
# database.sql('SELECT * FROM rt_alerts').show()

### Most popular stop:
51475 (Westbound W Georgia St @ Denman St)

In [7]:
popular_stops = database.sql('SELECT stop_id, count(*) FROM rt_position GROUP BY stop_id ORDER BY count(*)')
popular_stops

┌─────────┬──────────────┐
│ stop_id │ count_star() │
│ varchar │    int64     │
├─────────┼──────────────┤
│ 11964   │            1 │
│ 2911    │            1 │
│ 7900    │            1 │
│ 3561    │            1 │
│ 3772    │            1 │
│ 6245    │            1 │
│ 8833    │            1 │
│ 9252    │            1 │
│ 2785    │            1 │
│ 7905    │            1 │
│  ·      │            · │
│  ·      │            · │
│  ·      │            · │
│ 11673   │          106 │
│ 4461    │          109 │
│ 12155   │          110 │
│ 8283    │          112 │
│ 81      │          118 │
│ 4608    │          126 │
│ 8282    │          130 │
│ 11427   │          135 │
│ 11251   │          140 │
│ 1487    │          170 │
├─────────┴──────────────┤
│  7105 rows (20 shown)  │
└────────────────────────┘

In [40]:
#Adapted from Deduplicator stackoverflow.com/questions/27928/calculate-distance-between-two-latitude-longitude-points-haversine-formula/21623206
#Calculates speed between two points

def speed(data):
    R = 6371 # Radius of the earth in km
    
    data['lat2'] = data['latitude'].shift(-1)
    data['lon2'] = data['longitude'].shift(-1)
    data['time2'] = data['timestamp'].shift(-1) 

    lat1 = data['latitude']
    lon1 = data['longitude']
    lat2 = data['lat2']
    lon2 = data['lon2']
    
    data['haversine_lat'] = np.subtract(data['lat2'], data['latitude'])
    data['haversine_lon'] = np.subtract(data['lon2'], data['longitude'])
    
    dLat = deg2rad(data['haversine_lat'])  
    dLon = deg2rad(data['haversine_lon'])

    a = np.sin(dLat/2) * np.sin(dLat/2) + np.cos(deg2rad(lat1)) * np.cos(deg2rad(lat2)) * np.sin(dLon/2) * np.sin(dLon/2)
    
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a)) 
    d = R * c * 1000

    time_delta = (data['time2'] - data['timestamp']).dt.total_seconds()
    speed = d/time_delta
    
    return speed

def deg2rad(deg):
    return deg * (np.pi/180)

In [11]:
position_df = database.sql('SELECT * FROM rt_position').df()

In [170]:
# position_df

In [13]:
speeds = pd.DataFrame(position_df.groupby(['trip_id', 'route_id']).apply(speed)).reset_index()

Many of the trips only had one recorded location, making it impossible to calculate their speed. We will have to drop them.

In [15]:
speeds

Unnamed: 0,trip_id,route_id,level_2,0
0,13997801,6612,40483,1.010278
1,13997801,6612,41147,0.056557
2,13997801,6612,41859,0.055394
3,13997801,6612,42558,0.018465
4,13997801,6612,43250,0.046513
...,...,...,...,...
52236,14317324,6617,50660,4.137038
52237,14317324,6617,51224,3.035010
52238,14317324,6617,52156,
52239,14317325,6617,51742,


In [17]:
speeds = speeds.dropna()
speeds.sort_values(0)
# speeds['route_id'].nunique()
#speeds[speeds[0] == 0]

Unnamed: 0,trip_id,route_id,level_2,0
19047,14018995,6657,21203,0.000000e+00
28821,14033876,6704,35167,0.000000e+00
28822,14033876,6704,39347,0.000000e+00
28823,14033876,6704,40045,0.000000e+00
28824,14033876,6704,40738,0.000000e+00
...,...,...,...,...
5913,14003219,6621,5255,4.931975e+05
20556,14021292,6664,39957,5.863188e+05
43823,14056409,38311,15043,7.250348e+05
32416,14037127,6713,19411,1.544070e+06


In [34]:
averages = pd.DataFrame(speeds.groupby(['trip_id', 'route_id'])[0].mean()).reset_index()

In [36]:
averages.sort_values(0)

Unnamed: 0,trip_id,route_id,0
1588,14037578,29038,0.000000
1506,14035959,6708,0.000000
554,14010840,6637,0.000000
1051,14022783,30037,0.000000
1550,14036756,17707,0.000000
...,...,...,...
1295,14029839,36799,36958.755268
1565,14037127,6713,59703.621794
1199,14027783,6677,65216.837204
258,14003219,6621,116451.037282


Some speeds are clearly very incorrect, as the fastest is 119,872 m/s (over 430,000 km/h!). It seems like this is due to invalid coordinates, so we will have to remove those too.

In [24]:
position_df[position_df['trip_id'] == '14056422']

Unnamed: 0,trip_id,start_date,schedule_relationship,route_id,direction_id,latitude,longitude,current_stop_sequence,current_status,timestamp,stop_id,vehicle_id,vehicle_label
50228,14056422,2024-11-30,0,38311,1,49.311169,-123.071785,3,2,2024-12-01 04:11:35,4194,19044,19044
51033,14056422,2024-11-30,0,38311,1,0.0,0.0,4,2,2024-12-01 04:16:36,10865,19044,19044
51574,14056422,2024-11-30,0,38311,1,49.311382,-123.08123,4,2,2024-12-01 04:17:38,10865,19044,19044


In [26]:
position_df = position_df[(position_df['latitude'] >= 49) &  (position_df['latitude'] <= 49.5) & (position_df['longitude'] >= -123.5) & (position_df['longitude'] <= -122.3)]

In [38]:
trip_averages = pd.DataFrame(position_df.groupby(['trip_id', 'route_id']).apply(speed)).reset_index()

In [42]:
trip_averages = trip_averages.dropna()
trip_averages

Unnamed: 0,trip_id,route_id,level_2,0
0,13997801,6612,40483,1.010278
1,13997801,6612,41147,0.056557
2,13997801,6612,41859,0.055394
3,13997801,6612,42558,0.018465
4,13997801,6612,43250,0.046513
...,...,...,...,...
52062,14317323,6617,51087,2.046627
52063,14317323,6617,51624,2.973273
52065,14317324,6617,50068,4.577747
52066,14317324,6617,50660,4.137038


In [44]:
route_averages = pd.DataFrame(trip_averages.groupby(['trip_id', 'route_id'])[0].mean()).reset_index()

In [46]:
route_averages.sort_values(0)

Unnamed: 0,trip_id,route_id,0
1343,14032729,6700,0.000000
1881,14044612,6738,0.000000
821,14017652,6651,0.000000
1878,14044570,6738,0.000000
1384,14033842,6704,0.000000
...,...,...,...
1937,14045663,20667,19.091364
1632,14038387,6715,20.663357
963,14021096,6663,20.860456
1607,14038261,6715,21.627020


Some trips show an average speed of less than 1. It seems that these are all incredibly short trips that either didn't leave the station or didn't have their coordinates updated, so we'll drop these.

In [48]:
position_df[position_df['trip_id'] == '14044570']

Unnamed: 0,trip_id,start_date,schedule_relationship,route_id,direction_id,latitude,longitude,current_stop_sequence,current_status,timestamp,stop_id,vehicle_id,vehicle_label
47593,14044570,2024-11-30,0,6738,0,49.188751,-122.849152,1,2,2024-12-01 02:12:33,5122,18127,18127
48281,14044570,2024-11-30,0,6738,0,49.188751,-122.849152,1,2,2024-12-01 02:13:33,5122,18127,18127
48968,14044570,2024-11-30,0,6738,0,49.188751,-122.849152,1,2,2024-12-01 02:14:33,5122,18127,18127
49645,14044570,2024-11-30,0,6738,0,49.188751,-122.849152,1,2,2024-12-01 02:15:33,5122,18127,18127


In [50]:
route_averages = route_averages[route_averages[0] >= 1]

In [52]:
route_averages.sort_values(0)

Unnamed: 0,trip_id,route_id,0
157,14001106,6618,1.023404
96,13999435,6615,1.038347
1790,14043126,6728,1.043438
1585,14037581,29038,1.049638
983,14021497,6665,1.050193
...,...,...,...
1937,14045663,20667,19.091364
1632,14038387,6715,20.663357
963,14021096,6663,20.860456
1607,14038261,6715,21.627020


In [54]:
route_averages = pd.DataFrame(route_averages.groupby('route_id')[0].mean()).reset_index()

In [56]:
route_averages['km/h'] = (route_averages[0]*3.6)
route_averages = route_averages.rename(columns={0: 'm/s'})

In [406]:
# average_routes.sort_values('km/h')

In [58]:
routes = pd.read_csv('transit_cleaned/routes.txt')

In [62]:
routes_speeds = pd.merge(route_averages.astype('string'), routes.astype('string'), on='route_id', how='left')

In [64]:
routes_speeds['km/h'] = routes_speeds['km/h'].astype('float64')

### 3 fastest bus routes: 

- 555 (Carvolth Exchange/Lougheed Station)

- 749 (Ruskin/Haney Place)

- 620 (Tsawwassen Ferry/Bridgeport Station)

### 3 slowest bus routes:

- 005 (Robson/Downtown)

- 006 (Davie/Downtown)

- 229 (Lynn Valley/Lonsdale Quay)

In [68]:
routes_speeds.sort_values('km/h', ascending=False)

Unnamed: 0.1,route_id,m/s,km/h,Unnamed: 0,agency_id,route_short_name,route_long_name,route_desc,route_type,route_url,route_color,route_text_color
11,20667,17.48101514689686,62.931655,14,TL,555,Carvolth Exchange/Lougheed Station,,3,,,
162,6805,11.342863889456392,40.834310,202,TL,749,Ruskin/Haney Place,,3,,,
151,6747,10.419751795491194,37.511106,184,TL,620,Tsawwassen Ferry/Bridgeport Station,,3,,,
155,6779,10.171957560733375,36.619047,195,TL,262,Lions Bay/Caulfeild,,3,,,
139,6715,9.749896031618155,35.099626,160,TL,351,White Rock Centre/Bridgeport Station,,3,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
113,6678,2.9665820007023056,10.679695,130,TL,230,Upper Lonsdale/Lonsdale Quay,,3,,,
157,6790,2.849767177014494,10.259162,197,TL,103,New Westminster Station/Quayside,,3,,,
112,6677,2.71412427673272,9.770847,129,TL,229,Lynn Valley/Lonsdale Quay,,3,,,
62,6616,1.949562353036469,7.018424,77,TL,006,Davie/Downtown,,3,,,
