# Introduction

In this file, we would like to answer the question: "how frequently are buses directly mentioned in unplanned alerts as an alternative mode of transport?".

# Load data

In [8]:
import passenger_pred.load as ppl
import passenger_pred.preprocess as ppp
import os
import pandas as pd
import json
import datetime
from datetime import timedelta
import numpy as np

In [11]:
file_names = os.listdir('../data/unplanned_alerts')
file_names = ['../data/unplanned_alerts/' + i for i in file_names if 'json' in i]

In [12]:
alert_df = ppl.load_alerts(file_names)
alert_df.shape

(187250, 9)

In [13]:
alert_df = ppp.remove_unplanned_alert(alert_df)

In [14]:
alert_df.shape

(27144, 9)

In [15]:
alert_df = alert_df.drop_duplicates()
alert_df.shape

(7258, 9)

In [16]:
alert_df.to_csv("unplanned_alert_20211028_20211121.csv")

In [17]:
unique_alerts = alert_df.statusDescription.unique()
len(unique_alerts)

2044

## Get all routes

In [18]:
from google.transit import gtfs_realtime_pb2
from urllib.request import Request, urlopen

BUS_API = "410b671c-ff0e-41d5-bf67-77a0e5aba917"

feed = gtfs_realtime_pb2.FeedMessage()
GTFS_BUS_URL = "http://gtfsrt.prod.obanyc.com/tripUpdates?key=" + BUS_API
req = Request(GTFS_BUS_URL)
feed.ParseFromString(urlopen(req).read())

2214916

In [19]:
routes = set()

for route in feed.entity:
    id = route.trip_update.trip.route_id
    routes.add(id)

len(routes)

252

## Find mention of routes

In [20]:
from collections import defaultdict

mention_count = 0
mentioned_routes = defaultdict(lambda: 0)

for alert in unique_alerts:
    for route in routes:
        if route in alert:
            mentioned_routes[route] += 1
            mention_count += 1

mention_count

213

We find that around 10% of alerts mention a bus route. This is promising.

In [21]:
route_count = pd.Series(mentioned_routes)
route_count.describe()

count    55.000000
mean      3.872727
std       3.266296
min       1.000000
25%       1.000000
50%       3.000000
75%       6.000000
max      16.000000
dtype: float64

We find 54 unique bus routes mentioned. One bus route is mentioned 16 times. This would probably be an interesting one to investigate.

In [22]:
route_count[route_count == route_count.max()]

B6    16
dtype: int64

Let's take a look at the alerts for which the B6 has been mentioned.

In [23]:
unique_alert_df = alert_df.drop_duplicates(subset=['statusDescription'])

In [24]:
B6 = unique_alert_df[unique_alert_df.statusDescription.str.contains("B6")]
B6.shape

(16, 9)

In [25]:
B6.statusDescription.iloc[0]

'<p>Northbound [D][N][R] trains are not running between <b>36 St</b> and <b>DeKalb Av</b> while emergency teams respond to someone who was struck by a train at <b>Union St</b>.</p><p style="min-height:10px"></p><p>Extremely limited southbound [D][N][R] service has resumed in Brooklyn.</p><p style="min-height:10px"></p><p>The last stop on northbound [D][N][R] trains will be <b>36 St</b> in Brooklyn, where they\'ll turn back for southbound service.</p><p style="min-height:10px"></p><p>Some southbound [D] trains are ending service at <b>Broadway-Lafayette St</b>, where they\'ll turn back for northbound service.</p><p style="min-height:10px"></p><p>Northbound [N] trains are running on the [Q] line between <b>Coney Island-Stillwell Av</b> and <b>DeKalb Av</b>.</p><p style="min-height:10px"></p><p>Some southbound [R] trains are ending service at <b>Whitehall St-South Ferry</b>, where they\'ll turn back for northbound service.</p><p style="min-height:10px"></p><p>See a station agent to reques

In [26]:
%%html

<p>Northbound [D][N][R] trains are not running between <b>36 St</b> and <b>DeKalb Av</b> while emergency teams respond to someone who was struck by a train at <b>Union St</b>.</p><p style="min-height:10px"></p><p>Extremely limited southbound [D][N][R] service has resumed in Brooklyn.</p><p style="min-height:10px"></p><p>The last stop on northbound [D][N][R] trains will be <b>36 St</b> in Brooklyn, where they\'ll turn back for southbound service.</p><p style="min-height:10px"></p><p>Some southbound [D] trains are ending service at <b>Broadway-Lafayette St</b>, where they\'ll turn back for northbound service.</p><p style="min-height:10px"></p><p>Northbound [N] trains are running on the [Q] line between <b>Coney Island-Stillwell Av</b> and <b>DeKalb Av</b>.</p><p style="min-height:10px"></p><p>Some southbound [R] trains are ending service at <b>Whitehall St-South Ferry</b>, where they\'ll turn back for northbound service.</p><p style="min-height:10px"></p><p>See a station agent to request a free courtesy pass for local <strong>B37</strong>, <strong>B63</strong>, <strong>B64</strong>, <strong>B68</strong>, or <strong>B103</strong> buses. Track buses near you at <a target="_blank" href="//bt.mta.info" rel="noopener noreferrer nofollow">bt.mta.info</a></p>

In [27]:
B6.statusDescription.iloc[1]

'<p>[N] [Q] [R] [W] trains are <strong>rerouted</strong> and experiencing <strong>major delays </strong>because of multiple incidents:</p><ul><li><p>We\'re working to repair a signal problem that has activated multiple trains\' brakes in the tunnel from <strong>Queens</strong> to <strong>Manhattan</strong>.</p></li><li><p>We\'re working to get trains back on schedule after someone was struck by a train on the <strong>Manhattan Bridge</strong></p></li></ul><p>Here are details on how service is running:</p><p style="min-height:10px"></p><p>[N] <strong>Service</strong></p><p>Southbound [N] trains are running at slow speeds in the tunnel from <strong>Queens</strong> to <strong>Manhattan</strong>.</p><p style="min-height:10px"></p><p>[Q] <strong>Service</strong></p><p>Trains are experiencing major delays in both directions.</p><p style="min-height:10px"></p><p>[R]<strong> Service</strong></p><p>There is no southbound [R] service at <b>Queens Plaza</b>, <b>Lexington Av/59 St</b> or <b>5 Av/5

In [28]:
%%html
'<p>[N] [Q] [R] [W] trains are <strong>rerouted</strong> and experiencing <strong>major delays </strong>because of multiple incidents:</p><ul><li><p>We\'re working to repair a signal problem that has activated multiple trains\' brakes in the tunnel from <strong>Queens</strong> to <strong>Manhattan</strong>.</p></li><li><p>We\'re working to get trains back on schedule after someone was struck by a train on the <strong>Manhattan Bridge</strong></p></li></ul><p>Here are details on how service is running:</p><p style="min-height:10px"></p><p>[N] <strong>Service</strong></p><p>Southbound [N] trains are running at slow speeds in the tunnel from <strong>Queens</strong> to <strong>Manhattan</strong>.</p><p style="min-height:10px"></p><p>[Q] <strong>Service</strong></p><p>Trains are experiencing major delays in both directions.</p><p style="min-height:10px"></p><p>[R]<strong> Service</strong></p><p>There is no southbound [R] service at <b>Queens Plaza</b>, <b>Lexington Av/59 St</b> or <b>5 Av/59 St</b>.</p><p style="min-height:10px"></p><p>Southbound [R] trains are running on the [F] line from <b>36 St</b> (Queens) to <b>Lexington Av/63 St</b> and then on the [Q] line to <b>57 St-7 Av</b>.</p><p style="min-height:10px"></p><p>[W]<strong> Service</strong></p><p>[W] service is extremely limited.</p><p style="min-height:10px"></p><p><strong>Alternatives - Between Queens and Manhattan </strong>[N] [R] [W]</p><p>Take <strong>M60-SBS</strong> buses between <b>Astoria Blvd</b> and [A] [B] [C] [D], [2] [3], [4] [5] [6] stations on <strong>125 St</strong>.</p><p style="min-height:10px"></p><p>Take <strong>M102</strong> buses for local service in <strong>Astoria</strong>.</p><p style="min-height:10px"></p><p>Transfer to/from [7] trains at <b>Queensboro Plaza</b> for service to/from <strong>Manhattan</strong>.</p><p style="min-height:10px"></p><p>For <b>Queens Plaza</b>, take \u200c[M] trains by transferring at \u200c<strong>36 St\u200c </strong>or \u200c<strong>34 St-Herald Sq\u200c</strong>.</p><p style="min-height:10px"></p><p><strong>Alternatives- Manhattan</strong></p><p>Take nearby [4] [5] [6], [B] [D] [F] [M] and [1] [2] [3] trains. Transfer at <b>Lexington Av/59 St</b>, <b>34 St-Herald Sq</b> and <b>Times Sq-42 St</b>.</p><p style="min-height:10px"></p><p>For service to/from<strong> \u200cLexington Av/59 St\u200c</strong>, take \u200c[6] trains to/from \u200c<strong>51 St\u200c </strong>for \u200c[M] service or \u200c<strong>Grand Central-42 St\u200c </strong>for \u200c[7] service.</p><p style="min-height:10px"></p><p>For service to \u200c<strong>5 Av/59 St\u200c</strong>, use nearby \u200c<b>57 St</b> ([F]) where southbound \u200c[R] service is rerouted.</p><p style="min-height:10px"></p><p><strong>Alternatives - Brooklyn</strong></p><p>For [Q] service, take [B] [2] [5] trains.</p><p style="min-height:10px"></p><p>For [N] service, take [D] trains.</p><p style="min-height:10px"></p><p>For [R] service, take <strong>B37</strong> and <strong>B63</strong> buses.</p>'

For how long were these alerts active? Note that `endDate` is `None` in every case. This is due to the fact that for these unplanned disruptions, it is unknown at what time it ended - they will simply have disappeared at some point. We thus need to infer this from the original data.

In [29]:
file_name[34:44]

NameError: name 'file_name' is not defined

In [30]:
time_alerts = []

for file_name in file_names:
    time = file_name[34:44]
    with open (file_name, 'r') as f:
        data = json.load(f)
        for dic in data:
            dic['time'] = time
        time_alerts += data

In [31]:
time_df = pd.DataFrame(time_alerts)
time_df.shape

(187250, 10)

In [32]:
alert_times = time_df.groupby("id").time.unique()
alert_times.head()

id
lmm:alert:103638                 [1635356402, 1635356173, 1635357002]
lmm:alert:103648     [1635356402, 1635356173, 1635357002, 1635357602]
lmm:alert:103650    [1635356402, 1635357002, 1635358802, 163535820...
lmm:alert:103656                                         [1635357602]
lmm:alert:103657                                         [1635358202]
Name: time, dtype: object

In [33]:
time_df = time_df.merge(alert_times, on='id')
time_df.shape

(187250, 11)

In [34]:
time_df = time_df.rename({"time_x": "time", "time_y": "time_list"}, axis=1)
time_df.columns

Index(['startDate', 'direction', 'endDate', 'statusSummary', 'id', 'priority',
       'line', 'creationDate', 'statusDescription', 'time', 'time_list'],
      dtype='object')

In [35]:
time_df['end_time'] = time_df.time_list.apply(max)

In [36]:
time_df = time_df[['id', 'end_time']].drop_duplicates(subset='id')

In [37]:
B6_time = B6.merge(time_df[['id', 'end_time']], on='id',)
B6_time.shape

(16, 10)

In [38]:
B6_time.end_time

0     1636852802
1     1637621402
2     1636845001
3     1635407402
4     1636882202
5     1636978202
6     1636957202
7     1636399201
8     1637616002
9     1637430002
10    1636980002
11    1636847401
12    1637617202
13    1637618402
14    1636401002
15    1637616602
Name: end_time, dtype: object

In [39]:
B6_time['endDate'] = B6_time.end_time.astype(int).apply(lambda x: datetime.datetime.fromtimestamp(x))

In [40]:
B6_time['startDate'] = pd.to_datetime(B6_time.startDate)

In [41]:
B6_time['endDate'] = B6_time.endDate.dt.tz_localize('EST')

In [42]:
B6_time['endDate'] = B6_time.endDate.dt.tz_convert('UTC')

In [43]:
B6_time['startDate'] = pd.to_datetime(B6_time.startDate, utc=True)

In [44]:
B6_time['endDate'] = B6_time.endDate + timedelta(minutes=10)

In [45]:
B6_time.endDate - B6_time.startDate

0    0 days 01:39:08
1    0 days 01:00:02
2    0 days 00:19:19
3    0 days 01:52:14
4    0 days 01:52:47
5    0 days 00:38:55
6    0 days 01:15:34
7    0 days 00:13:03
8    0 days 00:28:38
9    0 days 00:34:38
10   0 days 00:35:34
11   0 days 00:45:56
12   0 days 00:14:38
13   0 days 00:25:05
14   0 days 00:15:48
15   0 days 00:12:59
dtype: timedelta64[ns]

In [46]:
B6_time['alert_minutes'] = (B6_time.endDate - B6_time.startDate) / pd.Timedelta(minutes=1)
B6_time.alert_minutes

0      99.133333
1      60.033333
2      19.316667
3     112.233333
4     112.783333
5      38.916667
6      75.566667
7      13.050000
8      28.633333
9      34.633333
10     35.566667
11     45.933333
12     14.633333
13     25.083333
14     15.800000
15     12.983333
Name: alert_minutes, dtype: float64

In [47]:
pd.to_datetime(alert_df.startDate.max()) - pd.to_datetime(alert_df.startDate.min())

TypeError: Timestamp subtraction must have the same timezones or no timezones

In [48]:
alert_df.startDate.min()

'2021-10-27T12:43:24-0400'

In [49]:
total_minutes = (pd.to_datetime(alert_df.startDate.max()).tz_convert("EST") - pd.to_datetime(alert_df.startDate.min()).tz_convert("EST")) / pd.Timedelta(minutes=1)
total_minutes

37990.53333333333

In [50]:
B6_time.alert_minutes.sum() / total_minutes * 100

1.959172285025392

In [52]:
B6_time.to_csv("B6.csv")

So, we find that for about 2% of 'time', there has been an alert active in which there was a mention of the B6 line as a rerouting. This number is relatively low, and that is for the line for which the alerts were most frequent. It thus remains an open question whether adding a 'disruption' feature to our dataset would be useful. 