# Introduction

In this file, we would like to answer the question: "how frequently are buses directly mentioned in unplanned alerts as an alternative mode of transport?".

# Load data

In [4]:
import passenger_pred.load as ppl
import passenger_pred.preprocess as ppp
import os
import pandas as pd
import json
import datetime
from datetime import timedelta
import numpy as np

In [5]:
file_names = os.listdir('../data/unplanned_alerts')
file_names = ['../data/unplanned_alerts/' + i for i in file_names if 'json' in i]
len(file_names)

5197

In [6]:
alert_df = ppl.load_alerts(file_names)
alert_df.shape

(265854, 9)

In [7]:
alert_df = ppp.remove_unplanned_alert(alert_df)

In [8]:
alert_df.shape

(35664, 9)

In [9]:
alert_df = alert_df.drop_duplicates()
alert_df.shape

(10032, 9)

In [264]:
alert_df.loc[724].statusDescription

'<p>Northbound [D][N][R] trains are not running between <b>36 St</b> and <b>DeKalb Av</b> while emergency teams respond to someone who was struck by a train at <b>Union St</b>.</p><p style="min-height:10px"></p><p>Extremely limited southbound [D][N][R] service has resumed in Brooklyn.</p><p style="min-height:10px"></p><p>The last stop on northbound [D][N][R] trains will be <b>36 St</b> in Brooklyn, where they\'ll turn back for southbound service.</p><p style="min-height:10px"></p><p>Some southbound [D] trains are ending service at <b>Broadway-Lafayette St</b>, where they\'ll turn back for northbound service.</p><p style="min-height:10px"></p><p>Northbound [N] trains are running on the [Q] line between <b>Coney Island-Stillwell Av</b> and <b>DeKalb Av</b>.</p><p style="min-height:10px"></p><p>Some southbound [R] trains are ending service at <b>Whitehall St-South Ferry</b>, where they\'ll turn back for northbound service.</p><p style="min-height:10px"></p><p>See a station agent to reques

In [266]:
%%html
<p>Northbound [D][N][R] trains are not running between <b>36 St</b> and <b>DeKalb Av</b> while emergency teams respond to someone who was struck by a train at <b>Union St</b>.</p><p style="min-height:10px"></p><p>Extremely limited southbound [D][N][R] service has resumed in Brooklyn.</p><p style="min-height:10px"></p><p>The last stop on northbound [D][N][R] trains will be <b>36 St</b> in Brooklyn, where they\'ll turn back for southbound service.</p><p style="min-height:10px"></p><p>Some southbound [D] trains are ending service at <b>Broadway-Lafayette St</b>, where they\'ll turn back for northbound service.</p><p style="min-height:10px"></p><p>Northbound [N] trains are running on the [Q] line between <b>Coney Island-Stillwell Av</b> and <b>DeKalb Av</b>.</p><p style="min-height:10px"></p><p>Some southbound [R] trains are ending service at <b>Whitehall St-South Ferry</b>, where they\'ll turn back for northbound service.</p><p style="min-height:10px"></p><p>See a station agent to request a free courtesy pass for local <strong>B37</strong>, <strong>B63</strong>, <strong>B64</strong>, <strong>B68</strong>, or <strong>B103</strong> buses. Track buses near you at <a target="_blank" href="//bt.mta.info" rel="noopener noreferrer nofollow">bt.mta.info</a></p>

In [11]:
alert_df.to_csv("unplanned_alert_20211028_20211201.csv")

In [12]:
unique_alerts = alert_df.statusDescription.unique()
len(unique_alerts)

2825

In [45]:
unique_alerts = [i.replace(",", " ") for i in unique_alerts]

In [78]:
[i for i in unique_alerts if "M7 " in i][0]

'<p>[N] trains are delayed and partially suspended while FDNY responds to a track fire in the Coney Island train <a target="_blank" href="//yard.There" rel="noopener noreferrer nofollow">yard.</a></p><p style="min-height:10px"></p><p><a target="_blank" href="//yard.There" rel="noopener noreferrer nofollow">There</a> are no [N] trains running between <b>36 St</b> (Bklyn) and <b>Coney Island-Stillwell Av</b>.</p><p style="min-height:10px"></p><p>Travel Alternatives</p><p>For service between <b>45 St</b> and <b>Coney Island-Stillwell Av</b>  take nearby B4  B6  B9  B11 service for [D] or [F] trains. </p><p style="min-height:10px"></p><p style="min-height:10px"></p>'

In [79]:
%%html
'<p>[N] trains are delayed and partially suspended while FDNY responds to a track fire in the Coney Island train <a target="_blank" href="//yard.There" rel="noopener noreferrer nofollow">yard.</a></p><p style="min-height:10px"></p><p><a target="_blank" href="//yard.There" rel="noopener noreferrer nofollow">There</a> are no [N] trains running between <b>36 St</b> (Bklyn) and <b>Coney Island-Stillwell Av</b>.</p><p style="min-height:10px"></p><p>Travel Alternatives</p><p>For service between <b>45 St</b> and <b>Coney Island-Stillwell Av</b>  take nearby M3  B6  B9  B11 service for [D] or [F] trains. </p><p style="min-height:10px"></p><p style="min-height:10px"></p>'

## Get all routes

In [51]:
from google.transit import gtfs_realtime_pb2
from urllib.request import Request, urlopen

BUS_API = "410b671c-ff0e-41d5-bf67-77a0e5aba917"

feed = gtfs_realtime_pb2.FeedMessage()
GTFS_BUS_URL = "http://gtfsrt.prod.obanyc.com/tripUpdates?key=" + BUS_API
req = Request(GTFS_BUS_URL)
feed.ParseFromString(urlopen(req).read())

3540397

In [60]:
routes = set()

for route in feed.entity:
    id = route.trip_update.trip.route_id
    routes.add(id)

len(routes)

300

In [69]:
routes = {route + " " for route in routes}

In [70]:
len(routes)

300

In [92]:
{i for i in routes if "B6" in i}

{'B6 ', 'B60 ', 'B61 ', 'B62 ', 'B63 ', 'B64 ', 'B65 ', 'B67 ', 'B68 ', 'B69 '}

## Find mention of routes

In [95]:
from collections import defaultdict

mention_count = 0
mentioned_routes = defaultdict(lambda: 0)

for alert in unique_alerts:
    for route in routes:
        if route in alert:
            if route == "B6 ":
                print(alert)
            mentioned_routes[route] += 1
            mention_count += 1

mention_count

<p>[N] trains are delayed and partially suspended while FDNY responds to a track fire in the Coney Island train <a target="_blank" href="//yard.There" rel="noopener noreferrer nofollow">yard.</a></p><p style="min-height:10px"></p><p><a target="_blank" href="//yard.There" rel="noopener noreferrer nofollow">There</a> are no [N] trains running between <b>36 St</b> (Bklyn) and <b>Coney Island-Stillwell Av</b>.</p><p style="min-height:10px"></p><p>Travel Alternatives</p><p>For service between <b>45 St</b> and <b>Coney Island-Stillwell Av</b>  take nearby B4  B6  B9  B11 service for [D] or [F] trains. </p><p style="min-height:10px"></p><p style="min-height:10px"></p>
<p>[N] trains are delayed and partially suspended while FDNY responds to a track fire in the Coney Island train <a target="_blank" href="//yard.There" rel="noopener noreferrer nofollow">yard.</a></p><p style="min-height:10px"></p><p><a target="_blank" href="//yard.There" rel="noopener noreferrer nofollow">There</a> are no [N] tr

58

In [96]:
round(mention_count / len(unique_alerts) * 100, 2)

2.05

We find that only 2% of alerts mention a bus route. This is not a whole lot. 

In [98]:
route_count = pd.Series(mentioned_routes)
route_count.describe()

count    29.00000
mean      2.00000
std       1.46385
min       1.00000
25%       1.00000
50%       1.00000
75%       2.00000
max       6.00000
dtype: float64

We find 54 unique bus routes mentioned. One bus route is mentioned 16 times. This would probably be an interesting one to investigate.

In [99]:
route_count[route_count == route_count.max()]

M7     6
dtype: int64

In [127]:
route_count.sort_values(ascending=False).head(10)

M7       6
M5       5
M3       5
B4       4
M100     4
B6       3
Q66      3
Q24      2
B11      2
B9       2
dtype: int64

Let's take a look at the alerts for which the M3 has been mentioned.

In [80]:
unique_alert_df = alert_df.drop_duplicates(subset=['statusDescription'])

In [82]:
unique_alert_df.shape

(2825, 9)

In [124]:
unique_alert_df.statusDescription = unique_alert_df.statusDescription.str.replace(",", " ")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [209]:
M3 = unique_alert_df[unique_alert_df.statusDescription.str.contains("M100 ", regex=False)]
M3.shape

(4, 9)

For how long were these alerts active? Note that `endDate` is `None` in every case. This is due to the fact that for these unplanned disruptions, it is unknown at what time it ended - they will simply have disappeared at some point. We thus need to infer this from the original data.

In [210]:
file_name[34:44]

'1636692601'

In [179]:
time_alerts = []

for file_name in file_names:
    time = file_name[34:44]
    with open (file_name, 'r') as f:
        data = json.load(f)
        for dic in data:
            dic['time'] = time
        time_alerts += data

In [156]:
time_df = pd.DataFrame(time_alerts)
time_df.shape

(265854, 10)

In [157]:
alert_times = time_df.groupby("id").time.unique()
alert_times.head()

id
lmm:alert:103638                 [1635356402, 1635356173, 1635357002]
lmm:alert:103648     [1635356402, 1635356173, 1635357002, 1635357602]
lmm:alert:103650    [1635356402, 1635357002, 1635358802, 163535820...
lmm:alert:103656                                         [1635357602]
lmm:alert:103657                                         [1635358202]
Name: time, dtype: object

In [158]:
time_df = time_df.merge(alert_times, on='id')
time_df.shape

(265854, 11)

In [159]:
time_df = time_df.rename({"time_x": "time", "time_y": "time_list"}, axis=1)
time_df.columns

Index(['startDate', 'direction', 'endDate', 'statusSummary', 'id', 'priority',
       'line', 'creationDate', 'statusDescription', 'time', 'time_list'],
      dtype='object')

In [160]:
time_df['end_time'] = time_df.time_list.apply(max)

In [161]:
time_df = time_df[['id', 'end_time']].drop_duplicates(subset='id')

In [211]:
M3_time = M3.merge(time_df[['id', 'end_time']], on='id',)
M3_time.shape

(4, 10)

In [212]:
M3_time.end_time

0    1638328802
1    1638325801
2    1638330602
3    1637172602
Name: end_time, dtype: object

In [213]:
M3_time['endDate'] = M3_time.end_time.astype(int).apply(lambda x: datetime.datetime.fromtimestamp(x))

In [214]:
M3_time['startDate'] = pd.to_datetime(M3_time.startDate)

In [215]:
M3_time['endDate'] = M3_time.endDate.dt.tz_localize('EST')

In [216]:
M3_time['endDate'] = M3_time.endDate.dt.tz_convert('UTC')

In [217]:
M3_time['startDate'] = pd.to_datetime(M3_time.startDate, utc=True)

In [218]:
M3_time['endDate'] = M3_time.endDate + timedelta(minutes=10)

In [219]:
M3_time.endDate - M3_time.startDate

0   0 days 00:59:54
1   0 days 00:59:07
2   0 days 00:29:28
3   0 days 00:14:10
dtype: timedelta64[ns]

In [220]:
M3_time['alert_minutes'] = (M3_time.endDate - M3_time.startDate) / pd.Timedelta(minutes=1)
M3_time.alert_minutes

0    59.900000
1    59.116667
2    29.466667
3    14.166667
Name: alert_minutes, dtype: float64

In [172]:
pd.to_datetime(alert_df.startDate.max()) - pd.to_datetime(alert_df.startDate.min())

TypeError: Timestamp subtraction must have the same timezones or no timezones

In [221]:
alert_df.startDate.min()

'2021-10-27T12:43:24-0400'

In [222]:
total_minutes = (pd.to_datetime(alert_df.startDate.max()).tz_convert("EST") - pd.to_datetime(alert_df.startDate.min()).tz_convert("EST")) / pd.Timedelta(minutes=1)
total_minutes

51991.066666666666

In [223]:
M3_time.alert_minutes.sum() / total_minutes * 100

0.312842206225172

In [224]:
M3_time.to_csv("M100.csv")

So, we find that for about 0.7% of 'time', there has been an alert active in which there was a mention of the M3 line as a rerouting. This number is relatively low, and that is for the line for which the alerts were most frequent. It thus remains an open question whether adding a 'disruption' feature to our dataset would be useful. 