In [1]:
import os 
os.chdir("/home/canyon/Bus-Weather-Impacts")
from src.utils import *
import pandas as pd
import os
import osmnx as ox
import numpy as np
import geopandas as gpd
import networkx as nx
from sklearn.neighbors import KDTree
import numpy as np
pd.options.mode.chained_assignment = None
pd.set_option('display.float_format', '{:.02f}'.format)
from geopy.distance import geodesic
from shapely.ops import linemerge
pd.set_option('display.float_format', lambda x: '%.6f' % x)
pd.set_option('display.max_columns', None)
import contextily as ctx
from src.api import parse_zipped_gtfs
import matplotlib.pyplot as plt
from src.gtfs_segments import GTFS_shape_processor

In [2]:
url = pd.read_csv("https://data.ny.gov/resource/58t6-89vi.csv?$limit=10000000")

# URL of the dataset (replace with actual dataset identifier)
# SoQL query parameters to get data for the date '2023-09-29'
params = {
    "$where": "Timestamp >= '2023-10-01 T00:00:00.000' AND Timestamp < '2023-10-31 T00:00:00.000'",
    "$limit": 1000000000,
}

response = requests.get(url, params=params)

# Check if the request was successful
if response.status_code == 200:
    mta_data = pd.DataFrame(response.json())
else:
    print("Error:", response.status_code)

In [3]:
gtfs_speed = read_speeds_between_dates("2023-10-01", "2023-10-30")

In [4]:
feeds = ["https://transitfeeds.com/p/mta/80/20230918/download", "https://transitfeeds.com/p/mta/81/20230918/download", "https://transitfeeds.com/p/mta/83/20230918/download", "https://transitfeeds.com/p/mta/82/20230919/download", "https://transitfeeds.com/p/mta/84/20230919/download", "https://transitfeeds.com/p/mta/85/20230918/download"]
nyc_gtfs_trips = []
nyc_stops = []
for feed in feeds:
    gtfs = parse_zipped_gtfs(feed)
    nyc_gtfs_trips.append(gtfs['trips.txt'])
full_trips = pd.concat(nyc_gtfs_trips)
full_trips = full_trips[["route_id", "trip_id"]]

full_trips.to_parquet("/home/data/bus-weather/nyc_trip_shapes.parquet")

In [5]:
gtfs_speed = gtfs_speed.query("speed_mph < 70 & speed_mph > 0").merge(full_trips)

In [6]:
gtfs_speed["hour_of_day"] = gtfs_speed["interpolated_time"].dt.hour
gtfs_speed["month"] = gtfs_speed["interpolated_time"].dt.month
gtfs_speed["year"] = gtfs_speed["interpolated_time"].dt.year
gtfs_speed["day_of_week"] = gtfs_speed["interpolated_time"].dt.day_name()
gtfs_speed["date"] = pd.to_datetime(gtfs_speed["interpolated_time"].dt.date)
gtfs_speed['combined_datetime'] = gtfs_speed['date'] + pd.to_timedelta(gtfs_speed['hour_of_day'], unit='h')

In [7]:
hourly_speeds_gtfs = gtfs_speed.groupby(["route_id", "year", "month", "hour_of_day", "day_of_week", "stop_id", "prev_stop_id", "stop_name", "prev_stop_name"]).agg(avg_speed_gtfs = ("speed_mph", "mean"),
                                                                                                                                    gtfs_bus_count = ("unique_trip_id", "nunique")).reset_index().rename({"stop_id" : "next_timepoint_stop_id",
                                                                                                                                                                                                          "prev_stop_id": "timepoint_stop_id",
                                                                                                                                                                                                          "prev_stop_name" : "timepoint_stop_name",
                                                                                                                                                                                                          "stop_name" : "next_timepoint_stop_name"}, axis = 1)

In [8]:
hourly_speeds_mta = mta_speeds[["route_id", "year", "month", "hour_of_day", "day_of_week", "timepoint_stop_id", "next_timepoint_stop_id", "timepoint_stop_name", "next_timepoint_stop_name", "average_road_speed", "bus_trip_count"]]

In [9]:
test_merged = hourly_speeds_gtfs.merge(hourly_speeds_mta, how = "inner")

In [None]:
np.corrcoef(test_merged["average_road_speed"], test_merged["avg_speed_gtfs"])

In [11]:
mta_routes = hourly_speeds_mta.route_id.drop_duplicates()

In [12]:
gtfs_routes = hourly_speeds_gtfs.route_id.drop_duplicates()

In [None]:
segments = []
for feed in feeds:
    segment_df = GTFS_shape_processor(feed, 4326, 2263).process_shapes()
    segments.append(segment_df)

full_segments = pd.concat(segments)

In [14]:
full_segments.drop("trip_id", axis = 1).drop_duplicates().to_parquet("/home/data/bus-weather/bus_segments_2023_09_18.parquet")

In [15]:
ntas_url = "https://data.cityofnewyork.us/api/geospatial/d3qk-pfyz?method=export&format=GeoJSON"
ntas = gpd.read_file(ntas_url).to_crs(2263)

In [None]:
full_seg_nta = full_segments.overlay(ntas).drop_duplicates(["stop_id", "prev_stop_id"])

In [52]:
gtfs_speed = gtfs_speed.merge(full_seg_nta[["stop_id", "prev_stop_id", "ntaname", "ntacode", "boroname"]]).drop_duplicates()

In [98]:
hourly_speeds = gtfs_speed.groupby(["combined_datetime", "ntaname", "boroname", "ntacode", "date", "day_of_week", "hour_of_day"]).agg(avg_speed_gtfs = ("speed_mph", "mean"), gtfs_bus_count = ("unique_trip_id", "nunique")).reset_index().query("gtfs_bus_count > 5")

In [None]:
ggplot(hourly_speeds, aes(color = "day_of_week", x = "avg_speed_gtfs")) + geom_density()

In [None]:
ggplot(mta_speeds.query("month == 10"), aes(color = "day_of_week", x = "average_road_speed")) + geom_density()

In [99]:
codes = ["BK73", "BK72", "BK61", "BK63", "BK91", "BK42"]
days = ["2023-09-25","2023-09-26", "2023-09-27", "2023-09-28", "2023-09-29"]

In [102]:
flood_days = ["2023-09-29 10:00:00", "2023-09-29 11:00:00", "2023-09-29 12:00:00", "2023-09-29 13:00:00", "2023-09-29 14:00:00", "2023-09-29 15:00:00", "2023-09-29 16:00:00"]
comparison_hours = [10, 11, 12, 13, 14, 15, 16]

In [None]:
gtfs_flood = gtfs_speed.query("~day_of_week.isin(['Sunday', 'Saturday'])")
gtfs_flood["flood"] = gtfs_flood.combined_datetime.isin(flood_days)
gtfs_flood["flood_zone"] = gtfs_flood.ntacode.isin(codes)

In [None]:
ggplot(gtfs_flood.query("boroname == 'Brooklyn'").sample(100000), aes(x = "speed_mph", color = "flood_zone")) + geom_density() + facet_wrap("~flood") +  theme(figure_size=(10,10))

In [None]:
ggplot(gtfs_flood.query("hour_of_day.isin(@comparison_hours)"), aes(x = "flood_zone", color = "flood", y = "speed_mph")) + geom_boxplot() + facet_wrap("~boroname") +theme(figure_size=(10,10)) + ylim(0, 20)

In [None]:
ggplot(gtfs_flood.sample(100000), aes(x = "speed_mph", color = "flood")) + geom_density() + facet_wrap("~boroname") +  theme(figure_size=(10,10))

In [None]:
from plotnine import *
ggplot(hourly_speeds.query("ntacode.isin(@codes) and date.isin(@days)"), aes(x = "combined_datetime", y = "gtfs_bus_count", color = "ntaname")) + geom_line() + theme(figure_size=(10,10))

In [None]:
hourly_speeds.query("ntacode.isin(@codes)").combined_datetime.drop_duplicates()

In [None]:
gtfs_speed.date.drop_duplicates().sort_values()

In [None]:
hourly_speeds.query("avg_speed_gtfs < 0")