In [1]:
import os 
os.chdir("/home/canyon/Bus-Weather-Impacts")
from src.utils import *
from src.bus_functions import *
import pandas as pd
import os
import osmnx as ox
import numpy as np
import geopandas as gpd
import networkx as nx
from sklearn.neighbors import KDTree
import numpy as np
pd.options.mode.chained_assignment = None
pd.set_option('display.float_format', '{:.02f}'.format)
from geopy.distance import geodesic
from shapely.geometry import Point
calculated_pair_path = "data/node_pairs.parquet"
pd.set_option('display.max_columns', None)
from plotnine import *

In [9]:
def get_bus_stops(path = "/home/data/test/cities/C3562/stops.geojson"):
    bus_stops = gpd.read_file(path)
    bus_agencies = ["MTA NYCT", "MTABC", "MTA NYCT,MTABC"]
    bus_stops = bus_stops.query("agency_ids_serviced.isin(@bus_agencies)")[["stop_id", "stop_name", "stop_lat", "stop_lon", "geometry"]].rename({"stop_lat" : "lat", "stop_lon" : "lon"}, axis = 1)
    bus_stops["stop_id"] = [f"MTA_{stop_id}" for stop_id in bus_stops["stop_id"]]
    bus_stops = prep_coords(bus_stops, "lat", "lon")

    return bus_stops

def bus_stops_nodes(bus_stops, tree, nodes):
    stops_with_nodes = tag_feed_with_nodes(bus_stops, tree, nodes)
    stops_with_nodes["dist_to_node"] = calculate_distance_to_node(stops_with_nodes)
    stops_with_nodes = stops_with_nodes.query("dist_to_node < 200")
    stops_with_nodes = stops_with_nodes[["stop_id", "stop_name", "osmid", "dist_to_node"]]

    return stops_with_nodes

def get_stop_pairs(bus_stops, raw_GTFS_path):
    gtfs_rt = read_parquet_from_tar_gz(raw_GTFS_path)
    gtfs_rt = gtfs_rt.merge(bus_stops, left_on="next_stop_id", right_on = "stop_id", how = "left")
    gtfs_rt = gtfs_rt[["trip_id", "route_long", "timestamp", "origin_id", "next_stop_id"]].sort_values(["trip_id", "timestamp"]).drop_duplicates(["trip_id", "origin_id", "next_stop_id"]).dropna()
    gtfs_rt["prev_stop_id"] = gtfs_rt["next_stop_id"].shift(1)
    gtfs_rt.loc[gtfs_rt["prev_stop_id"].isna(), 'prev_stop_id'] = gtfs_rt["origin_id"]

    stop_pairs = gtfs_rt[["prev_stop_id", "next_stop_id"]]

    stop_pairs = stop_pairs.merge(bus_stops[["stop_id", "stop_name", "osmid"]], left_on = "prev_stop_id", right_on = "stop_id").merge(bus_stops[["stop_id", "stop_name", "osmid"]], left_on = "next_stop_id", right_on = "stop_id", suffixes = ["_prev", "_next"]).rename({"osmid_next" : "osmid", "osmid_prev" : "prev_osmid"}, axis = 1)
    stop_pairs["osmid"] = stop_pairs["osmid"].astype(int)
    stop_pairs["prev_osmid"] = stop_pairs["prev_osmid"].astype(int)

    return stop_pairs.drop_duplicates()

def get_pair_paths(stop_pairs, G, nodes, calculated_pair_path = "data/node_pairs.parquet"):
    precalculate_node_pair_distances(stop_pairs[["osmid", "prev_osmid"]], calculated_pair_path=calculated_pair_path, G = G, nodes = nodes)
    node_pair_dists = pd.read_parquet(calculated_pair_path)
    stop_pairs = stop_pairs[["osmid", "prev_osmid", "next_stop_id", "prev_stop_id", "stop_name_prev", "stop_name_next"]].merge(node_pair_dists)
    
    return stop_pairs

def full_process_stops(tree, nodes, G, GTFS_PATH, calculated_pair_path = "data/node_pairs.parquet", stops_path = "/home/data/test/cities/C3562/stops.geojson"):
    bus_stops = get_bus_stops(stops_path)
    bus_stops = bus_stops_nodes(bus_stops, tree, nodes)
    stop_pairs = get_stop_pairs(bus_stops, GTFS_PATH)
    stop_pairs = get_pair_paths(stop_pairs, G, nodes, calculated_pair_path)

    return stop_pairs[["next_stop_id", "prev_stop_id",  "stop_name_prev", "stop_name_next", "shortest_path"]].rename({"shortest_path" : "shortest_path_stops"}, axis = 1)

In [3]:
GTFS_PATH = "https://urbantech-public.s3.amazonaws.com/DO-NOT-DELETE-BUSOBSERVATORY-PUBLIC-DATASET/one-system-day.tar.gz"
tree, nodes, G = get_node_data()

In [10]:
full_process_stops(tree, nodes, G, GTFS_PATH, calculated_pair_path = "data/node_pairs.parquet", stops_path = "/home/data/test/cities/C3562/stops.geojson")

(0, 6)
No new pairs to calculate


Unnamed: 0,next_stop_id,prev_stop_id,stop_name_prev,stop_name_next,shortest_path_stops
0,MTA_203720,MTA_201572,FOREST AV/GRANDVIEW AV,SOUTH AV/BRABANT ST,"[42981287.0, 42971135.0, 42971130.0, 42971125...."
1,MTA_201525,MTA_203720,SOUTH AV/BRABANT ST,SOUTH AV/ARLINGTON PL,"[42971120.0, 42971112.0]"
2,MTA_200008,MTA_201525,SOUTH AV/ARLINGTON PL,RICHMOND TER/SOUTH AV,"[42971112.0, 42971108.0]"
3,MTA_202608,MTA_200008,RICHMOND TER/SOUTH AV,HAMILTON AV/EGMONT PL,"[42971108.0, 42962136.0, 42981210.0, 42967005...."
4,MTA_200761,MTA_202608,HAMILTON AV/EGMONT PL,HAMILTON AV/ST MARKS PL,"[42962456.0, 42962450.0, 2494322125.0, 4295570..."
...,...,...,...,...,...
60450,MTA_201934,MTA_200456,YUKON AV/RICHMOND AV,VICTORY BL/ALBERTA AV,"[447772098.0, 447761758.0, 4209669876.0, 44776..."
60451,MTA_403621,MTA_202590,VICTORY BL/RICHMOND AV (NEAR),W 57 ST/AV OF THE AMERICAS,"[42978137.0, 4407796037.0, 42978151.0, 4297815..."
60452,MTA_201256,MTA_404873,5 AV/W 48 ST,RICHMOND TER/ST GEORGE FERRY,"[42452353.0, 42434072.0, 42432693.0, 42443556...."
60453,MTA_200176,MTA_200582,VICTORY BL/ALBERT ST,BAY ST/NICK LAPORTE PL,"[42955040.0, 42978304.0, 42960625.0, 42955372...."


In [None]:
stop_paths = NYC_gtfs_rt.merge(stops_with_dists, how = "outer")[["trip_id", "origin_id", "next_stop_id","stop_name_prev", "stop_name_next", "shortest_path"]].rename({"shortest_path" : "shortest_path_between_stops"}, axis = 1)

In [None]:
processed_buses = gpd.read_parquet("/home/canyon/Bus-Weather-Impacts/data/buses_with_segmented.parquet")
processed_buses

In [None]:
processed_buses = processed_buses.merge(stop_paths)

In [None]:
def check_in_bus_path(row):
    if isinstance(row["shortest_path_between_stops"], (list, np.ndarray)):
        return row["from"] in row["shortest_path_between_stops"] and row["to"] in row["shortest_path_between_stops"]
    return False

processed_buses["in_bus_path"] = processed_buses.apply(check_in_bus_path, axis=1)

In [None]:
processed_buses.groupby("route")["in_bus_path"].mean().reset_index().sort_values("in_bus_path")

In [None]:
processed_buses.groupby("in_bus_path")["speed_osm"].describe()

In [None]:
NYC_stops.query("stop_id == '303117'")

In [None]:
processed_buses.query("route == 'M96''")

In [None]:
processed_buses["speed_truncated"] = processed_buses.speed_osm.apply(lambda x: x if x < 70 else 70)

In [None]:
(ggplot(processed_buses.query("route.str.startswith('M96')"), aes(x = "speed_truncated", color = "in_bus_path")) + geom_density())

In [None]:
processed_buses.query("route.str.startswith('M96')").plot()