## Importing Packages

In [1]:
import numpy as np
import pandas as pd
import networkx as nx
from collections import defaultdict

## Importing the Dataset

### MTA Station dataset

In [2]:
# Read in the MTAStation dataset
mta_stations = pd.read_csv('data/MTA_Stations.csv')
mta_stations.head()

Unnamed: 0,GTFS Stop ID,Station ID,Complex ID,Division,Line,Stop Name,Borough,CBD,Daytime Routes,Structure,GTFS Latitude,GTFS Longitude,North Direction Label,South Direction Label,ADA,ADA Northbound,ADA Southbound,ADA Notes,Georeference
0,R01,1,1,BMT,Astoria,Astoria-Ditmars Blvd,Q,False,N W,Elevated,40.775036,-73.912034,Last Stop,Manhattan,0,0,0,,POINT (-73.912034 40.775036)
1,R03,2,2,BMT,Astoria,Astoria Blvd,Q,False,N W,Elevated,40.770258,-73.917843,Astoria,Manhattan,1,1,1,,POINT (-73.917843 40.770258)
2,R04,3,3,BMT,Astoria,30 Av,Q,False,N W,Elevated,40.766779,-73.921479,Astoria,Manhattan,0,0,0,,POINT (-73.921479 40.766779)
3,R05,4,4,BMT,Astoria,Broadway,Q,False,N W,Elevated,40.76182,-73.925508,Astoria,Manhattan,0,0,0,,POINT (-73.925508 40.76182)
4,R06,5,5,BMT,Astoria,36 Av,Q,False,N W,Elevated,40.756804,-73.929575,Astoria,Manhattan,0,0,0,,POINT (-73.929575 40.756804)


After looking through the station dataset, we identified columns `Station Name`, `Station ID`, `Complex ID`, `Borough` to be the columns most relevant to the questions we are trying to answer.

In [3]:
mta_stations = mta_stations[['Complex ID','Stop Name','Station ID','Borough']]

### MTA Ridership Dataset

In [4]:
# Read in the MTARider dataset
mta_rides = pd.read_csv('data/MTA_Passengers.csv')
mta_rides.head()

Unnamed: 0,Year,Month,Day of Week,Hour of Day,Timestamp,Origin Station Complex ID,Origin Station Complex Name,Origin Latitude,Origin Longitude,Destination Station Complex ID,Destination Station Complex Name,Destination Latitude,Destination Longitude,Estimated Average Ridership,Origin Point,Destination Point
0,2024,1,Monday,1,01/08/2024 01:00:00 AM,26,"DeKalb Av (B,Q,R)",40.690635,-73.981824,355,"Winthrop St (2,5)",40.656652,-73.9502,0.5556,POINT (-73.981824 40.690635),POINT (-73.9502 40.656652)
1,2024,1,Monday,1,01/08/2024 01:00:00 AM,231,"Grand St (B,D)",40.718267,-73.993753,284,Nassau Av (G),40.724635,-73.951277,0.3068,POINT (-73.993753 40.718267),POINT (-73.951277 40.724635)
2,2024,1,Monday,1,01/08/2024 01:00:00 AM,313,"72 St (1,2,3)",40.778453,-73.98197,71,8 Av (N),40.635064,-74.011719,0.3012,POINT (-73.98197 40.778453),POINT (-74.011719 40.635064)
3,2024,1,Monday,1,01/08/2024 01:00:00 AM,320,23 St (1),40.744081,-73.995657,309,103 St (1),40.799446,-73.968379,0.9,POINT (-73.995657 40.744081),POINT (-73.968379 40.799446)
4,2024,1,Monday,1,01/08/2024 01:00:00 AM,399,68 St-Hunter College (6),40.768141,-73.96387,618,"14 St (A,C,E)/8 Av (L)",40.740335,-74.002134,0.294,POINT (-73.96387 40.768141),POINT (-74.002134 40.740335)


Similarly, we cut down our dataset to only include the columns that are relevant to our question.

In [5]:
mta_rides = mta_rides[['Day of Week','Hour of Day','Estimated Average Ridership','Origin Station Complex ID','Destination Station Complex ID']]
mta_rides

Unnamed: 0,Day of Week,Hour of Day,Estimated Average Ridership,Origin Station Complex ID,Destination Station Complex ID
0,Monday,1,0.5556,26,355
1,Monday,1,0.3068,231,284
2,Monday,1,0.3012,313,71
3,Monday,1,0.9000,320,309
4,Monday,1,0.2940,399,618
...,...,...,...,...,...
9169120,Sunday,12,0.3322,426,379
9169121,Sunday,12,0.7305,427,613
9169122,Sunday,12,0.3130,32,263
9169123,Sunday,12,0.8602,324,407


## Creating Graph-Based Representation

In [6]:
G = nx.MultiDiGraph

Constructing the graph-based representation, we use each of the `ridership data` as **edge** and use its `origin station` and `destination station` as **nodes**. 

In [7]:
G = nx.from_pandas_edgelist(mta_rides, source='Origin Station Complex ID', target='Destination Station Complex ID', 
                     edge_attr= True,
                     create_using=G, edge_key=None)

In [8]:
for _, row in mta_stations.iterrows():
    complex_id = row['Complex ID']
    attributes = row.to_dict()
    G.add_node(complex_id, **attributes)

In [9]:
# Example of a node in the graph-based representation
G.nodes[440]

{'Complex ID': 440, 'Stop Name': '116 St', 'Station ID': 440, 'Borough': 'M'}

In [10]:
# Examples of edges in the graph-based representation
count = 5
idx = 0
for u, v, attributes in G.edges(data=True):
    print(f"Edge ({u}, {v}) has attributes {attributes}")
    idx +=1
    if idx >= count:
        break


Edge (26, 355) has attributes {'Day of Week': 'Monday', 'Hour of Day': 1, 'Estimated Average Ridership': 0.5556}
Edge (26, 355) has attributes {'Day of Week': 'Monday', 'Hour of Day': 13, 'Estimated Average Ridership': 1.4878}
Edge (26, 355) has attributes {'Day of Week': 'Monday', 'Hour of Day': 14, 'Estimated Average Ridership': 1.2784}
Edge (26, 355) has attributes {'Day of Week': 'Monday', 'Hour of Day': 15, 'Estimated Average Ridership': 2.1476}
Edge (26, 355) has attributes {'Day of Week': 'Monday', 'Hour of Day': 16, 'Estimated Average Ridership': 2.8818}


## Problem 1

### 1.a

In [11]:
# Dictionary to store ride counts for each borough
borough_rides = defaultdict(list)

# Iterate over edges to accumulate ride counts per station in each borough
for u, v, data in G.edges(data=True):
    origin_station = u
    borough = G.nodes[u].get('Borough', 'Unknown') 
    ride_count = data.get('Estimated Average Ridership')

    borough_rides[borough].append((origin_station, ride_count))

# Find the top 5 stations for each borough
top_stations_per_borough = {}
for borough, rides in borough_rides.items():
    station_rides = defaultdict(int)
    for station, count in rides:
        station_rides[station] += count
    
    # Convert to DataFrame for sorting
    df = pd.DataFrame(station_rides.items(), columns=['Station', 'Ride Count'])
    
    # Sort by 'Ride Count' and take the top 5
    top_stations_per_borough[borough] = df.sort_values(by='Ride Count', ascending=False).head(5)

# Print the top stations for each borough
for borough, top_stations in top_stations_per_borough.items():
    print(f"Top 5 stations in {borough}:")
    print(top_stations)
    print()


Top 5 stations in Bk:
     Station   Ride Count
45       617  180918.4360
51       120  150664.2967
86       636  139917.7360
41       620  113707.1244
106      345   93257.7031

Top 5 stations in M:
    Station   Ride Count
36      611  772832.8065
18      610  578428.9966
29      607  421029.6784
23      602  382816.6320
49      164  332532.6990

Top 5 stations in Bx:
    Station  Ride Count
32      604  75334.2277
5       434  59474.6468
12      366  56023.6398
16      382  35939.9912
0       371  35910.6288

Top 5 stations in Q:
    Station   Ride Count
25      616  277384.3949
35      447  254239.2993
7       450  117368.9540
24      279  113661.6204
17      451  112363.4206



### 1.b

In [12]:
# Days to consider for the analysis
days_to_include = {'Monday', 'Tuesday', 'Wednesday'}

# Dictionary to accumulate ride counts for stations on specified days
station_rides = defaultdict(int)

# Iterate over the edges and filter by day_of_week
for u, v, data in G.edges(data=True):
    day_of_week = data.get('Day of Week')
    if day_of_week in days_to_include:
        origin_station = u
        ride_count = data.get('Estimated Average Ridership')
        station_rides[origin_station] += ride_count

# Convert to DataFrame for easy sorting
df = pd.DataFrame(station_rides.items(), columns=['Station', 'Ride Count'])

# Sort by 'Ride Count' and select the top 5 stations
top_5_stations = df.sort_values(by='Ride Count', ascending=False).head(5)

# Display the results
print("Top 5 origin stations for Monday, Tuesday, and Wednesday combined:")
print(top_5_stations)


Top 5 origin stations for Monday, Tuesday, and Wednesday combined:
    Station   Ride Count
84      611  366682.6978
38      610  299994.9464
68      607  198093.2732
54      602  178115.1026
56      628  158768.4294


### 1.c

In [13]:
# Days to consider for the analysis
weekend_days = {'Saturday', 'Sunday'}

# Dictionary to accumulate ride counts for stations on specified days
weekend_station_rides = defaultdict(int)

# Iterate over the edges and filter by day_of_week
for u, v, data in G.edges(data=True):
    day_of_week = data.get('Day of Week')
    if day_of_week in weekend_days:
        origin_station = u
        ride_count = data.get('Estimated Average Ridership')
        weekend_station_rides[origin_station] += ride_count

# Convert to DataFrame for sorting
df_weekend = pd.DataFrame(weekend_station_rides.items(), columns=['Station', 'Ride Count'])

# Sort by 'Ride Count' and select the top 5 stations
top_5_weekend_stations = df_weekend.sort_values(by='Ride Count', ascending=False).head(5)

# Display the results
print("Top 5 origin stations for Saturday and Sunday combined:")
print(top_5_weekend_stations)


Top 5 origin stations for Saturday and Sunday combined:
     Station   Ride Count
84       611  151484.7149
68       607   81496.3893
38       610   78418.8155
54       602   77392.2122
118      164   69408.9595


### 1.d

In [14]:
# Time range to consider (1 am - 5 am)
start_time = 1
end_time = 5

# Dictionary to accumulate ride counts for stations within the specified time range
early_morning_rides = defaultdict(int)

# Iterate over the edges and filter by time_of_ride
for u, v, data in G.edges(data=True):
    time_of_ride = data.get('Hour of Day')
    if time_of_ride:
        if start_time <= time_of_ride <= end_time:
            origin_station = u
            ride_count = data.get('Estimated Average Ridership')
            early_morning_rides[origin_station] += ride_count

# Convert to DataFrame for sorting
df_early_morning = pd.DataFrame(early_morning_rides.items(), columns=['Station', 'Ride Count'])

# Sort by 'Ride Count' and select the top 5 stations
top_5_early_morning_stations = df_early_morning.sort_values(by='Ride Count', ascending=False).head(5)

# Display the results
print("Top 5 origin stations for rides between 1 am and 5 am across all days and boroughs:")
print(top_5_early_morning_stations)

Top 5 origin stations for rides between 1 am and 5 am across all days and boroughs:
     Station  Ride Count
84       611  23851.8459
183      616  14400.2977
260      447  11501.9799
63       450  11346.5569
237      278   8971.7651


### 1.e

In [15]:
# Time range to consider (6 am - 9 am)
start_time = 6
end_time = 9

# Dictionary to accumulate ride counts for stations within the specified time range
early_morning_rides = defaultdict(int)

# Iterate over the edges and filter by time_of_ride
for u, v, data in G.edges(data=True):
    time_of_ride = data.get('Hour of Day')
    if time_of_ride:
        if start_time <= time_of_ride <= end_time:
            origin_station = u
            ride_count = data.get('Estimated Average Ridership')
            early_morning_rides[origin_station] += ride_count

# Convert to DataFrame for sorting
df_early_morning = pd.DataFrame(early_morning_rides.items(), columns=['Station', 'Ride Count'])

# Sort by 'Ride Count' and select the top 5 stations
top_5_early_morning_stations = df_early_morning.sort_values(by='Ride Count', ascending=False).head(5)

# Display the results
print("Top 5 origin stations for rides between 6 am and 9 am across all days and boroughs:")
print(top_5_early_morning_stations)

Top 5 origin stations for rides between 6 am and 9 am across all days and boroughs:
     Station   Ride Count
84       611  173476.3812
38       610  120589.1905
183      616   89547.9364
100      318   86708.2752
260      447   84534.1125


## Problem 2

### 2.a

In [16]:
# Dictionary to store ride counts for each borough
borough_rides = defaultdict(list)

# Iterate over edges to accumulate ride counts per destination station in each borough
for u, v, data in G.edges(data=True):
    destination_station = v
    borough = G.nodes[v].get('Borough', 'Unknown') 
    ride_count = data.get('Estimated Average Ridership')

    borough_rides[borough].append((destination_station, ride_count))

# Find the top 5 stations for each borough
top_destinations_per_borough = {}
for borough, rides in borough_rides.items():
    # Aggregate ride counts per station
    station_rides = defaultdict(int)
    for station, count in rides:
        station_rides[station] += count
    
    # Convert to DataFrame for sorting
    df = pd.DataFrame(station_rides.items(), columns=['Station', 'Ride Count'])
    
    # Sort by 'Ride Count' and take the top 5
    top_destinations_per_borough[borough] = df.sort_values(by='Ride Count', ascending=False).head(5)

# Print the top stations for each borough
for borough, top_stations in top_destinations_per_borough.items():
    print(f"Top 5 destination stations in {borough}:")
    print(top_stations)
    print()

Top 5 destination stations in Bk:
     Station   Ride Count
69       617  182108.8651
12       120  155106.0882
78       636  142927.4383
125      620  117848.1965
80       345   86573.2395

Top 5 destination stations in M:
    Station   Ride Count
35      611  755731.0109
90      610  553936.6710
18      607  427831.4631
21      602  408713.8706
7       628  324693.1087

Top 5 destination stations in Q:
    Station   Ride Count
12      616  273041.5919
3       447  238424.8161
14      606  113193.4213
29      450  109201.5524
2       451  103706.4346

Top 5 destination stations in Bx:
    Station  Ride Count
0       604  72982.0239
33      434  58186.8629
11      366  51492.9785
3       603  34738.4729
12      382  33780.4641



### 2.b

In [17]:
# Days to consider for the analysis
days_to_include = {'Thursday', 'Friday'}

# Dictionary to accumulate ride counts for destination stations on specified days
station_rides = defaultdict(int)

# Iterate over the edges and filter by day_of_week
for u, v, data in G.edges(data=True):
    day_of_week = data.get('Day of Week')
    if day_of_week in days_to_include:
        destination_station = v
        ride_count = data.get('Estimated Average Ridership') 
        station_rides[destination_station] += ride_count

# Convert to DataFrame for easy sorting
df = pd.DataFrame(station_rides.items(), columns=['Station', 'Ride Count'])

# Sort by 'Ride Count' and select the top 5 stations
top_5_stations = df.sort_values(by='Ride Count', ascending=False).head(5)

# Display the results
print("Top 5 destination stations for Thursday and Friday combined:")
print(top_5_stations)

Top 5 destination stations for Thursday and Friday combined:
     Station   Ride Count
81       611  254027.9515
257      610  193929.8707
43       607  146122.4552
55       602  137625.2515
21       628  112550.0857


### 2.c

In [18]:
# Days to consider for the analysis
days_to_include = {'Saturday'}

# Dictionary to accumulate ride counts for destination stations on specified days
station_rides = defaultdict(int)

# Iterate over the edges and filter by day_of_week
for u, v, data in G.edges(data=True):
    day_of_week = data.get('Day of Week')
    if day_of_week in days_to_include:
        destination_station = v
        ride_count = data.get('Estimated Average Ridership') 
        station_rides[destination_station] += ride_count

# Convert to DataFrame for sorting
df = pd.DataFrame(station_rides.items(), columns=['Station', 'Ride Count'])

# Sort by 'Ride Count' and select the top 5 stations
top_5_stations = df.sort_values(by='Ride Count', ascending=False).head(5)

# Display the results
print("Top 5 destination stations for Saturday only:")
print(top_5_stations)

Top 5 destination stations for Saturday only:
     Station  Ride Count
80       611  82215.3906
42       607  48560.3881
54       602  48271.3167
250      610  40886.4699
194      164  35605.2535


### 2.d

In [19]:
# Time range to consider (12 am - 5 am)
range = [24, 1, 2, 3, 4, 5]

# Dictionary to accumulate ride counts for stations within the specified time range
early_morning_rides = defaultdict(int)

# Iterate over the edges and filter by time_of_ride
for u, v, data in G.edges(data=True):
    time_of_ride = data.get('Hour of Day')
    if time_of_ride:
        if time_of_ride in range:
            destination_station = v
            ride_count = data.get('Estimated Average Ridership')
            early_morning_rides[destination_station] += ride_count

# Convert to DataFrame for sorting
df_early_morning = pd.DataFrame(early_morning_rides.items(), columns=['Station', 'Ride Count'])

# Sort by 'Ride Count' and select the top 5 stations
top_5_early_morning_stations = df_early_morning.sort_values(by='Ride Count', ascending=False).head(5)

# Display the results
print("Top 5 destination stations for rides between 12 am and 5 am across all days and boroughs:")
print(top_5_early_morning_stations)

Top 5 destination stations for rides between 12 am and 5 am across all days and boroughs:
     Station  Ride Count
81       611  20514.5787
221      610  17061.6508
43       607  11541.9589
21       628  10544.6630
173      164   9657.5863


### 2.e

In [20]:
# Time range to consider (6pm-9pm)
range = [18, 19, 20, 21]

# Dictionary to accumulate ride counts for stations within the specified time range
early_morning_rides = defaultdict(int)

# Iterate over the edges and filter by time_of_ride
for u, v, data in G.edges(data=True):
    time_of_ride = data.get('Hour of Day')
    if time_of_ride:
        if time_of_ride in range:
            destination_station = v
            ride_count = data.get('Estimated Average Ridership') 
            early_morning_rides[destination_station] += ride_count

# Convert to DataFrame for sorting
df_early_morning = pd.DataFrame(early_morning_rides.items(), columns=['Station', 'Ride Count'])

# Sort by 'Ride Count' and select the top 5 stations
top_5_early_morning_stations = df_early_morning.sort_values(by='Ride Count', ascending=False).head(5)

# Display the results
print("Top 5 destination stations for rides between 6 pm and 9 pm across all days and boroughs:")
print(top_5_early_morning_stations)

Top 5 destination stations for rides between 6 pm and 9 pm across all days and boroughs:
     Station   Ride Count
81       611  130974.9728
257      610   67513.9508
43       607   63408.8806
105      616   62489.0931
55       602   60015.5027


## Problem 3

### 3.a

In [21]:
# Day and time range to consider for the analysis
target_day = 'Monday'
start_hour = 13  # 1 pm
end_hour = 14    # 2 pm

# Dictionary to accumulate ride counts for each source-destination pair
congested_pairs = defaultdict(int)

# Iterate over the edges and filter by day_of_week and time_of_ride
for u, v, data in G.edges(data=True):
    day_of_week = data.get('Day of Week')
    time_of_ride = data.get('Hour of Day')

    if day_of_week == target_day:
        if start_hour <= time_of_ride <= end_hour:
            source_station = u
            destination_station = v
            ride_count = data.get('Estimated Average Ridership', 1) 
            congested_pairs[(source_station, destination_station)] += ride_count

# Convert to DataFrame for sorting
df = pd.DataFrame(congested_pairs.items(), columns=['Source-Destination Pair', 'Ride Count'])

# Sort by 'Ride Count' and select the top 10 pairs
top_10_congested_pairs = df.sort_values(by='Ride Count', ascending=False).head(10)

# Display the results
print("Top 10 congested source-destination pairs on Monday between 1 pm and 2 pm:")
print(top_10_congested_pairs)


Top 10 congested source-destination pairs on Monday between 1 pm and 2 pm:
      Source-Destination Pair  Ride Count
11557              (610, 611)    282.2022
69208              (447, 450)    273.7716
17144              (628, 610)    272.3798
69188              (447, 451)    271.3252
11490              (610, 602)    270.6618
69162              (447, 616)    259.8570
16366              (602, 610)    255.8078
38850              (451, 447)    239.9478
24238              (611, 610)    232.1258
11634              (610, 628)    231.0762


### 3.b

In [22]:
# Day and time range to consider for the analysis
target_day = 'Friday'
start_hour = 18  # 6 pm
end_hour = 21    # 9 pm
target_borough = 'Q'

# Dictionary to accumulate ride counts for each source-destination pair
congested_pairs = defaultdict(int)

# Iterate over the edges and filter by day_of_week, time_of_ride, and borough
for u, v, data in G.edges(data=True):
    day_of_week = data.get('Day of Week')
    time_of_ride = data.get('Hour of Day')

    # Check if both source and destination stations are in the target borough
    if day_of_week == target_day and time_of_ride:
        if start_hour <= time_of_ride <= end_hour:
            source_borough = G.nodes[u].get('Borough')
            destination_borough = G.nodes[v].get('Borough')
            # print(source_borough, destination_borough)
            if source_borough == target_borough and destination_borough == target_borough:
                ride_count = data.get('Estimated Average Ridership') 
                congested_pairs[(u, v)] += ride_count

# Convert to DataFrame for sorting
df = pd.DataFrame(congested_pairs.items(), columns=['Source-Destination Pair', 'Ride Count'])

# Sort by 'Ride Count' and select the top 10 pairs
top_10_congested_pairs = df.sort_values(by='Ride Count', ascending=False).head(10)

# Display the results
print("Top 10 congested source-destination pairs in Queens on Fridays between 6 pm and 9 pm:")
print(top_10_congested_pairs)


Top 10 congested source-destination pairs in Queens on Fridays between 6 pm and 9 pm:
     Source-Destination Pair  Ride Count
1870              (447, 616)    645.8218
1890              (447, 450)    644.1210
1883              (447, 451)    593.8203
952               (451, 447)    433.6342
1887              (447, 452)    385.5596
1423              (616, 447)    345.3137
389               (450, 447)    278.0807
1893              (447, 449)    240.2326
2809              (453, 447)    235.2895
1426              (616, 254)    222.0617


### 3.c

In [23]:
# Day and time range to consider for the analysis
start_hour = 1  # 1 am
end_hour = 5    # 5 am
target_borough = 'Bk'

# Dictionary to accumulate ride counts for each source-destination pair
congested_pairs = defaultdict(int)

# Iterate over the edges and filter by time_of_ride, and borough
for u, v, data in G.edges(data=True):
    time_of_ride = data.get('Hour of Day')

    # Check if both source and destination stations are in the target borough
    if start_hour <= time_of_ride <= end_hour:
        source_borough = G.nodes[u].get('Borough')
        destination_borough = G.nodes[v].get('Borough')
        # print(source_borough, destination_borough)
        if source_borough == target_borough and destination_borough == target_borough:
            ride_count = data.get('Estimated Average Ridership') 
            congested_pairs[(u, v)] += ride_count

# Convert to DataFrame for sorting
df = pd.DataFrame(congested_pairs.items(), columns=['Source-Destination Pair', 'Ride Count'])

# Sort by 'Ride Count' and select the top 10 pairs
top_10_congested_pairs = df.sort_values(by='Ride Count', ascending=False).head(10)

# Display the results
print("Top 10 congested source-destination pairs in Brooklyn between 1 am and 5 am:")
print(top_10_congested_pairs)


Top 10 congested source-destination pairs in Brooklyn between 1 am and 5 am:
      Source-Destination Pair  Ride Count
10504              (345, 617)    136.0707
4022               (359, 617)    107.7769
5317               (120, 630)    104.9992
10571              (345, 620)    104.1418
1391               (630, 120)     93.2904
10495              (345, 337)     92.5102
12202              (188, 636)     92.4959
4755                (617, 32)     86.2556
5308               (120, 127)     86.0058
767                (629, 630)     77.3768


### 3.d

In [24]:
# Day and time range to consider for the analysis
days_to_include = {'Monday', 'Tuesday', 'Wednesday', 'Thursday'}
start_hour = 6  # 6 am
end_hour = 7    # 7 am
starting_borough = 'Bk'
target_borough = 'M'

# Dictionary to accumulate ride counts for each source-destination pair
congested_pairs = defaultdict(int)

# Iterate over the edges and filter by day_of_week, time_of_ride, and borough
for u, v, data in G.edges(data=True):
    day_of_week = data.get('Day of Week')
    time_of_ride = data.get('Hour of Day')

    # Check if both source and destination stations are in the target borough
    if day_of_week in days_to_include and time_of_ride:
        if start_hour <= time_of_ride <= end_hour:
            source_borough = G.nodes[u].get('Borough')
            destination_borough = G.nodes[v].get('Borough')
            # print(source_borough, destination_borough)
            if source_borough == starting_borough and destination_borough == target_borough:
                ride_count = data.get('Estimated Average Ridership') 
                congested_pairs[(u, v)] += ride_count

# Convert to DataFrame for sorting
df = pd.DataFrame(congested_pairs.items(), columns=['Source-Destination Pair', 'Ride Count'])

# Sort by 'Ride Count' and select the top 10 pairs
top_10_congested_pairs = df.sort_values(by='Ride Count', ascending=False).head(10)

# Display the results
print("Top 10 congested source-destination pairs from Brooklyn to Manhattan, Monday-Thursday from 6am to 7am:")
print(top_10_congested_pairs)

Top 10 congested source-destination pairs from Brooklyn to Manhattan, Monday-Thursday from 6am to 7am:
      Source-Destination Pair  Ride Count
4869               (617, 414)   1029.6123
11295              (345, 610)    692.6152
14953               (51, 607)    655.3457
4401               (620, 610)    653.3663
4069               (359, 610)    589.2357
11288              (345, 628)    580.2716
4061               (359, 628)    557.0981
5527               (120, 610)    555.5914
14954               (51, 225)    503.5513
1739                (54, 225)    478.5522


### 3.e

In [25]:
# Day and time range to consider for the analysis
days_to_include = {'Monday', 'Tuesday', 'Wednesday', 'Thursday'}
start_hour = 6  # 6 am
end_hour = 7    # 7 am
starting_borough = 'Bx'
target_borough = 'M'

# Dictionary to accumulate ride counts for each source-destination pair
congested_pairs = defaultdict(int)

# Iterate over the edges and filter by day_of_week, time_of_ride, and borough
for u, v, data in G.edges(data=True):
    day_of_week = data.get('Day of Week')
    time_of_ride = data.get('Hour of Day')

    # Check if both source and destination stations are in the target borough
    if day_of_week in days_to_include and time_of_ride:
        if start_hour <= time_of_ride <= end_hour:
            source_borough = G.nodes[u].get('Borough')
            destination_borough = G.nodes[v].get('Borough')
            # print(source_borough, destination_borough)
            if source_borough == starting_borough and destination_borough == target_borough:
                ride_count = data.get('Estimated Average Ridership') 
                congested_pairs[(u, v)] += ride_count

# Convert to DataFrame for sorting
df = pd.DataFrame(congested_pairs.items(), columns=['Source-Destination Pair', 'Ride Count'])

# Sort by 'Ride Count' and select the top 10 pairs
top_10_congested_pairs = df.sort_values(by='Ride Count', ascending=False).head(10)

# Display the results
print("Top 10 congested source-destination pairs from Bronx to Manhattan, Monday-Thursday from 6am to 7am:")
print(top_10_congested_pairs)

Top 10 congested source-destination pairs from Bronx to Manhattan, Monday-Thursday from 6am to 7am:
     Source-Destination Pair  Ride Count
1352              (366, 610)    562.2564
1375              (366, 602)    402.3675
1404              (366, 392)    395.6825
1359              (366, 399)    383.7979
1427              (366, 397)    372.9343
1362              (366, 612)    357.8659
1366              (366, 628)    346.6705
1426              (366, 622)    320.3933
3589              (604, 614)    306.0047
5117              (378, 397)    302.6310


### 3.f

In [26]:
# Day and time range to consider for the analysis
days_to_include = {'Monday', 'Tuesday', 'Wednesday', 'Thursday'}
start_hour = 6  # 6 am
end_hour = 7    # 7 am
starting_borough = 'SI'
target_borough = 'M'

# Dictionary to accumulate ride counts for each source-destination pair
congested_pairs = defaultdict(int)

# Iterate over the edges and filter by day_of_week, time_of_ride, and borough
for u, v, data in G.edges(data=True):
    day_of_week = data.get('Day of Week')
    time_of_ride = data.get('Hour of Day')

    # Check if both source and destination stations are in the target borough
    if day_of_week in days_to_include and time_of_ride:
        if start_hour <= time_of_ride <= end_hour:
            source_borough = G.nodes[u].get('Borough')
            destination_borough = G.nodes[v].get('Borough')
            if source_borough == starting_borough and destination_borough == target_borough:
                # print(source_borough, destination_borough)
                ride_count = data.get('Estimated Average Ridership') 
                congested_pairs[(u, v)] += ride_count

# Convert to DataFrame for easy sorting
df = pd.DataFrame(congested_pairs.items(), columns=['Source-Destination Pair', 'Ride Count'])

# Sort by 'Ride Count' and select the top 10 pairs
top_10_congested_pairs = df.sort_values(by='Ride Count', ascending=False).head(10)

# Display the results
print("Top 10 congested source-destination pairs from State Island to Manhattan, Monday-Thursday from 6am to 7am:")
print(top_10_congested_pairs)

Top 10 congested source-destination pairs from State Island to Manhattan, Monday-Thursday from 6am to 7am:
Empty DataFrame
Columns: [Source-Destination Pair, Ride Count]
Index: []


The empty dataframe means that there is not a station in which satisfies the requirement of the query. Therefore, there are no source-destination pairs from State Island to Manhattan from 6am to 7am. 