In [1]:
import pandas as pd

# Visualization 
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
sns.set()

In [2]:
# Subway Stations near CCNY
ccny_old_station_ids = ['N019', 'N024', 'N026', 'R176']
ccny_station_ids = ["151", "152", "153", "305"]
ccny_station_complex_name = ['145 St (A,B,C,D)', '135 St (B,C)', '125 St (A,B,C,D)', '137 St-City College (1)']

In [3]:
df_2023 = pd.read_csv('data/ridership_daily_station_2023.csv')
df_2023.head()

Unnamed: 0,transit_timestamp,station_complex_id,station_complex,borough,Georeference,ridership,transfers
0,2023-01-01,1,"Astoria-Ditmars Blvd (N,W)",Queens,POINT (-73.91203308105469 40.7750358581543),3848,88
1,2023-01-01,10,"49 St (N,R,W)",Manhattan,POINT (-73.98413848876953 40.7598991394043),12783,28
2,2023-01-01,100,"Hewes St (M,J)",Brooklyn,POINT (-73.95343017578125 40.706871032714844),745,2
3,2023-01-01,101,"Marcy Av (M,J,Z)",Brooklyn,POINT (-73.95775604248047 40.70835876464844),4313,72
4,2023-01-01,103,"Bowery (J,Z)",Manhattan,POINT (-73.99391174316406 40.720279693603516),2596,2


In [4]:
df_2023.dtypes

transit_timestamp     object
station_complex_id    object
station_complex       object
borough               object
Georeference          object
ridership              int64
transfers              int64
dtype: object

In [5]:
# Generating station_id dictionary for MTA Dataset for Ridership since Feb 2022

df_stations = df_2023.groupby(['station_complex_id', 'station_complex', 'borough', 'Georeference']).first().reset_index()
df_stations = df_stations[df_stations['transit_timestamp'] == '2023-01-01']
duplicate_rows = df_stations[df_stations.duplicated(subset=['station_complex_id'])]
df_stations.drop_duplicates(subset=['station_complex_id'], inplace=True)
df_stations.drop(columns=['transit_timestamp','ridership','transfers'], inplace=True)
df_stations.head()

Unnamed: 0,station_complex_id,station_complex,borough,Georeference
1,1,"Astoria-Ditmars Blvd (N,W)",Queens,POINT (-73.91203308105469 40.7750358581543)
2,10,"49 St (N,R,W)",Manhattan,POINT (-73.98413848876953 40.7598991394043)
5,100,"Hewes St (M,J)",Brooklyn,POINT (-73.95343017578125 40.706871032714844)
7,101,"Marcy Av (M,J,Z)",Brooklyn,POINT (-73.95775604248047 40.70835876464844)
9,103,"Bowery (J,Z)",Manhattan,POINT (-73.99391174316406 40.720279693603516)


In [6]:
df_2023_ccny = df_2023[df_2023['station_complex_id'].isin(ccny_station_ids)]
df_2023_ccny['transit_timestamp'] = pd.to_datetime(df_2023_ccny['transit_timestamp'])
df_2023_ccny['weekday'] = df_2023_ccny['transit_timestamp'].dt.day_name()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_2023_ccny['transit_timestamp'] = pd.to_datetime(df_2023_ccny['transit_timestamp'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_2023_ccny['weekday'] = df_2023_ccny['transit_timestamp'].dt.day_name()


In [7]:
df_2023_ccny.head(8)

Unnamed: 0,transit_timestamp,station_complex_id,station_complex,borough,Georeference,ridership,transfers,weekday
40,2023-01-01,151,"145 St (A,C,B,D)",Manhattan,POINT (-73.9442138671875 40.82478332519531),6073,60,Sunday
41,2023-01-01,152,"135 St (C,B)",Manhattan,POINT (-73.94764709472656 40.817893981933594),1179,11,Sunday
42,2023-01-01,153,"125 St (A,C,B,D)",Manhattan,POINT (-73.95234680175781 40.81110763549805),7590,302,Sunday
181,2023-01-01,305,137 St-City College (1),Manhattan,POINT (-73.95367431640625 40.82200622558594),3631,25,Sunday
518,2023-01-02,151,"145 St (A,C,B,D)",Manhattan,POINT (-73.9442138671875 40.82478332519531),7692,107,Monday
519,2023-01-02,152,"135 St (C,B)",Manhattan,POINT (-73.94764709472656 40.817893981933594),1485,31,Monday
520,2023-01-02,153,"125 St (A,C,B,D)",Manhattan,POINT (-73.95234680175781 40.81110763549805),9619,450,Monday
659,2023-01-02,305,137 St-City College (1),Manhattan,POINT (-73.95367431640625 40.82200622558594),4741,28,Monday


In [8]:
df_feb_first = df_2023_ccny[df_2023_ccny['transit_timestamp'] == '2023-02-01']
df_feb_first

Unnamed: 0,transit_timestamp,station_complex_id,station_complex,borough,Georeference,ridership,transfers,weekday
14917,2023-02-01,151,"145 St (A,C,B,D)",Manhattan,POINT (-73.9442138671875 40.82478332519531),16213,259,Wednesday
14918,2023-02-01,152,"135 St (C,B)",Manhattan,POINT (-73.94764709472656 40.817893981933594),3841,64,Wednesday
14919,2023-02-01,153,"125 St (A,C,B,D)",Manhattan,POINT (-73.95234680175781 40.81110763549805),17613,789,Wednesday
15058,2023-02-01,305,137 St-City College (1),Manhattan,POINT (-73.95367431640625 40.82200622558594),9905,55,Wednesday


In [9]:
# Temporal Plot of Ridership for CCNY Stations

fig = px.line(df_2023_ccny, x='transit_timestamp', y='ridership', color='station_complex')
fig.update_layout(title='Ridership Count for CCNY Stations', xaxis_title='Date', yaxis_title='Ridership Count')
fig.show()

In [None]:
df_2023_ccny.to_csv('data/ridership_daily_station_2023_ccny.csv', index=False)