- assign zone_id to each row in the subway dataset
- combine the subway and taxi datasets.
- running this scrip generates a csv file for the combined dataframes.


In [114]:
import json
from shapely.geometry import shape, Point
import os
!pip install python-dotenv
from dotenv import load_dotenv
import geopandas as gpd
import pandas as pd
import numpy as np
import seaborn as sns
!pip install optuna
import optuna
import folium
from folium.plugins import HeatMap
from sklearn.model_selection import train_test_split, KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import xgboost as xgb
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline




In [115]:
load_dotenv('/content/.env')

True

In [None]:
# Download the file from Google Drive using the file ID
!gdown {os.getenv('taxi-geojason')}
!gdown {os.getenv('cleaned-subway-data')}
!gdown {os.getenv('cleaned-taxi-data')}

In [117]:
#load the taxi dataset into a df
taxi_df = pd.read_csv('/content/combined_taxi_df.csv')

taxi_df.head()

Unnamed: 0,datetime_formatted,hour,day_of_week,week,month,day_of_month,year_month,zone,passenger_count
0,2021-01-01 00:00:00,0,4,53,0,1,2021-01,4,4
1,2021-01-01 00:00:00,0,4,53,0,1,2021-01,13,3
2,2021-01-01 00:00:00,0,4,53,0,1,2021-01,24,3
3,2021-01-01 00:00:00,0,4,53,0,1,2021-01,41,12
4,2021-01-01 00:00:00,0,4,53,0,1,2021-01,42,2


In [118]:
#drop all columns except of datetime_formated and passenger_count and zone
taxi_df = taxi_df[["datetime_formatted", "zone", "passenger_count"]].reset_index(drop=True)

In [119]:
#rename datetime_formatted to transit_timestamp and zone to zone_id
taxi_df.rename(columns={'datetime_formatted': 'transit_timestamp', 'zone': 'zone_id' }, inplace=True)

In [120]:
taxi_df.head()

Unnamed: 0,transit_timestamp,zone_id,passenger_count
0,2021-01-01 00:00:00,4,4
1,2021-01-01 00:00:00,13,3
2,2021-01-01 00:00:00,24,3
3,2021-01-01 00:00:00,41,12
4,2021-01-01 00:00:00,42,2


In [121]:
#get the number of rows after grouping
taxi_df.shape

(1745586, 3)

In [122]:
# change the transit_timestamp into a datetime object
taxi_df['transit_timestamp'] = pd.to_datetime(taxi_df['transit_timestamp'])

In [123]:
#change zone_id to categorical type
taxi_df['zone_id'] = taxi_df['zone_id'].astype('int')

In [124]:
taxi_df.dtypes

transit_timestamp    datetime64[ns]
zone_id                       int64
passenger_count               int64
dtype: object

In [125]:
##group the subway df by the combo of timestamp and zone_id summing up the ridership values.
taxi_df = taxi_df.groupby(['transit_timestamp', 'zone_id']).sum().reset_index()

In [126]:
#count the number of duplicate rows
taxi_df.duplicated().sum()

0

In [127]:
#what is the max and min dates in the df
print(taxi_df['transit_timestamp'].min())
print(taxi_df['transit_timestamp'].max())

2021-01-01 00:00:00
2024-04-01 00:00:00


In [128]:
subway_df = pd.read_csv('cleaned-subway-data-1.csv')

subway_df.head()

Unnamed: 0,transit_timestamp,station_complex_id,station_complex,fare_class_category,ridership,latitude,longitude
0,2024-04-25 17:00:00,222,Roosevelt Island (F),Metrocard - Seniors & Disability,21,40.7591,-73.9533
1,2024-04-25 06:00:00,310,"96 St (1,2,3)",Metrocard - Full Fare,49,40.7939,-73.9723
2,2024-04-25 10:00:00,313,"72 St (1,2,3)",Metrocard - Unlimited 7-Day,90,40.7785,-73.982
3,2024-04-25 12:00:00,329,Rector St (1),OMNY - Seniors & Disability,2,40.7075,-74.0138
4,2024-04-25 16:00:00,146,181 St (A),OMNY - Seniors & Disability,7,40.8517,-73.938


In [129]:
#drop all columns except of transit_timestamp, latitude, longitude and risership

subway_df = subway_df[['transit_timestamp', 'latitude', 'longitude', 'ridership']]

subway_df.head()

Unnamed: 0,transit_timestamp,latitude,longitude,ridership
0,2024-04-25 17:00:00,40.7591,-73.9533,21
1,2024-04-25 06:00:00,40.7939,-73.9723,49
2,2024-04-25 10:00:00,40.7785,-73.982,90
3,2024-04-25 12:00:00,40.7075,-74.0138,2
4,2024-04-25 16:00:00,40.8517,-73.938,7


In [130]:
# change the transit_timestamp into a datetime object
subway_df['transit_timestamp'] = pd.to_datetime(subway_df['transit_timestamp'])

In [131]:
subway_df.dtypes

transit_timestamp    datetime64[ns]
latitude                    float64
longitude                   float64
ridership                     int64
dtype: object

In [132]:
#what is the max and min dates in the df
print(subway_df['transit_timestamp'].min())
print(subway_df['transit_timestamp'].max())

2022-02-01 00:00:00
2024-05-23 23:00:00


In [133]:
#keep rows with time stamps that are on or before 2024-04-01 00:00:00 in the subway dataset.
subway_df = subway_df[subway_df['transit_timestamp'] <= '2024-04-01 00:00:00']

In [134]:
#keep rows with time stamps that are on or after 2022-02-01 00:00:00 in the taxi dataset.
taxi_df = taxi_df[taxi_df['transit_timestamp'] >= '2022-02-01 00:00:00']

In [135]:
#assign taxi zones to subway locations.
# Load GeoJSON data into a GeoDataFrame
def load_geojson_gpd(filepath):
    return gpd.read_file(filepath)

# Function to find zones using spatial join in geopandas
def assign_zones(df, gdf):
    # Convert DataFrame to GeoDataFrame
    gdf_points = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.longitude, df.latitude))
    gdf_points.set_crs(gdf.crs, inplace=True)  # Ensure CRS matches if known; otherwise, assume it matches

    # Spatial join points to polygons
    joined = gpd.sjoin(gdf_points, gdf, how="left", op='within')
    return joined['location_id']

# Load GeoJSON data into a GeoDataFrame
geo_df = load_geojson_gpd('/content/NYC Taxi Zones.geojson')


# Assign zones using the efficient spatial join
subway_df['zone_id'] = assign_zones(subway_df, geo_df)

subway_df.head()

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,transit_timestamp,latitude,longitude,ridership,zone_id
265,2024-02-28 14:00:00,40.7104,-74.0066,145,209
707,2024-02-14 00:00:00,40.7031,-74.013,32,88
4708,2024-03-16 05:00:00,40.7382,-73.9962,7,234
6123,2024-03-16 11:00:00,40.7195,-73.9999,132,144
6906,2024-03-30 18:00:00,40.7986,-73.9416,53,74


In [136]:
#drop the latitude and longitude columns from the subway df
subway_df = subway_df.drop(['latitude', 'longitude'], axis=1)

In [137]:
#group the subway df by the combo of timestamp and zone_id summing up the ridership values.
subway_df = subway_df.groupby(['transit_timestamp', 'zone_id']).sum().reset_index()

In [138]:
#count the number of duplicate rows
subway_df.duplicated().sum()

0

In [139]:
#change zone_id to categorical type
subway_df['zone_id'] = subway_df['zone_id'].astype('int')

In [140]:
subway_df.dtypes

transit_timestamp    datetime64[ns]
zone_id                       int64
ridership                     int64
dtype: object

In [141]:
#preview the dataframe.
subway_df.head()

Unnamed: 0,transit_timestamp,zone_id,ridership
0,2022-02-01,113,101
1,2022-02-01,114,292
2,2022-02-01,116,50
3,2022-02-01,125,71
4,2022-02-01,127,34


In [142]:
#return the rows with zone id = 20
subway_df[subway_df['zone_id'] == 113]

Unnamed: 0,transit_timestamp,zone_id,ridership
0,2022-02-01 00:00:00,113,101
52,2022-02-01 01:00:00,113,31
104,2022-02-01 02:00:00,113,17
155,2022-02-01 03:00:00,113,6
205,2022-02-01 04:00:00,113,6
...,...,...,...
977104,2024-03-31 20:00:00,113,1124
977156,2024-03-31 21:00:00,113,1066
977207,2024-03-31 22:00:00,113,828
977259,2024-03-31 23:00:00,113,430


In [143]:
#return the rows with zone id = 20
taxi_df[taxi_df['zone_id'] == 113]

Unnamed: 0,transit_timestamp,zone_id,passenger_count
578429,2022-02-01 00:00:00,113,58
578489,2022-02-01 01:00:00,113,17
578550,2022-02-01 02:00:00,113,17
578607,2022-02-01 03:00:00,113,1
578665,2022-02-01 04:00:00,113,6
...,...,...,...
1745293,2024-03-31 20:00:00,113,189
1745357,2024-03-31 21:00:00,113,164
1745420,2024-03-31 22:00:00,113,201
1745482,2024-03-31 23:00:00,113,80


In [144]:
# combine the subway and taxi dfs into a single df called df. remove duplicate columns.
df = pd.merge(taxi_df, subway_df,on=['transit_timestamp', 'zone_id'], how='left')
df = df.drop_duplicates()
df.head()

Unnamed: 0,transit_timestamp,zone_id,passenger_count,ridership
0,2022-02-01,4,20,
1,2022-02-01,12,1,
2,2022-02-01,13,23,
3,2022-02-01,24,20,25.0
4,2022-02-01,41,32,154.0


In [145]:
df.dtypes

transit_timestamp    datetime64[ns]
zone_id                       int64
passenger_count               int64
ridership                   float64
dtype: object

In [146]:
df.shape

(1167176, 4)

In [147]:
df.rename(columns={'passenger_count': 'taxi_ridership', 'ridership': 'subway_ridership' }, inplace=True)

In [148]:
#percentage of NaN values in the df
print(f"percentage of NaN values in the ridership column is {(df.isna().sum() / df.shape[0]) * 100}")

percentage of NaN values in the ridership column is transit_timestamp     0.000000
zone_id               0.000000
taxi_ridership        0.000000
subway_ridership     19.138845
dtype: float64


In [149]:
#add zero instead of NaN in the subway_ridership column
df['subway_ridership'] = df['subway_ridership'].fillna(0)

In [150]:
#add a column called total_ridership that is the sume of the taxi and subway riderships
df['total_ridership'] = df['taxi_ridership'] + df['subway_ridership']

In [151]:
#change total_ridership and subway ridership type to int
df['total_ridership'] = df['total_ridership'].astype('int')
df['subway_ridership'] = df['subway_ridership'].astype('int')

In [152]:
#preview the dataframe
df.head()

Unnamed: 0,transit_timestamp,zone_id,taxi_ridership,subway_ridership,total_ridership
0,2022-02-01,4,20,0,20
1,2022-02-01,12,1,0,1
2,2022-02-01,13,23,0,23
3,2022-02-01,24,20,25,45
4,2022-02-01,41,32,154,186


In [153]:
#count the number of rows with duplicate timestamp and zone_id combo
df[['transit_timestamp','zone_id']].duplicated().sum()

0

In [105]:
#export the dataframe to a csv file
df.to_csv('taxi_subway_zones_df.csv', index=False)