In [19]:
import pandas as pd

rides = pd.read_parquet("../data/transformed/validated_rides_2024_01.parquet")

rides[(rides["pickup_datetime"] >= "2024-01-01 00:00:00") & (rides["pickup_location_id"] == 1)].head(60)

Unnamed: 0,pickup_datetime,pickup_location_id
23213,2024-01-01 05:54:58,1
24321,2024-01-01 06:57:44,1
24322,2024-01-01 06:57:44,1
24323,2024-01-01 06:59:32,1
24678,2024-01-01 06:56:45,1
37075,2024-01-01 13:51:30,1
37364,2024-01-01 13:44:04,1
38566,2024-01-01 13:46:08,1
40509,2024-01-01 14:58:25,1
42009,2024-01-01 14:27:42,1


# Work with hourly frequency 
## Predict next hour mai konsa user drive book karega

In [7]:
# dt is a datetime object
rides["pickup_hour"] = rides["pickup_datetime"].dt.floor("H")
rides = rides.sort_values(by="pickup_hour")
rides

  rides["pickup_hour"] = rides["pickup_datetime"].dt.floor("H")


Unnamed: 0,pickup_datetime,pickup_location_id,pickup_hour
0,2024-01-01 00:57:55,186,2024-01-01 00:00:00
4416,2024-01-01 00:29:35,239,2024-01-01 00:00:00
4415,2024-01-01 00:29:35,239,2024-01-01 00:00:00
4414,2024-01-01 00:39:39,100,2024-01-01 00:00:00
4413,2024-01-01 00:53:11,68,2024-01-01 00:00:00
...,...,...,...
2822545,2024-01-31 23:22:04,148,2024-01-31 23:00:00
2822546,2024-01-31 23:23:52,138,2024-01-31 23:00:00
2822547,2024-01-31 23:45:25,144,2024-01-31 23:00:00
2822576,2024-01-31 23:19:13,79,2024-01-31 23:00:00


## For every pickup hour and location, fetch the number of rides

- When you use groupby() with multiple columns like rides.groupby(["pickup_hour", "pickup_location_id"]), it only creates groups for combinations that actually exist in your data.


## Why This Matters for Your Project
For a taxi demand predictor, including zero-count combinations is often important because:
Your model needs to learn that certain locations have no demand during certain hours
When making predictions, you'll want to predict for all location-hour combinations, not just the ones that had rides in the past
Time series analysis typically requires a complete time series without gaps
Including these zero-count combinations gives you a more complete picture of demand patterns across all locations and times.

In [8]:
# rides.groupby(["pickup_hour", "pickup_location_id"]) - Groups all the individual ride records by both the hour and the pickup location ID
# .size() - Counts the number of rides in each group (each unique hour-location combination)
# .reset_index(name="rides_count") - Converts the result into a DataFrame with three columns:
# pickup_hour: The hourly timestamp (e.g., "2024-01-01 00:00:00")
# pickup_location_id: The location identifier
# rides_count: The number of rides that occurred at that location during that hour
agg_rides = rides.groupby(["pickup_hour", "pickup_location_id"]).size().reset_index(name="rides_count")
agg_rides

Unnamed: 0,pickup_hour,pickup_location_id,rides_count
0,2024-01-01 00:00:00,4,25
1,2024-01-01 00:00:00,7,4
2,2024-01-01 00:00:00,9,1
3,2024-01-01 00:00:00,10,6
4,2024-01-01 00:00:00,12,4
...,...,...,...
77525,2024-01-31 23:00:00,260,2
77526,2024-01-31 23:00:00,261,12
77527,2024-01-31 23:00:00,262,9
77528,2024-01-31 23:00:00,263,53


## Adding zeroes for missing rows

In [9]:
from tqdm import tqdm

def add_missing_rows(agg_rides:pd.DataFrame) -> pd.DataFrame:
    location_ids = agg_rides["pickup_location_id"].unique()
    full_range = pd.date_range(
        agg_rides["pickup_hour"].min(),
        agg_rides["pickup_hour"].max(),
        freq="H"
    )
    print(full_range)
    output = pd.DataFrame()
    for location_id in tqdm(location_ids):
        # keep only rides for this 'location_id' (filter operation)
        agg_rides_i = agg_rides.loc[agg_rides.pickup_location_id == location_id,['pickup_hour','rides_count']]

        # quick way to add missing dates with 0 in a series (reindex operation)
        agg_rides_i.set_index("pickup_hour", inplace=True)
        agg_rides_i.index = pd.to_datetime(agg_rides_i.index)
        agg_rides_i = agg_rides_i.reindex(full_range, fill_value=0)

        # add back the location_id column
        agg_rides_i["pickup_location_id"] = location_id

        output = pd.concat([output, agg_rides_i])
    
    output = output.reset_index().rename(columns={"index":"pickup_hour"})
    return output



In [14]:
agg_rides_all_slots = add_missing_rows(agg_rides)


'H' is deprecated and will be removed in a future version, please use 'h' instead.



DatetimeIndex(['2024-01-01 00:00:00', '2024-01-01 01:00:00',
               '2024-01-01 02:00:00', '2024-01-01 03:00:00',
               '2024-01-01 04:00:00', '2024-01-01 05:00:00',
               '2024-01-01 06:00:00', '2024-01-01 07:00:00',
               '2024-01-01 08:00:00', '2024-01-01 09:00:00',
               ...
               '2024-01-31 14:00:00', '2024-01-31 15:00:00',
               '2024-01-31 16:00:00', '2024-01-31 17:00:00',
               '2024-01-31 18:00:00', '2024-01-31 19:00:00',
               '2024-01-31 20:00:00', '2024-01-31 21:00:00',
               '2024-01-31 22:00:00', '2024-01-31 23:00:00'],
              dtype='datetime64[ns]', length=744, freq='h')


100%|██████████| 260/260 [00:00<00:00, 264.79it/s]


Unnamed: 0,pickup_hour,rides_count,pickup_location_id
0,2024-01-01 00:00:00,25,4
1,2024-01-01 01:00:00,29,4
2,2024-01-01 02:00:00,34,4
3,2024-01-01 03:00:00,31,4
4,2024-01-01 04:00:00,32,4
5,2024-01-01 05:00:00,8,4
6,2024-01-01 06:00:00,6,4
7,2024-01-01 07:00:00,4,4
8,2024-01-01 08:00:00,0,4
9,2024-01-01 09:00:00,1,4


In [23]:
agg_rides_all_slots[agg_rides_all_slots["pickup_location_id"] == 4].head(30)

Unnamed: 0,pickup_hour,rides_count,pickup_location_id
0,2024-01-01 00:00:00,25,4
1,2024-01-01 01:00:00,29,4
2,2024-01-01 02:00:00,34,4
3,2024-01-01 03:00:00,31,4
4,2024-01-01 04:00:00,32,4
5,2024-01-01 05:00:00,8,4
6,2024-01-01 06:00:00,6,4
7,2024-01-01 07:00:00,4,4
8,2024-01-01 08:00:00,0,4
9,2024-01-01 09:00:00,1,4


# PLOTLY TO VISUALIZE TIME SERIES

In [11]:
import plotly.express as px
from typing import Optional, List

def plot_rides(df:pd.DataFrame, locations: Optional[List[int]] = None):
    if locations is None:
        locations = df["pickup_location_id"].unique()
    for location in locations:
        df_location = df[df["pickup_location_id"] == location]
        fig = px.line(df_location, x="pickup_hour", y="rides_count", title=f"Location {location}")
        fig.show()

plot_rides(agg_rides_all_slots, [130,1])

In [12]:
# agg_rides_all_slots.to_parquet("../data/transformed/ts_data_2024_01.parquet")