In [5]:
import pandas as pd

rides = pd.read_parquet("../data/transformed/validated_rides_2024_01.parquet")

rides.head(10)

Unnamed: 0,pickup_datetime,pickup_location_id
0,2024-01-01 00:57:55,186
1,2024-01-01 00:03:00,140
2,2024-01-01 00:17:06,236
3,2024-01-01 00:36:38,79
4,2024-01-01 00:46:51,211
5,2024-01-01 00:54:08,148
6,2024-01-01 00:49:44,138
7,2024-01-01 00:30:40,246
8,2024-01-01 00:26:01,161
9,2024-01-01 00:28:08,113


# Work with hourly frequency 
## Predict next hour mai konsa user drive book karega

In [17]:
rides["pickup_hour"] = rides["pickup_datetime"].dt.floor("H")
rides


  rides["pickup_hour"] = rides["pickup_datetime"].dt.floor("H")


Unnamed: 0,pickup_datetime,pickup_location_id,pickup_hour
0,2024-01-01 00:57:55,186,2024-01-01 00:00:00
1,2024-01-01 00:03:00,140,2024-01-01 00:00:00
2,2024-01-01 00:17:06,236,2024-01-01 00:00:00
3,2024-01-01 00:36:38,79,2024-01-01 00:00:00
4,2024-01-01 00:46:51,211,2024-01-01 00:00:00
...,...,...,...
2964619,2024-01-31 23:45:59,107,2024-01-31 23:00:00
2964620,2024-01-31 23:13:07,114,2024-01-31 23:00:00
2964621,2024-01-31 23:19:00,211,2024-01-31 23:00:00
2964622,2024-01-31 23:07:23,107,2024-01-31 23:00:00


## For ecery pickoup hour and location, fetch the number of rides

- When you use groupby() with multiple columns like rides.groupby(["pickup_hour", "pickup_location_id"]), it only creates groups for combinations that actually exist in your data.


## Why This Matters for Your Project
For a taxi demand predictor, including zero-count combinations is often important because:
Your model needs to learn that certain locations have no demand during certain hours
When making predictions, you'll want to predict for all location-hour combinations, not just the ones that had rides in the past
Time series analysis typically requires a complete time series without gaps
Including these zero-count combinations gives you a more complete picture of demand patterns across all locations and times.

In [18]:
agg_rides = rides.groupby(["pickup_hour", "pickup_location_id"]).size().reset_index(name="rides_count")
agg_rides

Unnamed: 0,pickup_hour,pickup_location_id,rides_count
0,2024-01-01 00:00:00,4,25
1,2024-01-01 00:00:00,7,4
2,2024-01-01 00:00:00,9,1
3,2024-01-01 00:00:00,10,6
4,2024-01-01 00:00:00,12,4
...,...,...,...
77525,2024-01-31 23:00:00,260,2
77526,2024-01-31 23:00:00,261,12
77527,2024-01-31 23:00:00,262,9
77528,2024-01-31 23:00:00,263,53


## Adding zeroes for missing rows

In [19]:
from tqdm import tqdm

def add_missing_rows(agg_rides:pd.DataFrame) -> pd.DataFrame:
    
    
    location_ids = agg_rides["pickup_location_id"].unique()
    full_range = pd.date_range(
        agg_rides["pickup_hour"].min(),
        agg_rides["pickup_hour"].max(),
        freq="H"
    )
    output = pd.DataFrame()

    for location_id in tqdm(location_ids):
        # keep only rides for this 'location_id' (filter operation)
        agg_rides_i = agg_rides.loc[agg_rides.pickup_location_id == location_id,['pickup_hour','rides_count']]

        # quick way to add missing dates with 0 in a series (reindex operation)
        agg_rides_i.set_index("pickup_hour", inplace=True)
        agg_rides_i.index = pd.to_datetime(agg_rides_i.index)
        agg_rides_i = agg_rides_i.reindex(full_range, fill_value=0)

        # add back the location_id column
        agg_rides_i["pickup_location_id"] = location_id

        output = pd.concat([output, agg_rides_i])
    
    output = output.reset_index().rename(columns={"index":"pickup_hour"})
    return output



In [20]:
agg_rides_all_slots = add_missing_rows(agg_rides)

  full_range = pd.date_range(
100%|██████████| 260/260 [00:00<00:00, 291.15it/s]
