In [3]:
import pandas as pd
rides = pd.read_parquet('../data/transformed/validated_rides_2022_01.parquet')

rides.head()

Unnamed: 0,pickup_datetime,pickup_location_id
0,2022-01-01 00:53:29,142
1,2022-01-01 00:42:07,236
2,2022-01-01 01:02:19,166
3,2022-01-01 00:35:23,114
4,2022-01-01 01:14:20,68


In [4]:
rides['pickup_hour'] = rides['pickup_datetime'].dt.floor('H')

In [5]:
rides.head()

Unnamed: 0,pickup_datetime,pickup_location_id,pickup_hour
0,2022-01-01 00:53:29,142,2022-01-01 00:00:00
1,2022-01-01 00:42:07,236,2022-01-01 00:00:00
2,2022-01-01 01:02:19,166,2022-01-01 01:00:00
3,2022-01-01 00:35:23,114,2022-01-01 00:00:00
4,2022-01-01 01:14:20,68,2022-01-01 01:00:00


In [6]:
agg_rides = rides.groupby(['pickup_hour','pickup_location_id']).size().reset_index()
agg_rides.rename(columns = {0:'rides'},inplace = True)
agg_rides.head()

Unnamed: 0,pickup_hour,pickup_location_id,rides
0,2022-01-01,4,5
1,2022-01-01,7,5
2,2022-01-01,12,2
3,2022-01-01,13,8
4,2022-01-01,24,8


In [8]:
from tqdm import tqdm

def add_missing_slots(agg_rides: pd.DataFrame) -> pd.DataFrame:
    
    location_ids = agg_rides['pickup_location_id'].unique()
    full_range = pd.date_range(
        agg_rides['pickup_hour'].min(), agg_rides['pickup_hour'].max(), freq='H')
    output = pd.DataFrame()
    for location_id in tqdm(location_ids):

        # keep only rides for this 'location_id'
        agg_rides_i = agg_rides.loc[agg_rides.pickup_location_id == location_id, ['pickup_hour', 'rides']]
            
        # quick way to add missing dates with 0 in a Series
        # taken from https://stackoverflow.com/a/19324591
        agg_rides_i.set_index('pickup_hour', inplace=True)
        agg_rides_i.index = pd.DatetimeIndex(agg_rides_i.index)
        agg_rides_i = agg_rides_i.reindex(full_range, fill_value=0)
        
        # add back `location_id` columns
        agg_rides_i['pickup_location_id'] = location_id

        output = pd.concat([output, agg_rides_i])
    
    # move the purchase_day from the index to a dataframe column
    output = output.reset_index().rename(columns={'index': 'pickup_hour'})
    
    return output

In [9]:
agg_rides_all_slots = add_missing_slots(agg_rides)

100%|██████████| 257/257 [00:00<00:00, 376.65it/s]


In [10]:
from typing import Optional, List
import plotly.express as px

def plot_rides(
    rides: pd.DataFrame,
    locations: Optional[List[int]] = None
    ):
    """
    Plot time-series data
    """
    rides_to_plot = rides[rides.pickup_location_id.isin(locations)] if locations else rides

    fig = px.line(
        rides_to_plot,
        x="pickup_hour",
        y="rides",
        color='pickup_location_id',
        template='none',
    )

    fig.show()

In [11]:
plot_rides(agg_rides_all_slots, locations=[43])

In [12]:
agg_rides_all_slots.to_parquet('../data/transformed/ts_data_2022_01.parquet')