In [1]:
import pandas as pd
from tqdm import tqdm
from typing import Optional, List
import plotly.express as px

In [2]:
rides = pd.read_parquet('../data/transformed/validated_rides_2023-05.parquet')
rides.head()

Unnamed: 0,pickup_datetime,pickup_location_id
0,2023-05-01 00:33:13,138
1,2023-05-01 00:42:49,138
2,2023-05-01 00:56:34,138
3,2023-05-01 00:00:52,138
4,2023-05-01 00:05:50,138


In [3]:
rides.sample(20)

Unnamed: 0,pickup_datetime,pickup_location_id
758207,2023-05-07 18:37:33,132
39565,2023-05-01 13:34:47,170
1029396,2023-05-10 12:32:51,238
3199589,2023-05-29 22:07:59,142
2219346,2023-05-20 09:43:10,113
2068696,2023-05-18 23:11:31,148
3004267,2023-05-27 15:54:55,143
2883971,2023-05-26 11:08:54,263
552400,2023-05-05 21:07:02,48
1599986,2023-05-15 09:55:33,239


In [4]:
rides['pickup_hour'] = rides['pickup_datetime'].dt.floor('H')
rides

  rides['pickup_hour'] = rides['pickup_datetime'].dt.floor('H')


Unnamed: 0,pickup_datetime,pickup_location_id,pickup_hour
0,2023-05-01 00:33:13,138,2023-05-01 00:00:00
1,2023-05-01 00:42:49,138,2023-05-01 00:00:00
2,2023-05-01 00:56:34,138,2023-05-01 00:00:00
3,2023-05-01 00:00:52,138,2023-05-01 00:00:00
4,2023-05-01 00:05:50,138,2023-05-01 00:00:00
...,...,...,...
3513644,2023-05-31 23:59:36,186,2023-05-31 23:00:00
3513645,2023-05-31 23:52:44,224,2023-05-31 23:00:00
3513646,2023-05-31 23:26:09,161,2023-05-31 23:00:00
3513647,2023-05-31 23:03:44,162,2023-05-31 23:00:00


In [5]:
agg_rides = rides.groupby(['pickup_hour', 'pickup_location_id']).size().reset_index()
agg_rides.rename(columns={0: 'rides'}, inplace=True)
agg_rides

Unnamed: 0,pickup_hour,pickup_location_id,rides
0,2023-05-01 00:00:00,4,1
1,2023-05-01 00:00:00,7,2
2,2023-05-01 00:00:00,10,2
3,2023-05-01 00:00:00,24,2
4,2023-05-01 00:00:00,25,1
...,...,...,...
76445,2023-05-31 23:00:00,261,10
76446,2023-05-31 23:00:00,262,10
76447,2023-05-31 23:00:00,263,53
76448,2023-05-31 23:00:00,264,50


In [6]:
agg_rides.sample(20)

Unnamed: 0,pickup_hour,pickup_location_id,rides
63810,2023-05-26 16:00:00,151,57
33293,2023-05-14 11:00:00,220,1
61371,2023-05-25 17:00:00,164,192
38561,2023-05-16 14:00:00,228,1
69233,2023-05-28 23:00:00,13,3
28243,2023-05-12 11:00:00,17,1
34212,2023-05-14 20:00:00,79,135
68183,2023-05-28 13:00:00,18,1
5557,2023-05-03 07:00:00,225,3
32777,2023-05-14 06:00:00,263,29


In [7]:
def add_missing_slots(agg_rides: pd.DataFrame) -> pd.DataFrame:
    
    location_ids = agg_rides['pickup_location_id'].unique()
    full_range = pd.date_range(agg_rides['pickup_hour'].min(), agg_rides['pickup_hour'].max(), freq='H')
    output = pd.DataFrame()
    for location_id in tqdm(location_ids):
        
        # Filter the rides for the current location
        agg_rides_i = agg_rides.loc[agg_rides['pickup_location_id'] == location_id, ['pickup_hour', 'rides']]

        # Add missing slots and fill with 0 
        # stackoverflow.com/questions/a/19324591
        agg_rides_i.set_index('pickup_hour', inplace=True)
        agg_rides_i.index = pd.DatetimeIndex(agg_rides_i.index)
        agg_rides_i = agg_rides_i.reindex(full_range, fill_value=0)

        # Reset the index and add the location_id
        agg_rides_i['pickup_location_id'] = location_id

        output = pd.concat([output, agg_rides_i])

    # Reset the index and rename the columns
    output = output.reset_index().rename(columns={'index': 'pickup_hour'})

    return output

In [8]:
agg_rides_all_slots = add_missing_slots(agg_rides)

  full_range = pd.date_range(agg_rides['pickup_hour'].min(), agg_rides['pickup_hour'].max(), freq='H')
100%|██████████| 261/261 [00:01<00:00, 179.96it/s]


In [9]:
agg_rides_all_slots.sample(5)

Unnamed: 0,pickup_hour,rides,pickup_location_id
15749,2023-05-06 05:00:00,0,93
162743,2023-05-23 23:00:00,0,154
61578,2023-05-24 18:00:00,0,179
173845,2023-05-21 13:00:00,0,183
161705,2023-05-11 17:00:00,0,250


In [10]:
def plot_rides(rides: pd.DataFrame, locations: Optional[List[int]] = None):
    '''
    Plot time-series data of rides for the specified locations.
    '''

    rides_to_plot = rides[rides['pickup_location_id'].isin(locations)] if locations else rides

    fig = px.line(rides_to_plot, x='pickup_hour', y='rides', color='pickup_location_id', template='none')
    fig.show()

In [11]:
plot_rides(agg_rides_all_slots, locations=[138])

In [12]:
agg_rides_all_slots.to_parquet('../data/transformed/timeseries_data_2023-05.parquet')