In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm

In [2]:
ts_data = pd.read_parquet('../data/transformed/timeseries_data_2023-05.parquet')
ts_data.head()

Unnamed: 0,pickup_hour,rides,pickup_location_id
0,2023-05-01 00:00:00,1,4
1,2023-05-01 01:00:00,0,4
2,2023-05-01 02:00:00,2,4
3,2023-05-01 03:00:00,0,4
4,2023-05-01 04:00:00,0,4


In [3]:
ts_data_one_location = ts_data.loc[ts_data.pickup_location_id == 186, :].reset_index(drop=True)
ts_data_one_location.head(24)

Unnamed: 0,pickup_hour,rides,pickup_location_id
0,2023-05-01 00:00:00,26,186
1,2023-05-01 01:00:00,19,186
2,2023-05-01 02:00:00,16,186
3,2023-05-01 03:00:00,23,186
4,2023-05-01 04:00:00,8,186
5,2023-05-01 05:00:00,52,186
6,2023-05-01 06:00:00,130,186
7,2023-05-01 07:00:00,232,186
8,2023-05-01 08:00:00,223,186
9,2023-05-01 09:00:00,258,186


In [4]:
def get_cutoff_indices(data: pd.DataFrame, n_features: int, step_size: int) -> list:
    stop_position = len(data) - 1

    # Begin subsequence at index 0
    subsequent_first_idx = 0
    subsequent_middle_idx = n_features
    subsequent_last_idx = n_features + 1
    indices = []

    while subsequent_last_idx < stop_position:
        indices.append((subsequent_first_idx, subsequent_middle_idx, subsequent_last_idx))
        subsequent_first_idx += step_size
        subsequent_middle_idx += step_size
        subsequent_last_idx += step_size

    return indices


In [5]:
n_features = 24
step_size = 1

indices = get_cutoff_indices(ts_data_one_location, n_features, step_size)
indices[:10]

[(0, 24, 25),
 (1, 25, 26),
 (2, 26, 27),
 (3, 27, 28),
 (4, 28, 29),
 (5, 29, 30),
 (6, 30, 31),
 (7, 31, 32),
 (8, 32, 33),
 (9, 33, 34)]

In [6]:
n_examples = len(indices)
x = np.ndarray(shape=(n_examples, n_features), dtype=np.float32)
y = np.ndarray(shape=(n_examples), dtype=np.float32)
pickup_hours = []

for i, idx in enumerate(indices):
    x[i, :] = ts_data_one_location.iloc[idx[0]:idx[1]]['rides'].values
    y[i] = ts_data_one_location.iloc[idx[1]:idx[2]]['rides'].values
    pickup_hours.append(ts_data_one_location.iloc[idx[1]]['pickup_hour'])

  y[i] = ts_data_one_location.iloc[idx[1]:idx[2]]['rides'].values


In [7]:
print(f'x shape: {x.shape}')
print(f'x={x}')
print(f'pickup_hours={pickup_hours[:3]}')

x shape: (718, 24)
x=[[ 26.  19.  16. ... 137. 135. 144.]
 [ 19.  16.  23. ... 135. 144.  64.]
 [ 16.  23.   8. ... 144.  64.  17.]
 ...
 [182. 134. 127. ... 159. 179. 194.]
 [134. 127. 174. ... 179. 194. 163.]
 [127. 174. 259. ... 194. 163. 176.]]
pickup_hours=[Timestamp('2023-05-02 00:00:00'), Timestamp('2023-05-02 01:00:00'), Timestamp('2023-05-02 02:00:00')]


In [8]:
features_one_location = pd.DataFrame(x, columns=[f'rides_previous_{i+1}_hour' for i in reversed(range(n_features))])
features_one_location

Unnamed: 0,rides_previous_24_hour,rides_previous_23_hour,rides_previous_22_hour,rides_previous_21_hour,rides_previous_20_hour,rides_previous_19_hour,rides_previous_18_hour,rides_previous_17_hour,rides_previous_16_hour,rides_previous_15_hour,...,rides_previous_10_hour,rides_previous_9_hour,rides_previous_8_hour,rides_previous_7_hour,rides_previous_6_hour,rides_previous_5_hour,rides_previous_4_hour,rides_previous_3_hour,rides_previous_2_hour,rides_previous_1_hour
0,26.0,19.0,16.0,23.0,8.0,52.0,130.0,232.0,223.0,258.0,...,176.0,190.0,186.0,196.0,200.0,173.0,136.0,137.0,135.0,144.0
1,19.0,16.0,23.0,8.0,52.0,130.0,232.0,223.0,258.0,192.0,...,190.0,186.0,196.0,200.0,173.0,136.0,137.0,135.0,144.0,64.0
2,16.0,23.0,8.0,52.0,130.0,232.0,223.0,258.0,192.0,191.0,...,186.0,196.0,200.0,173.0,136.0,137.0,135.0,144.0,64.0,17.0
3,23.0,8.0,52.0,130.0,232.0,223.0,258.0,192.0,191.0,165.0,...,196.0,200.0,173.0,136.0,137.0,135.0,144.0,64.0,17.0,12.0
4,8.0,52.0,130.0,232.0,223.0,258.0,192.0,191.0,165.0,190.0,...,200.0,173.0,136.0,137.0,135.0,144.0,64.0,17.0,12.0,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
713,212.0,229.0,182.0,134.0,127.0,174.0,259.0,39.0,34.0,7.0,...,242.0,289.0,325.0,244.0,237.0,221.0,183.0,203.0,152.0,159.0
714,229.0,182.0,134.0,127.0,174.0,259.0,39.0,34.0,7.0,5.0,...,289.0,325.0,244.0,237.0,221.0,183.0,203.0,152.0,159.0,179.0
715,182.0,134.0,127.0,174.0,259.0,39.0,34.0,7.0,5.0,4.0,...,325.0,244.0,237.0,221.0,183.0,203.0,152.0,159.0,179.0,194.0
716,134.0,127.0,174.0,259.0,39.0,34.0,7.0,5.0,4.0,69.0,...,244.0,237.0,221.0,183.0,203.0,152.0,159.0,179.0,194.0,163.0


In [9]:
targets_one_location = pd.DataFrame(y, columns=['target_rides_next_hour'])
targets_one_location

Unnamed: 0,target_rides_next_hour
0,64.0
1,17.0
2,12.0
3,6.0
4,9.0
...,...
713,179.0
714,194.0
715,163.0
716,176.0


In [10]:
def transform_timeseries_data_into_features_target(ts_data: pd.DataFrame, input_sequence_length: int, step_size: int) -> pd.DataFrame:
    '''
    Transforms timeseries data into features and target dataframes to train ML models
    '''

    assert set(ts_data.columns) == {'pickup_location_id', 'rides', 'pickup_hour'}

    location_ids = ts_data['pickup_location_id'].unique()
    features = pd.DataFrame()
    targets = pd.DataFrame()

    for location_id in tqdm(location_ids):

        # Get data for one location
        ts_data_one_location = ts_data.loc[ts_data.pickup_location_id == location_id, ['pickup_hour', 'rides']]
        # Get indices for subsequences
        indices = get_cutoff_indices(ts_data_one_location, input_sequence_length, step_size)

        # Create features and targets arrays
        n_examples = len(indices)
        x = np.ndarray(shape=(n_examples, input_sequence_length), dtype=np.float32)
        y = np.ndarray(shape=(n_examples), dtype=np.float32)
        pickup_hours = []

        for i, idx in enumerate(indices):
            x[i, :] = ts_data_one_location.iloc[idx[0]:idx[1]]['rides'].values
            y[i] = ts_data_one_location.iloc[idx[1]:idx[2]]['rides'].values
            pickup_hours.append(ts_data_one_location.iloc[idx[1]]['pickup_hour'])

        # Create features and targets dataframes
        features_one_location = pd.DataFrame(x, columns=[f'rides_previous_{i+1}_hour' for i in reversed(range(input_sequence_length))])
        features_one_location['pickup_hour'] = pickup_hours
        features_one_location['pickup_location_id'] = location_id

        targets_one_location = pd.DataFrame(y, columns=[f'target_rides_next_hour'])

        features = pd.concat([features, features_one_location])
        targets = pd.concat([targets, targets_one_location])

    features.reset_index(drop=True, inplace=True)
    targets.reset_index(drop=True, inplace=True)

    return features, targets['target_rides_next_hour']

In [11]:
import warnings

warnings.filterwarnings('ignore', category=DeprecationWarning)

In [12]:
features, targets = transform_timeseries_data_into_features_target(ts_data, 
                                                                   input_sequence_length=24*7*1,        # 1 week of historical data
                                                                   step_size=24)

100%|██████████| 261/261 [00:03<00:00, 69.69it/s]


In [13]:
print(f'features shape: {features.shape}')
print(f'targets shape: {targets.shape}')

features shape: (6264, 170)
targets shape: (6264,)
