In [4]:
### We now want to transform time series data into the required features, so that we are able to get the features and targets for each sample in the dataframe

In [28]:
import pandas as pd
from typing import Optional, List
import numpy as np

In [3]:
ts_data = pd.read_parquet('../data/transformed/ts_data_2022_01.parquet')
ts_data.head(5)

Unnamed: 0,pickup_hour,rides,pickup_location_id
0,2022-01-01 00:00:00,11,4
1,2022-01-01 01:00:00,15,4
2,2022-01-01 02:00:00,26,4
3,2022-01-01 03:00:00,8,4
4,2022-01-01 04:00:00,9,4


In [8]:
ts_data_one_location = ts_data.loc[ts_data.pickup_location_id == 43,:].reset_index(drop=True)
ts_data_one_location.head(10)

Unnamed: 0,pickup_hour,rides,pickup_location_id
0,2022-01-01 00:00:00,97,43
1,2022-01-01 01:00:00,60,43
2,2022-01-01 02:00:00,22,43
3,2022-01-01 03:00:00,8,43
4,2022-01-01 04:00:00,6,43
5,2022-01-01 05:00:00,5,43
6,2022-01-01 06:00:00,3,43
7,2022-01-01 07:00:00,10,43
8,2022-01-01 08:00:00,7,43
9,2022-01-01 09:00:00,19,43


In [14]:
def get_cutoff_indices(data: pd.DataFrame, n_features: int, step_size: int) -> list:
    """
    
    """
    stop_position = len(data) -1 

    ## start the first sub-sequence at index position 0
    subseq_first_idx = 0
    subseq_mid_idx =  n_features
    subseq_last_idx = n_features + 1 
    indices = []

    while subseq_last_idx <= stop_position:
        indices.append((subseq_first_idx, subseq_mid_idx,subseq_last_idx))

        subseq_first_idx += step_size
        subseq_mid_idx += step_size
        subseq_last_idx += step_size
    return indices



In [24]:
n_features = 24
step_size = 1
indices = get_cutoff_indices(ts_data_one_location, n_features, step_size)
indices[0:5]

[(0, 24, 25), (1, 25, 26), (2, 26, 27), (3, 27, 28), (4, 28, 29)]

In [27]:
len(ts_data_one_location) - len(indices)

25

In [30]:
n_examples = len(indices)
x = np.ndarray(shape = (n_examples, n_features),dtype=np.float32)
y = np.ndarray(shape = (n_examples), dtype = np.float32)
pickup_hours = []

for i, idx in enumerate(indices):
    x[i, :] = ts_data_one_location.iloc[idx[0]:idx[1]]['rides'].values
    y[i] = ts_data_one_location.iloc[idx[1]:idx[2]]['rides'].values
    pickup_hours.append(ts_data_one_location.iloc[idx[1]]['pickup_hour'])


  y[i] = ts_data_one_location.iloc[idx[1]:idx[2]]['rides'].values


In [31]:
print(f'Shape of x: {x.shape}')
print(f'Shape of y : {y.shape}')
print(f'{pickup_hours[:5]=}')

Shape of x: (719, 24)
Shape of y : (719,)
pickup_hours[:5]=[Timestamp('2022-01-02 00:00:00'), Timestamp('2022-01-02 01:00:00'), Timestamp('2022-01-02 02:00:00'), Timestamp('2022-01-02 03:00:00'), Timestamp('2022-01-02 04:00:00')]


In [32]:
features_one_location = pd.DataFrame(x, columns =[f'rides_previous_{i+1}' for i in reversed(range(n_features))])
features_one_location

Unnamed: 0,rides_previous_24,rides_previous_23,rides_previous_22,rides_previous_21,rides_previous_20,rides_previous_19,rides_previous_18,rides_previous_17,rides_previous_16,rides_previous_15,...,rides_previous_10,rides_previous_9,rides_previous_8,rides_previous_7,rides_previous_6,rides_previous_5,rides_previous_4,rides_previous_3,rides_previous_2,rides_previous_1
0,97.0,60.0,22.0,8.0,6.0,5.0,3.0,10.0,7.0,19.0,...,70.0,94.0,87.0,73.0,34.0,32.0,22.0,16.0,18.0,6.0
1,60.0,22.0,8.0,6.0,5.0,3.0,10.0,7.0,19.0,24.0,...,94.0,87.0,73.0,34.0,32.0,22.0,16.0,18.0,6.0,3.0
2,22.0,8.0,6.0,5.0,3.0,10.0,7.0,19.0,24.0,39.0,...,87.0,73.0,34.0,32.0,22.0,16.0,18.0,6.0,3.0,1.0
3,8.0,6.0,5.0,3.0,10.0,7.0,19.0,24.0,39.0,35.0,...,73.0,34.0,32.0,22.0,16.0,18.0,6.0,3.0,1.0,1.0
4,6.0,5.0,3.0,10.0,7.0,19.0,24.0,39.0,35.0,77.0,...,34.0,32.0,22.0,16.0,18.0,6.0,3.0,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
714,52.0,36.0,28.0,16.0,13.0,8.0,1.0,1.0,2.0,1.0,...,78.0,74.0,66.0,91.0,117.0,100.0,106.0,147.0,121.0,102.0
715,36.0,28.0,16.0,13.0,8.0,1.0,1.0,2.0,1.0,1.0,...,74.0,66.0,91.0,117.0,100.0,106.0,147.0,121.0,102.0,66.0
716,28.0,16.0,13.0,8.0,1.0,1.0,2.0,1.0,1.0,4.0,...,66.0,91.0,117.0,100.0,106.0,147.0,121.0,102.0,66.0,61.0
717,16.0,13.0,8.0,1.0,1.0,2.0,1.0,1.0,4.0,9.0,...,91.0,117.0,100.0,106.0,147.0,121.0,102.0,66.0,61.0,73.0


In [35]:
targets_one_location = pd.DataFrame(y, columns = ['target_rides_next_hour'])
targets_one_location

Unnamed: 0,target_rides_next_hour
0,3.0
1,1.0
2,1.0
3,0.0
4,0.0
...,...
714,66.0
715,61.0
716,73.0
717,33.0
