In [1]:
import pandas as pd

ts_data = pd.read_parquet('../data/transformed/ts_data_2024_01.parquet')
ts_data

Unnamed: 0,pickup_hour,rides,pickup_location_id
0,2024-01-01 00:00:00,25,4
1,2024-01-01 01:00:00,29,4
2,2024-01-01 02:00:00,34,4
3,2024-01-01 03:00:00,31,4
4,2024-01-01 04:00:00,32,4
...,...,...,...
193435,2024-01-31 19:00:00,0,245
193436,2024-01-31 20:00:00,0,245
193437,2024-01-31 21:00:00,0,245
193438,2024-01-31 22:00:00,0,245


In [4]:
def get_cutoff_indices(
    data: pd.DataFrame,
    n_features: int,
    step_size: int
    ) -> list:

        stop_position = len(data) - 1
        
        # Start the first sub-sequence at index position 0
        subseq_first_idx = 0
        subseq_mid_idx = n_features
        subseq_last_idx = n_features + 1
        indices = []
        
        while subseq_last_idx <= stop_position:
            indices.append((subseq_first_idx, subseq_mid_idx, subseq_last_idx))
            
            subseq_first_idx += step_size
            subseq_mid_idx += step_size
            subseq_last_idx += step_size

        return indices

In [2]:
import numpy as np
from tqdm import tqdm

def transform_ts_data_into_features_and_target(
    ts_data: pd.DataFrame,
    input_seq_len: int,
    step_size: int
) -> pd.DataFrame:
    """
    Slices and transposes data from time-series format into a (features, target)
    format that we can use to train Supervised ML models
    """
    assert set(ts_data.columns) == {'pickup_hour', 'rides', 'pickup_location_id'}

    location_ids = ts_data['pickup_location_id'].unique()
    features = pd.DataFrame()
    targets = pd.DataFrame()
    
    for location_id in tqdm(location_ids):
        
        # keep only ts data for this `location_id`
        ts_data_one_location = ts_data.loc[
            ts_data.pickup_location_id == location_id, 
            ['pickup_hour', 'rides']
        ]

        # pre-compute cutoff indices to split dataframe rows
        indices = get_cutoff_indices(
            ts_data_one_location,
            input_seq_len,
            step_size
        )

        # slice and transpose data into numpy arrays for features and targets
        n_examples = len(indices)
        x = np.ndarray(shape=(n_examples, input_seq_len), dtype=np.float32)
        y = np.ndarray(shape=(n_examples), dtype=np.float32)
        pickup_hours = []
        for i, idx in enumerate(indices):
            x[i, :] = ts_data_one_location.iloc[idx[0]:idx[1]]['rides'].values
            y[i] = ts_data_one_location.iloc[idx[1]:idx[2]]['rides'].values
            pickup_hours.append(ts_data_one_location.iloc[idx[1]]['pickup_hour'])

        # numpy -> pandas
        features_one_location = pd.DataFrame(
            x,
            columns=[f'rides_previous_{i+1}_hour' for i in reversed(range(input_seq_len))]
        )
        features_one_location['pickup_hour'] = pickup_hours
        features_one_location['pickup_location_id'] = location_id

        # numpy -> pandas
        targets_one_location = pd.DataFrame(y, columns=[f'target_rides_next_hour'])

        # concatenate results
        features = pd.concat([features, features_one_location])
        targets = pd.concat([targets, targets_one_location])

    features.reset_index(inplace=True, drop=True)
    targets.reset_index(inplace=True, drop=True)

    return features, targets['target_rides_next_hour']

In [5]:
features, targets = transform_ts_data_into_features_and_target(
    ts_data,
    input_seq_len=24*7*1, # one week of history
    step_size=24,
)

print(f'{features.shape=}')
print(f'{targets.shape=}')

  y[i] = ts_data_one_location.iloc[idx[1]:idx[2]]['rides'].values
  y[i] = ts_data_one_location.iloc[idx[1]:idx[2]]['rides'].values
  y[i] = ts_data_one_location.iloc[idx[1]:idx[2]]['rides'].values
  y[i] = ts_data_one_location.iloc[idx[1]:idx[2]]['rides'].values
  y[i] = ts_data_one_location.iloc[idx[1]:idx[2]]['rides'].values
  y[i] = ts_data_one_location.iloc[idx[1]:idx[2]]['rides'].values
  y[i] = ts_data_one_location.iloc[idx[1]:idx[2]]['rides'].values
  y[i] = ts_data_one_location.iloc[idx[1]:idx[2]]['rides'].values
  y[i] = ts_data_one_location.iloc[idx[1]:idx[2]]['rides'].values
  y[i] = ts_data_one_location.iloc[idx[1]:idx[2]]['rides'].values
  y[i] = ts_data_one_location.iloc[idx[1]:idx[2]]['rides'].values
  y[i] = ts_data_one_location.iloc[idx[1]:idx[2]]['rides'].values
  y[i] = ts_data_one_location.iloc[idx[1]:idx[2]]['rides'].values
  y[i] = ts_data_one_location.iloc[idx[1]:idx[2]]['rides'].values
  y[i] = ts_data_one_location.iloc[idx[1]:idx[2]]['rides'].values
  y[i] = t

features.shape=(6240, 170)
targets.shape=(6240,)



