# Transform Time Series Data Into Features And Targets For Supervised Machine Learning
- slice and slide
- tabular datasets has n+1 columns first n columns are features and last column is the target

In [2]:
import pandas as pd

ts_data = pd.read_parquet("../data/transformed/ts_data_2024_01.parquet")

ts_data.head(26)

Unnamed: 0,pickup_hour,rides_count,pickup_location_id
0,2024-01-01 00:00:00,25,4
1,2024-01-01 01:00:00,29,4
2,2024-01-01 02:00:00,34,4
3,2024-01-01 03:00:00,31,4
4,2024-01-01 04:00:00,32,4
5,2024-01-01 05:00:00,8,4
6,2024-01-01 06:00:00,6,4
7,2024-01-01 07:00:00,4,4
8,2024-01-01 08:00:00,0,4
9,2024-01-01 09:00:00,1,4


This line of code performs several operations in sequence:
### 1. Filtering Data
  ts_data["pickup_location_id"] == 43 creates a boolean mask that is True for all rows where the pickup location ID equals 43.
### 2. Row Selection
  .loc[...] uses this boolean mask to select only those rows where the condition is True.
### 3. Column Selection
  The : after the comma in .loc[mask, :] means "select all columns" from the filtered rows.
### 4. Index Reset
  .reset_index(drop=True) creates a brand new sequential index (starting from 0) for the filtered DataFrame. The drop=True parameter discards the original index values instead of keeping them as a column.


In [13]:
ts_data_one_location = ts_data.loc[ts_data["pickup_location_id"] == 43,:].reset_index(drop=True)
# print(ts_data_one_location.index)
ts_data_one_location.head(25)



Unnamed: 0,pickup_hour,rides_count,pickup_location_id
0,2024-01-01 00:00:00,162,43
1,2024-01-01 01:00:00,89,43
2,2024-01-01 02:00:00,38,43
3,2024-01-01 03:00:00,14,43
4,2024-01-01 04:00:00,5,43
5,2024-01-01 05:00:00,3,43
6,2024-01-01 06:00:00,5,43
7,2024-01-01 07:00:00,12,43
8,2024-01-01 08:00:00,10,43
9,2024-01-01 09:00:00,15,43


## get_cutoff_indices
This function generates sliding window indices for time series data, which is essential for preparing features and targets for machine learning models. It's particularly useful for forecasting tasks where you use past data to predict future values.


In [16]:
def get_cutoff_indices(
        data: pd.DataFrame,
        n_features: int,
        step_size: int,
)->list:
    stop_position = len(data) - 1
    indices = []

    # start the first subsequence at index 0
    subsequence_start_index = 0
    subsequence_mid_index = n_features 
    subsequence_end_index = n_features + 1

    while subsequence_end_index <= stop_position:
        indices.append((subsequence_start_index, subsequence_mid_index, subsequence_end_index))
        subsequence_start_index += step_size
        subsequence_mid_index += step_size
        subsequence_end_index += step_size

    return indices


In [18]:
n_features = 24
step_size = 1

indices = get_cutoff_indices(ts_data_one_location, n_features, step_size)
indices[:5]

[(0, 24, 25), (1, 25, 26), (2, 26, 27), (3, 27, 28), (4, 28, 29)]

This code transforms time series data into a format suitable for supervised machine learning by creating feature-target pairs. Let's analyze it line by line:

In [20]:
import numpy as np

n_examples = len(indices)
x = np.ndarray(shape=(n_examples, n_features), dtype=np.float32)
y = np.ndarray(shape=(n_examples), dtype=np.float32)
pickup_hours = []
# looping through the indices to create features and targets
for i,idx in enumerate(indices):
    x[i,:] = ts_data_one_location.iloc[idx[0]:idx[1]]['rides_count'].values # features
    y[i] = ts_data_one_location.iloc[idx[1]:idx[2]]['rides_count'].values # target
    pickup_hours.append(ts_data_one_location.iloc[idx[1]]['pickup_hour'])

  y[i] = ts_data_one_location.iloc[idx[1]:idx[2]]['rides_count'].values # target


In [21]:
print(f'{x.shape=}')
print(f"{x=}")
print(f"{pickup_hours=}")

x.shape=(719, 24)
x=array([[162.,  89.,  38., ...,  28.,  13.,   5.],
       [ 89.,  38.,  14., ...,  13.,   5.,   3.],
       [ 38.,  14.,   5., ...,   5.,   3.,   0.],
       ...,
       [ 93.,  55.,  38., ..., 107., 120.,  81.],
       [ 55.,  38.,  12., ..., 120.,  81.,  52.],
       [ 38.,  12.,   3., ...,  81.,  52.,  54.]],
      shape=(719, 24), dtype=float32)
pickup_hours=[Timestamp('2024-01-02 00:00:00'), Timestamp('2024-01-02 01:00:00'), Timestamp('2024-01-02 02:00:00'), Timestamp('2024-01-02 03:00:00'), Timestamp('2024-01-02 04:00:00'), Timestamp('2024-01-02 05:00:00'), Timestamp('2024-01-02 06:00:00'), Timestamp('2024-01-02 07:00:00'), Timestamp('2024-01-02 08:00:00'), Timestamp('2024-01-02 09:00:00'), Timestamp('2024-01-02 10:00:00'), Timestamp('2024-01-02 11:00:00'), Timestamp('2024-01-02 12:00:00'), Timestamp('2024-01-02 13:00:00'), Timestamp('2024-01-02 14:00:00'), Timestamp('2024-01-02 15:00:00'), Timestamp('2024-01-02 16:00:00'), Timestamp('2024-01-02 17:00:00'), Tim

In [22]:
features_one_place = pd.DataFrame(
    x,
    columns=[f'rides_previous_{i+1}_hour' for i in reversed(range(n_features))],
)
features_one_place




Unnamed: 0,rides_previous_24_hour,rides_previous_23_hour,rides_previous_22_hour,rides_previous_21_hour,rides_previous_20_hour,rides_previous_19_hour,rides_previous_18_hour,rides_previous_17_hour,rides_previous_16_hour,rides_previous_15_hour,...,rides_previous_10_hour,rides_previous_9_hour,rides_previous_8_hour,rides_previous_7_hour,rides_previous_6_hour,rides_previous_5_hour,rides_previous_4_hour,rides_previous_3_hour,rides_previous_2_hour,rides_previous_1_hour
0,162.0,89.0,38.0,14.0,5.0,3.0,5.0,12.0,10.0,15.0,...,108.0,125.0,84.0,50.0,40.0,40.0,36.0,28.0,13.0,5.0
1,89.0,38.0,14.0,5.0,3.0,5.0,12.0,10.0,15.0,28.0,...,125.0,84.0,50.0,40.0,40.0,36.0,28.0,13.0,5.0,3.0
2,38.0,14.0,5.0,3.0,5.0,12.0,10.0,15.0,28.0,55.0,...,84.0,50.0,40.0,40.0,36.0,28.0,13.0,5.0,3.0,0.0
3,14.0,5.0,3.0,5.0,12.0,10.0,15.0,28.0,55.0,49.0,...,50.0,40.0,40.0,36.0,28.0,13.0,5.0,3.0,0.0,0.0
4,5.0,3.0,5.0,12.0,10.0,15.0,28.0,55.0,49.0,74.0,...,40.0,40.0,36.0,28.0,13.0,5.0,3.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
714,134.0,92.0,93.0,55.0,38.0,12.0,3.0,0.0,0.0,1.0,...,52.0,84.0,89.0,91.0,103.0,120.0,75.0,126.0,96.0,107.0
715,92.0,93.0,55.0,38.0,12.0,3.0,0.0,0.0,1.0,1.0,...,84.0,89.0,91.0,103.0,120.0,75.0,126.0,96.0,107.0,120.0
716,93.0,55.0,38.0,12.0,3.0,0.0,0.0,1.0,1.0,8.0,...,89.0,91.0,103.0,120.0,75.0,126.0,96.0,107.0,120.0,81.0
717,55.0,38.0,12.0,3.0,0.0,0.0,1.0,1.0,8.0,11.0,...,91.0,103.0,120.0,75.0,126.0,96.0,107.0,120.0,81.0,52.0


In [23]:
target_one_place = pd.DataFrame(
    y,
    columns=['target_rides_next_hour'],
)
target_one_place




Unnamed: 0,target_rides_next_hour
0,3.0
1,0.0
2,0.0
3,0.0
4,0.0
...,...
714,120.0
715,81.0
716,52.0
717,54.0


Timeline (hours): 0 1 2 3 ... 167 168 ... 191 192 ... 215 216 ...
                  |<---- Week 1 ---->|↓    |     ↓    |     ↓
                                     |     |     |     |
Prediction 1:     [======FEATURES======] → TARGET
                                     |
                  |   |<---- Week 2 ---->|↓    |     
                  |                      |     |     
Prediction 2:     |   [======FEATURES======] → TARGET
                  |                      |
                  |   |   |<---- Week 3 ---->|↓
                  |   |                      |
Prediction 3:     |   |   [======FEATURES======] → TARGET

In [32]:
from tqdm import tqdm

def transform_ts_data_into_features_and_targets(
        ts_data: pd.DataFrame,
        input_seq_len: int,
        step_size: int,
)->pd.DataFrame:
    assert set(ts_data.columns) == {'pickup_hour', 'rides_count', 'pickup_location_id'}

    location_ids = ts_data['pickup_location_id'].unique()
    features = pd.DataFrame()
    targets = pd.DataFrame()

    for location_id in tqdm(location_ids):
        ts_data_one_location = ts_data.loc[ts_data.pickup_location_id == location_id, ['pickup_hour', 'rides_count']]   

        indices = get_cutoff_indices(ts_data_one_location, input_seq_len, step_size)

        # creating features and targets
        n_examples = len(indices)
        x = np.ndarray(shape=(n_examples, input_seq_len), dtype=np.float32)
        y = np.ndarray(shape=(n_examples), dtype=np.float32)
        pickup_hours = []

        for i, idx in enumerate(indices):
            x[i, :] = ts_data_one_location.iloc[idx[0]:idx[1]]['rides_count'].values
            y[i] = ts_data_one_location.iloc[idx[1]:idx[2]]['rides_count'].values
            pickup_hours.append(ts_data_one_location.iloc[idx[1]]['pickup_hour'])

        # numpy array to pandas dataframe
        features_one_location = pd.DataFrame(
            x,
            columns=[f'rides_previous_{i+1}_hour' for i in reversed(range(input_seq_len))],
        )
        features_one_location['pickup_hour'] = pickup_hours
        features_one_location['pickup_location_id'] = location_id

        # numpy array to pandas dataframe
        target_one_location = pd.DataFrame(
            y,
            columns=['target_rides_next_hour'],
        )

        # concatenating features and targets
        features = pd.concat([features, features_one_location])
        targets = pd.concat([targets, target_one_location])
    
    features = features.reset_index(drop=True)
    targets = targets.reset_index(drop=True)

    return features, targets['target_rides_next_hour']

In [33]:
features, targets = transform_ts_data_into_features_and_targets(
    ts_data, 
    input_seq_len=24*7*1,  # 1 week
    step_size=24
)

print(f'{features.shape=}')
print(f'{targets.shape=}')




  y[i] = ts_data_one_location.iloc[idx[1]:idx[2]]['rides_count'].values
  y[i] = ts_data_one_location.iloc[idx[1]:idx[2]]['rides_count'].values
  y[i] = ts_data_one_location.iloc[idx[1]:idx[2]]['rides_count'].values
  y[i] = ts_data_one_location.iloc[idx[1]:idx[2]]['rides_count'].values
  y[i] = ts_data_one_location.iloc[idx[1]:idx[2]]['rides_count'].values
  y[i] = ts_data_one_location.iloc[idx[1]:idx[2]]['rides_count'].values
  y[i] = ts_data_one_location.iloc[idx[1]:idx[2]]['rides_count'].values
  y[i] = ts_data_one_location.iloc[idx[1]:idx[2]]['rides_count'].values
  y[i] = ts_data_one_location.iloc[idx[1]:idx[2]]['rides_count'].values
  y[i] = ts_data_one_location.iloc[idx[1]:idx[2]]['rides_count'].values
  y[i] = ts_data_one_location.iloc[idx[1]:idx[2]]['rides_count'].values
  y[i] = ts_data_one_location.iloc[idx[1]:idx[2]]['rides_count'].values
  y[i] = ts_data_one_location.iloc[idx[1]:idx[2]]['rides_count'].values
  y[i] = ts_data_one_location.iloc[idx[1]:idx[2]]['rides_count']

features.shape=(6240, 170)
targets.shape=(6240,)
