In [1]:
import pandas as pd
import numpy as np

def apply_rolling_window(X,
                    initial_time_step,
                    max_time_step,
                    window_size,
                    target_idx):
    """
    Apply rolling window operation on a time series for sequence prediction tasks.

    Parameters:
    - X (numpy.ndarray): The input dataset, where rows are samples and columns are features.
    - initial_time_step (int): The starting time step for the windowing operation.
    - max_time_step (int): The maximum time step to consider in the windowing operation.
    - window_size (int): The size of the rolling window.
    - target_idx (int): The index of the target variable in the dataset.

    Returns:
    - X_temp (numpy.ndarray): Rolling windows of feature data.
    - y_temp (numpy.ndarray): Corresponding target values for the rolling windows.

    The function applies a rolling window to the dataset, creating sub-windows of feature data and
    their corresponding target values. It handles missing values and ensures that the input data
    is valid for sequence prediction tasks.

    The rolling window is defined by 'initial_time_step', 'max_time_step', and 'window_size'.
    The 'target_idx' specifies the index of the target variable within the dataset.

    Note:
    - Ensure 'target_idx' is a valid column index within the dataset (0 <= target_idx < X.shape[1]).
    - The time steps should be non-negative (initial_time_step >= 0) and within a valid range
      (max_time_step >= initial_time_step).
    - The function handles missing (NaN) target values and identifies NaN values in feature windows.

    Example usage:
    >>> X_temp, y_temp = apply_windowing(X, initial_time_step=0, max_time_step=10, window_size=3, target_idx=0)
    """
    assert target_idx >= 0 and target_idx < X.shape[1]
    assert initial_time_step >= 0
    assert max_time_step >= initial_time_step

    start = initial_time_step

    sub_windows = (
        start +
        np.expand_dims(np.arange(window_size), 0) +
        np.expand_dims(np.arange(max_time_step + 1), 0).T
    )

    X_temp, y_temp = X[sub_windows], X[window_size:(
        max_time_step+window_size+1):1, target_idx]

    idx_y_train_not_nan = np.where(~np.isnan(y_temp))[0]
    assert len(idx_y_train_not_nan) == len(y_temp)

    x_train_is_nan_idx = np.unique(np.where(np.isnan(X_temp)))

    return X_temp, y_temp

In [2]:
import pandas as pd
import numpy as np

# Sample multivariate time series data in a DataFrame
data = pd.DataFrame({
    'feature1': [1, 2, 3, 4, 5, 6, 7, 8, 9],
    'feature2': [10, 20, 30, 40, 50, 60, 70, 80, 90]
}, index=pd.date_range(start='2023-01-01', periods=9, freq='D'))

print(data)

            feature1  feature2
2023-01-01         1        10
2023-01-02         2        20
2023-01-03         3        30
2023-01-04         4        40
2023-01-05         5        50
2023-01-06         6        60
2023-01-07         7        70
2023-01-08         8        80
2023-01-09         9        90


In [3]:
window_size = 3
apply_rolling_window(
    X = data.to_numpy(), 
    initial_time_step = 0, 
    max_time_step = len(data) - window_size - 1, 
    window_size = window_size,
    target_idx = 1)

(array([[[ 1, 10],
         [ 2, 20],
         [ 3, 30]],
 
        [[ 2, 20],
         [ 3, 30],
         [ 4, 40]],
 
        [[ 3, 30],
         [ 4, 40],
         [ 5, 50]],
 
        [[ 4, 40],
         [ 5, 50],
         [ 6, 60]],
 
        [[ 5, 50],
         [ 6, 60],
         [ 7, 70]],
 
        [[ 6, 60],
         [ 7, 70],
         [ 8, 80]]]),
 array([40, 50, 60, 70, 80, 90]))