In [19]:
import numpy as np
import pandas as pd

# одновимірне вікно

In [20]:
def get_labelled_windows(x, horizon=1):
    """
    Creates labels for windowed dataset.
    """
    return x[:, :-horizon], x[:, -horizon:]

def make_windows(x, window_size=7, horizon=1):
    """
    Turns a 1D array into a 2D array of sequential windows of window_size.
    """
    window_step = np.expand_dims(np.arange(window_size+horizon), axis=0)
    window_indexes = window_step + np.expand_dims(np.arange(len(x)-(window_size+horizon-1)), axis=0).T
    windowed_array = x[window_indexes]
    windows, labels = get_labelled_windows(windowed_array, horizon=horizon)
    return windows, labels

def make_train_test_splits(windows, labels, test_split=0.2):
    """
    Splits matching pairs of windows and labels into train and test splits.
    """
    split_size = int(len(windows) * (1-test_split))
    train_windows = windows[:split_size]
    train_labels = labels[:split_size]
    test_windows = windows[split_size:]
    test_labels = labels[split_size:]
    return train_windows, test_windows, train_labels, test_labels

In [21]:
time = np.arange(1000)
df = pd.DataFrame({"time": time, "value": time + np.random.normal(0, 2, size=len(time))})
df = df.round(2)

df

Unnamed: 0,time,value
0,0,0.54
1,1,2.21
2,2,-0.23
3,3,5.36
4,4,3.62
...,...,...
995,995,997.32
996,996,997.24
997,997,1000.10
998,998,997.05


In [22]:
WINDOW_SIZE = 5
HORIZON = 2
TEST_SPLIT = 0.2

windows, labels = make_windows(df["value"].values, window_size=WINDOW_SIZE, horizon=HORIZON)
train_windows, test_windows, train_labels, test_labels = make_train_test_splits(windows, labels, test_split=TEST_SPLIT)

In [23]:
pd.DataFrame({
    "windows": [list(window) for window in train_windows],
    "labels": [list(label) for label in train_labels]
}).head()

Unnamed: 0,windows,labels
0,"[0.54, 2.21, -0.23, 5.36, 3.62]","[2.3, 6.02]"
1,"[2.21, -0.23, 5.36, 3.62, 2.3]","[6.02, 6.87]"
2,"[-0.23, 5.36, 3.62, 2.3, 6.02]","[6.87, 7.97]"
3,"[5.36, 3.62, 2.3, 6.02, 6.87]","[7.97, 8.53]"
4,"[3.62, 2.3, 6.02, 6.87, 7.97]","[8.53, 9.66]"


# багатовимірне вікно

In [29]:
time = np.arange(1000)
df = pd.DataFrame({"time": time, 
                   "value_x": time + np.random.normal(0, 2, size=len(time)),
                   "value_y": np.sin(0.1 * time) + np.random.normal(0, 0.1, size=len(time))
                   })
df = df.round(2)

df

Unnamed: 0,time,value_x,value_y
0,0,1.99,-0.15
1,1,-0.81,0.05
2,2,2.94,0.27
3,3,2.78,0.35
4,4,3.61,0.46
...,...,...,...
995,995,996.48,-0.99
996,996,995.98,-0.75
997,997,994.84,-0.69
998,998,997.17,-0.49


In [30]:
HORIZON = 1
WINDOW_SIZE = 3

for i in range(WINDOW_SIZE):
    df[f"value_x_{i+1}"] = df["value_x"].shift(periods=i+1)

df = df.dropna().reset_index(drop=True)
df

Unnamed: 0,time,value_x,value_y,value_x_1,value_x_2,value_x_3
0,3,2.78,0.35,2.94,-0.81,1.99
1,4,3.61,0.46,2.78,2.94,-0.81
2,5,5.44,0.36,3.61,2.78,2.94
3,6,6.51,0.64,5.44,3.61,2.78
4,7,11.23,0.73,6.51,5.44,3.61
...,...,...,...,...,...,...
992,995,996.48,-0.99,995.88,993.23,991.11
993,996,995.98,-0.75,996.48,995.88,993.23
994,997,994.84,-0.69,995.98,996.48,995.88
995,998,997.17,-0.49,994.84,995.98,996.48


In [31]:
windows = df.drop(columns=["time", "value_x"]).values
labels = df["value_x"].values
train_windows, test_windows, train_labels, test_labels = make_train_test_splits(windows, labels, test_split=TEST_SPLIT)

In [34]:
len(train_windows), len(train_labels), len(test_windows), len(test_labels)

(797, 797, 200, 200)

In [37]:
pd.DataFrame({
    "windows": [list(window) for window in train_windows],
    "labels": [label for label in train_labels]
}).head()

Unnamed: 0,windows,labels
0,"[0.35, 2.94, -0.81, 1.99]",2.78
1,"[0.46, 2.78, 2.94, -0.81]",3.61
2,"[0.36, 3.61, 2.78, 2.94]",5.44
3,"[0.64, 5.44, 3.61, 2.78]",6.51
4,"[0.73, 6.51, 5.44, 3.61]",11.23


# агрегація

In [41]:
df = pd.DataFrame({"user_id": np.random.randint(1, 5, size=1000),
                   "value_x": time + np.random.normal(0, 2, size=len(time)),
                   "value_y": np.sin(0.1 * time) + np.random.normal(0, 0.1, size=len(time))
                   })
df = df.round(2)

df

Unnamed: 0,user_id,value_x,value_y
0,3,0.12,-0.07
1,3,-0.67,0.16
2,4,1.95,0.26
3,3,0.68,0.36
4,1,3.20,0.50
...,...,...,...
995,3,998.94,-0.89
996,2,995.42,-0.78
997,1,999.50,-0.83
998,4,1001.63,-0.61


In [42]:
df.groupby('user_id').agg({
    'value_x': list,
    'value_y': list
}).reset_index().head()

Unnamed: 0,user_id,value_x,value_y
0,1,"[3.2, 8.69, 9.62, 14.69, 12.19, 18.7, 32.54, 3...","[0.5, 0.62, 0.86, 0.75, 1.08, 0.84, -0.14, -0...."
1,2,"[8.24, 5.21, 16.67, 21.09, 23.91, 26.26, 33.3,...","[0.66, 0.73, 1.06, 0.97, 0.55, 0.22, -0.26, -0..."
2,3,"[0.12, -0.67, 0.68, 11.36, 12.23, 16.87, 17.57...","[-0.07, 0.16, 0.36, 0.66, 0.75, 1.06, 0.98, 0...."
3,4,"[1.95, 2.1, 13.07, 14.23, 23.38, 28.73, 27.91,...","[0.26, 0.25, 0.95, 0.91, 0.58, 0.33, 0.15, 0.0..."
