In [2]:
import os
import datetime

import pandas as pd

## Load the Data

In [25]:
data_input_dir = "../data/processed"
input_file = os.path.join(data_input_dir, "ep", "merged_data.parquet")

splits_output_dir = "../data/processed/splits"
os.makedirs(splits_output_dir, exist_ok=True)

In [4]:
ep_df = pd.read_parquet(input_file)
print(ep_df.shape)

(14196903, 7)


In [6]:
ep_df.head()

Unnamed: 0,ride_name,timestamp,wait_time,closed,temperature,rain,wind
0,alpine express enzian,2017-05-23 09:00:00,1.0,False,18.7,0.0,1.1
1,poppy towers,2017-05-23 09:00:00,0.0,False,18.7,0.0,1.1
2,silver star,2017-05-23 09:00:00,0.0,False,18.7,0.0,1.1
3,swiss bob run,2017-05-23 09:00:00,1.0,False,18.7,0.0,1.1
4,tirol log flume,2017-05-23 09:00:00,1.0,False,18.7,0.0,1.1


## Create the Splits

We will split the data into a training, validation and test set using the following methology:
- 2017 - 2022: Training
- 2023: Validation
- 2024: Test

This approximates a commonly used 80/10/10 split.

In [8]:
train_indices = ep_df[ep_df["timestamp"] < datetime.datetime(2023, 1, 1)].index
validation_indices = ep_df[(ep_df["timestamp"] >= datetime.datetime(2023, 1, 1)) & (ep_df["timestamp"] < datetime.datetime(2024, 1, 1))].index
test_indices = ep_df[ep_df["timestamp"] >= datetime.datetime(2024, 1, 1)].index

Check the shapes of the splits to ensure they contain the correct number of rows.

In [10]:
train_indices.shape, validation_indices.shape, test_indices.shape

((10485477,), (1852375,), (1859051,))

We print the first and last few indices of each split to ensure the data is correctly split.

In [16]:
print(train_indices[:5], train_indices[-5:])
print(validation_indices[:5], validation_indices[-5:])
print(test_indices[:5], test_indices[-5:])

Index([0, 1, 2, 3, 4], dtype='int64') Index([10485472, 10485473, 10485474, 10485475, 10485476], dtype='int64')
Index([10485477, 10485478, 10485479, 10485480, 10485481], dtype='int64') Index([12337847, 12337848, 12337849, 12337850, 12337851], dtype='int64')
Index([12337852, 12337853, 12337854, 12337855, 12337856], dtype='int64') Index([14196898, 14196899, 14196900, 14196901, 14196902], dtype='int64')


Also print the first few timestamps of each split to ensure the data doesn't overlap between splits.

In [22]:
print("Train set:")
print(ep_df.loc[train_indices[:5], ["timestamp"]])
print(ep_df.loc[train_indices[-5:], ["timestamp"]])
print()
print("Validation set:")
print(ep_df.loc[validation_indices[:5], ["timestamp"]])
print(ep_df.loc[validation_indices[-5:], ["timestamp"]])
print()
print("Test set:")
print(ep_df.loc[test_indices[:5], ["timestamp"]])
print(ep_df.loc[test_indices[-5:], ["timestamp"]])

Train set:
            timestamp
0 2017-05-23 09:00:00
1 2017-05-23 09:00:00
2 2017-05-23 09:00:00
3 2017-05-23 09:00:00
4 2017-05-23 09:00:00
                   timestamp
10485472 2022-12-31 21:00:00
10485473 2022-12-31 21:00:00
10485474 2022-12-31 21:00:00
10485475 2022-12-31 21:00:00
10485476 2022-12-31 21:00:00

Validation set:
                   timestamp
10485477 2023-01-01 09:00:00
10485478 2023-01-01 09:00:00
10485479 2023-01-01 09:00:00
10485480 2023-01-01 09:00:00
10485481 2023-01-01 09:00:00
                   timestamp
12337847 2023-12-31 21:00:00
12337848 2023-12-31 21:00:00
12337849 2023-12-31 21:00:00
12337850 2023-12-31 21:00:00
12337851 2023-12-31 21:00:00

Test set:
                   timestamp
12337852 2024-01-01 09:00:00
12337853 2024-01-01 09:00:00
12337854 2024-01-01 09:00:00
12337855 2024-01-01 09:00:00
12337856 2024-01-01 09:00:00
                   timestamp
14196898 2024-12-31 21:00:00
14196899 2024-12-31 21:00:00
14196900 2024-12-31 21:00:00
14196901 2024-12-

Finally, we save the splits to parquet files for further use.

In [26]:
train_indices_series = pd.DataFrame(train_indices)
validation_indices_series = pd.DataFrame(validation_indices)
test_indices_series = pd.DataFrame(test_indices)

train_output_file = os.path.join(splits_output_dir, "train_indices.parquet")
validation_output_file = os.path.join(splits_output_dir, "validation_indices.parquet")
test_output_file = os.path.join(splits_output_dir, "test_indices.parquet")

train_indices_series.to_parquet(train_output_file, index=False)
validation_indices_series.to_parquet(validation_output_file, index=False)
test_indices_series.to_parquet(test_output_file, index=False)