In [1]:
import sys
from pathlib import Path

script_dir = Path().resolve()
root_dir = (script_dir.parent)
sys.path.append(str(root_dir))

import pandas as pd
import numpy as np

from datasets.pecanstreet import PecanStreetDataset
from datasets.openpower import OpenPowerDataset
from datasets.timeseries_dataset import TimeSeriesDataset
from endata.trainer import Trainer

2024-12-06 07:23:40.211203: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-12-06 07:23:40.225172: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-12-06 07:23:40.229490: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-06 07:23:40.240016: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  from .autonotebook import tqdm as notebook_

## Training a model from scratch ##

To train your own model from scratch, the ` Trainer ` class provides a simple implementation. Simply define your a custom dataset and a ` Trainer ` object, and call the ` Trainer ` 's ` fit() ` method.

## Writing a custom dataset ##

When creating a custom time series dataset class for use with EnData, the class must inherit from the provided `TimeSeriesDataset` base class. The `TimeSeriesDataset` class provides a robust and modular framework for handling wide-format time series data. Custom implementations only need to define the `_preprocess_data` method, which is an abstract method in the base class. This method should ensure that the data is available in a clean wide-format data frame, that has the structure outlined below.

### Responsibilities of `_preprocess_data`

- Preprocess raw input data into a DataFrame that satisfies the expected structure.
- Ensure time series columns contain arrays of the correct sequence length (`seq_len`).
- Add any additional columns, such as entity identifiers or conditioning variables.

### Benefits of the Base Class

- **Normalization and Scaling:** Automatically handles standardization and min-max scaling.
- **Conditioning Variables:** Provides support for encoding and managing conditioning variables.
- **Time Series Merging and Splitting:** Facilitates operations to merge multiple time series columns into a single multidimensional array and split them back when needed.
- **Data Transformation:** Includes functions for inverse transformations to revert normalized data to its original scale.

---

### Expected Input DataFrame Structure

The input to the `TimeSeriesDataset` class must adhere to the following structure:

| **Column Name**       | **Description**                                                                                     |
|------------------------|-----------------------------------------------------------------------------------------------------|
| `timeseries_col1`      | A column containing arrays of length `seq_len` (after preprocessing) representing the first dimension of the time series. |
| `timeseries_col2`      | A column containing arrays of length `seq_len` (after preprocessing) representing the second dimension of the time series.|
| `entity_column`        | A column containing unique identifiers for each entity (e.g., user, household, or device ID).       |
| `conditioning_var1`    | An (optional) static or numeric conditioning variable (e.g., categorical or continuous feature).                |
| `conditioning_var2`    | Further (optional) static or numeric conditioning variables.                                                    |

- The `time_series_column_names` parameter specifies which columns are part of the time series.
- The `entity_column_name` parameter identifies the column containing unique entity IDs.
- The `conditioning_var_column_names` parameter defines additional conditioning variables.

---

In [2]:
class CustomTimeSeriesDataset(TimeSeriesDataset):
    """
    A custom TimeSeriesDataset implementation for handling toy data.

    Input data structure:
    - time_series_col1, time_series_col2: Time series data with arrays of length seq_len.
    - entity_id: Unique identifier for each entity.
    - static_conditioning: Categorical or numeric conditioning variable.
    """
    def __init__(
        self,
        data: pd.DataFrame,
        seq_len: int = 16,
        normalize: bool = True,
        scale: bool = True,
    ):
        entity_column_name = "entity_id"
        time_series_column_names = ["time_series_col1", "time_series_col2"]
        conditioning_var_column_names = ["conditioning_var"]

        super().__init__(
            data=data,
            entity_column_name=entity_column_name,
            time_series_column_names=time_series_column_names,
            conditioning_var_column_names=conditioning_var_column_names,
            seq_len=seq_len,
            normalize=normalize,
            scale=scale,
        )

    def _preprocess_data(self, data: pd.DataFrame) -> pd.DataFrame:
        """
        Preprocesses the raw input data to ensure it conforms to the expected format.

        - Ensures time series columns contain arrays of length seq_len.
        - Ensures all required columns are present.

        Args:
            data (pd.DataFrame): The raw input data.

        Returns:
            pd.DataFrame: The preprocessed data.
        """
        required_columns = ["entity_id", "time_series_col1", "time_series_col2", "conditioning_var"]
        for col in required_columns:
            if col not in data.columns:
                raise ValueError(f"Missing required column: {col}")

        for col in ["time_series_col1", "time_series_col2"]:
            data[col] = data[col].apply(
                lambda x: np.array(x).reshape(-1, 1) if isinstance(x, list) else x
            )
            data[col] = data[col].apply(
                lambda x: np.array(x) if isinstance(x, np.ndarray) else ValueError(f"Invalid data in {col}")
            )
        for col in ["time_series_col1", "time_series_col2"]:
            data[col] = data[col].apply(
                lambda x: x[:self.seq_len] if len(x) >= self.seq_len else ValueError(f"Sequence too short in {col}")
            )
        return data

Now that we have defined our dataset class, let's create some artificial timeseries columns and conditioning variables which will comprise our dataset:

In [9]:
data = pd.DataFrame({
        "entity_id": [f"entity_{i}" for i in range(100)],
        "time_series_col1": [np.random.rand(16).tolist() for _ in range(100)],
        "time_series_col2": [np.random.rand(16).tolist() for _ in range(100)],
        "conditioning_var": np.random.choice(["a", "b", "c"], size=100).tolist(),
    })

custom_dataset = CustomTimeSeriesDataset(data)
custom_dataset.data

Unnamed: 0,index,entity_id,conditioning_var,timeseries
0,0,entity_0,0,"[[0.9260902409329174, 0.06733527088211624], [0..."
1,1,entity_1,0,"[[0.380586763778633, 0.9862005547191708], [0.4..."
2,2,entity_2,1,"[[0.17112909695039222, 0.8879684137033145], [0..."
3,3,entity_3,1,"[[0.07133078623947496, 0.18731916651554545], [..."
4,4,entity_4,2,"[[0.37708240761630424, 0.36632049150486373], [..."
...,...,...,...,...
95,95,entity_95,2,"[[0.8441465706548078, 0.7879977379401687], [0...."
96,96,entity_96,2,"[[0.5815471394060646, 0.7404401475196428], [0...."
97,97,entity_97,2,"[[0.4270044026340649, 0.0861661115561432], [0...."
98,98,entity_98,1,"[[0.04971922238613394, 0.7569102697182223], [0..."


We will now create a `Trainer` object by passing the name of the desired model and the dataset object. To start training, simply call `Trainer.fit()`.

In [10]:
trainer = Trainer(model_name="diffusion_ts", dataset=custom_dataset)
trainer.fit()

Training: 100%|██████████| 1000/1000 [00:57<00:00, 17.49it/s]


Training complete


Once training is complete, we can create a data generator object that has access to the trained model and dataset information. To generate data, there is no need to load in a trained model. Simply define the conditioning variables, and call the `DataGenerator` 's `generate()` method.

In [12]:
data_generator = trainer.get_data_generator()
conditioning_var_codes = data_generator.get_conditioning_var_codes()
conditioning_var_codes

{'conditioning_var': {0: 'a', 1: 'b', 2: 'c'}}

In [13]:
conditioning_vars = {
    "conditioning_var": 0
}
data_generator.set_model_conditioning_vars(conditioning_vars)
generated_df = data_generator.generate(num_samples=100)
generated_df

sampling loop time step: 100%|██████████| 1000/1000 [00:16<00:00, 62.15it/s]
