# Playground notebook to test implementations

In [1]:
from collections.abc import Generator
from pathlib import Path
from typing import Any

import datasets
import pandas as pd
from datasets import Features, Sequence, Value

In [2]:
# Load dataframe
url_wide = (
    "https://gist.githubusercontent.com/rsnirwan/c8c8654a98350fadd229b00167174ec4"
    "/raw/a42101c7786d4bc7695228a0f2c8cea41340e18f/ts_wide.csv"
)
df = pd.read_csv(url_wide, index_col=0, parse_dates=True)

In [3]:
df.shape

(240, 10)

In [4]:
df.head()

Unnamed: 0,A,B,C,D,E,F,G,H,I,J
2021-01-01 00:00:00,-1.3378,0.1268,-0.3645,-1.0864,-2.3803,-0.2447,2.2647,-0.7917,0.7071,1.3763
2021-01-01 01:00:00,-1.6111,0.0926,-0.1364,-1.1613,-2.1421,-0.3477,2.4262,-0.9609,0.6413,1.275
2021-01-01 02:00:00,-1.9259,-0.142,0.1063,-1.0405,-2.1426,-0.3271,2.4434,-0.9034,0.4323,0.6767
2021-01-01 03:00:00,-1.9184,-0.493,0.6269,-0.8531,-1.706,-0.3088,2.4307,-0.9602,0.3193,0.515
2021-01-01 04:00:00,-1.9168,-0.5057,0.9419,-0.7666,-1.4287,-0.4284,2.3258,-1.2504,0.366,0.1708


In [5]:
path = "src/samay/models/moment/data/ETTh1.csv"
df = pd.read_csv(path)
df.set_index("date", inplace=True)
pd.infer_freq(df.index), df.head(1)

('H',
                       HUFL   HULL   MUFL   MULL   LUFL  LULL      OT
 date                                                                
 2016-07-01 00:00:00  5.827  2.009  1.599  0.462  4.203  1.34  30.531)

In [6]:
def multivar_example_gen_func() -> Generator[dict[str, Any], None, None]:
    yield {
        "target": df.to_numpy().T,  # array of shape (var, time)
        "start": df.index[0],
        "freq": pd.infer_freq(df.index),
        "item_id": "item_0",
    }

In [7]:
features = Features(
    dict(
        target=Sequence(
            Sequence(Value("float32")), length=len(df.columns)
        ),  # multivariate time series are saved as (var, time)
        start=Value("timestamp[s]"),
        freq=Value("string"),
        item_id=Value("string"),
    )
)

In [8]:
hf_dataset = datasets.Dataset.from_generator(
    multivar_example_gen_func, features=features
)

In [9]:
[x for x in dir(hf_dataset) if not x.startswith("_")]

['add_column',
 'add_elasticsearch_index',
 'add_faiss_index',
 'add_faiss_index_from_external_arrays',
 'add_item',
 'align_labels_with_mapping',
 'builder_name',
 'cache_files',
 'cast',
 'cast_column',
 'citation',
 'class_encode_column',
 'cleanup_cache_files',
 'column_names',
 'config_name',
 'data',
 'dataset_size',
 'description',
 'download_checksums',
 'download_size',
 'drop_index',
 'export',
 'features',
 'filter',
 'flatten',
 'flatten_indices',
 'format',
 'formatted_as',
 'from_buffer',
 'from_csv',
 'from_dict',
 'from_file',
 'from_generator',
 'from_json',
 'from_list',
 'from_pandas',
 'from_parquet',
 'from_spark',
 'from_sql',
 'from_text',
 'get_index',
 'get_nearest_examples',
 'get_nearest_examples_batch',
 'homepage',
 'info',
 'is_index_initialized',
 'iter',
 'license',
 'list_indexes',
 'load_elasticsearch_index',
 'load_faiss_index',
 'load_from_disk',
 'map',
 'num_columns',
 'num_rows',
 'prepare_for_task',
 'push_to_hub',
 'remove_columns',
 'rename_col

In [10]:
hf_dataset.features

{'target': Sequence(feature=Sequence(feature=Value(dtype='float32', id=None), length=-1, id=None), length=7, id=None),
 'start': Value(dtype='timestamp[s]', id=None),
 'freq': Value(dtype='string', id=None),
 'item_id': Value(dtype='string', id=None)}

In [11]:
hf_dataset[0].keys()

dict_keys(['target', 'start', 'freq', 'item_id'])

In [12]:
df1 = hf_dataset.to_pandas()
df1["target"][0].shape, df1["target"][0][0].shape

((7,), (17420,))

In [13]:
help(hf_dataset.to_pandas)

Help on method to_pandas in module datasets.arrow_dataset:

to_pandas(batch_size: Optional[int] = None, batched: bool = False) -> Union[pandas.core.frame.DataFrame, Iterator[pandas.core.frame.DataFrame]] method of datasets.arrow_dataset.Dataset instance
    Returns the dataset as a `pandas.DataFrame`. Can also return a generator for large datasets.

    Args:
        batched (`bool`):
            Set to `True` to return a generator that yields the dataset as batches
            of `batch_size` rows. Defaults to `False` (returns the whole datasets once).
        batch_size (`int`, *optional*):
            The size (number of rows) of the batches if `batched` is `True`.
            Defaults to `datasets.config.DEFAULT_MAX_BATCH_SIZE`.

    Returns:
        `pandas.DataFrame` or `Iterator[pandas.DataFrame]`

    Example:

    ```py
    >>> ds.to_pandas()
    ```



In [14]:
df1

Unnamed: 0,target,start,freq,item_id
0,"[[5.827, 5.693, 5.157, 5.09, 5.358, 5.626, 7.1...",2016-07-01,H,item_0


In [15]:
# External imports
import os
import sys
import pandas as pd
import matplotlib.pyplot as plt
import random

src_path = os.path.abspath(os.path.join("src"))
if src_path not in sys.path:
    sys.path.insert(0, src_path)

# Local imports
from samay.dataset import MoiraiDataset
from samay.model import MoiraiTSModel

# Load the pretrained model
repo = "Salesforce/moirai-moe-1.0-R-small"
config = {
        "context_len": 128,
        "horizon_len": 64,
        "num_layers": 100,
        "model_type": "moirai-moe",
        "model_size": "small"
    }

moirai_model = MoiraiTSModel(repor=repo, config=config)

In [16]:
# Config for the electric transformer temperature dataset
data_config = {"name" : "ett",
                "path" : "src/samay/models/moment/data/ETTh1.csv",
                "date_col" : "date",
                "freq": "h"
            }

df = pd.read_csv(data_config["path"])
df.head()

Unnamed: 0,date,HUFL,HULL,MUFL,MULL,LUFL,LULL,OT
0,2016-07-01 00:00:00,5.827,2.009,1.599,0.462,4.203,1.34,30.531
1,2016-07-01 01:00:00,5.693,2.076,1.492,0.426,4.142,1.371,27.787001
2,2016-07-01 02:00:00,5.157,1.741,1.279,0.355,3.777,1.218,27.787001
3,2016-07-01 03:00:00,5.09,1.942,1.279,0.391,3.807,1.279,25.044001
4,2016-07-01 04:00:00,5.358,1.942,1.492,0.462,3.868,1.279,21.948


In [17]:
train = MoiraiDataset(
    name=data_config['name'],
    path=data_config['path'],
    datetime_col=data_config['date_col'],
    freq=data_config['freq'],
    context_len=config['context_len'],
    horizon_len=config['horizon_len'],
    normalize=False
)

test = MoiraiDataset(
    name=data_config['name'],
    path=data_config['path'],
    mode="test",
    datetime_col=data_config['date_col'],
    freq=data_config['freq'],
    context_len=config['context_len'],
    horizon_len=config['horizon_len'],
    normalize=False
)

[13936, 13936, 17419]
[13936, 13936, 17419]


In [18]:
train.dataset.shape, test.dataset.shape

(torch.Size([13936, 7]), torch.Size([3484, 7]))