In [4]:
import logging
import os
import polars as pl
import sys
import warnings

if os.path.abspath(".") not in sys.path:
    sys.path.append(os.path.abspath("."))

from utils import plot_dataset_dimensions

logging.basicConfig(level=logging.INFO)
warnings.filterwarnings("ignore")

In [5]:
# Load Synthetic Train and Test Data
dataset_dir = "../../datasets/synthetic"
train_path = os.path.join(dataset_dir, "train", "train.csv")
test_path = os.path.join(dataset_dir, "test", "test.csv")

if not os.path.exists(train_path) or not os.path.exists(test_path):
    raise FileNotFoundError(f"Dataset files not found in {dataset_dir}. Please run the generation script first.")

# Read CSVs
train_df = pl.read_csv(train_path)
test_df = pl.read_csv(test_path)

value_cols = [col for col in train_df.columns if col.startswith("value_")]
dims = sorted([int(col.split("_")[1]) for col in value_cols])

print(f"Train Data: {train_df.shape}")
print(f"Test Data:  {test_df.shape}")
print(f"Dimensions: {len(dims)} ({dims})")

# Preview
train_df.head()

Train Data: (2500, 12)
Test Data:  (2500, 12)
Dimensions: 0 ([])


timestamp,value-0,value-1,value-2,value-3,value-4,value-5,value-6,value-7,value-8,value-9,label
str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i64
"""2023-01-01 00:00:00""",0.024916,0.018588,0.0311,-0.277102,0.177439,0.133843,-0.17277,0.160014,-0.06329,0.644817,0
"""2023-01-01 00:01:00""",0.19746,0.045966,0.069702,-0.284231,0.437768,-0.065421,-0.362746,-0.167485,-0.073569,0.969488,0
"""2023-01-01 00:02:00""",-0.013867,0.13261,-0.067291,0.020803,0.212118,0.159324,0.033736,0.122705,-0.087473,-0.895343,0
"""2023-01-01 00:03:00""",-0.08659,0.09073,0.577023,-0.088909,0.24116,0.018905,-0.316584,-0.111371,0.420482,-0.413905,0
"""2023-01-01 00:04:00""",0.113878,0.176096,-0.763239,0.12437,0.224121,-0.045989,-0.100916,-0.064357,0.072791,0.759664,0


In [6]:
# Visualize Train and Test Data using shared utility
dataset_name = "synthetic"

# 1. Plot Training Data
plot_dataset_dimensions(
    train_df, 
    dataset_name=dataset_name, 
    split="train"
)

# 2. Plot Test Data
plot_dataset_dimensions(
    test_df, 
    dataset_name=dataset_name, 
    split="test"
)

Plotting 10 dimensions for synthetic (train)...
✓ Saved plots for 10 dimensions (split into 1 segments each) to ../../figures/datasets/synthetic/train
Plotting 10 dimensions for synthetic (test)...
✓ Saved plots for 10 dimensions (split into 1 segments each) to ../../figures/datasets/synthetic/test
