In [7]:
import logging
import os
import polars as pl
import sys
import warnings

if os.path.abspath(".") not in sys.path:
    sys.path.append(os.path.abspath("."))

from utils import plot_dataset_dimensions

logging.basicConfig(level=logging.INFO)
warnings.filterwarnings("ignore")

In [8]:
# Load Synthetic Train and Test Data
dataset_dir = "../../datasets/SMD"
train_path = os.path.join(dataset_dir, "train", "train.csv")
test_path = os.path.join(dataset_dir, "test", "test.csv")

if not os.path.exists(train_path) or not os.path.exists(test_path):
    raise FileNotFoundError(
        f"Dataset files not found in {dataset_dir}. Please run the generation script first."
    )

# Read CSVs
train_df = pl.read_csv(train_path)
test_df = pl.read_csv(test_path)

value_cols = [col for col in train_df.columns if col.startswith("value_")]
dims = sorted([int(col.split("_")[1]) for col in value_cols])

print(f"Train Data: {train_df.shape}")
print(f"Test Data:  {test_df.shape}")
print(f"Dimensions: {len(dims)} ({dims})")

# Preview
train_df.head()

Train Data: (708405, 39)
Test Data:  (708420, 40)
Dimensions: 38 ([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37])


timestamp,value_0,value_1,value_2,value_3,value_4,value_5,value_6,value_7,value_8,value_9,value_10,value_11,value_12,value_13,value_14,value_15,value_16,value_17,value_18,value_19,value_20,value_21,value_22,value_23,value_24,value_25,value_26,value_27,value_28,value_29,value_30,value_31,value_32,value_33,value_34,value_35,value_36,value_37
i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
0,0.010101,0.001642,0.001996,0.00263,0.0,0.987293,0.059893,0.0,0.0,0.0,0.0,0.057778,0.0,0.070713,0.008347,0.029484,0.0,0.0,0.01258,0.014019,0.044014,0.040679,0.092219,0.037405,0.07541,0.037391,0.0,0.07323,0.0,0.2,0.100698,0.0,0.0,0.030769,0.141966,0.141966,0.0,0.0
1,0.010101,0.000626,0.001553,0.002391,0.0,0.834117,0.060428,0.0,0.0,0.000678,0.000783,0.118519,0.0,0.098998,0.026232,0.03317,0.0,0.0,0.013249,0.013439,0.041019,0.034086,0.074136,0.095503,0.04918,0.095863,0.0,0.062944,0.0,0.2,0.082419,0.051948,0.0,0.035165,0.149766,0.149766,0.0,0.0
2,0.010101,0.000235,0.001109,0.002152,0.0,0.681114,0.061497,0.0,0.0,0.001055,0.001566,0.078519,0.0,0.079552,0.015898,0.03317,0.0,0.0,0.0115,0.012811,0.039636,0.035986,0.071275,0.035814,0.062295,0.0358,0.0,0.064903,0.0,0.2,0.088069,0.0,0.0,0.030769,0.135725,0.135725,0.0,0.0
3,0.010101,0.000938,0.001109,0.002152,0.0,0.681462,0.060963,0.0,0.0,0.0,0.0,0.044444,0.0,0.053035,0.006094,0.014742,0.0,0.0,0.012179,0.013922,0.042171,0.038221,0.057892,0.042181,0.065574,0.042164,0.0,0.069312,0.0,0.2,0.093054,0.012987,0.0,0.035165,0.148206,0.148206,0.0,0.0
4,0.010101,0.001173,0.00122,0.002152,0.0,0.681462,0.060428,0.0,0.0,0.0,0.0,0.016296,0.0,0.06482,0.002782,0.034398,0.0,0.0,0.013434,0.015238,0.047817,0.044256,0.083058,0.044568,0.078689,0.043357,0.0,0.080088,0.0,0.2,0.109339,0.025974,0.0,0.030769,0.154446,0.154446,0.0,0.0


In [9]:
# Visualize Train and Test Data using shared utility
dataset_name = "SMD"

# 1. Plot Training Data
plot_dataset_dimensions(train_df, dataset_name=dataset_name, split="train")

# 2. Plot Test Data
plot_dataset_dimensions(test_df, dataset_name=dataset_name, split="test")

Plotting 38 dimensions for SMD (train)...


✓ Saved plots for 38 dimensions (split into 142 segments each) to ../../figures/datasets/SMD/train
Plotting 38 dimensions for SMD (test)...
✓ Saved plots for 38 dimensions (split into 142 segments each) to ../../figures/datasets/SMD/test
