In [1]:
import logging
import os
import polars as pl
import sys
import warnings

if os.path.abspath(".") not in sys.path:
    sys.path.append(os.path.abspath("."))

from utils import plot_dataset_dimensions

logging.basicConfig(level=logging.INFO)
warnings.filterwarnings("ignore")

In [2]:
# Load Synthetic Train and Test Data
dataset_dir = "../../datasets/PSM"
train_path = os.path.join(dataset_dir, "train", "train.csv")
test_path = os.path.join(dataset_dir, "test", "test.csv")

if not os.path.exists(train_path) or not os.path.exists(test_path):
    raise FileNotFoundError(f"Dataset files not found in {dataset_dir}. Please run the generation script first.")

# Read CSVs
train_df = pl.read_csv(train_path)
test_df = pl.read_csv(test_path)

value_cols = [col for col in train_df.columns if col.startswith("value_")]
dims = sorted([int(col.split("_")[1]) for col in value_cols])

print(f"Train Data: {train_df.shape}")
print(f"Test Data:  {test_df.shape}")
print(f"Dimensions: {len(dims)} ({dims})")

# Preview
train_df.head()

Train Data: (132481, 26)
Test Data:  (87841, 27)
Dimensions: 25 ([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24])


timestamp,value_0,value_1,value_2,value_3,value_4,value_5,value_6,value_7,value_8,value_9,value_10,value_11,value_12,value_13,value_14,value_15,value_16,value_17,value_18,value_19,value_20,value_21,value_22,value_23,value_24
i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
0,0.732689,0.761748,0.606848,0.488746,0.42431,0.403609,0.519318,0.398792,0.451453,0.447077,0.463336,0.487324,0.151929,0.138458,0.201467,0.318797,0.451856,0.5715,0.469717,0.609883,0.008432,0.0,0.481838,0.006536,0.138249
1,0.732799,0.761855,0.607133,0.488781,0.432008,0.410256,0.511364,0.402568,0.455657,0.449474,0.459267,0.494656,0.151487,0.138011,0.20211,0.321463,0.456123,0.562226,0.466533,0.629812,0.008432,0.0,0.477218,0.006536,0.115207
2,0.732938,0.761594,0.606895,0.488791,0.418858,0.407724,0.488636,0.396526,0.456104,0.451282,0.471587,0.490333,0.15367,0.140763,0.203354,0.347219,0.456692,0.572002,0.487845,0.643598,0.006745,0.0,0.492623,0.008715,0.092166
3,0.732893,0.761656,0.606478,0.488802,0.417896,0.404242,0.5,0.405589,0.46002,0.456628,0.47691,0.480858,0.153426,0.141215,0.201345,0.361904,0.460532,0.563354,0.479512,0.64469,0.008432,0.0,0.457064,0.008715,0.142857
4,0.732788,0.761573,0.606777,0.4888,0.421103,0.407407,0.511364,0.399547,0.458507,0.454611,0.451032,0.458795,0.153335,0.139718,0.203097,0.359767,0.458825,0.563354,0.448298,0.629948,0.006745,0.0,0.472223,0.006536,0.170507


In [3]:
# Visualize Train and Test Data using shared utility
dataset_name = "PSM"

# 1. Plot Training Data
plot_dataset_dimensions(
    train_df, 
    dataset_name=dataset_name, 
    split="train"
)

# 2. Plot Test Data
plot_dataset_dimensions(
    test_df, 
    dataset_name=dataset_name, 
    split="test"
)

Plotting 25 dimensions for PSM (train)...
✓ Saved plots for 25 dimensions (split into 14 segments each) to ../../figures/datasets/PSM/train
Plotting 25 dimensions for PSM (test)...
✓ Saved plots for 25 dimensions (split into 9 segments each) to ../../figures/datasets/PSM/test
