In [9]:
import polars as pl
import os
import numpy as np

# PSM dataset paths
dataset_dir = "../../datasets/SMD"
train_path = os.path.join(dataset_dir, "raw", "SMD_train.npy")
test_path = os.path.join(dataset_dir, "raw", "SMD_test.npy")
test_label_path = os.path.join(dataset_dir, "raw", "SMD_test_label.npy")

if not os.path.exists(dataset_dir):
    raise FileNotFoundError(f"{dataset_dir} not exist")

if not os.path.exists(test_path):
    raise FileNotFoundError(f"{test_path} not exist")

if not os.path.exists(train_path):
    raise FileNotFoundError(f"{train_path} not exist")

In [10]:
train_arr = np.load(train_path)
print(f"train shape: {train_arr.shape}")

# Generate column names: value-0, value-1, ...
col_names = [f"value_{i}" for i in range(train_arr.shape[1])]

# Create Train DataFrame
train_df = pl.DataFrame(train_arr, schema=col_names)
# Add timestamp as the first column
train_df = train_df.with_columns(
    pl.int_range(0, pl.len(), dtype=pl.Int64).alias("timestamp").cast(pl.UInt64)
).select(["timestamp"] + col_names)

train_output_dir = os.path.join(dataset_dir, "train")
os.makedirs(train_output_dir, exist_ok=True)
train_df.write_csv(os.path.join(train_output_dir, "train.csv"))

train_df.head()

train shape: (708405, 38)


timestamp,value_0,value_1,value_2,value_3,value_4,value_5,value_6,value_7,value_8,value_9,value_10,value_11,value_12,value_13,value_14,value_15,value_16,value_17,value_18,value_19,value_20,value_21,value_22,value_23,value_24,value_25,value_26,value_27,value_28,value_29,value_30,value_31,value_32,value_33,value_34,value_35,value_36,value_37
u64,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32
0,0.010101,0.001642,0.001996,0.00263,0.0,0.987293,0.059893,0.0,0.0,0.0,0.0,0.057778,0.0,0.070713,0.008347,0.029484,0.0,0.0,0.01258,0.014019,0.044014,0.040679,0.092219,0.037405,0.07541,0.037391,0.0,0.07323,0.0,0.2,0.100698,0.0,0.0,0.030769,0.141966,0.141966,0.0,0.0
1,0.010101,0.000626,0.001553,0.002391,0.0,0.834117,0.060428,0.0,0.0,0.000678,0.000783,0.118519,0.0,0.098998,0.026232,0.03317,0.0,0.0,0.013249,0.013439,0.041019,0.034086,0.074136,0.095503,0.04918,0.095863,0.0,0.062944,0.0,0.2,0.082419,0.051948,0.0,0.035165,0.149766,0.149766,0.0,0.0
2,0.010101,0.000235,0.001109,0.002152,0.0,0.681114,0.061497,0.0,0.0,0.001055,0.001566,0.078519,0.0,0.079552,0.015898,0.03317,0.0,0.0,0.0115,0.012811,0.039636,0.035986,0.071275,0.035814,0.062295,0.0358,0.0,0.064903,0.0,0.2,0.088069,0.0,0.0,0.030769,0.135725,0.135725,0.0,0.0
3,0.010101,0.000938,0.001109,0.002152,0.0,0.681462,0.060963,0.0,0.0,0.0,0.0,0.044444,0.0,0.053035,0.006094,0.014742,0.0,0.0,0.012179,0.013922,0.042171,0.038221,0.057892,0.042181,0.065574,0.042164,0.0,0.069312,0.0,0.2,0.093054,0.012987,0.0,0.035165,0.148206,0.148206,0.0,0.0
4,0.010101,0.001173,0.00122,0.002152,0.0,0.681462,0.060428,0.0,0.0,0.0,0.0,0.016296,0.0,0.06482,0.002782,0.034398,0.0,0.0,0.013434,0.015238,0.047817,0.044256,0.083058,0.044568,0.078689,0.043357,0.0,0.080088,0.0,0.2,0.109339,0.025974,0.0,0.030769,0.154446,0.154446,0.0,0.0


In [11]:
# Process Test Data
test_arr = np.load(test_path)
test_labels = np.load(test_label_path)
print(f"test shape: {test_arr.shape}")
print(f"test label shape: {test_labels.shape}")

# Create Test DataFrame
test_df = pl.DataFrame(test_arr, schema=col_names)

# Add timestamp (first) and label (last)
test_df = test_df.with_columns(
    [
        pl.int_range(0, pl.len(), dtype=pl.Int64).alias("timestamp").cast(pl.UInt64),
        pl.Series("label", test_labels).cast(pl.Int8),
    ]
).select(["timestamp"] + col_names + ["label"])

train_output_dir = os.path.join(dataset_dir, "test")
os.makedirs(train_output_dir, exist_ok=True)
test_df.write_csv(os.path.join(train_output_dir, "test.csv"))
test_df.head()

test shape: (708420, 38)
test label shape: (708420,)


timestamp,value_0,value_1,value_2,value_3,value_4,value_5,value_6,value_7,value_8,value_9,value_10,value_11,value_12,value_13,value_14,value_15,value_16,value_17,value_18,value_19,value_20,value_21,value_22,value_23,value_24,value_25,value_26,value_27,value_28,value_29,value_30,value_31,value_32,value_33,value_34,value_35,value_36,value_37,label
u64,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,i8
0,0.010101,0.001251,0.001553,0.002391,0.0,0.628024,0.031016,0.0,0.0,0.0,0.0,0.066667,0.0,0.092516,0.009274,0.034398,0.0,0.0,0.00792,0.009805,0.024427,0.018328,0.010148,0.032232,0.009836,0.032617,0.0,0.035023,0.0,0.2,0.039216,0.012987,0.0,0.030769,0.120125,0.121685,0.0,0.0,0
1,0.010101,0.00172,0.001664,0.00263,0.0,0.628895,0.033155,0.0,0.0,0.0,0.0,0.195556,0.0,0.080731,0.023052,0.03317,0.0,0.0,0.0079,0.010288,0.025233,0.019781,0.009739,0.030641,0.009836,0.031424,0.0,0.036493,0.0,0.2,0.042539,0.025974,0.0,0.026374,0.121685,0.121685,0.0,0.0,0
2,0.010101,0.001486,0.001553,0.00263,0.0,0.629765,0.035294,0.0,0.0,0.0,0.0,0.066667,0.0,0.076606,0.010466,0.019656,0.0,0.0,0.007694,0.009913,0.025925,0.020563,0.013451,0.029845,0.016393,0.029833,0.0,0.038697,0.0,0.2,0.045862,0.025974,0.0,0.026374,0.121685,0.121685,0.0,0.0,0
3,0.020202,0.001642,0.001664,0.00263,0.0,0.630287,0.035829,0.0,0.0,0.0,0.0,0.04,0.0,0.083677,0.020933,0.036855,0.0,0.0,0.058353,0.058551,0.140915,0.132432,0.168738,0.027457,0.259016,0.027446,0.0,0.212589,0.0,0.2,0.293785,0.051948,0.0,0.043956,0.24493,0.24493,0.0,0.0,0
4,0.010101,0.004848,0.003882,0.004304,0.0,0.630635,0.036364,0.0,0.564103,0.0,0.0,0.093333,0.055556,0.756629,0.567435,0.711302,0.0,0.0,0.007128,0.010155,0.02604,0.022575,0.073523,0.030243,0.02623,0.030628,0.0,0.040411,0.0,0.2,0.051844,0.012987,0.0,0.032967,0.123245,0.123245,0.0,0.0,0
