In [None]:
import polars as pl
import os

# PSM dataset paths
dataset_dir = "../../datasets/SWAT"
train_path = os.path.join(dataset_dir, "raw", "swat_train2.csv")
test_path = os.path.join(dataset_dir, "raw", "swat2.csv")

if not os.path.exists(dataset_dir):
    raise FileNotFoundError(f"{dataset_dir} not exist")

if not os.path.exists(test_path):
    raise FileNotFoundError(f"{test_path} not exist")

if not os.path.exists(train_path):
    raise FileNotFoundError(f"{train_path} not exist")

In [None]:
def process_columns(df: pl.DataFrame) -> pl.DataFrame:
    """
    Renames columns to standardize format:
    - X -> value_X
    Add timestamp column
    """
    rename_map = {}
    for i, col in enumerate(df.columns):
        if col != "Normal/Attack":
            rename_map[col] = f"value_{i}"
    rename_map["Normal/Attack"] = "label"
    df = df.rename(rename_map)

    # Add timestamp column with sequential values
    df = df.with_columns(pl.Series("timestamp", range(df.height)).cast(pl.UInt64))
    df = df.with_columns(pl.col("label").cast(pl.UInt8))
    # Move timestamp to first column
    df = df.select(["timestamp"] + [col for col in df.columns if col != "timestamp"])

    return df

In [None]:
# Read and process train data
train_df = pl.read_csv(train_path).pipe(process_columns)

# Save train data
train_output_dir = os.path.join(dataset_dir, "train")
os.makedirs(train_output_dir, exist_ok=True)
train_df.write_csv(os.path.join(train_output_dir, "train.csv"))

print(f"Train data shape: {train_df.shape}")
print(f"Columns: {train_df.columns}")
print(f"Timestamp dtype: {train_df['timestamp'].dtype}")
train_df.head()

Train data shape: (495000, 53)
Columns: ['timestamp', 'value_0', 'value_1', 'value_2', 'value_3', 'value_4', 'value_5', 'value_6', 'value_7', 'value_8', 'value_9', 'value_10', 'value_11', 'value_12', 'value_13', 'value_14', 'value_15', 'value_16', 'value_17', 'value_18', 'value_19', 'value_20', 'value_21', 'value_22', 'value_23', 'value_24', 'value_25', 'value_26', 'value_27', 'value_28', 'value_29', 'value_30', 'value_31', 'value_32', 'value_33', 'value_34', 'value_35', 'value_36', 'value_37', 'value_38', 'value_39', 'value_40', 'value_41', 'value_42', 'value_43', 'value_44', 'value_45', 'value_46', 'value_47', 'value_48', 'value_49', 'value_50', 'label']
Timestamp dtype: UInt64


timestamp,value_0,value_1,value_2,value_3,value_4,value_5,value_6,value_7,value_8,value_9,value_10,value_11,value_12,value_13,value_14,value_15,value_16,value_17,value_18,value_19,value_20,value_21,value_22,value_23,value_24,value_25,value_26,value_27,value_28,value_29,value_30,value_31,value_32,value_33,value_34,value_35,value_36,value_37,value_38,value_39,value_40,value_41,value_42,value_43,value_44,value_45,value_46,value_47,value_48,value_49,value_50,label
u64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,u8
0,0.0,124.3135,1.0,1.0,1.0,251.9226,8.313446,312.7916,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.560983,0.000256,138.5061,1.0,1.0,1.0,1.0,1.0,1.0,0.0,169.2387,0.0,133.8503,1.0,1.0,1.0,1.0,1.0,7.44636,175.4166,260.7024,123.3145,0.001538,0.001409,0.001664,0.0,1.0,1.0,9.100231,0.0,3.3485,0.000256,1.0,1.0,1.0,0
1,0.0,124.392,1.0,1.0,1.0,251.9226,8.313446,312.7916,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.560983,0.000256,138.7465,1.0,1.0,1.0,1.0,1.0,1.0,0.0,169.2387,0.0,134.0041,1.0,1.0,1.0,1.0,1.0,7.44636,175.4166,260.7024,123.3145,0.001538,0.001409,0.001664,0.0,1.0,1.0,9.100231,0.0,3.3485,0.000256,1.0,1.0,1.0,0
2,0.0,124.4705,1.0,1.0,1.0,251.9226,8.313446,312.7916,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.560983,0.000256,138.6263,1.0,1.0,1.0,1.0,1.0,1.0,0.0,169.2387,0.0,134.0041,1.0,1.0,1.0,1.0,1.0,7.44636,175.4166,260.7024,123.3145,0.001538,0.001409,0.001664,0.0,1.0,1.0,9.100231,0.0,3.3485,0.000256,1.0,1.0,1.0,0
3,0.0,124.6668,1.0,1.0,1.0,251.9226,8.313446,312.7916,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.560983,0.000256,138.7064,1.0,1.0,1.0,1.0,1.0,1.0,0.0,169.2387,0.0,134.1195,1.0,1.0,1.0,1.0,1.0,7.44636,175.4166,260.7024,123.3145,0.001538,0.001409,0.001664,0.0,1.0,1.0,9.100231,0.0,3.3485,0.000256,1.0,1.0,1.0,0
4,0.0,124.5098,1.0,1.0,1.0,251.9226,8.313446,312.7916,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.560983,0.000256,138.9067,1.0,1.0,1.0,1.0,1.0,1.0,0.0,169.2387,0.0,134.1964,1.0,1.0,1.0,1.0,1.0,7.44636,175.4166,260.7024,123.3145,0.001538,0.001409,0.001664,0.0,1.0,1.0,9.100231,0.0,3.3485,0.000256,1.0,1.0,1.0,0


In [4]:
# Read and process test data and labels
test_df = pl.read_csv(test_path).pipe(process_columns)

# Save test data
test_output_dir = os.path.join(dataset_dir, "test")
os.makedirs(test_output_dir, exist_ok=True)
test_df.write_csv(os.path.join(test_output_dir, "test.csv"))

print(f"Test data shape: {test_df.shape}")
print(f"Columns: {test_df.columns}")
print(f"Last column: {test_df.columns[-1]}")
test_df.head()

Test data shape: (449919, 53)
Columns: ['timestamp', 'value_0', 'value_1', 'value_2', 'value_3', 'value_4', 'value_5', 'value_6', 'value_7', 'value_8', 'value_9', 'value_10', 'value_11', 'value_12', 'value_13', 'value_14', 'value_15', 'value_16', 'value_17', 'value_18', 'value_19', 'value_20', 'value_21', 'value_22', 'value_23', 'value_24', 'value_25', 'value_26', 'value_27', 'value_28', 'value_29', 'value_30', 'value_31', 'value_32', 'value_33', 'value_34', 'value_35', 'value_36', 'value_37', 'value_38', 'value_39', 'value_40', 'value_41', 'value_42', 'value_43', 'value_44', 'value_45', 'value_46', 'value_47', 'value_48', 'value_49', 'value_50', 'label']
Last column: label


timestamp,value_0,value_1,value_2,value_3,value_4,value_5,value_6,value_7,value_8,value_9,value_10,value_11,value_12,value_13,value_14,value_15,value_16,value_17,value_18,value_19,value_20,value_21,value_22,value_23,value_24,value_25,value_26,value_27,value_28,value_29,value_30,value_31,value_32,value_33,value_34,value_35,value_36,value_37,value_38,value_39,value_40,value_41,value_42,value_43,value_44,value_45,value_46,value_47,value_48,value_49,value_50,label
u64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,u8
0,2.427057,522.8467,2.0,2.0,1.0,262.0161,8.396437,328.6337,2.445391,2.0,1.0,1.0,2.0,1.0,2.0,1.0,19.74838,2.206835,956.1651,1.0,2.0,1.0,1.0,1.0,2.0,148.808,156.0882,1.713517,942.0662,1.0,2.0,1.0,1.0,2.0,7.878621,145.1166,264.5475,12.03538,1.723789,1.279621,0.7352687,0.3077859,2.0,1.0,250.8652,1.649953,189.5988,0.000128,1.0,1.0,1.0,0
1,2.446274,522.886,2.0,2.0,1.0,262.0161,8.396437,328.6337,2.445391,2.0,1.0,1.0,2.0,1.0,2.0,1.0,19.74838,2.208244,956.1651,1.0,2.0,1.0,1.0,1.0,2.0,148.808,156.0882,1.715952,942.0277,1.0,2.0,1.0,1.0,2.0,7.878621,145.1166,264.5475,12.03538,1.723789,1.297554,0.7352687,0.3077859,2.0,1.0,250.8652,1.649953,189.6789,0.000128,1.0,1.0,1.0,0
2,2.489191,522.8467,2.0,2.0,1.0,262.0161,8.394514,328.6337,2.442316,2.0,1.0,1.0,2.0,1.0,2.0,1.0,19.69076,2.208628,956.4855,1.0,2.0,1.0,1.0,1.0,2.0,148.808,156.0882,1.715952,941.8739,1.0,2.0,1.0,1.0,2.0,7.878621,145.1166,264.5475,12.03538,1.723404,1.293967,0.7352687,0.3086186,2.0,1.0,250.8812,1.649953,189.6789,0.000128,1.0,1.0,1.0,0
3,2.53435,522.9645,2.0,2.0,1.0,262.0161,8.394514,328.6337,2.442316,2.0,1.0,1.0,2.0,1.0,2.0,1.0,19.69076,2.208628,956.806,1.0,2.0,1.0,1.0,1.0,2.0,148.808,156.0882,1.71467,941.797,1.0,2.0,1.0,1.0,2.0,7.878621,145.0141,264.5475,12.03538,1.723404,1.281158,0.7352687,0.3086186,2.0,1.0,250.8812,1.649953,189.6148,0.000128,1.0,1.0,1.0,0
4,2.56926,523.4748,2.0,2.0,1.0,262.0161,8.394514,328.6337,2.443085,2.0,1.0,1.0,2.0,1.0,2.0,1.0,19.69076,2.208628,957.0864,1.0,2.0,1.0,1.0,1.0,2.0,148.808,156.0882,1.71467,942.22,1.0,2.0,1.0,1.0,2.0,7.878621,144.8859,264.5475,12.03538,1.723404,1.281158,0.7352687,0.3086186,2.0,1.0,250.8812,1.649953,189.5027,0.000128,1.0,1.0,1.0,0
