In [1]:
import polars as pl
import os

# Tencent dataset paths
dataset_dir = "../../datasets/Tencent"
raw_dir = os.path.join(dataset_dir, "raw")
train_dir = os.path.join(dataset_dir, "train")
test_dir = os.path.join(dataset_dir, "test")

# Create train and test directories if they don't exist
os.makedirs(train_dir, exist_ok=True)
os.makedirs(test_dir, exist_ok=True)

if not os.path.exists(raw_dir):
    raise FileNotFoundError(f"{raw_dir} not exist")

files = ["20251115_metrics_1.csv", "20251115_metrics_2.csv", "20251115_metrics_3.csv"]

# Define target features to keep
target_features = [
    "system_usage:0",
    "page_faults",
    "numa_miss",
    "sectors_read:sda1",
    "rx_bytes:lo"
]

for i, file_name in enumerate(files, 1):
    file_path = os.path.join(raw_dir, file_name)
    if not os.path.exists(file_path):
        print(f"Warning: {file_path} does not exist")
        continue

    print(f"Processing {file_name}...")

    # Read CSV, treating all columns as Utf8 initially to handle units
    df = pl.read_csv(file_path, infer_schema_length=0)

    # 1. Handle Timestamp (First Column)
    timestamp_col = df.columns[0]
    # Convert timestamp to unix timestamp (UInt64) - seconds
    # Format: 2025-11-15_00:00:15
    df = df.with_columns(
        pl.col(timestamp_col)
        .str.strptime(pl.Datetime, format="%Y-%m-%d_%H:%M:%S")
        .cast(pl.UInt64)
        .truediv(1_000_000)  # Convert ns to ms
        .cast(pl.UInt64)
        .alias("timestamp")
    )

    if timestamp_col != "timestamp":
        df = df.drop(timestamp_col)

    # 2. Handle Label
    if "label" in df.columns:
        df = df.with_columns(pl.col("label").fill_null("0").cast(pl.UInt8))

    # 3. Filter and Handle Numeric Columns
    # Filter feature columns to only include target features
    feature_cols = [c for c in target_features if c in df.columns]
    
    # Check if any target features are missing
    missing_features = set(target_features) - set(feature_cols)
    if missing_features:
        print(f"Warning: Missing features in {file_name}: {missing_features}")

    # Apply unit conversion to selected feature columns
    # Remove 'k' or 'M' from the end and cast to float, WITHOUT multiplication
    exprs = []
    for col in feature_cols:
        expr = (
            pl.col(col)
            .str.replace(r"[kM]$", "")  # Remove k or M at the end
            .cast(pl.Float64, strict=False)
            .alias(col)
        )
        exprs.append(expr)

    df = df.with_columns(exprs)

    # Reorder columns: timestamp, selected features..., label
    final_cols = (
        ["timestamp"] + feature_cols + (["label"] if "label" in df.columns else [])
    )
    df = df.select(final_cols)

    # Split into train (first 50%) and test (last 50%)
    n_rows = len(df)
    split_idx = int(n_rows * 0.5)

    train_df = df.slice(0, split_idx)
    test_df = df.slice(split_idx, n_rows - split_idx)

    # Save files
    train_save_path = os.path.join(train_dir, f"{i}.csv")
    test_save_path = os.path.join(test_dir, f"{i}.csv")

    train_df.write_csv(train_save_path)
    test_df.write_csv(test_save_path)

    print(f"Saved {i}.csv to train and test directories")
    print(f"Train shape: {train_df.shape}, Test shape: {test_df.shape}")
    print(f"Train head:\n{train_df.head()}")

Processing 20251115_metrics_1.csv...
Saved 1.csv to train and test directories
Train shape: (4320, 7), Test shape: (4320, 7)
Train head:
shape: (5, 7)
┌────────────┬────────────────┬─────────────┬───────────┬───────────────────┬─────────────┬───────┐
│ timestamp  ┆ system_usage:0 ┆ page_faults ┆ numa_miss ┆ sectors_read:sda1 ┆ rx_bytes:lo ┆ label │
│ ---        ┆ ---            ┆ ---         ┆ ---       ┆ ---               ┆ ---         ┆ ---   │
│ u64        ┆ f64            ┆ f64         ┆ f64       ┆ f64               ┆ f64         ┆ u8    │
╞════════════╪════════════════╪═════════════╪═══════════╪═══════════════════╪═════════════╪═══════╡
│ 1763164805 ┆ 0.0            ┆ 3.2553727e7 ┆ 0.0       ┆ 1891.0            ┆ 80.0        ┆ 0     │
│ 1763164815 ┆ 0.0            ┆ 3.2553731e7 ┆ 0.0       ┆ 1891.0            ┆ 80.0        ┆ 0     │
│ 1763164825 ┆ 0.0            ┆ 3.2553731e7 ┆ 0.0       ┆ 1891.0            ┆ 80.0        ┆ 0     │
│ 1763164835 ┆ 0.0            ┆ 3.2553736e7 ┆ 0.0