In [2]:
import polars as pl
import numpy as np
np.random.seed(42)

In [3]:
df = (
    pl.scan_csv("data/train.csv")
    .filter(pl.col("meter") == 0)
    .with_columns(pl.col("timestamp").str.to_datetime("%Y-%m-%d %H:%M:%S").alias("timestamp"))
).collect()
print(df)   

shape: (12_060_910, 4)
┌─────────────┬───────┬─────────────────────┬───────────────┐
│ building_id ┆ meter ┆ timestamp           ┆ meter_reading │
│ ---         ┆ ---   ┆ ---                 ┆ ---           │
│ i64         ┆ i64   ┆ datetime[μs]        ┆ f64           │
╞═════════════╪═══════╪═════════════════════╪═══════════════╡
│ 0           ┆ 0     ┆ 2016-01-01 00:00:00 ┆ 0.0           │
│ 1           ┆ 0     ┆ 2016-01-01 00:00:00 ┆ 0.0           │
│ 2           ┆ 0     ┆ 2016-01-01 00:00:00 ┆ 0.0           │
│ 3           ┆ 0     ┆ 2016-01-01 00:00:00 ┆ 0.0           │
│ 4           ┆ 0     ┆ 2016-01-01 00:00:00 ┆ 0.0           │
│ …           ┆ …     ┆ …                   ┆ …             │
│ 1444        ┆ 0     ┆ 2016-12-31 23:00:00 ┆ 8.75          │
│ 1445        ┆ 0     ┆ 2016-12-31 23:00:00 ┆ 4.825         │
│ 1446        ┆ 0     ┆ 2016-12-31 23:00:00 ┆ 0.0           │
│ 1447        ┆ 0     ┆ 2016-12-31 23:00:00 ┆ 159.575       │
│ 1448        ┆ 0     ┆ 2016-12-31 23:00:00 ┆ 2

In [4]:
p = 0.3 
num_rows = len(df)
num_to_modify = int(p * num_rows) 
indices_to_modify = set(np.random.choice(num_rows, num_to_modify, replace=False))
a = 100
def modify_value(x, idx):
    if idx in indices_to_modify:
        return x + (np.random.rand()-0.5)*2*a, 1 
    return x, 0
df = df.with_columns(pl.Series("index", range(num_rows)))

# Step 2: Combine 'meter_reading' and 'index' into a Struct and apply function
df = df.with_columns(
    pl.struct(["index","meter_reading"]).map_elements(lambda i: modify_value(i["meter_reading"],i["index"]),return_dtype=pl.List(pl.Float64)).alias("modification")
)

# # Step 3: Extract fields from Struct
df = df.with_columns([
    pl.col("modification").list.get(0,null_on_oob=True).alias("modified_meter"),
    pl.col("modification").list.get(1,null_on_oob=True).cast(dtype=pl.Int64).alias("attack")
]).drop(["modification", "index"])  # Clean up extra columns
print(df)

shape: (12_060_910, 6)
┌─────────────┬───────┬─────────────────────┬───────────────┬────────────────┬────────┐
│ building_id ┆ meter ┆ timestamp           ┆ meter_reading ┆ modified_meter ┆ attack │
│ ---         ┆ ---   ┆ ---                 ┆ ---           ┆ ---            ┆ ---    │
│ i64         ┆ i64   ┆ datetime[μs]        ┆ f64           ┆ f64            ┆ i64    │
╞═════════════╪═══════╪═════════════════════╪═══════════════╪════════════════╪════════╡
│ 0           ┆ 0     ┆ 2016-01-01 00:00:00 ┆ 0.0           ┆ -39.967204     ┆ 1      │
│ 1           ┆ 0     ┆ 2016-01-01 00:00:00 ┆ 0.0           ┆ 0.0            ┆ 0      │
│ 2           ┆ 0     ┆ 2016-01-01 00:00:00 ┆ 0.0           ┆ -8.945045      ┆ 1      │
│ 3           ┆ 0     ┆ 2016-01-01 00:00:00 ┆ 0.0           ┆ -98.928972     ┆ 1      │
│ 4           ┆ 0     ┆ 2016-01-01 00:00:00 ┆ 0.0           ┆ 0.0            ┆ 0      │
│ …           ┆ …     ┆ …                   ┆ …             ┆ …              ┆ …      │
│ 1444   

In [5]:
df.write_csv("data/train_with_attack.csv")