In [None]:
import argparse
import numpy as np
import pandas as pd
import os
from datetime import datetime, timedelta

In [None]:
def generate_csv(
    out_path: str = "data/sample_data.csv",
    n: int = 1_000_000,
    groups: int = 50,
    seed: int = 42,
    chunk_size: int = 100_000,
    start_ts: str = "2020-01-01T00:00:00",
    freq_seconds: int = 1,
    user_id_max_multiplier: int = 2,
    compress: bool = False
):
    rng = np.random.default_rng(seed)

    # ensure output folder exists
    os.makedirs(os.path.dirname(out_path) or ".", exist_ok=True)

    compression = "gzip" if compress else None
    header_written = False

    # We'll generate timestamps deterministically as start + i*freq_seconds
    start = pd.to_datetime(start_ts)

    rows_remaining = n
    offset = 0
    batch_idx = 0

    print(f"Generating {n} rows to '{out_path}' (groups={groups}, chunk={chunk_size}, compress={compress})")

    while rows_remaining > 0:
        cur_chunk = min(chunk_size, rows_remaining)

        # generate arrays
        group = rng.integers(0, groups, size=cur_chunk, dtype=np.int32)
        # value distribution: base normal plus small group-dependent shift
        value = rng.normal(loc=100.0, scale=20.0, size=cur_chunk).astype(np.float32) + (group * 0.5).astype(np.float32)
        user_id = rng.integers(0, n * user_id_max_multiplier, size=cur_chunk, dtype=np.int32)

        # timestamps for this chunk
        # compute seconds offsets for chunk
        start_offset_seconds = offset * freq_seconds
        secs = np.arange(start_offset_seconds, start_offset_seconds + cur_chunk * freq_seconds, freq_seconds, dtype=np.int64)
        ts = (start + pd.to_timedelta(secs, unit="s")).astype("datetime64[ns]")

        df_chunk = pd.DataFrame({
            "user_id": user_id,
            "group": group,
            "value": value,
            "ts": ts
        })

        # write chunk to CSV (append after first chunk)
        if not header_written:
            df_chunk.to_csv(out_path, index=False, compression=compression)
            header_written = True
        else:
            # append without writing header
            df_chunk.to_csv(out_path, index=False, header=False, mode="a", compression=compression)

        offset += cur_chunk
        rows_remaining -= cur_chunk
        batch_idx += 1

        print(f"  wrote chunk {batch_idx} ({cur_chunk} rows), {rows_remaining} rows remaining")

    print(f"Done. Wrote {n} rows to {out_path} (compression={compression})")

def parse_args():
    p = argparse.ArgumentParser(description="Generate synthetic dataset for approx-query-engine")
    p.add_argument("--out", default="data/sample_data.csv", help="Output CSV path (use .gz if compress)")
    p.add_argument("--n", type=int, default=1_000_000, help="Total number of rows to generate")
    p.add_argument("--groups", type=int, default=50, help="Number of distinct groups")
    p.add_argument("--seed", type=int, default=42, help="Random seed")
    p.add_argument("--chunk", type=int, default=100_000, help="Chunk size for writing (rows per write)")
    p.add_argument("--start-ts", default="2020-01-01T00:00:00", help="Start timestamp (ISO format)")
    p.add_argument("--freq-seconds", type=int, default=1, help="Timestamp frequency in seconds")
    p.add_argument("--user-id-mult", type=int, default=2, help="Multiplier for user_id max (n * mult)")
    p.add_argument("--compress", action="store_true", help="Write gzip compressed CSV")
    return p.parse_args()


if __name__ == "__main__":
    args = parse_args()
    generate_csv(
        out_path=args.out,
        n=args.n,
        groups=args.groups,
        seed=args.seed,
        chunk_size=args.chunk,
        start_ts=args.start_ts,
        freq_seconds=args.freq_seconds,
        user_id_max_multiplier=args.user_id_mult,
        compress=args.compress
    )