In [1]:
#!/usr/bin/env python
"""
csv2_gaf_rgb.py – Convert CIC-IDS-2017 flow rows into 2-channel GAF images (GADF+GASF)
and save as RGB PNG (R=GADF, G=GASF, B=zero-pad).

Author: you / 2025
"""

import argparse
import os
import glob
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from pyts.image import GramianAngularField
import cv2
from tqdm import tqdm


def load_cic_csv(dir_path: str) -> pd.DataFrame:
    """Concatenate all CSV files in dir_path into one DataFrame."""
    files = glob.glob(os.path.join(dir_path, "*.csv"))
    dfs = []
    for f in files:
        dfs.append(pd.read_csv(f, low_memory=False))
    return pd.concat(dfs, ignore_index=True)


def encode_gaf_pair(signal: np.ndarray, size: int):
    """Return (GADF, GASF) of shape (size, size)."""
    gadf = GramianAngularField(
        method="difference", image_size=size, overlapping=False
    ).fit_transform(signal.reshape(1, -1))[0]
    gasf = GramianAngularField(
        method="summation", image_size=size, overlapping=False
    ).fit_transform(signal.reshape(1, -1))[0]
    return gadf, gasf


def vector_from_row(row: pd.Series, feature_order: list[str]) -> np.ndarray:
    """Concatenate selected features (already scaled) into 1-D vector."""
    return row[feature_order].values.astype(np.float32)


def save_rgb(gadf: np.ndarray, gasf: np.ndarray, save_path: Path):
    """
    Stack GADF→R, GASF→G, B channel = zeros (or use another feature group).
    Values are in [-1, 1]; scale to [0, 255] for PNG.
    """
    gadf_u8 = ((gadf + 1) / 2 * 255).astype(np.uint8)
    gasf_u8 = ((gasf + 1) / 2 * 255).astype(np.uint8)
    b = np.zeros_like(gadf_u8, dtype=np.uint8)
    rgb = cv2.merge([gadf_u8, gasf_u8, b])
    cv2.imwrite(str(save_path), rgb)


def main(args):
    df = load_cic_csv(args.csv_dir)
    print(f"Loaded {len(df):,} rows.")

    # make sure chosen features exist
    for f in args.features:
        if f not in df.columns:
            raise ValueError(f"Feature '{f}' not found in CSV header.")

    # drop NaNs in chosen features
    df = df.dropna(subset=args.features).reset_index(drop=True)

    # ---- scale features to [-1, 1] (GAF requirement) ----
    scaler = MinMaxScaler(feature_range=(-1, 1))
    df[args.features] = scaler.fit_transform(df[args.features])

    # prepare output dirs
    out_dir = Path(args.out_dir)
    out_dir.mkdir(parents=True, exist_ok=True)

    label_map = []
    for idx, row in tqdm(df.iterrows(), total=len(df)):
        vec = vector_from_row(row, args.features)
        gadf, gasf = encode_gaf_pair(vec, args.img_size)
        img_name = f"flow_{idx:07d}.png"
        save_rgb(gadf, gasf, out_dir / img_name)
        label_map.append({"image": img_name, "label": row["Label"]})

    pd.DataFrame(label_map).to_csv(out_dir / "labels.csv", index=False)
    print("Done:", len(label_map), "images saved to", out_dir)


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--csv_dir", required=True,
                        help="Directory containing CIC-IDS-2017 CSV(s)")
    parser.add_argument("--out_dir", required=True,
                        help="Where to save images + labels.csv")
    parser.add_argument("--features", nargs="+", required=True,
                        help="Ordered list of numeric feature columns to concatenate")
    parser.add_argument("--img_size", type=int, default=32,
                        help="Square image size (power of two works best); "
                             "must divide len(features).")
    args = parser.parse_args()
    main(args)


usage: ipykernel_launcher.py [-h] --csv_dir CSV_DIR --out_dir OUT_DIR
                             --features FEATURES [FEATURES ...]
                             [--img_size IMG_SIZE]
ipykernel_launcher.py: error: the following arguments are required: --csv_dir, --out_dir, --features


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
