In [None]:
import sys
from pathlib import Path

# Point to repo root once (adjust if your repo name/path differs)
REPO_ROOT = Path(r"C:\ML-Malware").resolve()

if not REPO_ROOT.exists():
    raise FileNotFoundError(f"REPO_ROOT does not exist: {REPO_ROOT}")

if str(REPO_ROOT) not in sys.path:
    sys.path.insert(0, str(REPO_ROOT))

print("REPO_ROOT:", REPO_ROOT)

import malrob.data
print(malrob.data.__file__)

print([x for x in dir(malrob.data) if "ember" in x])

from malrob.data import make_ember_flat, make_ember_families_flat

EMBER_DIR = REPO_ROOT / "data" / "ember2018"
SAVE_DIR  = REPO_ROOT / "datasets" / "ember2018"
SAVE_DIR.mkdir(parents=True, exist_ok=True)

# Single control value for dataset size:
# total rows = 2 * target_per_class
target_per_class = 6000  # 12k total - this will make your dataset tag 12k
seed_x = 42 


# FULL PE features (hist bins + byteentropy grid)
binary_csv = make_ember_flat(
    ember_dir=EMBER_DIR,
    input_filename="test_features.jsonl",
    target_per_class=target_per_class,
    save_dir=SAVE_DIR,
    feature_mode="full",          # IMPORTANT: matches your 287/288 cols
    prefix="ember_full", # outputs ember_full_(x)k_flat.csv
    seed=seed_x,
)

family_csv = make_ember_families_flat(
    ember_dir=EMBER_DIR,
    input_filename="test_features.jsonl",
    target_per_class=target_per_class,
    save_dir=SAVE_DIR,
    feature_mode="full",          # IMPORTANT
    family_key="avclass",
    prefix="ember_full",          # outputs ember_full_(x)k_families_flat.csv
    seed=seed_x,
)

print("Binary dataset saved to:", binary_csv)
print("Family dataset saved to:", family_csv)
print(f"[INFO] NOTEBOOK 01 COMPLETED")
