In [None]:
import sys
from pathlib import Path

REPO_ROOT = Path(r"C:\ML-Malware").resolve()
if str(REPO_ROOT) not in sys.path:
    sys.path.insert(0, str(REPO_ROOT))

import malrob.adversarial_binary as adv

# ----------------------------
# 1) Dynamic knobs (change these only)
# ----------------------------
TARGET_PER_CLASS = 6000  # 6000 -> 12k total, 18000 -> 36k total, etc.
DATASET_TAG = f"{(TARGET_PER_CLASS * 2)//1000}k"        # "12k", "36k", ...
MODELS_DIR  = REPO_ROOT / f"models_full_{DATASET_TAG}"  # e.g. C:\ML-Malware\models_full_12k

DATA_CSV_FAMILIES = REPO_ROOT / "datasets" / "ember2018" / f"ember_full_{DATASET_TAG}_families_flat.csv"

# Where Notebook 03 should write outputs
RESULTS_DIR   = MODELS_DIR / "results_adv"
ARTIFACTS_DIR = MODELS_DIR / "artifacts_adv"
RESULTS_DIR.mkdir(parents=True, exist_ok=True)
ARTIFACTS_DIR.mkdir(parents=True, exist_ok=True)

# ----------------------------
# 2) Locked experiment grid (per your thesis design)
# ----------------------------
cfg = adv.AdversarialConfig(
    eps_list=[0.01, 0.05, 0.10],
    pgd_steps={0.01: 20, 0.05: 30, 0.10: 40},
    pgd_alpha={0.01: 0.002, 0.05: 0.005, 0.10: 0.01},
    random_state=42,
)

print("Loaded adversarial module from:", adv.__file__)
print("DATA_CSV_FAMILIES:", DATA_CSV_FAMILIES)
print("MODELS_DIR:", MODELS_DIR)
print("RESULTS_DIR:", RESULTS_DIR)
print("ARTIFACTS_DIR:", ARTIFACTS_DIR)

# ----------------------------
# 3) Load artifacts (RF, NN, scaler) - dynamic + correct signature
# ----------------------------
# Derive input_dim from the dataset itself (robust for 12k/36k/100k/etc.)
import pandas as pd

df_tmp = pd.read_csv(DATA_CSV_FAMILIES)
input_dim = df_tmp.drop(columns=["label", "family"]).shape[1]

scaler_path    = MODELS_DIR / f"scaler_full_{DATASET_TAG}.joblib"
nn_weights_path = MODELS_DIR / f"nn_binary_{DATASET_TAG}.pt"
rf_model_path   = MODELS_DIR / f"rf_binary_{DATASET_TAG}.joblib"

print("SCALER:", scaler_path )
print("NN:", nn_weights_path)
print("RF:", rf_model_path)

# (Optional) explicit existence checks so you fail fast with a clear message
for p in [scaler_path, nn_weights_path, rf_model_path]:
    if not p.exists():
        raise FileNotFoundError(f"Missing artifact: {p}")

scaler, nn_model, rf = adv.load_binary_artifacts(
    scaler_path=scaler_path,
    nn_weights_path=nn_weights_path,
    rf_model_path=rf_model_path,
    input_dim=input_dim,
    device="cpu",
)


# ----------------------------
# 4) Compute clean thresholds on full test set
#    (also persists clean_thresholds.json)
# ----------------------------
clean_thr_nn, clean_thr_rf, split = adv.compute_clean_thresholds(
    data_csv=DATA_CSV_FAMILIES,
    scaler=scaler,
    nn_model=nn_model,
    rf=rf,
    artifacts_dir=ARTIFACTS_DIR,
    random_state=cfg.random_state,
)

# split contains X_train/X_test etc. if your function returns it;
# if it returns something else, just remove the "split=" usage below accordingly.

# ----------------------------
# 5) Run FGSM + PGD (NNâ†’RF transfer), save summaries
# ----------------------------
fgsm_df = adv.run_fgsm_nn_to_rf(
    cfg=cfg,
    split=split,
    scaler=scaler,
    nn_model=nn_model,
    rf=rf,
    clean_thr_nn=clean_thr_nn,
    clean_thr_rf=clean_thr_rf,
    results_dir=RESULTS_DIR,
    artifacts_dir=ARTIFACTS_DIR,
)


pgd_df = adv.run_pgd_nn_to_rf(
    cfg=cfg,
    split=split,
    scaler=scaler,
    nn_model=nn_model,
    rf=rf,
    clean_thr_nn=clean_thr_nn,
    clean_thr_rf=clean_thr_rf,
    results_dir=RESULTS_DIR,
    artifacts_dir=ARTIFACTS_DIR,
)


# ----------------------------
# 6) Merge + plot
# ----------------------------
adv.merge_and_plot_targeted_evasion_summary(
    fgsm_df=fgsm_df,
    pgd_df=pgd_df,
    results_dir=RESULTS_DIR,
)


print("Summaries:", RESULTS_DIR / "summary_fgsm.csv", "and", RESULTS_DIR / "summary_pgd.csv")
print("Merged CSV/plot should be under:", RESULTS_DIR)

print("[DONE] NOTEBOOK 03 COMPLETED")
