In [14]:
import pandas as pd, numpy as np, re
from pathlib import Path

train_path = Path("./Dataset/_outputs_train_val_test/train.csv")
val_path = Path("./Dataset/_outputs_train_val_test/val.csv")
test_path = Path("./Dataset/_outputs_train_val_test/test.csv")
obf_path = Path("./Dataset/test_obfuscated_safe_multi.csv")

train_df = pd.read_csv(train_path)
val_df = pd.read_csv(val_path) if val_path.exists() else None
test_df = pd.read_csv(test_path)
obf_df = pd.read_csv(obf_path)

train_df.shape, test_df.shape, obf_df.shape, train_df.columns.tolist(), obf_df.columns.tolist()



((13938, 2),
 (3872, 2),
 (6800, 6),
 ['payload', 'label'],
 ['sample_id',
  'variant_id',
  'technique',
  'payload_original',
  'payload_obfuscated',
  'label'])

In [15]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

def normalize_text(s: str) -> str:
    s = str(s).lower()
    s = s.replace("\n"," ").replace("\r"," ")
    s = re.sub(r"\s+", " ", s).strip()
    return s

# Clean payloads
for df in [train_df, test_df]:
    df["payload"] = df["payload"].map(normalize_text)
obf_df["payload_obfuscated"] = obf_df["payload_obfuscated"].map(normalize_text)

y_train = train_df["label"].astype(int).to_numpy()

# Fit BoW on train only (paper-style)
vectorizer = CountVectorizer(token_pattern=r"(?u)\b\w+\b")
X_train = vectorizer.fit_transform(train_df["payload"])

# Transform obfuscated payloads
X_obf_full = vectorizer.transform(obf_df["payload_obfuscated"])
y_obf = obf_df["label"].astype(int).to_numpy()

# Build chi2 selector for tree-based models
TOP_K = 500
selector = SelectKBest(score_func=chi2, k=min(TOP_K, X_train.shape[1]))
X_train_k = selector.fit_transform(X_train, y_train)
X_obf_k = selector.transform(X_obf_full)

# Models (match your model_selection style)
RANDOM_STATE = 42
models = {
    "Logistic Regression (BoW full)": ("full",
        LogisticRegression(max_iter=2000, n_jobs=-1, class_weight="balanced", random_state=RANDOM_STATE)),
    "Linear SVM (BoW full)": ("full",
        LinearSVC(class_weight="balanced", random_state=RANDOM_STATE)),
    f"Decision Tree (chi2 k={selector.k})": ("k",
        DecisionTreeClassifier(class_weight="balanced", random_state=RANDOM_STATE)),
    f"Random Forest (chi2 k={selector.k})": ("k",
        RandomForestClassifier(n_estimators=300, n_jobs=-1, class_weight="balanced_subsample", random_state=RANDOM_STATE)),
}

# Fit
fitted = {}
for name, (space, clf) in models.items():
    if space == "full":
        clf.fit(X_train, y_train)
    else:
        clf.fit(X_train_k, y_train)
    fitted[name] = (space, clf)

def metrics(y_true, y_pred):
    return {
        "acc": accuracy_score(y_true, y_pred),
        "prec": precision_score(y_true, y_pred, zero_division=0),
        "rec": recall_score(y_true, y_pred, zero_division=0),
        "f1": f1_score(y_true, y_pred, zero_division=0),
    }

# 1) Mean metrics across variants (evaluate on all rows in obf_df)
mean_rows = []
pred_cache = {}  # store per-row predictions for later
for name, (space, clf) in fitted.items():
    X_use = X_obf_full if space == "full" else X_obf_k
    y_pred = clf.predict(X_use)
    pred_cache[name] = y_pred
    m = metrics(y_obf, y_pred)
    mean_rows.append({"model": name, **m})

df_mean = pd.DataFrame(mean_rows).sort_values("f1", ascending=False).reset_index(drop=True)
df_mean



Unnamed: 0,model,acc,prec,rec,f1
0,Linear SVM (BoW full),0.613235,0.996618,0.40255,0.573467
1,Logistic Regression (BoW full),0.611618,0.996597,0.400046,0.570918
2,Random Forest (chi2 k=500),0.611029,0.994901,0.399818,0.570408
3,Decision Tree (chi2 k=500),0.604559,0.989649,0.391849,0.561409


In [16]:
# 2) Worst-case robustness (per original sample_id)
worst_rows = []
for name, y_pred in pred_cache.items():
    tmp = obf_df[["sample_id","label"]].copy()
    tmp["pred"] = y_pred.astype(int)

    # group per sample_id
    # For malicious (label=1): worst-case pred = min over variants
    # For benign (label=0): only one row, min is fine too
    grp = tmp.groupby("sample_id", as_index=False).agg(
        y_true=("label", "max"),  # should be 0 for benign, 1 for malicious
        y_pred=("pred", "min"),   # worst-case (if any variant missed -> 0)
        n=("pred","size")
    )
    m = metrics(grp["y_true"].to_numpy(), grp["y_pred"].to_numpy())
    worst_rows.append({"model": name, **m, "samples": len(grp), "avg_variants": grp["n"].mean()})
    
df_worst = pd.DataFrame(worst_rows).sort_values("f1", ascending=False).reset_index(drop=True)
df_worst



Unnamed: 0,model,acc,prec,rec,f1,samples,avg_variants
0,Linear SVM (BoW full),0.630682,0.869565,0.027322,0.05298,3872,1.756198
1,Random Forest (chi2 k=500),0.629132,0.804348,0.025273,0.049007,3872,1.756198
2,Logistic Regression (BoW full),0.628874,0.846154,0.022541,0.043912,3872,1.756198
3,Decision Tree (chi2 k=500),0.625775,0.647059,0.022541,0.043564,3872,1.756198


In [17]:
# 3) Per-technique breakdown:
# For each technique t != 'none', build a test set = benign rows + malicious rows of technique t
benign_mask = (obf_df["label"] == 0)
benign_idx = np.where(benign_mask.to_numpy())[0]

techniques = sorted([t for t in obf_df["technique"].unique().tolist() if t != "none"])

tech_rows = []
for name, (space, clf) in fitted.items():
    y_pred_all = pred_cache[name].astype(int)

    for t in techniques:
        mal_mask_t = (obf_df["label"] == 1) & (obf_df["technique"] == t)
        idx = np.concatenate([benign_idx, np.where(mal_mask_t.to_numpy())[0]])
        y_true_t = y_obf[idx]
        y_pred_t = y_pred_all[idx]
        m = metrics(y_true_t, y_pred_t)

        # Also compute recall on malicious only (same as m["rec"], but keep explicit)
        mal_only_idx = np.where(mal_mask_t.to_numpy())[0]
        y_true_mal = y_obf[mal_only_idx]
        y_pred_mal = y_pred_all[mal_only_idx]
        rec_mal = recall_score(y_true_mal, y_pred_mal, zero_division=0)

        tech_rows.append({
            "model": name,
            "technique": t,
            "n_benign": len(benign_idx),
            "n_malicious": len(mal_only_idx),
            "acc": m["acc"],
            "prec": m["prec"],
            "rec": m["rec"],      # recall over full set (same as malicious recall because benign are 0)
            "f1": m["f1"],
            "mal_recall": rec_mal,
        })

df_tech = pd.DataFrame(tech_rows)

# For "kỹ thuật nào làm tụt recall nhất": find min mal_recall per model
df_worst_tech = (
    df_tech.sort_values(["model","mal_recall","f1"], ascending=[True, True, True])
           .groupby("model", as_index=False)
           .first()[["model","technique","mal_recall","f1","n_malicious"]]
           .rename(columns={"technique":"worst_technique"})
)

# Also provide a pivot table of mal_recall per technique for easy viewing
pivot_recall = df_tech.pivot_table(index="technique", columns="model", values="mal_recall", aggfunc="mean")
df_worst_tech, pivot_recall.head(10)



(                            model worst_technique  mal_recall   f1  \
 0      Decision Tree (chi2 k=500)          base64         0.0  0.0   
 1           Linear SVM (BoW full)          base64         0.0  0.0   
 2  Logistic Regression (BoW full)          base64         0.0  0.0   
 3      Random Forest (chi2 k=500)          base64         0.0  0.0   
 
    n_malicious  
 0          814  
 1          814  
 2          814  
 3          814  ,
 model                Decision Tree (chi2 k=500)  Linear SVM (BoW full)  \
 technique                                                                
 base64                                 0.000000               0.000000   
 comment_marker                         0.893246               0.978214   
 double_urlencode                       0.012526               0.016701   
 html_entities                          0.993912               0.995434   
 html_then_urlencode                    0.012048               0.012048   
 urlencode                 

In [18]:
# Format for presentation: round metrics
df_mean_fmt = df_mean.copy()
df_worst_fmt = df_worst.copy()
df_tech_fmt = df_tech.copy()
df_worst_tech_fmt = df_worst_tech.copy()
pivot_recall_fmt = pivot_recall.copy()

for d in [df_mean_fmt, df_worst_fmt, df_tech_fmt, df_worst_tech_fmt, pivot_recall_fmt]:
    for col in d.columns:
        if d[col].dtype.kind in "fc":
            d[col] = d[col].astype(float)

df_mean_fmt = df_mean_fmt.assign(**{c: df_mean_fmt[c].round(4) for c in ["acc","prec","rec","f1"]})
df_worst_fmt = df_worst_fmt.assign(**{c: df_worst_fmt[c].round(4) for c in ["acc","prec","rec","f1","avg_variants"]})
df_tech_fmt = df_tech_fmt.assign(**{c: df_tech_fmt[c].round(4) for c in ["acc","prec","rec","f1","mal_recall"]})
df_worst_tech_fmt["mal_recall"] = df_worst_tech_fmt["mal_recall"].round(4)
df_worst_tech_fmt["f1"] = df_worst_tech_fmt["f1"].round(4)
pivot_recall_fmt = pivot_recall_fmt.round(4)

print("Mean metrics over all obfuscated variants (expanded test)", df_mean_fmt)
print("Worst-case robustness (per original sample_id; malicious must pass all variants)", df_worst_fmt)
print("Per-technique breakdown (benign + malicious of that technique)", df_tech_fmt.sort_values(["model","mal_recall"]))
print("Worst technique per model (min malicious recall)", df_worst_tech_fmt)
print("Malicious recall pivot by technique (higher is better)", pivot_recall_fmt)

(out_mean_path := Path("./Dataset/obfuscated_predicted_output/obf_eval_mean.csv")).write_text(df_mean_fmt.to_csv(index=False), encoding="utf-8")
(out_worst_path := Path("./Dataset/obfuscated_predicted_output/obf_eval_worst.csv")).write_text(df_worst_fmt.to_csv(index=False), encoding="utf-8")
(out_tech_path := Path("./Dataset/obfuscated_predicted_output/obf_eval_per_technique.csv")).write_text(df_tech_fmt.to_csv(index=False), encoding="utf-8")
(str(out_mean_path), str(out_worst_path), str(out_tech_path))



Mean metrics over all obfuscated variants (expanded test)                             model     acc    prec     rec      f1
0           Linear SVM (BoW full)  0.6132  0.9966  0.4026  0.5735
1  Logistic Regression (BoW full)  0.6116  0.9966  0.4000  0.5709
2      Random Forest (chi2 k=500)  0.6110  0.9949  0.3998  0.5704
3      Decision Tree (chi2 k=500)  0.6046  0.9896  0.3918  0.5614
Worst-case robustness (per original sample_id; malicious must pass all variants)                             model     acc    prec     rec      f1  samples  \
0           Linear SVM (BoW full)  0.6307  0.8696  0.0273  0.0530     3872   
1      Random Forest (chi2 k=500)  0.6291  0.8043  0.0253  0.0490     3872   
2  Logistic Regression (BoW full)  0.6289  0.8462  0.0225  0.0439     3872   
3      Decision Tree (chi2 k=500)  0.6258  0.6471  0.0225  0.0436     3872   

   avg_variants  
0        1.7562  
1        1.7562  
2        1.7562  
3        1.7562  
Per-technique breakdown (benign + malicious of tha

('Dataset\\obfuscated_predicted_output\\obf_eval_mean.csv',
 'Dataset\\obfuscated_predicted_output\\obf_eval_worst.csv',
 'Dataset\\obfuscated_predicted_output\\obf_eval_per_technique.csv')

In [19]:
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

def plot_grouped_metrics(df, metric_cols, metric_labels, title, out_path,
                         y_min=None, y_max=1.0, y_ticks=None, decimals=4):
    models = df["model"].tolist()
    data = df[metric_cols].to_numpy(dtype=float)

    # fixed y limits / ticks
    if y_min is None:
        vmin = float(np.min(data))
        pad = max(1e-4, (y_max - vmin) * 0.15)
        y_min = max(0.0, vmin - pad)

    x = np.arange(len(models))
    width = 0.18

    fig, ax = plt.subplots(figsize=(12, 6))
    containers = []
    for i, lab in enumerate(metric_labels):
        vals = data[:, i]
        bars = ax.bar(x + (i - 1.5) * width, vals, width, label=lab)
        containers.append((bars, vals))

    ax.set_xticks(x)
    ax.set_xticklabels([m.upper() for m in models], fontsize=9)
    ax.set_ylabel("Score")
    ax.set_title(title)
    ax.set_ylim(y_min, y_max)
    if y_ticks is not None:
        ax.set_yticks(y_ticks)
        ax.set_yticklabels([f"{t:.3f}" for t in y_ticks])
    ax.grid(axis="y", alpha=0.35)
    ax.legend(loc="upper center", ncol=4, bbox_to_anchor=(0.5, 1.18))

    fmt = "{:." + str(decimals) + "f}"
    for bars, vals in containers:
        for b, v in zip(bars, vals):
            ax.text(b.get_x() + b.get_width()/2, v, fmt.format(v),
                    ha="center", va="bottom", fontsize=9, rotation=0)

    fig.tight_layout()
    fig.savefig(out_path, dpi=200, bbox_inches="tight")
    plt.close(fig)
    return out_path

# Metrics like Section 3.1
metric_cols = ["acc","prec","rec","f1"]
metric_labels = ["Accuracy","Precision","Recall","F1"]

out1 = plot_grouped_metrics(
    df_mean, metric_cols, metric_labels,
    title="Obfuscated XSS (Mean over variants) — Metrics by Model",
    out_path=Path("./Dataset/obfuscated_predicted_output/obf_mean_metrics.png"),
    y_max=1.0, y_min=0.0,  # show full range because recall is much lower
    y_ticks=[0.0,0.2,0.4,0.6,0.8,1.0],
    decimals=4
)

out2 = plot_grouped_metrics(
    df_worst, metric_cols, metric_labels,
    title="Obfuscated XSS (Worst-case robustness) — Metrics by Model",
    out_path=Path("./Dataset/obfuscated_predicted_output/obf_worst_metrics.png"),
    y_max=1.0, y_min=0.0,
    y_ticks=[0.0,0.2,0.4,0.6,0.8,1.0],
    decimals=4
)

(str(out1), str(out2), Path(out1).exists(), Path(out2).exists())



('Dataset\\obfuscated_predicted_output\\obf_mean_metrics.png',
 'Dataset\\obfuscated_predicted_output\\obf_worst_metrics.png',
 True,
 True)