In [1]:
import pandas as pd, numpy as np, random, base64, urllib.parse, re
from pathlib import Path

# Load user's test set
test_path = Path(r"D:\UIT Document\UIT subjects\IE105 - Nhập môn đảm bảo và an toàn thông tin\LLM-for-XSS\Dataset\_outputs_train_val_test\test.csv")
df = pd.read_csv(test_path)

# ---- Safe "obfuscation-like" transforms (string-only, no executable wrappers) ----
def b64_encode_only(s: str) -> str:
    return base64.b64encode(s.encode("utf-8", errors="ignore")).decode("ascii")

def url_encode_only(s: str) -> str:
    return urllib.parse.quote(s, safe="")

def html_entity_encode(s: str) -> str:
    return (s.replace("&", "&amp;")
             .replace("<", "&lt;")
             .replace(">", "&gt;")
             .replace('"', "&quot;")
             .replace("'", "&#x27;"))

def whitespace_noise(s: str, p: float = 0.15) -> str:
    out = []
    for ch in s:
        out.append(ch)
        if ch in ['<','>','=',"'",'"','(',')',';','/','+','-','*',',',':'] and random.random() < p:
            out.append(" " * random.randint(1, 2))
    return "".join(out)

def comment_marker(s: str, p: float = 0.03) -> str:
    out=[]
    for ch in s:
        out.append(ch)
        if ch.isalpha() and random.random() < p:
            out.append("/*OBF*/")
    return "".join(out)

def double_urlencode(s: str) -> str:
    return url_encode_only(url_encode_only(s))

def html_then_urlencode(s: str) -> str:
    return url_encode_only(html_entity_encode(s))

TRANSFORMS = {
    "urlencode": url_encode_only,
    "double_urlencode": double_urlencode,
    "base64": b64_encode_only,
    "html_entities": html_entity_encode,
    "html_then_urlencode": html_then_urlencode,
    "whitespace_noise": whitespace_noise,
    "comment_marker": comment_marker,
}

# Probabilities (sum to 1.0). You can tweak these.
probs = {
    "urlencode": 0.20,
    "double_urlencode": 0.10,
    "base64": 0.20,
    "html_entities": 0.15,
    "html_then_urlencode": 0.10,
    "whitespace_noise": 0.15,
    "comment_marker": 0.10,
}

# Validate probabilities
names = list(TRANSFORMS.keys())
pvals = np.array([probs[n] for n in names], dtype=float)
pvals = pvals / pvals.sum()

def weighted_sample_without_replacement(k: int) -> list:
    """Pick up to k distinct techniques per sample, weighted by pvals."""
    k = min(k, len(names))
    chosen = []
    available = names.copy()
    weights = pvals.copy()
    for _ in range(k):
        w = np.array([weights[names.index(a)] for a in available], dtype=float)
        w = w / w.sum()
        pick = np.random.choice(available, p=w)
        chosen.append(pick)
        available.remove(pick)
    return chosen

# ---- Generate multi-variant obfuscated test set ----
SEED = 42
N_VARIANTS_PER_MAL = 3  # improvement: multiple variants per malicious sample

random.seed(SEED)
np.random.seed(SEED)

rows = []
for idx, row in df.iterrows():
    payload = str(row["payload"])
    label = int(row["label"])
    sample_id = idx

    if label != 1:
        rows.append({
            "sample_id": sample_id,
            "variant_id": 0,
            "technique": "none",
            "payload_original": payload,
            "payload_obfuscated": payload,
            "label": label,
        })
        continue

    # Diverse techniques per payload: sample without replacement
    techniques = weighted_sample_without_replacement(N_VARIANTS_PER_MAL)

    for vid, tech in enumerate(techniques, start=1):
        fn = TRANSFORMS[tech]
        # Some functions accept extra params; call safely
        try:
            obf = fn(payload)
        except TypeError:
            obf = fn(payload)  # fallback
        rows.append({
            "sample_id": sample_id,
            "variant_id": vid,
            "technique": tech,
            "payload_original": payload,
            "payload_obfuscated": obf,
            "label": label,
        })

df_obf_multi = pd.DataFrame(rows)

out_path = Path("./Dataset/obfuscated_test_xss.csv")
df_obf_multi.to_csv(out_path, index=False)

# Show quick stats
tech_counts = df_obf_multi["technique"].value_counts()
mal_count = int((df["label"] == 1).sum())
ben_count = int((df["label"] == 0).sum())
summary = {
    "original_total": len(df),
    "original_benign": ben_count,
    "original_malicious": mal_count,
    "N_variants_per_malicious": N_VARIANTS_PER_MAL,
    "output_rows_total": len(df_obf_multi),
    "output_rows_benign": int((df_obf_multi["label"] == 0).sum()),
    "output_rows_malicious_variants": int((df_obf_multi["label"] == 1).sum()),
}
out_path, summary, tech_counts.head(20)



(WindowsPath('Dataset/obfuscated_test_xss.csv'),
 {'original_total': 3872,
  'original_benign': 2408,
  'original_malicious': 1464,
  'N_variants_per_malicious': 3,
  'output_rows_total': 6800,
  'output_rows_benign': 2408,
  'output_rows_malicious_variants': 4392},
 technique
 none                   2408
 urlencode               841
 base64                  814
 html_entities           657
 whitespace_noise        644
 html_then_urlencode     498
 double_urlencode        479
 comment_marker          459
 Name: count, dtype: int64)