# 3.4 — Complexity of (LLM-style) Obfuscated XSS (Reproduction + Improvements)

Notebook này tái hiện ý tưởng **mục 3.4** trong paper: dùng **Shannon entropy** để đo “độ phức tạp/khó đoán” của payload obfuscated,
và **cải tiến** bằng cách bổ sung:
- **Token-level entropy**
- **Compressibility ratio (gzip)**
- Các proxy cấu trúc đơn giản: length, non-alnum ratio, unique-char ratio
- **Heuristic validity** (balanced brackets/quotes) và báo cáo riêng trên subset “valid-like”
- **Bootstrap CI** cho chênh lệch entropy (độ tin cậy thống kê)
- **Sensitivity analysis**: baseline tool có/không Base64 (vì Base64 làm entropy tăng mạnh)

> Lưu ý: dataset “LLM-style” ở đây là **string-transform composition** để mô phỏng output đa dạng; không tạo payload khai thác chạy được.


In [9]:
import pandas as pd, numpy as np, re, math, gzip, io, random, base64, urllib.parse
from pathlib import Path
import matplotlib.pyplot as plt

# Load available datasets
test_csv = Path("./Dataset/_outputs_train_val_test/test.csv")
tool_single_csv = Path("./Dataset/test_obfuscated_safe.csv")  # single-variant tool-like (safe)
tool_multi_csv = Path("./Dataset/test_obfuscated_safe_multi.csv")  # multi-variant tool-like (safe)

df_test = pd.read_csv(test_csv)
df_tool_single = pd.read_csv(tool_single_csv) if tool_single_csv.exists() else None
df_tool_multi = pd.read_csv(tool_multi_csv)

df_test.shape, (df_tool_single.shape if df_tool_single is not None else None), df_tool_multi.shape



((3872, 2), (3872, 4), (6800, 6))

In [11]:
# ----------------------------
# Safe LLM-style obfuscation generator (string-only)
# ----------------------------
def normalize_text(s: str) -> str:
    s = str(s).replace("\n"," ").replace("\r"," ")
    s = re.sub(r"\s+", " ", s).strip()
    return s

def b64_encode_only(s: str) -> str:
    return base64.b64encode(s.encode("utf-8", errors="ignore")).decode("ascii")

def url_encode_only(s: str) -> str:
    return urllib.parse.quote(s, safe="")

def html_entity_encode(s: str) -> str:
    return (s.replace("&","&amp;")
             .replace("<","&lt;")
             .replace(">","&gt;")
             .replace('"',"&quot;")
             .replace("'","&#x27;"))

def whitespace_noise(s: str, p: float = 0.15) -> str:
    out=[]
    for ch in s:
        out.append(ch)
        if ch in ['<','>','=',"'",'"','(',')',';','/','+','-','*',',',':'] and random.random()<p:
            out.append(" " * random.randint(1,2))
    return "".join(out)

def comment_marker(s: str, p: float = 0.03) -> str:
    out=[]
    for ch in s:
        out.append(ch)
        if ch.isalpha() and random.random()<p:
            out.append("/*OBF*/")
    return "".join(out)

def reverse_chunks(s: str) -> str:
    s = str(s)
    if len(s) < 12:
        return s[::-1]
    k = max(3, min(8, len(s)//10))
    chunks = [s[i:i+k] for i in range(0, len(s), k)]
    return "".join(chunks[::-1])

def double_urlencode(s: str) -> str:
    return url_encode_only(url_encode_only(s))

def html_then_urlencode(s: str) -> str:
    return url_encode_only(html_entity_encode(s))

TRANSFORMS = {
    "urlencode": url_encode_only,
    "double_urlencode": double_urlencode,
    "base64": b64_encode_only,
    "html_entities": html_entity_encode,
    "html_then_urlencode": html_then_urlencode,
    "whitespace_noise": whitespace_noise,
    "comment_marker": comment_marker,
    "reverse_chunks": reverse_chunks,
}

# Probabilities for a "tool baseline" (single transform) and for "LLM-style" (composed)
P_TOOL = {
    "urlencode": 0.25,
    "base64": 0.20,
    "html_entities": 0.20,
    "whitespace_noise": 0.20,
    "comment_marker": 0.15,
    "double_urlencode": 0.00,
    "html_then_urlencode": 0.00,
    "reverse_chunks": 0.00,
}

P_LLM = {
    "urlencode": 0.18,
    "double_urlencode": 0.10,
    "base64": 0.18,
    "html_entities": 0.12,
    "html_then_urlencode": 0.10,
    "whitespace_noise": 0.15,
    "comment_marker": 0.10,
    "reverse_chunks": 0.07,
}

def _sample_weighted(dist: dict) -> str:
    names = [k for k,v in dist.items() if v>0]
    weights = np.array([dist[k] for k in names], dtype=float)
    weights = weights / weights.sum()
    return np.random.choice(names, p=weights)

def _sample_k_without_replacement(dist: dict, k: int) -> list:
    names = [k for k,v in dist.items() if v>0]
    w = np.array([dist[k] for k in names], dtype=float)
    w = w / w.sum()
    k = min(k, len(names))
    chosen=[]
    available = names.copy()
    avail_w = w.copy()
    for _ in range(k):
        avail_w = avail_w / avail_w.sum()
        pick = np.random.choice(available, p=avail_w)
        chosen.append(pick)
        j = available.index(pick)
        available.pop(j)
        avail_w = np.delete(avail_w, j)
    return chosen

def llm_style_obfuscate(s: str, temperature: float = 1.0) -> tuple[str, str]:
    """
    temperature >1: more aggressive composition/noise
    Returns: (obf_string, recipe)
    """
    s = normalize_text(s)

    # decide number of transformations: 1..3
    # higher temperature -> higher chance of 2-3 transforms
    r = np.random.random()
    if temperature <= 1.0:
        k = 1 if r < 0.65 else (2 if r < 0.95 else 3)
    else:
        k = 1 if r < 0.35 else (2 if r < 0.80 else 3)

    techs = _sample_k_without_replacement(P_LLM, k)

    out = s
    # apply in sampled order
    for t in techs:
        fn = TRANSFORMS[t]
        out = fn(out)

    # extra "diversity noise" when temperature high (still string-only)
    if temperature > 1.2:
        # sprinkle whitespace + markers lightly
        out = whitespace_noise(out, p=0.08)
        if np.random.random() < 0.35:
            out = comment_marker(out, p=0.015)

    return out, "+".join(techs)

# Generate two LLM-style datasets (temp 1.0 and 1.5) with 1 variant per malicious
def build_llm_style_dataset(df_base: pd.DataFrame, temperature: float, seed: int = 42) -> pd.DataFrame:
    random.seed(seed)
    np.random.seed(seed)
    rows=[]
    for i, r in df_base.iterrows():
        payload = str(r["payload"])
        label = int(r["label"])
        if label != 1:
            rows.append({"sample_id": i, "label": label, "payload_original": payload,
                         "payload_obfuscated": payload, "generator": f"llm_style_t{temperature}", "recipe":"none"})
        else:
            obf, recipe = llm_style_obfuscate(payload, temperature=temperature)
            rows.append({"sample_id": i, "label": label, "payload_original": payload,
                         "payload_obfuscated": obf, "generator": f"llm_style_t{temperature}", "recipe": recipe})
    return pd.DataFrame(rows)

df_llm_t10 = build_llm_style_dataset(df_test, temperature=1.0, seed=42)
df_llm_t15 = build_llm_style_dataset(df_test, temperature=1.5, seed=42)

# baseline tool-like dataset (use existing single dataset if available; otherwise generate)
if df_tool_single is not None:
    df_tool = df_tool_single.rename(columns={"payload_obfuscated":"payload_obfuscated"})
    # align columns
    df_tool = pd.DataFrame({
        "sample_id": np.arange(len(df_tool)),
        "label": df_tool["label"].astype(int),
        "payload_original": df_tool["payload_original"].astype(str),
        "payload_obfuscated": df_tool["payload_obfuscated"].astype(str),
        "generator": "tool_baseline",
        "recipe": df_tool["technique"].astype(str),
    })
else:
    # fallback: generate single-transform baseline from test
    random.seed(42); np.random.seed(42)
    rows=[]
    for i, r in df_test.iterrows():
        payload=str(r["payload"]); label=int(r["label"])
        if label!=1:
            rows.append({"sample_id":i,"label":label,"payload_original":payload,
                         "payload_obfuscated":payload,"generator":"tool_baseline","recipe":"none"})
        else:
            t = _sample_weighted(P_TOOL)
            obf = TRANSFORMS[t](payload)
            rows.append({"sample_id":i,"label":label,"payload_original":payload,
                         "payload_obfuscated":obf,"generator":"tool_baseline","recipe":t})
    df_tool = pd.DataFrame(rows)

# Save these datasets for reference
out_tool = Path("./report_3_4/obf_tool_baseline_for_3_4.csv")
out_t10 = Path("./report_3_4/obf_llm_style_t1_0_for_3_4.csv")
out_t15 = Path("./report_3_4/obf_llm_style_t1_5_for_3_4.csv")

df_tool.to_csv(out_tool, index=False)
df_llm_t10.to_csv(out_t10, index=False)
df_llm_t15.to_csv(out_t15, index=False)

(out_tool, out_t10, out_t15, df_tool.shape, df_llm_t10.shape, df_llm_t15.shape, df_tool["label"].value_counts().to_dict())



(WindowsPath('report_3_4/obf_tool_baseline_for_3_4.csv'),
 WindowsPath('report_3_4/obf_llm_style_t1_0_for_3_4.csv'),
 WindowsPath('report_3_4/obf_llm_style_t1_5_for_3_4.csv'),
 (3872, 6),
 (3872, 6),
 (3872, 6),
 {0: 2408, 1: 1464})

In [12]:
# ----------------------------
# Complexity metrics (3.4 reproduction + improvements)
# ----------------------------
def shannon_entropy_char(s: str) -> float:
    s = s if s is not None else ""
    if len(s) == 0:
        return 0.0
    # counts over characters
    counts = {}
    for ch in s:
        counts[ch] = counts.get(ch, 0) + 1
    n = len(s)
    ent = 0.0
    for c in counts.values():
        p = c / n
        ent -= p * math.log2(p)
    return ent  # bits per char

def shannon_entropy_tokens(s: str) -> float:
    toks = re.findall(r"(?u)\b\w+\b", s.lower() if s else "")
    if len(toks) <= 1:
        return 0.0
    counts = {}
    for t in toks:
        counts[t] = counts.get(t, 0) + 1
    n = len(toks)
    ent = 0.0
    for c in counts.values():
        p = c / n
        ent -= p * math.log2(p)
    return ent  # bits per token

def gzip_ratio(s: str) -> float:
    b = (s or "").encode("utf-8", errors="ignore")
    if len(b) == 0:
        return 0.0
    out = io.BytesIO()
    with gzip.GzipFile(fileobj=out, mode="wb") as f:
        f.write(b)
    comp_len = len(out.getvalue())
    return comp_len / max(1, len(b))

def unique_char_ratio(s: str) -> float:
    s = s or ""
    return len(set(s)) / max(1, len(s))

def non_alnum_ratio(s: str) -> float:
    s = s or ""
    non = sum(1 for ch in s if not ch.isalnum())
    return non / max(1, len(s))

def balanced_simple(s: str) -> int:
    """
    Heuristic syntactic plausibility:
    - parentheses/brackets/braces balanced
    - single/double quotes counts even
    Returns 1 if passes, else 0
    """
    s = s or ""
    pairs = {"(":")","[":"]","{":"}"}
    stack=[]
    for ch in s:
        if ch in pairs:
            stack.append(pairs[ch])
        elif ch in pairs.values():
            if not stack or stack.pop() != ch:
                return 0
    if stack:
        return 0
    # quotes even (very rough)
    if s.count("'") % 2 != 0:
        return 0
    if s.count('"') % 2 != 0:
        return 0
    return 1

def compute_metrics(df: pd.DataFrame, group_name: str) -> pd.DataFrame:
    rows=[]
    for _, r in df.iterrows():
        obf = str(r["payload_obfuscated"])
        rows.append({
            "sample_id": int(r["sample_id"]),
            "label": int(r["label"]),
            "group": group_name,
            "length": len(obf),
            "H_char": shannon_entropy_char(obf),
            "H_tok": shannon_entropy_tokens(obf),
            "gzip_ratio": gzip_ratio(obf),
            "uniq_char_ratio": unique_char_ratio(obf),
            "non_alnum_ratio": non_alnum_ratio(obf),
            "balanced": balanced_simple(obf),
            "recipe": r.get("recipe",""),
        })
    return pd.DataFrame(rows)

m_tool = compute_metrics(df_tool, "tool_baseline")
m_t10 = compute_metrics(df_llm_t10, "llm_style_t1.0")
m_t15 = compute_metrics(df_llm_t15, "llm_style_t1.5")

metrics_all = pd.concat([m_tool, m_t10, m_t15], ignore_index=True)

# Focus on malicious only like paper's "obfuscated XSS"
metrics_mal = metrics_all[metrics_all["label"] == 1].copy()

metrics_all.shape, metrics_mal.shape, metrics_mal["group"].value_counts().to_dict()



((11616, 11),
 (4392, 11),
 {'tool_baseline': 1464, 'llm_style_t1.0': 1464, 'llm_style_t1.5': 1464})

In [5]:
# ----------------------------
# 3) Complexity metrics
# ----------------------------
def shannon_entropy_char(s: str) -> float:
    s = s if s is not None else ""
    if len(s) == 0:
        return 0.0
    counts = {}
    for ch in s:
        counts[ch] = counts.get(ch, 0) + 1
    n = len(s)
    ent = 0.0
    for c in counts.values():
        p = c / n
        ent -= p * math.log2(p)
    return ent

def shannon_entropy_tokens(s: str) -> float:
    toks = re.findall(r"(?u)\b\w+\b", s.lower() if s else "")
    if len(toks) <= 1:
        return 0.0
    counts = {}
    for t in toks:
        counts[t] = counts.get(t, 0) + 1
    n = len(toks)
    ent = 0.0
    for c in counts.values():
        p = c / n
        ent -= p * math.log2(p)
    return ent

def gzip_ratio(s: str) -> float:
    b = (s or "").encode("utf-8", errors="ignore")
    if len(b) == 0:
        return 0.0
    out = io.BytesIO()
    with gzip.GzipFile(fileobj=out, mode="wb") as f:
        f.write(b)
    comp_len = len(out.getvalue())
    return comp_len / max(1, len(b))

def unique_char_ratio(s: str) -> float:
    s = s or ""
    return len(set(s)) / max(1, len(s))

def non_alnum_ratio(s: str) -> float:
    s = s or ""
    non = sum(1 for ch in s if not ch.isalnum())
    return non / max(1, len(s))

def balanced_simple(s: str) -> int:
    s = s or ""
    pairs = {"(":")","[":"]","{":"}"}
    stack=[]
    for ch in s:
        if ch in pairs:
            stack.append(pairs[ch])
        elif ch in pairs.values():
            if not stack or stack.pop() != ch:
                return 0
    if stack:
        return 0
    if s.count("'") % 2 != 0:
        return 0
    if s.count('"') % 2 != 0:
        return 0
    return 1

def compute_metrics(df: pd.DataFrame) -> pd.DataFrame:
    rows=[]
    for _, r in df.iterrows():
        obf = str(r["payload_obfuscated"])
        rows.append({
            "sample_id": int(r["sample_id"]),
            "label": int(r["label"]),
            "group": r["group"],
            "length": len(obf),
            "H_char": shannon_entropy_char(obf),
            "H_tok": shannon_entropy_tokens(obf),
            "gzip_ratio": gzip_ratio(obf),
            "uniq_char_ratio": unique_char_ratio(obf),
            "non_alnum_ratio": non_alnum_ratio(obf),
            "balanced": balanced_simple(obf),
            "recipe": r.get("recipe",""),
        })
    return pd.DataFrame(rows)

m_tool = compute_metrics(df_tool)
m_t10 = compute_metrics(df_llm_t10)
m_t15 = compute_metrics(df_llm_t15)

metrics_all = pd.concat([m_tool, m_t10, m_t15], ignore_index=True)
metrics_mal = metrics_all[metrics_all["label"]==1].copy()

metrics_all.shape, metrics_mal.shape


((14544, 11), (7320, 11))

In [13]:
# Summary statistics + bootstrap CI for mean differences
def bootstrap_ci_diff(a: np.ndarray, b: np.ndarray, n_boot=5000, seed=42):
    rng = np.random.default_rng(seed)
    a = np.asarray(a); b = np.asarray(b)
    diffs = np.empty(n_boot)
    for i in range(n_boot):
        sa = rng.choice(a, size=len(a), replace=True)
        sb = rng.choice(b, size=len(b), replace=True)
        diffs[i] = sb.mean() - sa.mean()  # b - a
    lo, hi = np.percentile(diffs, [2.5, 97.5])
    return float(diffs.mean()), float(lo), float(hi)

def group_summary(df: pd.DataFrame) -> pd.DataFrame:
    agg = df.groupby("group").agg(
        n=("H_char","size"),
        H_char_mean=("H_char","mean"),
        H_char_med=("H_char","median"),
        H_tok_mean=("H_tok","mean"),
        gzip_mean=("gzip_ratio","mean"),
        len_mean=("length","mean"),
        uniq_mean=("uniq_char_ratio","mean"),
        nonalnum_mean=("non_alnum_ratio","mean"),
        balanced_rate=("balanced","mean"),
    ).reset_index()
    return agg

sum_mal = group_summary(metrics_mal)
sum_mal_rounded = sum_mal.copy()
for c in sum_mal_rounded.columns:
    if c != "group":
        sum_mal_rounded[c] = sum_mal_rounded[c].astype(float).round(4)
sum_mal_rounded



Unnamed: 0,group,n,H_char_mean,H_char_med,H_tok_mean,gzip_mean,len_mean,uniq_mean,nonalnum_mean,balanced_rate
0,llm_style_t1.0,1464.0,4.4186,4.3027,2.7472,1.0602,103.7425,0.3516,0.1618,0.9665
1,llm_style_t1.5,1464.0,4.5064,4.3875,2.7345,1.0239,117.7322,0.3432,0.153,0.9693
2,tool_baseline,1464.0,4.4384,4.3303,2.6276,1.1601,85.1066,0.3926,0.1992,0.9966


In [14]:
# Sensitivity analysis: tool baseline without base64 (to show how metric depends on obfuscator mix)
P_TOOL_NO_B64 = P_TOOL.copy()
P_TOOL_NO_B64["base64"] = 0.0
# renormalize by sampling function _sample_weighted uses weights >0, so ok.

def build_tool_baseline(df_base: pd.DataFrame, dist: dict, seed: int = 42, name: str = "tool_baseline") -> pd.DataFrame:
    random.seed(seed); np.random.seed(seed)
    rows=[]
    for i, r in df_base.iterrows():
        payload=str(r["payload"]); label=int(r["label"])
        if label!=1:
            rows.append({"sample_id": i, "label": label, "payload_original": payload,
                         "payload_obfuscated": payload, "generator": name, "recipe":"none"})
        else:
            t = _sample_weighted(dist)
            obf = TRANSFORMS[t](payload)
            rows.append({"sample_id": i, "label": label, "payload_original": payload,
                         "payload_obfuscated": obf, "generator": name, "recipe":t})
    return pd.DataFrame(rows)

df_tool_no_b64 = build_tool_baseline(df_test, P_TOOL_NO_B64, seed=42, name="tool_no_base64")
m_tool_no_b64 = compute_metrics(df_tool_no_b64, "tool_no_base64")
metrics_mal2 = pd.concat([metrics_mal, m_tool_no_b64[m_tool_no_b64["label"]==1]], ignore_index=True)

sum_mal2 = group_summary(metrics_mal2)
sum_mal2_rounded = sum_mal2.copy()
for c in sum_mal2_rounded.columns:
    if c != "group":
        sum_mal2_rounded[c] = sum_mal2_rounded[c].astype(float).round(4)
sum_mal2_rounded



Unnamed: 0,group,n,H_char_mean,H_char_med,H_tok_mean,gzip_mean,len_mean,uniq_mean,nonalnum_mean,balanced_rate
0,llm_style_t1.0,1464.0,4.4186,4.3027,2.7472,1.0602,103.7425,0.3516,0.1618,0.9665
1,llm_style_t1.5,1464.0,4.5064,4.3875,2.7345,1.0239,117.7322,0.3432,0.153,0.9693
2,tool_baseline,1464.0,4.4384,4.3303,2.6276,1.1601,85.1066,0.3926,0.1992,0.9966
3,tool_no_base64,1464.0,4.2842,4.2736,3.184,1.1431,85.1414,0.3611,0.239,0.9952


In [15]:
def percent_increase(a, b):
    return (b - a) / a * 100.0 if a != 0 else np.nan

def get_group(df, g):
    return df[df["group"]==g]

# Compare LLM t1.5 vs tool_baseline and vs tool_no_base64
for baseline in ["tool_baseline", "tool_no_base64"]:
    a = get_group(metrics_mal2, baseline)["H_char"].to_numpy()
    b = get_group(metrics_mal2, "llm_style_t1.5")["H_char"].to_numpy()
    mean_diff, lo, hi = bootstrap_ci_diff(a, b, n_boot=3000, seed=42)
    inc = percent_increase(a.mean(), b.mean())
    print(baseline, "mean(H_char)=", a.mean(), "LLM=", b.mean(),
          "increase%=", inc, "boot_diff=", mean_diff, "CI=[", lo, ",", hi, "]")



tool_baseline mean(H_char)= 4.438386753753551 LLM= 4.50638427984729 increase%= 1.5320324673426378 boot_diff= 0.0678928031992642 CI=[ 0.03501689992342396 , 0.10063142921668404 ]
tool_no_base64 mean(H_char)= 4.284196985406204 LLM= 4.50638427984729 increase%= 5.186206311193206 boot_diff= 0.22201066850471632 CI=[ 0.1941160638047772 , 0.24988429714072766 ]


In [16]:
# Create a concise comparison table (malicious only)
comparisons = []
pairs = [
    ("tool_baseline", "llm_style_t1.0"),
    ("tool_baseline", "llm_style_t1.5"),
    ("tool_no_base64", "llm_style_t1.5"),
]
for base, llm in pairs:
    a = get_group(metrics_mal2, base)["H_char"].to_numpy()
    b = get_group(metrics_mal2, llm)["H_char"].to_numpy()
    mean_diff, lo, hi = bootstrap_ci_diff(a, b, n_boot=5000, seed=42)
    comparisons.append({
        "baseline": base,
        "candidate": llm,
        "mean_H_char_baseline": a.mean(),
        "mean_H_char_candidate": b.mean(),
        "percent_increase": percent_increase(a.mean(), b.mean()),
        "boot_mean_diff": mean_diff,
        "boot_CI_low": lo,
        "boot_CI_high": hi,
    })

df_cmp = pd.DataFrame(comparisons)
df_cmp_rounded = df_cmp.copy()
for c in df_cmp_rounded.columns:
    if c not in ["baseline","candidate"]:
        df_cmp_rounded[c] = df_cmp_rounded[c].astype(float).round(4)
df_cmp_rounded



Unnamed: 0,baseline,candidate,mean_H_char_baseline,mean_H_char_candidate,percent_increase,boot_mean_diff,boot_CI_low,boot_CI_high
0,tool_baseline,llm_style_t1.0,4.4384,4.4186,-0.4463,-0.0201,-0.0503,0.0102
1,tool_baseline,llm_style_t1.5,4.4384,4.5064,1.532,0.0679,0.035,0.1009
2,tool_no_base64,llm_style_t1.5,4.2842,4.5064,5.1862,0.2221,0.1946,0.2495


In [21]:
from pathlib import Path

plots = {}

# Prepare data for plots (malicious only)
order = ["tool_baseline", "tool_no_base64", "llm_style_t1.0", "llm_style_t1.5"]
plot_df = metrics_mal2[metrics_mal2["group"].isin(order)].copy()

def make_boxplot(metric, ylabel, title, out_name):
    data = [plot_df[plot_df["group"]==g][metric].to_numpy() for g in order]
    fig, ax = plt.subplots(figsize=(10,4.5))
    ax.boxplot(data, labels=[g.replace("_"," ").upper() for g in order], showfliers=False)
    ax.set_ylabel(ylabel)
    ax.set_title(title)
    ax.grid(axis="y", alpha=0.35)
    fig.tight_layout()
    out_path = Path("") / out_name
    fig.savefig(out_path, dpi=200, bbox_inches="tight")
    plt.close(fig)
    plots[out_name] = out_path

make_boxplot("H_char", "Entropy (bits/char)", "3.4 Complexity — Character-level Shannon Entropy (malicious only)", "./report_3_4/3_4_entropy_char_boxplot.png")
make_boxplot("H_tok", "Entropy (bits/token)", "3.4 Complexity — Token-level Shannon Entropy (malicious only)", "./report_3_4/3_4_entropy_token_boxplot.png")
make_boxplot("gzip_ratio", "gzip(compressed)/raw", "3.4 Complexity — Compressibility Ratio (malicious only)", "./report_3_4/3_4_gzip_ratio_boxplot.png")

# CDF plot for H_char
fig, ax = plt.subplots(figsize=(9,4.5))
for g in order:
    vals = np.sort(plot_df[plot_df["group"]==g]["H_char"].to_numpy())
    y = np.arange(1, len(vals)+1) / len(vals)
    ax.plot(vals, y, label=g.replace("_"," ").upper())
ax.set_xlabel("Entropy (bits/char)")
ax.set_ylabel("CDF")
ax.set_title("3.4 Complexity — CDF of Character Entropy (malicious only)")
ax.grid(alpha=0.35)
ax.legend()
fig.tight_layout()
cdf_path = Path("./report_3_4/3_4_entropy_char_cdf.png")
fig.savefig(cdf_path, dpi=200, bbox_inches="tight")
plt.close(fig)
plots["3_4_entropy_char_cdf.png"] = cdf_path

# Scatter: length vs H_char (sample a subset for readability)
fig, ax = plt.subplots(figsize=(9,4.5))
for g in order:
    sub = plot_df[plot_df["group"]==g].sample(n=min(600, (plot_df["group"]==g).sum()), random_state=42)
    ax.scatter(sub["length"], sub["H_char"], s=8, alpha=0.35, label=g.replace("_"," ").upper())
ax.set_xlabel("Length")
ax.set_ylabel("Entropy (bits/char)")
ax.set_title("3.4 Complexity — Length vs Character Entropy (malicious only)")
ax.grid(alpha=0.35)
ax.legend()
fig.tight_layout()
scatter_path = Path("./report_3_4/3_4_length_vs_entropy_scatter.png")
fig.savefig(scatter_path, dpi=200, bbox_inches="tight")
plt.close(fig)
plots["3_4_length_vs_entropy_scatter.png"] = scatter_path

list(plots.keys())



  ax.boxplot(data, labels=[g.replace("_"," ").upper() for g in order], showfliers=False)
  ax.boxplot(data, labels=[g.replace("_"," ").upper() for g in order], showfliers=False)
  ax.boxplot(data, labels=[g.replace("_"," ").upper() for g in order], showfliers=False)


['./report_3_4/3_4_entropy_char_boxplot.png',
 './report_3_4/3_4_entropy_token_boxplot.png',
 './report_3_4/3_4_gzip_ratio_boxplot.png',
 '3_4_entropy_char_cdf.png',
 '3_4_length_vs_entropy_scatter.png']