In [5]:
import pandas as pd
import numpy as np

In [6]:
def explain_row(r):
    reasons = []
    impact = "Potential reconnaissance activity."
    action = "Monitor traffic."

    url = str(r['URL']).lower()
    ua  = str(r['User-Agent']).lower()
    method = str(r['Method'])

    # Rule 1: Suspicious URL patterns
    if any(p in url for p in ["../", "etc/passwd", "cgi-bin"]):
        reasons.append("Suspicious path traversal or legacy endpoint access")
        impact = "Attacker may be probing for sensitive files or old scripts."
        action = "Block this IP and patch vulnerable endpoints."

    # Rule 2: Automated tools
    if any(bot in ua for bot in ["curl", "wget", "sqlmap", "bot"]):
        reasons.append("Automated scanner detected")
        impact = "Likely reconnaissance by attacker’s script."
        action = "Add this IP to denylist."

    # Rule 3: HTTP method abuse
    if method not in ["GET", "POST", "HEAD"]:
        reasons.append(f"Unusual HTTP method: {method}")
        impact = "Potential exploitation attempt using uncommon HTTP verb."
        action = "Inspect this traffic manually."

    if not reasons:
        reasons.append("Unusual behavior compared to baseline")

    return {
        "summary": f"Suspicious request → {url} via {method}",
        "reasons": " | ".join(reasons),
        "impact": impact,
        "suggested_action": action,
        "confidence": "High" if r.get("anomaly_score", 0) > 0.8 else "Medium"
    }


In [7]:
df = pd.read_csv("csic2010_predictions.csv")

df_anomalies = df[df["predicted"] == 1]  

explanations = df_anomalies.apply(explain_row, axis=1, result_type="expand")

df_out = pd.concat([df_anomalies, explanations], axis=1)
df_out.to_csv("csic2010_with_explanations.csv", index=False)
print(df.head())
print("💾 Explanations saved to csic2010_with_explanations.csv")

   Unnamed: 0  Method  User-Agent  Pragma  Cache-Control  Accept  \
0           1       0           0       0              0       1   
1           1       0           0       0              0       1   
2           0       0           0       0              0       1   
3           1       0           0       0              0       1   
4           0       0           0       0              0       1   

   Accept-encoding  Accept-charset  language  host  ...  content    URL  \
0                0               0         0     0  ...    12091   8354   
1                0               0         0     0  ...    12091   3352   
2                0               0         0     0  ...    12091  12981   
3                0               0         0     0  ...    12091   1703   
4                0               0         0     0  ...    12091   6219   

   url_length  num_question_marks  num_equals  num_percent  num_slash  \
0          71                   1           1            0         