In [None]:
# Cell 1 â€” imports & paths
import os
import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np
import re
from pathlib import Path
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns

# Configurable paths (env overrides)
DATA_DIR = Path(os.getenv("DATA_DIR", "/workspace/data"))
OUT_DIR = DATA_DIR / "processed"
OUT_DIR.mkdir(parents=True, exist_ok=True)

# Input log (env overrideable)
SYSLOG = str(Path(os.getenv(
    "SYSLOG",
    str(DATA_DIR / "attack_data/datasets/attack_techniques/T1003.003/atomic_red_team/windows-sysmon.log")
)))

print("Libraries loaded.")
print("SYSLOG:", SYSLOG)
print("OUT_DIR:", OUT_DIR)


In [None]:
# --- STREAM PARSER FOR WINDOWS EVENT LOGS ---
# Handles files containing multiple <Event>...</Event> blocks without a single root element.
# Reads line by line, extracts each <Event> block, parses it, and yields dicts.

import xml.etree.ElementTree as ET

def parse_windows_events_stream(path, max_events=None):
    start_tag = "<Event"
    end_tag = "</Event>"
    buf = []
    inside = False
    count = 0

    with open(path, "r", encoding="utf-8", errors="ignore") as f:
        for line in f:
            if not inside and start_tag in line:
                inside = True
                buf = [line]
                if end_tag in line:
                    event_text = "".join(buf)
                    try:
                        elem = ET.fromstring(event_text)
                        yield _extract_event_from_element(elem)
                        count += 1
                    except Exception:
                        pass
                    inside = False
                    buf = []
                    if max_events and count >= max_events:
                        break
                continue

            if inside:
                buf.append(line)
                if end_tag in line:
                    event_text = "".join(buf)
                    try:
                        elem = ET.fromstring(event_text)
                        yield _extract_event_from_element(elem)
                        count += 1
                    except Exception:
                        pass
                    inside = False
                    buf = []
                    if max_events and count >= max_events:
                        break

def _extract_event_from_element(elem):
    """Extract useful fields from an <Event> XML element"""
    res = {}
    system = elem.find('.//{*}System')
    if system is not None:
        tc = system.find('.//{*}TimeCreated')
        if tc is not None and 'SystemTime' in tc.attrib:
            res['TimeCreated'] = tc.attrib.get('SystemTime')
        ev_id = system.find('.//{*}EventID')
        if ev_id is not None:
            res['EventID'] = ev_id.text
        comp = system.find('.//{*}Computer')
        if comp is not None:
            res['Computer'] = comp.text

    eventdata = elem.find('.//{*}EventData')
    if eventdata is not None:
        for data in eventdata.findall('.//{*}Data'):
            name = data.attrib.get('Name', 'Data')
            res[name.strip()] = data.text
    return res


In [None]:
rows = []
for i, ev in enumerate(parse_windows_events_stream(SYSLOG, max_events=2000)):
    rows.append(ev)
    if (i+1) % 500 == 0:
        print(f"Parsed {i+1} events")

df = pd.DataFrame(rows)
print("Shape:", df.shape)
df.head(10)


In [None]:
import re

def build_basic_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    Build robust baseline features from Windows Event (Sysmon/Security) logs.
    Handles missing columns and alternative field names.
    """
    df = df.copy()
    idx = df.index

    def s(col: str) -> pd.Series:
        if col in df.columns:
            return df[col].fillna('').astype(str)
        else:
            return pd.Series([''] * len(idx), index=idx, dtype='object')

    def first_existing(cols):
        for c in cols:
            if c in df.columns:
                return c
        return None

    cmd = s('CommandLine')
    df['CommandLine'] = cmd
    df['cmd_len'] = cmd.str.len()
    df['has_powershell'] = cmd.str.contains('powershell', case=False, na=False).astype(int)

    b64_regex = re.compile(r'[A-Za-z0-9+/=]{50,}')
    df['has_base64'] = cmd.apply(lambda x: 1 if b64_regex.search(x) else 0)

    proc_col = first_existing(['NewProcessName', 'Image', 'TargetImage', 'ProcessName'])
    proc = s(proc_col) if proc_col else pd.Series([''] * len(idx), index=idx, dtype='object')
    df['ProcessPath'] = proc

    df['is_temp_path'] = proc.str.contains(r'\\Temp\\|%TEMP%|/tmp/', case=False, regex=True, na=False).astype(int)

    parent_col = first_existing(['ParentProcessName', 'ParentImage', 'ParentProcess'])
    parent = s(parent_col) if parent_col else pd.Series([''] * len(idx), index=idx, dtype='object')
    df['ParentProcess'] = parent.str.lower()
    df['parent_is_cmd'] = df['ParentProcess'].str.contains(
        r'(?:^|\\)(?:cmd\.exe|powershell(?:_ise)?\.exe|pwsh\.exe)$', regex=True, na=False
    ).astype(int)

    df['TimeCreated'] = pd.to_datetime(s('TimeCreated'), errors='coerce')
    df['hour'] = df['TimeCreated'].dt.hour.fillna(-1).astype(int)
    df['EventID'] = pd.to_numeric(s('EventID'), errors='coerce').fillna(-1).astype(int)

    return df

df_feat = build_basic_features(df)
df_feat.head(10)


In [None]:
FEATURES = ['cmd_len','has_powershell','has_base64','is_temp_path','parent_is_cmd','hour']
X = df_feat[FEATURES].fillna(0)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

model = IsolationForest(n_estimators=200, contamination=0.05, random_state=42)
model.fit(X_scaled)

df_feat['anomaly_score'] = model.decision_function(X_scaled)
df_feat['anomaly_flag'] = model.predict(X_scaled)
df_feat['anomaly_flag'] = df_feat['anomaly_flag'].apply(lambda x: 1 if x == -1 else 0)

print("Training complete.")
df_feat[['CommandLine','cmd_len','anomaly_score','anomaly_flag']].head(10)


In [None]:
plt.figure(figsize=(10,4))
sns.histplot(df_feat['cmd_len'], bins=40)
plt.title("Command Line Length Distribution")
plt.xlabel("cmd_len")
plt.ylabel("count")
plt.show()

plt.figure(figsize=(12,4))
anoms = df_feat[df_feat['anomaly_flag']==1]
plt.scatter(df_feat['TimeCreated'], df_feat['cmd_len'], s=6, label='all')
plt.scatter(anoms['TimeCreated'], anoms['cmd_len'], color='red', s=20, label='anomaly')
plt.legend()
plt.title('Anomalies over time (cmd_len)')
plt.ylabel("cmd_len")
plt.xlabel("Time Created")
plt.show()


In [None]:
out_path = OUT_DIR / "sysmon_with_anomaly_scores.csv"
df_feat.to_csv(out_path, index=False)
print("Saved:", out_path)


In [None]:
out = OUT_DIR / "suspicious_commands.txt"
df_feat[df_feat['anomaly_flag']==1][['TimeCreated','CommandLine']].to_csv(out, index=False)
print("Saved:", out)
