# Constants

In [1]:
import os
import pandas as pd
import gc

  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [2]:
_USER = os.getenv("USER", "unknown_user")
print(f"Running as user: {_USER}")

Running as user: ahmed.bargady


In [32]:
DATA_DIR = f"/home/{_USER}/lustre/nlp_team-um6p-st-sccs-id7fz1zvotk/IDS/ahmed.bargady/data/github/logs-ad-ultimate/logadu-package/dataset/data"

# get .csv files inside the DATA_DIR
log_files = [f for f in os.listdir(DATA_DIR) if f.endswith('.csv')]
data_names = [f[:-4] for f in log_files]  # remove the .csv extension
for _name in data_names:
    print(_name)

LINUX24
WIN25CH


# DATASETS

## 1. Dataframes

In [36]:
L = "LINUX24"
W = "WIN25CH"

l_file = os.path.join(DATA_DIR, "LINUX24.csv")
w_file = os.path.join(DATA_DIR, "WIN25CH.csv")

dfl = pd.read_csv(l_file)
dfw = pd.read_csv(w_file)

dfl.shape, dfw.shape

((121590, 8), (9192, 8))

In [34]:
dfl.head()

Unnamed: 0,timestamp,location,src_id,content,mitre_id,mitre_tactic,mitre_technique,label
0,2023-10-01 00:49:18.889,rootcheck,1696121000.0,Trojaned version of file '/bin/diff' detected....,,,,0
1,2023-10-01 00:49:18.917,rootcheck,1696121000.0,Trojaned version of file '/usr/bin/diff' detec...,,,,0
2,2023-10-01 00:49:30.034,wazuh-monitord,1696121000.0,ossec: Manager started.,,,,0
3,2023-10-01 00:49:35.008,/var/log/dpkg.log,1696121000.0,<TIMESTAMP> install filebeat:amd64 <none> 7.10.2,,,,0
4,2023-10-01 00:49:37.011,/var/log/dpkg.log,1696121000.0,<TIMESTAMP> status half-configured filebeat:am...,,,,0


In [35]:
dfw.head()

Unnamed: 0,timestamp,location,src_id,content,mitre_id,mitre_tactic,mitre_technique,label
0,2024-10-28 08:51:44.254,rootcheck,1730106000.0,Trojaned version of file '/bin/diff' detected....,,,,0
1,2024-10-28 08:51:44.266,rootcheck,1730106000.0,Trojaned version of file '/usr/bin/diff' detec...,,,,0
2,2024-10-28 08:51:46.784,rootcheck,1730106000.0,File '<FILE_PATH>/<HEX_DATA>' is owned by root...,,,,0
3,2024-10-28 08:51:50.951,/var/log/dpkg.log,1730106000.0,<TIMESTAMP> status half-configured filebeat:am...,,,,0
4,2024-10-28 08:51:50.951,/var/log/dpkg.log,1730106000.0,<TIMESTAMP> install filebeat:amd64 <none> 7.10.2,,,,0


In [37]:
del dfl, dfw
gc.collect()

258

## 2. Parsed Content

### 2.1. Spell

A concise description of the **Spell parser**:

**What Spell Takes:**
1. **Input**: Raw log files (unstructured text)
2. **Parameters**:
   - `log_format`: Defines the structure of logs (e.g., `<Date> <Level> <Content>`)
   - `tau` (0.4-0.6): Similarity threshold (lower = more aggressive grouping)
   - `rex`: Regex rules to mask variables (e.g., IPs, IDs → `<*>`)

**What Spell Produces:**
1. **Output**:
   - **Structured logs**:  
     - `EventId` (hash of the template)  
     - `EventTemplate` (log pattern with `<*>` wildcards)  
     - `ParameterList` (extracted variables if `keep_para=True`)  
   - **Templates file**: Frequency count of each log pattern.

2. **How It Works**:
   - Uses **Longest Common Subsequence (LCS)** to match logs to templates.
   - Builds a **prefix tree** for efficient searching.
   - Dynamically updates templates as new logs arrive.

**Key Advantage**:
- **Balanced accuracy/speed**: Good for semi-structured logs (e.g., HDFS, Apache).  
- **Interpretable**: Templates clearly show static vs. variable parts.  

**Example**:
**Input Log**:  
`"2023-01-01 ERROR Failed to connect to 192.168.1.1"`  

**Output Template**:  
`"<*> ERROR Failed to connect to <*>"`  

**Parameters Extracted**:  
`["2023-01-01", "192.168.1.1"]`  

```
M. Du, F. Li, Spell: Streaming parsing of system event logs, in: Proceedings of the 2016 IEEE 16th International
Conference on Data Mining (ICDM), IEEE, Barcelona, Spain, 2016, pp. 859–864. doi:10.1109/ICDM.2016.
0102.
```

In [38]:
SPELL_DIR = os.path.join(DATA_DIR, "spell")

# structured data
sl = pd.read_csv(os.path.join(SPELL_DIR, L + "_structured.csv"))
sw = pd.read_csv(os.path.join(SPELL_DIR, W + "_structured.csv"))

# templates data
tl = pd.read_csv(os.path.join(SPELL_DIR, L + "_templates.csv"))
tw = pd.read_csv(os.path.join(SPELL_DIR, W + "_templates.csv"))

sl.shape, sw.shape, tl.shape, tw.shape

((121590, 4), (9192, 4), (1305, 3), (143, 3))

In [39]:
sl.head()

Unnamed: 0,LineId,Content,EventId,EventTemplate
0,1,Trojaned version of file '/bin/diff' detected....,52586b5f,Trojaned version of file bin diff detected ...
1,2,Trojaned version of file '/usr/bin/diff' detec...,52586b5f,Trojaned version of file bin diff detected ...
2,3,ossec: Manager started.,40bd922a,ossec Manager started
3,4,<TIMESTAMP> install filebeat:amd64 <none> 7.10.2,ac2c3f4e,<TIMESTAMP> install filebeat amd64 <none> 7 10 2
4,5,<TIMESTAMP> status half-configured filebeat:am...,2f12da89,<*> <*> amd64 7 10 2


In [40]:
sw.head()

Unnamed: 0,LineId,Content,EventId,EventTemplate
0,1,Trojaned version of file '/bin/diff' detected....,52586b5f,Trojaned version of file bin diff detected ...
1,2,Trojaned version of file '/usr/bin/diff' detec...,52586b5f,Trojaned version of file bin diff detected ...
2,3,File '<FILE_PATH>/<HEX_DATA>' is owned by root...,76111487,File <FILE PATH> <*>
3,4,<TIMESTAMP> status half-configured filebeat:am...,4e38040f,<*> <*> <*> 10 2
4,5,<TIMESTAMP> install filebeat:amd64 <none> 7.10.2,4e38040f,<*> <*> <*> 10 2


In [42]:
tl.sort_values(by="Occurrences", ascending=False).head()

Unnamed: 0,EventId,EventTemplate,Occurrences
634,11856dcb,type SYSCALL msg audit <LONG NUM> <*> <*> <ME...,20203
389,ff3a15a6,type SYSCALL msg audit <LONG NUM> <*> <*> <ME...,14414
76,b68e870c,<IP ADDR> <*> Oct 2023 <*> <*> <*> 0500 GET ...,13612
277,ea9e74f9,<IP ADDR> 20 Oct 2023 23 <*> <*> 0500 <*> <*>...,6480
271,f1964136,<IP ADDR> 20 Oct 2023 23 <*> <*> 0500 GET <*...,5647


In [43]:
tw.sort_values(by="Occurrences", ascending=False).head()

Unnamed: 0,EventId,EventTemplate,Occurrences
21,64c455e3,Registry Value x32 HKEY LOCAL <*>,3134
25,b2f5b94e,<*> <*> HKEY LOCAL <*> <*> <*> <*> <*> <*> <*...,1785
20,1a5be16d,Registry Value x32 HKEY LOCAL <*> <*> <*> <*...,1541
19,8582fe8d,Registry Key x32 HKEY LOCAL <*> modified Mo...,857
26,2bb5a355,<*> x32 HKEY LOCAL <*> <*> <*> <*> <*> <*> <*...,744


In [44]:
del sl, sw, tl, tw
gc.collect()

100

### 2.2. Drain

**What it does**:  
- **Input**: Raw logs  
- **Key Features**:  
  - Uses a **fixed-depth tree** to group logs by token length → fast clustering.  
  - Replaces variables with `<*>` based on **similarity threshold (`st`)**.  
  - Configurable `depth` (tree levels) and `max_children` (node splits).  

**Output**:  
- **Structured logs** with:  
  - `EventTemplate` (e.g., `"Error connecting to <*>"`).  
  - Extracted parameters (optional).  

**Best for**:  
- **High-speed parsing** of structured logs (e.g., network logs, HDFS).  
- When you need **deterministic results** (unlike Spell’s LCS).  


In [45]:
DRAIN_DIR = os.path.join(DATA_DIR, "drain")

# structured data
sl = pd.read_csv(os.path.join(DRAIN_DIR, L + "_structured.csv"))
sw = pd.read_csv(os.path.join(DRAIN_DIR, W + "_structured.csv"))

# templates data
tl = pd.read_csv(os.path.join(DRAIN_DIR, L + "_templates.csv"))
tw = pd.read_csv(os.path.join(DRAIN_DIR, W + "_templates.csv"))

sl.shape, sw.shape, tl.shape, tw.shape

((121590, 4), (9192, 4), (315, 3), (88, 3))

In [46]:
sl.head()

Unnamed: 0,LineId,Content,EventId,EventTemplate
0,1,Trojaned version of file '/bin/diff' detected....,331a4559,Trojaned version of file <*> detected. Signatu...
1,2,Trojaned version of file '/usr/bin/diff' detec...,331a4559,Trojaned version of file <*> detected. Signatu...
2,3,ossec: Manager started.,e1a1f4c3,ossec: Manager started.
3,4,<TIMESTAMP> install filebeat:amd64 <none> 7.10.2,60b2d672,<TIMESTAMP> install <*> <none> <*>
4,5,<TIMESTAMP> status half-configured filebeat:am...,7bb16bb4,<TIMESTAMP> status <*> filebeat:amd64 7.10.2


In [47]:
sw.head()

Unnamed: 0,LineId,Content,EventId,EventTemplate
0,1,Trojaned version of file '/bin/diff' detected....,331a4559,Trojaned version of file <*> detected. Signatu...
1,2,Trojaned version of file '/usr/bin/diff' detec...,331a4559,Trojaned version of file <*> detected. Signatu...
2,3,File '<FILE_PATH>/<HEX_DATA>' is owned by root...,37c728b8,File '<FILE_PATH>/<HEX_DATA>' is owned by root...
3,4,<TIMESTAMP> status half-configured filebeat:am...,7bb16bb4,<TIMESTAMP> status <*> filebeat:amd64 7.10.2
4,5,<TIMESTAMP> install filebeat:amd64 <none> 7.10.2,60b2d672,<TIMESTAMP> install <*> <none> <*>


In [48]:
tl.sort_values(by="Occurrences", ascending=False).head()

Unnamed: 0,EventId,EventTemplate,Occurrences
160,bcb077d0,type=SYSCALL <*> <MEMORY_ADDR> syscall=59 succ...,34743
29,3e799d06,"<IP_ADDR> - - <*> +0500] ""GET <*> HTTP/1.1"" 30...",14069
114,db7e8ee6,<IP_ADDR> - - <*> +0500] <*> <*> <*> <*> <*> <...,12451
163,f112642a,type=SYSCALL <*> <MEMORY_ADDR> syscall=59 succ...,11342
159,2fb53026,type=SYSCALL <*> <MEMORY_ADDR> syscall=59 succ...,8533


In [49]:
tw.sort_values(by="Occurrences", ascending=False).head()

Unnamed: 0,EventId,EventTemplate,Occurrences
30,543d1aa3,Registry <*> '[x32] <*> <*> Mode: scheduled,4930
26,1f5d426b,Registry Value '[x32] <*> modified Mode: sched...,1364
27,9fe42205,Registry Key <*> <*> modified Mode: scheduled ...,627
35,0de3dab3,Registry <*> <*> <*> <*> added Mode: scheduled,390
77,2c06ea44,Registry Value <*> <*> deleted Mode: scheduled,231


In [50]:
del sl, sw, tl, tw
gc.collect()

160

### 2.3. FT-Tree

_FT-Tree (Frequent Template Tree) Parser_

**What it does**:  
- **Input**: Tokenized logs (words).  
- **Key Features**:  
  - Builds a **prefix tree** weighted by token frequency.  
  - **Prunes branches** with low frequency (`leaf_num` parameter).  
  - Focuses on **common patterns** (ignores rare variants).  

**Output**:  
- **Frequent log templates** (e.g., `"Connection <*> failed after <*> retries"`).  

**Best for**:  
- **Large-scale logs** where rare patterns are noise.  
- Scenarios needing **pattern frequency analysis** (e.g., anomaly detection).  

In [51]:
FTT_DIR = os.path.join(DATA_DIR, "ft_tree")

# fre data
with open(os.path.join(FTT_DIR, L + ".fre"), "r") as f:
    fl = f.readlines()
with open(os.path.join(FTT_DIR, W + ".fre"), "r") as f:
    fw = f.readlines()

# templates data
with open(os.path.join(FTT_DIR, L + ".template"), "r") as f:
    tl = f.readlines()
with open(os.path.join(FTT_DIR, W + ".template"), "r") as f:
    tw = f.readlines()

len(fl), len(fw), len(tl), len(tw)

(114849, 6655, 1462, 237)

In [52]:
fl[:5]

['<MEMORY_ADDR>\n',
 '<PROCESS_ID>\n',
 'type=PATH\n',
 'cap_fver=0\n',
 'cap_frootid=0\n']

In [53]:
fw[:5]

["'<HEX_DATA>'\n", 'Mode:\n', 'scheduled\n', 'Registry\n', "'[x32]\n"]

In [54]:
tl[:5]

[' <MEMORY_ADDR> <PROCESS_ID> fsuid=0 euid=0 FSUID="root" type=SYSCALL suid=0 SUID="root" EUID="root" ARCH=x86_64 subj=unconfined key="audit-wazuh-c" syscall=59 SYSCALL=execve UID="root" success=yes exit=0 type=EXECVE items=2 GID="root" sgid=0 fsgid=0 egid=0 SGID="root" FSGID="root" EGID="root" a3=8 auid=1000 AUID="sohaib" ses=2 tty=pts2 exe="/usr/bin/kmod" comm="modprobe" a0="modprobe" a1="--all" a5="--show-depends" a4="--quiet" a3="--ignore-install" a2="--set-version=6.2.0-39-generic" a9="atusb" a99="ec_bhf" a98="ks8851_spi" a97="ks8842" a96="ks8851_par" a95="ks8851_common" a94="ksz884x" a93="bna" a92="8139cp" a91="8139too" a90="r8169" a8="fakelb" a89="atp" a88="ethoc" a87="hinic" a86="fealnx" a85="rmnet" a84="qcom-emac" a83="uli526x" a82="dmfe" a81="de2104x" a80="xircom_cb" a7="nlmon" a79="winbond-840" a78="tulip" a77="tsnep" a76="xilinx_emac" a75="ll_temac" a74="xilinx_emaclite" a73="enic" a72="samsung-sxgbe" a71="geneve" a70="slip" a6="gtp" a69="wireguard" a68="nxp-tja11xx" a67="d

In [55]:
tw[:5]

[' Windows® Operating Windows System\\r\\nCompany: System","company":"Microsoft Microsoft® Microsoft Create:\\r\\nRuleName: Corporation\\r\\nOriginalFileName: <TIMESTAMP>\\r\\nProcessGuid: <MEMORY_ADDR>\\r\\nTerminalSessionId: (WinBuild.160101.0800)\\r\\nDescription: Command 1\\r\\nIntegrityLevel: (WinBuild.160101.0800)","description":"Windows SHA1=<HEX_DATA>,MD5=<HEX_DATA>,SHA256=<HEX_DATA>,IMPHASH=<HEX_DATA>\\r\\nParentProcessGuid: DESKTOP-QGB4EQH\\\\mari\\r\\nLogonGuid: C:\\\\Windows\\\\system32\\\\\\r\\nUser: 10.0.17134.1 Processor\\r\\nProduct: Processor","product":"Microsoft® Cmd.Exe\\r\\nCommandLine: High\\r\\nHashes: + ($env:TEMP \\r\\nCurrentDirectory: -C Stop-Process Bypass = -PassThru -ID -FilePath -ExecutionPolicy (Start-Process $myT1036_003 technique_id=T1059.003,technique_name=Windows powershell.exe \\\\\\\\\\\\\\"\\\\\\\\svchost.exe\\\\\\\\\\\\\\"); \\\\\\\\\\\\\\"\\\\\\\\svchost.exe\\\\\\\\\\\\\\")).Id; \\\\\\\\\\\\\\"$env:ComSpec\\\\\\\\\\\\\\" \\\\\\"copy \\\\\\"\\\\s

In [56]:
del fl, fw, tl, tw
gc.collect()

460

## 2. Sequence Contruction