In [1]:
from contextlib import contextmanager
import math
import os, os.path
import random
import sys
from tempfile import TemporaryDirectory

import dask.dataframe as dd
from dask.distributed import Client, LocalCluster
import matplotlib.pyplot as mp
import pandas as pd

for _ in range(2):
    try:
        from eai_graph_tools.datasets.lanl import DatasetLANL
        break
    except ImportError:
        sys.path.append(os.path.realpath(".."))
else:
    raise RuntimeError("Could not locate the repo's code.")

In [2]:
%matplotlib inline

In [3]:
%load_ext autoreload
%autoreload

Adjust the number of workers here to ensure you respect the limitations of your machine.

In [4]:
cluster = LocalCluster(n_workers=3, threads_per_worker=4, memory_limit="4.5GB")
client = Client(cluster)
client

0,1
Client  Scheduler: tcp://127.0.0.1:38311  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 3  Cores: 12  Memory: 13.50 GB


Adjust the following path to direct the `DatasetLANL` instance to where you have Parquet stores of the dataset.

In [5]:
PATH_RAW = os.path.expanduser("~/los_alamos")
lanl = DatasetLANL(PATH_RAW)

# Context

Unit tests for graph creation require having a subset of the Los Alamos dataset. However, a purely random choice of this sample will yield, for any meaningful grouping of the events, groups containing a single event. This fails to provide a suitable dataset for this purpose.

We will thus make up a dataset with the same structure as the LANL dataset, but with a small number of records, and a small number of entities.

## Time

In [6]:
WIDTH_INTV_TIME = 100
NUM_INTV = 5
LIMIT_TIME = WIDTH_INTV_TIME * NUM_INTV

## Domain, users, computers, processes

In [7]:
NUM_DOMAINS = 1
DOMAINS = [f"DOM{n + 1}" for n in range(NUM_DOMAINS)]

NUM_USERS = 3
USERS = [f"USER{n + 1}@{dom}" for n in range(NUM_USERS) for dom in DOMAINS]

NUM_COMPUTERS = 5
COMPUTERS = [f"C{n + 1}" for n in range(NUM_COMPUTERS)]

NUM_PROCESSES = 7
PROCESSES = [f"P{n + 1}" for n in range(NUM_PROCESSES)]

## Analyzing categorical and numerical values for proportion mapping

In [8]:
def df_category(ddf, col):
    return ddf[col].reset_index().groupby(col).count().compute() \
        .rename(columns={"time": "count"}) \
        .sort_values(by="count", ascending=False)

def show_important_values(df, threshold=0.95):
    total = df["count"].sum()
    selection = df[total - df["count"].sort_values().cumsum() <= threshold * total].copy()
    captured = selection["count"].sum() / total
    selection["propn"] = selection["count"] / selection["count"].sum()
    display(selection)
    print(f"Captured: {100*captured:.02f}%")
    return selection

In [9]:
def df_integer(ddf, col):
    df = pd.DataFrame(ddf[col].reset_index().groupby(col).count().compute()) \
        .rename(columns={"time": "count"})
    df['propn'] = df['count'] / df['count'].sum()
    return df.sort_values(by="count", ascending=False)

## auth

In [10]:
lanl.ELEMENT["auth"]

{'columns': [('time', 'int64'),
  ('userdomain_source', 'category'),
  ('userdomain_destination', 'category'),
  ('computer_source', 'category'),
  ('computer_destination', 'category'),
  ('type_auth', 'category'),
  ('type_logon', 'category'),
  ('orientation_auth', 'category'),
  ('outcome_auth', 'category')],
 'labeled_by': [('redteam', 'is_attack', 'bool')]}

In [11]:
df_auth = lanl.element("auth")

### Type of authentication

In [12]:
df_type_auth = df_category(df_auth, "type_auth")

In [13]:
_ = show_important_values(df_type_auth)

  


Unnamed: 0_level_0,count,propn
type_auth,Unnamed: 1_level_1,Unnamed: 2_level_1
?,583283266,0.582635
Kerberos,362786806,0.362384
Negotiate,55042176,0.054981


Captured: 95.23%


### Type of logon

In [14]:
df_type_logon = df_category(df_auth, "type_logon")

In [15]:
_ = show_important_values(df_type_logon, 0.975)

  


Unnamed: 0_level_0,count,propn
type_logon,Unnamed: 1_level_1,Unnamed: 2_level_1
Network,845601123,0.81173
?,147235515,0.141338
Service,48889990,0.046932


Captured: 99.09%


### Authentication orientation

In [16]:
df_orientation_auth = df_category(df_auth, "orientation_auth")

In [17]:
_ = show_important_values(df_orientation_auth, 1.0)

  


Unnamed: 0_level_0,count,propn
orientation_auth,Unnamed: 1_level_1,Unnamed: 2_level_1
LogOn,468112703,0.445268
LogOff,436047754,0.414768
TGS,98992723,0.094162
TGT,38387292,0.036514
AuthMap,9570749,0.009104
ScreenLock,99551,9.5e-05
ScreenUnlock,94595,9e-05


Captured: 100.00%


### Authentication outcome

In [18]:
df_outcome_auth = df_category(df_auth, "outcome_auth")

In [19]:
_ = show_important_values(df_outcome_auth)

  


Unnamed: 0_level_0,count,propn
outcome_auth,Unnamed: 1_level_1,Unnamed: 2_level_1
Success,1038465969,1.0


Captured: 98.78%


No randomness here.

## dns

In [20]:
DatasetLANL.ELEMENT["dns"]

{'columns': [('time', 'int64'),
  ('computer_source', 'category'),
  ('computer_destination', 'category')]}

No weird categorical column to take care of.

## flows

In [21]:
DatasetLANL.ELEMENT["flows"]

{'columns': [('time', 'int64'),
  ('duration', 'int64'),
  ('computer_source', 'category'),
  ('port_source', 'category'),
  ('computer_destination', 'category'),
  ('port_destination', 'category'),
  ('protocol', 'category'),
  ('num_packets', 'int32'),
  ('num_bytes', 'int64')]}

In [22]:
df_flows = lanl.element("flows")

### Ports

In [23]:
df_port_source = df_category(df_flows, "port_source")
df_port_destination = df_category(df_flows, "port_destination")

In [24]:
df_src_eph_vs_num = show_important_values(df_port_source)

  


Unnamed: 0_level_0,count,propn
port_source,Unnamed: 1_level_1,Unnamed: 2_level_1
445,15594632,0.126294
389,5169049,0.041862
80,3958274,0.032056
88,2989200,0.024208
137,2548210,0.020637
139,1834928,0.014860
135,1469556,0.011901
N2,1340311,0.010855
N1,1218477,0.009868
443,967850,0.007838


Captured: 95.00%


In [25]:
df_dest_eph_vs_num = show_important_values(df_port_destination)

  


Unnamed: 0_level_0,count,propn
port_destination,Unnamed: 1_level_1,Unnamed: 2_level_1
445,25912435,0.209852
80,4811435,0.038966
389,3951978,0.032005
137,2562961,0.020756
139,2123849,0.017200
88,2019214,0.016353
N2,1281429,0.010378
N1,1091415,0.008839
135,1002218,0.008116
443,898289,0.007275


Captured: 95.00%


In [26]:
df_src_eph_vs_num.groupby(lambda x: -1 if x.startswith("N") else int(x)) \
    .sum() \
    .sort_values("propn", ascending=False)

Unnamed: 0,count,propn
-1,85952443,0.696089
445,15594632,0.126294
389,5169049,0.041862
80,3958274,0.032056
88,2989200,0.024208
137,2548210,0.020637
139,1834928,0.014860
135,1469556,0.011901
443,967850,0.007838
1433,741880,0.006008


In [27]:
df_dest_eph_vs_num.groupby(lambda x: -1 if x.startswith("N") else int(x)) \
    .sum() \
    .sort_values("propn", ascending=False)

Unnamed: 0,count,propn
-1,77184915,0.625084
445,25912435,0.209852
80,4811435,0.038966
389,3951978,0.032005
137,2562961,0.020756
139,2123849,0.017200
88,2019214,0.016353
135,1002218,0.008116
443,898289,0.007275
22,652179,0.005282


The examination of the source and destination port distribution suggests that the flows of this dataset are unidirectional: a TCP communication would thus consist, for instance, of multiple flows. For the purpose of putting together a dataset excerpt, which would merely be used for unit testing, this is not terribly important.

### Protocol

In [28]:
df_protocol = df_category(df_flows, "protocol")

In [29]:
_ = show_important_values(df_protocol, 1.0)

  


Unnamed: 0_level_0,count,propn
protocol,Unnamed: 1_level_1,Unnamed: 2_level_1
6,116550579,0.896699
17,10104702,0.077742
1,3321625,0.025555
41,506,4e-06


Captured: 100.00%


### Duration

In [30]:
df_duration = df_integer(df_flows, "duration")

In [31]:
df_duration

Unnamed: 0_level_0,count,propn
duration,Unnamed: 1_level_1,Unnamed: 2_level_1
0,84217053,6.479361e-01
60,6610771,5.086092e-02
1,3737267,2.875320e-02
10,3198539,2.460842e-02
11,2710803,2.085595e-02
6,2155929,1.658695e-02
12,1830268,1.408143e-02
61,1681783,1.293904e-02
15,1636460,1.259034e-02
13,1232388,9.481555e-03


In [32]:
df_duration.sort_index().groupby(lambda x: int(x/5)).sum()

Unnamed: 0,count,propn
0,89876828,0.6914804
1,5333526,0.04103425
2,10106851,0.07775852
3,2880869,0.02216438
4,898418,0.006912109
5,805279,0.00619553
6,3107382,0.02390709
7,2955888,0.02274155
8,580677,0.004467522
9,540443,0.004157976


### num_packets

In [33]:
df_num_packets = df_integer(df_flows, "num_packets")
df_num_packets

Unnamed: 0_level_0,count,propn
num_packets,Unnamed: 1_level_1,Unnamed: 2_level_1
1,47486949,3.653477e-01
6,11364340,8.743319e-02
4,11300023,8.693836e-02
5,7234553,5.566008e-02
2,7054442,5.427437e-02
3,6053497,4.657345e-02
7,5085182,3.912358e-02
9,5039281,3.877044e-02
8,3309676,2.546347e-02
10,2360654,1.816203e-02


In [34]:
df_num_packets.sort_index().groupby(lambda x: int(x/10)).sum()

Unnamed: 0,count,propn
0,103927943,7.995846e-01
1,11389267,8.762497e-02
2,2305136,1.773490e-02
3,2514452,1.934530e-02
4,830649,6.390718e-03
5,2105116,1.619601e-02
6,485231,3.733195e-03
7,335679,2.582595e-03
8,309041,2.377652e-03
9,214429,1.649740e-03


### num_bytes

In [35]:
df_num_bytes = df_integer(df_flows, "num_bytes")
df_num_bytes

Unnamed: 0_level_0,count,propn
num_bytes,Unnamed: 1_level_1,Unnamed: 2_level_1
46,28658997,2.204921e-01
52,4919819,3.785134e-02
60,3968397,3.053144e-02
138,2842471,2.186896e-02
193,1813700,1.395396e-02
92,1792301,1.378933e-02
120,1662008,1.278690e-02
278,1361070,1.047159e-02
372,1258112,9.679466e-03
81064,1241499,9.551652e-03


In [36]:
def mapping(n):
    for p, d in [(10, "KB"), (16, "64K"), (20, "MB"), (22, "4M"), (27, "128M"), (30, "GB"), (32, "4G")]:
        if n <= (1 << p):
            return d
    return "inf"
df_num_bytes.sort_index().groupby(mapping).sum().sort_values("count", ascending=False)

Unnamed: 0,count,propn
KB,82604397,0.635529
64K,42155827,0.324332
MB,4400058,0.033852
128M,441922,0.0034
4M,352725,0.002714
GB,19517,0.00015
4G,2966,2.3e-05


## proc

In [37]:
DatasetLANL.ELEMENT["proc"]

{'columns': [('time', 'int64'),
  ('userdomain_source', 'category'),
  ('computer_destination', 'category'),
  ('name_process', 'category'),
  ('status_process', 'category')]}

In [38]:
df_proc = lanl.element("proc")

### status_process

In [39]:
df_status_process = df_category(df_proc, "status_process")

In [40]:
_ = show_important_values(df_status_process)

  


Unnamed: 0_level_0,count,propn
status_process,Unnamed: 1_level_1,Unnamed: 2_level_1
Start,335842571,0.788279
End,90202525,0.211721


Captured: 100.00%


# Generating a LANL-like small dataset

## How many samples of each element?

In [41]:
num_rows = dict((name, len(lanl.element(name))) for name in lanl.name_elements())
num_rows

{'auth': 1051305367, 'proc': 426045096, 'flows': 129977412, 'dns': 40821591}

In [42]:
total_events = sum(num_rows.values())

In [43]:
num_samples = 40
dict((e, int(num_samples * n / total_events)) for e,n in num_rows.items())

{'auth': 25, 'proc': 10, 'flows': 3, 'dns': 0}

In [44]:
num_sp_elem = {
    "proc": 10,
    "flows": 5,
    "dns": 3
}
num_sp_elem["auth"] = num_samples - sum(num_sp_elem.values())
num_sp_elem

{'proc': 10, 'flows': 5, 'dns': 3, 'auth': 22}

## Sampling normal columns

In [45]:
def sampler_list(ls):
    return lambda n: [random.choice(ls) for _ in range(n)]

In [46]:
def sampler_df(df):
    if "propn" in df.columns:
        weights = list(df['propn'])
    elif "count" in df.columns:
        weights = list(df['count'] / df['count'].sum())
    else:
        raise RuntimeError("Neither propn nor count")
    return lambda n: random.choices(list(df.index), weights=weights, k=n)

In [47]:
def sampler_interval(min_intv, max_intv):
    return lambda n: [random.randint(min_intv, max_intv) for _ in range(n)]

In [48]:
def sampler_distribution(ls, weights):
    return lambda n: [random.choices(ls, weights=weights) for _ in range(n)]

In [49]:
sampler_columns = {
    "time": sampler_interval(0, WIDTH_INTV_TIME * NUM_INTV),
    "userdomain_source": sampler_list(USERS),
    "userdomain_destination": sampler_list(USERS),
    "computer_source": sampler_list(COMPUTERS),
    "computer_destination": sampler_list(COMPUTERS),
    "type_auth": sampler_df(df_type_auth),
    "type_logon": sampler_df(df_type_logon),
    "orientation_auth": sampler_df(df_orientation_auth),
    "outcome_auth": sampler_df(df_outcome_auth),
    "name_process": sampler_list(PROCESSES),
    "status_process": sampler_df(df_status_process),
    "duration": sampler_df(df_duration),
    "port_source": sampler_df(df_port_source),
    "port_destination": sampler_df(df_port_destination),
    "protocol": sampler_df(df_protocol),
    "num_packets": sampler_df(df_num_packets),
    "num_bytes": sampler_df(df_num_bytes)
}

In [50]:
def sample_element(name, n):
    return pd.DataFrame(
        dict((col, sampler_columns[col](n)) for col, _ in DatasetLANL.ELEMENT[name]["columns"])
    ).set_index("time").sort_index()

In [51]:
def join_lanl_samples(*p):
    return os.path.join("..", "tests", "test_data", "lanl", *p)

In [52]:
random.seed("12349876")
df_samples = {}
for name, n in num_sp_elem.items():
    df_samples[name] = sample_element(name, n)
    display(df_samples[name])
    df_samples[name].to_csv(join_lanl_samples(name + ".txt"), header=False)

Unnamed: 0_level_0,userdomain_source,computer_destination,name_process,status_process
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
143,USER3@DOM1,C3,P3,Start
168,USER2@DOM1,C2,P4,Start
199,USER2@DOM1,C4,P5,Start
199,USER3@DOM1,C3,P6,End
220,USER1@DOM1,C1,P3,Start
312,USER1@DOM1,C3,P7,Start
334,USER1@DOM1,C3,P1,Start
344,USER2@DOM1,C1,P3,Start
421,USER1@DOM1,C4,P7,Start
481,USER1@DOM1,C4,P3,Start


Unnamed: 0_level_0,duration,computer_source,port_source,computer_destination,port_destination,protocol,num_packets,num_bytes
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
29,0,C4,N113,C5,N8579,6,9,2694
195,65,C2,N131,C3,80,17,1,663
250,0,C5,N32369,C1,N386,6,1,46
281,0,C5,N12,C2,N4,6,155,416288
295,1,C5,445,C4,N63,6,3,190


Unnamed: 0_level_0,computer_source,computer_destination
time,Unnamed: 1_level_1,Unnamed: 2_level_1
305,C1,C3
461,C3,C2
495,C1,C5


Unnamed: 0_level_0,userdomain_source,userdomain_destination,computer_source,computer_destination,type_auth,type_logon,orientation_auth,outcome_auth
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
27,USER1@DOM1,USER1@DOM1,C1,C1,Negotiate,Network,TGS,Success
53,USER3@DOM1,USER1@DOM1,C3,C5,Kerberos,Network,LogOn,Success
57,USER2@DOM1,USER2@DOM1,C1,C5,?,Network,LogOn,Success
77,USER3@DOM1,USER2@DOM1,C3,C2,Kerberos,Network,LogOn,Success
132,USER3@DOM1,USER2@DOM1,C2,C1,?,Network,LogOn,Success
139,USER3@DOM1,USER3@DOM1,C5,C1,?,Network,LogOff,Success
178,USER2@DOM1,USER3@DOM1,C5,C1,?,?,LogOff,Success
191,USER2@DOM1,USER2@DOM1,C1,C5,?,Network,LogOn,Success
193,USER2@DOM1,USER2@DOM1,C4,C2,?,Network,TGS,Success
220,USER1@DOM1,USER3@DOM1,C2,C5,?,Network,TGT,Success


## Sampling label columns

In [53]:
NUM_AUTH_RECORDS_ATTACK = 3

In [54]:
df_redteam = df_samples["auth"][df_samples["auth"].orientation_auth == "LogOn"] \
    .sample(n=3) \
    .sort_index() \
    .drop(["userdomain_destination", "type_auth", "type_logon", "orientation_auth", "outcome_auth"], axis="columns")
df_redteam

Unnamed: 0_level_0,userdomain_source,computer_source,computer_destination
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
278,USER3@DOM1,C2,C1
326,USER3@DOM1,C4,C1
329,USER2@DOM1,C4,C2


In [55]:
df_redteam.to_csv(join_lanl_samples("redteam.txt"), header=False)

In [56]:
df_redteam

Unnamed: 0_level_0,userdomain_source,computer_source,computer_destination
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
278,USER3@DOM1,C2,C1
326,USER3@DOM1,C4,C1
329,USER2@DOM1,C4,C2


In [57]:
df_samples["auth"]

Unnamed: 0_level_0,userdomain_source,userdomain_destination,computer_source,computer_destination,type_auth,type_logon,orientation_auth,outcome_auth
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
27,USER1@DOM1,USER1@DOM1,C1,C1,Negotiate,Network,TGS,Success
53,USER3@DOM1,USER1@DOM1,C3,C5,Kerberos,Network,LogOn,Success
57,USER2@DOM1,USER2@DOM1,C1,C5,?,Network,LogOn,Success
77,USER3@DOM1,USER2@DOM1,C3,C2,Kerberos,Network,LogOn,Success
132,USER3@DOM1,USER2@DOM1,C2,C1,?,Network,LogOn,Success
139,USER3@DOM1,USER3@DOM1,C5,C1,?,Network,LogOff,Success
178,USER2@DOM1,USER3@DOM1,C5,C1,?,?,LogOff,Success
191,USER2@DOM1,USER2@DOM1,C1,C5,?,Network,LogOn,Success
193,USER2@DOM1,USER2@DOM1,C4,C2,?,Network,TGS,Success
220,USER1@DOM1,USER3@DOM1,C2,C5,?,Network,TGT,Success


In [58]:
df_auth_labeled = df_samples["auth"].merge(df_redteam, how="left", on=list(df_redteam.reset_index().columns), indicator=True)
df_auth_labeled["is_attack"] = (df_auth_labeled._merge != "left_only")
df_auth_labeled = df_auth_labeled.drop("_merge", axis="columns")
df_auth_labeled

Unnamed: 0_level_0,userdomain_source,userdomain_destination,computer_source,computer_destination,type_auth,type_logon,orientation_auth,outcome_auth,is_attack
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
27,USER1@DOM1,USER1@DOM1,C1,C1,Negotiate,Network,TGS,Success,False
53,USER3@DOM1,USER1@DOM1,C3,C5,Kerberos,Network,LogOn,Success,False
57,USER2@DOM1,USER2@DOM1,C1,C5,?,Network,LogOn,Success,False
77,USER3@DOM1,USER2@DOM1,C3,C2,Kerberos,Network,LogOn,Success,False
132,USER3@DOM1,USER2@DOM1,C2,C1,?,Network,LogOn,Success,False
139,USER3@DOM1,USER3@DOM1,C5,C1,?,Network,LogOff,Success,False
178,USER2@DOM1,USER3@DOM1,C5,C1,?,?,LogOff,Success,False
191,USER2@DOM1,USER2@DOM1,C1,C5,?,Network,LogOn,Success,False
193,USER2@DOM1,USER2@DOM1,C4,C2,?,Network,TGS,Success,False
220,USER1@DOM1,USER3@DOM1,C2,C5,?,Network,TGT,Success,False


In [59]:
df_auth_labeled.to_csv(join_lanl_samples("auth.txt.labeled"), header=False)

# Clean-up

In [60]:
client.close()
cluster.close()