# BETH Dataset Playground

In [1]:
import pandas as pd
from glob import glob
import networkx as nx
import ipaddress

# Initial Explore
## Load Data

In [3]:
dns_data_files = glob("*-dns.csv")
dns_data_files

['labelled_2021may-ip-10-100-1-105-dns.csv',
 'labelled_2021may-ip-10-100-1-186-dns.csv',
 'labelled_2021may-ip-10-100-1-26-dns.csv',
 'labelled_2021may-ip-10-100-1-4-dns.csv',
 'labelled_2021may-ip-10-100-1-95-dns.csv',
 'labelled_2021may-ubuntu-dns.csv']

In [2]:
feb_process_data_files = glob("*_data.csv")
feb_process_data_files

['labelled_testing_data.csv',
 'labelled_training_data.csv',
 'labelled_validation_data.csv']

### Loading DNS Data

In [7]:
dns_all = []
for data in dns_data_files:
    print(data)
    df = pd.read_csv(data,parse_dates=True, squeeze=True)
    dns_all.append(df)
dns_df = pd.concat(dns_all, ignore_index=True)

labelled_2021may-ip-10-100-1-105-dns.csv
labelled_2021may-ip-10-100-1-186-dns.csv
labelled_2021may-ip-10-100-1-26-dns.csv
labelled_2021may-ip-10-100-1-4-dns.csv
labelled_2021may-ip-10-100-1-95-dns.csv
labelled_2021may-ubuntu-dns.csv


In [8]:
dns_df.head(20)

Unnamed: 0,Timestamp,SourceIP,DestinationIP,DnsQuery,DnsAnswer,DnsAnswerTTL,DnsQueryNames,DnsQueryClass,DnsQueryType,NumberOfAnswers,DnsResponseCode,DnsOpCode,SensorId,sus,evil
0,2021-05-16T17:13:14Z,10.100.1.95,10.100.0.2,ssm.us-east-2.amazonaws.com,,,ssm.us-east-2.amazonaws.com,['IN'],['A'],0,0,0,ip-10-100-1-95,0,0
1,2021-05-16T17:13:14Z,10.100.0.2,10.100.1.95,ssm.us-east-2.amazonaws.com,['52.95.19.240'],['17'],ssm.us-east-2.amazonaws.com,['IN'],['A'],1,0,0,ip-10-100-1-95,0,0
2,2021-05-16T17:13:14Z,10.100.1.95,10.100.0.2,ssm.us-east-2.amazonaws.com,,,ssm.us-east-2.amazonaws.com,['IN'],['AAAA'],0,0,0,ip-10-100-1-95,0,0
3,2021-05-16T17:13:14Z,10.100.0.2,10.100.1.95,ssm.us-east-2.amazonaws.com,,,ssm.us-east-2.amazonaws.com,['IN'],['AAAA'],0,0,0,ip-10-100-1-95,0,0
4,2021-05-16T17:13:16Z,10.100.1.186,10.100.0.2,ssm.us-east-2.amazonaws.com,,,ssm.us-east-2.amazonaws.com,['IN'],['A'],0,0,0,ip-10-100-1-186,0,0
5,2021-05-16T17:13:16Z,10.100.0.2,10.100.1.186,ssm.us-east-2.amazonaws.com,['52.95.21.209'],['41'],ssm.us-east-2.amazonaws.com,['IN'],['A'],1,0,0,ip-10-100-1-186,0,0
6,2021-05-16T17:13:16Z,10.100.1.186,10.100.0.2,ssm.us-east-2.amazonaws.com,,,ssm.us-east-2.amazonaws.com,['IN'],['AAAA'],0,0,0,ip-10-100-1-186,0,0
7,2021-05-16T17:13:17Z,10.100.0.2,10.100.1.186,ssm.us-east-2.amazonaws.com,,,ssm.us-east-2.amazonaws.com,['IN'],['AAAA'],0,0,0,ip-10-100-1-186,0,0
8,2021-05-16T17:13:17Z,10.100.1.105,10.100.0.2,ssm.us-east-2.amazonaws.com,,,ssm.us-east-2.amazonaws.com,['IN'],['A'],0,0,0,ip-10-100-1-105,0,0
9,2021-05-16T17:13:17Z,10.100.1.105,10.100.0.2,ssm.us-east-2.amazonaws.com,,,ssm.us-east-2.amazonaws.com,['IN'],['AAAA'],0,0,0,ip-10-100-1-105,0,0


In [9]:
dns_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1614 entries, 0 to 1613
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Timestamp        1614 non-null   object
 1   SourceIP         1614 non-null   object
 2   DestinationIP    1614 non-null   object
 3   DnsQuery         1614 non-null   object
 4   DnsAnswer        390 non-null    object
 5   DnsAnswerTTL     390 non-null    object
 6   DnsQueryNames    1614 non-null   object
 7   DnsQueryClass    1614 non-null   object
 8   DnsQueryType     1614 non-null   object
 9   NumberOfAnswers  1614 non-null   int64 
 10  DnsResponseCode  1614 non-null   int64 
 11  DnsOpCode        1614 non-null   int64 
 12  SensorId         1614 non-null   object
 13  sus              1614 non-null   int64 
 14  evil             1614 non-null   int64 
dtypes: int64(5), object(10)
memory usage: 189.3+ KB


In [10]:
dns_df.value_counts('Timestamp')

Timestamp
2021-05-16T21:38:54Z    144
2021-05-16T17:43:20Z     72
2021-05-16T19:23:20Z     48
2021-05-16T20:43:20Z     48
2021-05-16T17:33:20Z     48
                       ... 
2021-05-16T17:28:27Z      6
2021-05-16T19:11:16Z      6
2021-05-16T19:09:16Z      6
2021-05-16T18:37:51Z      6
2021-05-16T18:43:14Z      6
Length: 61, dtype: int64

In [11]:
duplicateRows = dns_df[dns_df.duplicated()]
print("Duplicate Rows except first occurrence based on all columns are :")
print(len(duplicateRows))

Duplicate Rows except first occurrence based on all columns are :
1355


In [12]:
dns_df = dns_df.drop_duplicates()
dns_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 259 entries, 0 to 268
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Timestamp        259 non-null    object
 1   SourceIP         259 non-null    object
 2   DestinationIP    259 non-null    object
 3   DnsQuery         259 non-null    object
 4   DnsAnswer        65 non-null     object
 5   DnsAnswerTTL     65 non-null     object
 6   DnsQueryNames    259 non-null    object
 7   DnsQueryClass    259 non-null    object
 8   DnsQueryType     259 non-null    object
 9   NumberOfAnswers  259 non-null    int64 
 10  DnsResponseCode  259 non-null    int64 
 11  DnsOpCode        259 non-null    int64 
 12  SensorId         259 non-null    object
 13  sus              259 non-null    int64 
 14  evil             259 non-null    int64 
dtypes: int64(5), object(10)
memory usage: 32.4+ KB


### Load Process Data

In [11]:
df_process_feb = []
for data in feb_process_data_files:
    df = pd.read_csv(data,parse_dates=True, squeeze=True)
    df_process_feb.append(df)
feb_process_df = pd.concat(df_process_feb, ignore_index=True)

In [None]:
feb_process_df

Unnamed: 0,timestamp,processId,threadId,parentProcessId,userId,mountNamespace,processName,hostName,eventId,eventName,stackAddresses,argsNum,returnValue,args,sus,evil
0,129.050634,382,382,1,101,4026532232,systemd-resolve,ip-10-100-1-217,41,socket,"[140159195621643, 140159192455417, 94656731598...",3,15,"[{'name': 'domain', 'type': 'int', 'value': 'A...",0,0
1,129.051238,379,379,1,100,4026532231,systemd-network,ip-10-100-1-217,41,socket,"[139853228042507, 93935071185801, 93935080775184]",3,15,"[{'name': 'domain', 'type': 'int', 'value': 'A...",0,0
2,129.051434,1,1,0,0,4026531840,systemd,ip-10-100-1-217,1005,security_file_open,"[140362867191588, 8103505641674583858]",4,0,"[{'name': 'pathname', 'type': 'const char*', '...",0,0
3,129.051481,1,1,0,0,4026531840,systemd,ip-10-100-1-217,257,openat,[],4,17,"[{'name': 'dirfd', 'type': 'int', 'value': -10...",0,0
4,129.051522,1,1,0,0,4026531840,systemd,ip-10-100-1-217,5,fstat,[140362867189385],2,0,"[{'name': 'fd', 'type': 'int', 'value': 17}, {...",0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1141073,778.146676,1,1,0,0,4026531840,systemd,ip-10-100-1-169,3,close,[],1,0,"[{'name': 'fd', 'type': 'int', 'value': 39}]",0,0
1141074,778.146736,1,1,0,0,4026531840,systemd,ip-10-100-1-169,6,lstat,[],2,0,"[{'name': 'pathname', 'type': 'const char*', '...",0,0
1141075,778.146762,1,1,0,0,4026531840,systemd,ip-10-100-1-169,1005,security_file_open,[],4,0,"[{'name': 'pathname', 'type': 'const char*', '...",0,0
1141076,778.146790,1,1,0,0,4026531840,systemd,ip-10-100-1-169,257,openat,[],4,39,"[{'name': 'dirfd', 'type': 'int', 'value': -10...",0,0


In [15]:
feb_process_df.describe()

Unnamed: 0,timestamp,processId,threadId,parentProcessId,userId,mountNamespace,eventId,argsNum,returnValue,sus,evil
count,1141078.0,1141078.0,1141078.0,1141078.0,1141078.0,1141078.0,1141078.0,1141078.0,1141078.0,1141078.0,1141078.0
mean,1367.449,6909.07,6913.038,2467.229,143.7311,4026532000.0,237.2977,2.671557,3.018248,0.1520615,0.1388441
std,1154.433,1816.699,1807.393,2862.64,350.0947,172.6697,354.8319,1.250393,322.3468,0.3590806,0.345784
min,124.4392,1.0,1.0,0.0,0.0,4026532000.0,2.0,0.0,-115.0,0.0,0.0
25%,461.2974,7301.0,7301.0,187.0,0.0,4026532000.0,4.0,1.0,0.0,0.0,0.0
50%,903.3516,7366.0,7366.0,1385.0,0.0,4026532000.0,42.0,3.0,0.0,0.0,0.0
75%,2327.305,7461.0,7461.0,4489.0,0.0,4026532000.0,257.0,4.0,0.0,0.0,0.0
max,3954.588,8619.0,8619.0,7672.0,1001.0,4026532000.0,1010.0,5.0,32768.0,1.0,1.0


In [16]:
feb_process_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1141078 entries, 0 to 1141077
Data columns (total 16 columns):
 #   Column           Non-Null Count    Dtype  
---  ------           --------------    -----  
 0   timestamp        1141078 non-null  float64
 1   processId        1141078 non-null  int64  
 2   threadId         1141078 non-null  int64  
 3   parentProcessId  1141078 non-null  int64  
 4   userId           1141078 non-null  int64  
 5   mountNamespace   1141078 non-null  int64  
 6   processName      1141078 non-null  object 
 7   hostName         1141078 non-null  object 
 8   eventId          1141078 non-null  int64  
 9   eventName        1141078 non-null  object 
 10  stackAddresses   1141078 non-null  object 
 11  argsNum          1141078 non-null  int64  
 12  returnValue      1141078 non-null  int64  
 13  args             1141078 non-null  object 
 14  sus              1141078 non-null  int64  
 15  evil             1141078 non-null  int64  
dtypes: float64(1), int

In [18]:
duplicateRows = feb_process_df[process_df.duplicated()]
print("Duplicate Rows except first occurrence based on all columns are :")
print(len(duplicateRows))

Duplicate Rows except first occurrence based on all columns are :
0


In [18]:
duplicatedTuples = feb_process_df.groupby(['processId','parentProcessId', 'hostName', 'eventName']).size().reset_index(name='Count').sort_values('Count')
duplicatedTuples

Unnamed: 0,processId,parentProcessId,hostName,eventName,Count
16707,8619,188,ubuntu,sched_process_exit,1
7796,7374,7099,ip-10-100-1-57,execve,1
15222,7596,7595,ip-10-100-1-57,openat,1
7791,7374,7099,ip-10-100-1-57,clone,1
7785,7374,4489,ip-10-100-1-34,security_bprm_check,1
...,...,...,...,...,...
354,159,1,ip-10-100-1-57,security_file_open,2314
347,159,1,ip-10-100-1-57,close,2318
14631,7555,7548,ip-10-100-1-217,socket,2400
351,159,1,ip-10-100-1-57,openat,3635


#### Subset Data From Host 10-100-1-186

In [118]:
one86_process_df = pd.read_csv("labelled_2021may-ip-10-100-1-186.csv")

In [119]:
##Separate process nodes from event nodes
one86_procs = one86_process_df[['processId', 'parentProcessId', 'processName', 'userId','timestamp', 'sus', 'evil']].rename(columns={'processName':'name'})
one86_events = one86_process_df[['eventId', 'processId', 'eventName', 'timestamp', 'returnValue','sus', 'evil']].rename(columns={'processId': 'parentProcessId', 'eventName':'name'})


In [120]:
##Remove duplicates
one86_procs = one86_procs.drop_duplicates(keep='first')
one86_events = one86_events.drop_duplicates(keep='first')


In [121]:
##concatenate parentProcessId to eventId to timestamp to differentiate
one86_events['eventId'] =  one86_events['parentProcessId'].astype(str) + '_' + one86_events['eventId'].astype(str) + '_' + one86_events['timestamp'].astype(str)

In [122]:
one86_events

Unnamed: 0,eventId,parentProcessId,name,timestamp,returnValue,sus,evil
0,383_41_124.95282,383,socket,124.952820,15,0,0
1,380_41_124.953139,380,socket,124.953139,15,0,0
2,1_1005_124.953424,1,security_file_open,124.953424,0,0,0
3,1_257_124.953464,1,openat,124.953464,17,0,0
4,1_5_124.953494,1,fstat,124.953494,0,0,0
...,...,...,...,...,...,...,...
713862,159_1005_16026.611442,159,security_file_open,16026.611442,0,0,0
713863,159_257_16026.611475,159,openat,16026.611475,34,0,0
713864,159_5_16026.611515,159,fstat,16026.611515,0,0,0
713865,159_257_16026.611582,159,openat,16026.611582,-2,0,0


In [123]:
# one86_procs.value_counts('processId')
one86_procs[one86_procs['processId'] == 7324]


Unnamed: 0,processId,parentProcessId,name,userId,timestamp,sus,evil
2799,7324,1,(time-dir),0,143.928992,0,0
2800,7324,1,(time-dir),0,143.929058,0,0
2801,7324,1,(time-dir),0,143.929081,0,0
2802,7324,1,(time-dir),0,143.929102,0,0
2803,7324,1,(time-dir),0,143.929122,0,0
...,...,...,...,...,...,...,...
3333,7324,1,systemd-user-ru,0,143.978131,0,0
3334,7324,1,systemd-user-ru,0,143.978151,0,0
3335,7324,1,systemd-user-ru,0,143.978171,0,0
3336,7324,1,systemd-user-ru,0,143.978194,0,0


In [124]:
one86_events.value_counts('eventId')
# one86_events[one86_events['eventId'] == '']

eventId
1113_1006_133.660853     1
8262_3_10682.368496      1
8262_3_10682.367765      1
8262_3_10682.367855      1
8262_3_10682.367935      1
                        ..
7637_1005_4517.396042    1
7637_1005_4517.39713     1
7637_1005_4517.401773    1
7637_1005_4517.402481    1
93_1010_709.656816       1
Length: 713867, dtype: int64

In [125]:
##Propagate sus/evil, Remove conflicting names, keep first process timestamp
one86_procs = one86_procs.groupby('processId').agg({'parentProcessId':'min', 'name':'first', 'userId':'max', 'timestamp':'min', 'sus':'max', 'evil': 'max'})
one86_events = one86_events.groupby('eventId').agg({'parentProcessId':'min', 'name':'first', 'timestamp':'first', 'returnValue':'max', 'sus': 'max', 'evil': 'max'})

In [126]:
##One-hot return value - negative: 1-0/0: 0-0/positive: 0-1
one86_events['returnNegative'] = one86_events['returnValue'] < 0
one86_events['returnPositive'] = one86_events['returnValue'] > 0

##Binarize userId - 0-1000: 0, 1000+: 1
one86_procs['userType'] = one86_procs['userId'] >= 1000


In [127]:
##Feature for type of node
one86_procs['hostType'] = 0
one86_procs['processType'] = 1
one86_events['hostType'] = 0
one86_events['processType'] = 0


In [117]:
one86_procs

Unnamed: 0_level_0,parentProcessId,name,userId,timestamp,sus,evil,userType,hostType,processType
processId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,0,systemd,0,124.953424,0,0,False,0,1
5,2,kworker/dying,0,709.656880,0,0,False,0,1
7,2,kworker/dying,0,345.124745,0,0,False,0,1
8,2,kworker/dying,0,2823.192768,0,0,False,0,1
78,2,kworker/dying,0,318.744790,0,0,False,0,1
...,...,...,...,...,...,...,...,...,...
8903,7096,sshd,0,15997.377169,0,0,False,0,1
8904,8903,sshd,109,15997.698131,0,0,False,0,1
8906,1280,amazon-ssm-agen,0,16012.896073,0,0,False,0,1
8908,7096,sshd,0,16015.479898,0,0,False,0,1


## Graphs

### Process Graph

#### Subset Data Graph

In [28]:
subsetProcessGraph = nx.from_pandas_edgelist(one86_procs, source="parentProcessId", target="processId", create_using=nx.MultiDiGraph())

In [29]:
print("Graph analysis\n")
print("Multi-edge directed Graph\n")
print("Number of nodes: %s\t" % len(subsetProcessGraph.nodes))
print("Number of edges: %s\n" % len(subsetProcessGraph.edges))
print("Graph density: %s\n" % nx.density(subsetProcessGraph))
print("Graph is directed: %s\n" % subsetProcessGraph.is_directed())
print("Graph is weighted: %s\n" % nx.is_weighted(subsetProcessGraph))

Graph analysis

Multi-edge directed Graph

Number of nodes: 1513	
Number of edges: 20635

Graph density: 0.009020149882674668

Graph is directed: True

Graph is weighted: False



### DNS Graph

# To DGI Format

### DNS Data to DGI format

### Process Data to DGI format