# Beacon Detection using RAPIDS
----


In [1]:
# Import needed libraries
import time

# RAPIDS
import cudf
import cugraph

# general stuff
from collections import OrderedDict
import os.path
import socket
import struct


  return f(*args, **kwds)


## Let's download data and define columns
Downloading could takes a while.  Check to see if the data has already been downloaded to save time

In [2]:
if ( os.path.isfile('UNSW-NB15_1.csv') == False ) :
    !wget -N https://www.unsw.adfa.edu.au/unsw-canberra-cyber/cybersecurity/ADFA-NB15-Datasets/UNSW-NB15_1.csv

if ( os.path.isfile('UNSW-NB15_2.csv') == False ) :
    !wget -N https://www.unsw.adfa.edu.au/unsw-canberra-cyber/cybersecurity/ADFA-NB15-Datasets/UNSW-NB15_2.csv
        
if ( os.path.isfile('UNSW-NB15_3.csv') == False ) :
    !wget -N https://www.unsw.adfa.edu.au/unsw-canberra-cyber/cybersecurity/ADFA-NB15-Datasets/UNSW-NB15_3.csv
        
if ( os.path.isfile('UNSW-NB15_4.csv') == False ) :
    !wget -N https://www.unsw.adfa.edu.au/unsw-canberra-cyber/cybersecurity/ADFA-NB15-Datasets/UNSW-NB15_4.csv

if ( os.path.isfile('UNSW-NB15_GT.csv') == False ) :        
    !wget -N https://www.unsw.adfa.edu.au/unsw-canberra-cyber/cybersecurity/ADFA-NB15-Datasets/NUSW-NB15_GT.csv

--2020-05-07 17:04:52--  https://www.unsw.adfa.edu.au/unsw-canberra-cyber/cybersecurity/ADFA-NB15-Datasets/NUSW-NB15_GT.csv
Resolving www.unsw.adfa.edu.au (www.unsw.adfa.edu.au)... 202.58.60.197
Connecting to www.unsw.adfa.edu.au (www.unsw.adfa.edu.au)|202.58.60.197|:443... connected.
HTTP request sent, awaiting response... 304 Not Modified
File ‘NUSW-NB15_GT.csv’ not modified on server. Omitting download.



In [3]:
datafiles = [
    'UNSW-NB15_1.csv',
    'UNSW-NB15_2.csv',
    'UNSW-NB15_3.csv',
    'UNSW-NB15_4.csv'
]

In [4]:
# define column names and data types (there are several ways this could be done)
col_dtypes = OrderedDict([
    ('srcip', 'str'),
    ('sport', 'int32'),
    ('dstip', 'str'),
    ('dsport', 'int64'),
    ('proto', 'str'),
    ('state', 'str'),
    ('dur', 'float64'),
    ('sbytes', 'int64'),
    ('dbytes', 'int64'),
    ('sttl', 'int64'),
    ('dttl', 'int64'),
    ('sloss', 'int64'),
    ('dloss', 'int64'),
    ('service', 'str'),
    ('Sload', 'float64'),
    ('Dload', 'float64'),
    ('Spkts', 'int64'),
    ('Dpkts', 'int64'),
    ('swin', 'int64'),
    ('dwin', 'int64'),
    ('stcpb', 'int64'),
    ('dtcpb', 'int64'),
    ('smeansz', 'int64'),
    ('dmeansz', 'int64'),
    ('trans_depth', 'int64'),
    ('res_bdy_len', 'int64'),
    ('Sjit', 'float64'),
    ('Djit', 'float64'),
    ('Stime', 'int64'),
    ('Ltime', 'int64'),
    ('Sintpkt', 'float64'),
    ('Dintpkt', 'float64'),
    ('tcprtt', 'float64'),
    ('synack', 'float64'),
    ('ackdat', 'float64'),
    ('is_sm_ips_ports', 'int8'),
    ('ct_state_ttl', 'int64'),
    ('ct_flw_http_mthd', 'int64'),
    ('is_ftp_login', 'int8'),
    ('ct_ftp_cmd', 'int64'),
    ('ct_srv_src', 'int64'),
    ('ct_srv_dst', 'int64'),
    ('ct_dst_ltm', 'int64'),
    ('ct_src_ ltm', 'int64'),
    ('ct_src_dport_ltm', 'int64'),
    ('ct_dst_sport_ltm', 'int64'),
    ('ct_dst_src_ltm', 'int64'),
    ('attack_cat', 'str'),
    ('Label', 'int8')
])

In [5]:
num_files = len(datafiles)

## Let's load all the data 

In [6]:
gdf_a = [None] * num_files

for i in range(num_files) :
    gdf_a[i] = cudf.read_csv( datafiles[i], names=list(col_dtypes.keys()), delimiter=',', dtype=list(col_dtypes.values()) )

In [7]:
# merge the data sets together
gdf = cudf.concat(gdf_a)

In [8]:
# cleanup to reclaim space
del gdf_a

### Now convert IP addresses to Integer

In [9]:
# This adds the converted rows back into the DataFrame
gdf['src'] = gdf['srcip'].str.ip2int()
gdf['dst'] = gdf['dstip'].str.ip2int()

### Quick peek at the first few row

In [10]:
gdf.head(5)

Unnamed: 0,srcip,sport,dstip,dsport,proto,state,dur,sbytes,dbytes,sttl,...,ct_srv_dst,ct_dst_ltm,ct_src_ ltm,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,attack_cat,Label,src,dst
0,﻿59.166.0.0,1390,149.171.126.6,53,udp,CON,0.001055,132,164,31,...,7,1,3,1,1,1,,0,1000734720,2511044102
1,59.166.0.0,33661,149.171.126.9,1024,udp,CON,0.036133,528,304,31,...,4,2,3,1,1,2,,0,1000734720,2511044105
2,59.166.0.6,1464,149.171.126.7,53,udp,CON,0.001119,146,178,31,...,8,1,2,2,1,1,,0,1000734726,2511044103
3,59.166.0.5,3593,149.171.126.5,53,udp,CON,0.001209,132,164,31,...,9,1,1,1,1,1,,0,1000734725,2511044101
4,59.166.0.3,49664,149.171.126.0,53,udp,CON,0.001169,146,178,31,...,9,1,1,1,1,1,,0,1000734723,2511044096


# Look for Beacons
Beaconing is the process of a bot calling home.  Since that is an outbound characteristic, we are going to start by only looking at outgoing messages

In [11]:
# How many records are there?
len(gdf)

2546575

In [12]:
# Let's just use a subset of the data
d2 = cudf.DataFrame()
d2['srcip'] = gdf['srcip']
d2['src'] = gdf['src']
d2['dstip'] = gdf['dstip']
d2['dst'] = gdf['dst']
d2['stime'] = gdf['Stime']
d2.head(5)

Unnamed: 0,srcip,src,dstip,dst,stime
0,﻿59.166.0.0,1000734720,149.171.126.6,2511044102,1421927414
1,59.166.0.0,1000734720,149.171.126.9,2511044105,1421927414
2,59.166.0.6,1000734726,149.171.126.7,2511044103,1421927414
3,59.166.0.5,1000734725,149.171.126.5,2511044101,1421927414
4,59.166.0.3,1000734723,149.171.126.0,2511044096,1421927414


In [13]:
# Only care about outgoing links - since this is synthetic data, we know the output range
# on a real system this would be a little tougher since multiple internal IPs got get NATed to the same outgoing IP which would throw off timming
ip = struct.unpack("!I", socket.inet_aton('175.45.0.0'))[0]

In [14]:
# filter and sort the data
ddf = d2.query('dst > @ip').sort_values(['dstip', 'srcip','stime']).reset_index()

In [15]:
len(ddf)

219347

In [16]:
ddf.head(10)

Unnamed: 0,index,srcip,src,dstip,dst,stime
0,408157,10.40.85.1,170415361,175.45.176.0,2939006976,1424221780
1,605222,10.40.85.1,170415361,175.45.176.0,2939006976,1424228348
2,618428,10.40.85.1,170415361,175.45.176.0,2939006976,1424228827
3,326669,10.40.85.1,170415361,175.45.176.0,2939006976,1424239403
4,484607,10.40.85.1,170415361,175.45.176.0,2939006976,1424244631
5,524491,10.40.85.1,170415361,175.45.176.0,2939006976,1424245487
6,3818,10.40.85.1,170415361,175.45.176.0,2939006976,1424250167
7,65025,149.171.126.10,2511044106,175.45.176.0,2939006976,1421930004
8,159203,149.171.126.10,2511044106,175.45.176.0,2939006976,1424234823
9,439279,149.171.126.10,2511044106,175.45.176.0,2939006976,1424243135


In [17]:
# Compute the delta to to next instance
ddf['diff'] = ddf['stime'].diff()

In [18]:
# drop all negative times
ddf2 = ddf.query('diff > 0')
ddf2 = ddf2.drop(['src','dst', 'stime'])

In [37]:
x = ddf2.groupby(['dstip', 'srcip']).agg('var')

In [41]:
x

Unnamed: 0_level_0,Unnamed: 1_level_0,index,diff
dstip,srcip,Unnamed: 2_level_1,Unnamed: 3_level_1
175.45.176.0,10.40.85.1,54075976500.0,14176666.57
175.45.176.0,149.171.126.10,1565283542.0,3906185744.0
175.45.176.0,149.171.126.11,114766009500.0,2636060566000.0
175.45.176.0,149.171.126.12,962685985.9,326046.5336
175.45.176.0,149.171.126.13,47461805890.0,2967171.793
175.45.176.0,149.171.126.14,5637118683.0,5544332.022
175.45.176.0,149.171.126.17,18442.33333,1772241723000.0
175.45.176.0,149.171.126.18,1583145635.0,224561161800.0
175.45.176.0,149.171.126.19,,
175.45.176.1,10.40.85.1,12718097810.0,42038684.33


In [39]:
# the 'diff' field is now the variance
# a low variance means that there is little change in the time differences (a routine every x xeconds would have a variance of close to zero)
# one hour is 3600
# one day = 86,400
# Let's only keep variances under one day
y = x.query("diff < 86400")

In [40]:
y

Unnamed: 0_level_0,Unnamed: 1_level_0,index,diff
dstip,srcip,Unnamed: 2_level_1,Unnamed: 3_level_1
175.45.176.1,149.171.126.14,2995019000.0,50487.831009
192.168.241.243,192.168.241.243,683922600.0,24.451379


In [None]:
x