In [2]:
from dynamite_sdk.search import Search
from datetime import datetime, timedelta
from dynamite_sdk.anomaly_detection.gaussian_anomaly_detector import GaussianAnomalyDetector

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import math
%matplotlib inline

## Get the baseline features aggregated at 5 min interval 

In [3]:
start = datetime.now() - timedelta(minutes=500)
end = datetime.now()

search = Search('baselines', as_dataframe=True)
search.execute_query(start, end)

# for event in search.results:
#     print(event)
search.results.head(2)

Unnamed: 0,elasticsearch_index,originating_agent_tag,forwarder_type,event_type,node_ip_address,node_hostname,address,start_time,end_time,interval_size,...,ssl_server_connection_count,ssl_producer_consumer_ratio_average,ssl_producer_consumer_ratio_max,ssl_producer_consumer_ratio_min,rdp_client_connection_count,rdp_server_connection_count,rdp_producer_consumer_ratio_average,rdp_producer_consumer_ratio_max,rdp_producer_consumer_ratio_min,external_ports
0,zeek-baselines-2019.11.22,honeypot01,zeek,netbase,192.168.53.158,ip-192-168-53-158.us-west-2.compute.internal,192.168.32.1,2019-11-22 16:30:55.467000+00:00,2019-11-22 16:35:59.089000+00:00,00:05:03.622000,...,0,0,0,0,0,0,0,0,0,0
1,zeek-baselines-2019.11.22,honeypot01,zeek,netbase,192.168.53.158,ip-192-168-53-158.us-west-2.compute.internal,192.168.53.158,2019-11-22 16:30:40.433000+00:00,2019-11-22 16:35:49.088000+00:00,00:05:08.655000,...,0,0,0,0,0,0,0,0,0,0



### Select aggregated columns to build Anomaly Detection model

In [7]:
feature_columns = ['internal_port_count', 'internal_host_count',  'external_port_count',  
'external_host_count', 'internal_client_count',  'external_client_count',  'connection_count', 
                   'originating_connection_count', 'successful_originating_connection_count', 
                   'rejected_originating_connection_count', 'originating_to_highport_count',  
                   'originating_to_lowport_count', 'originating_to_service_count',  
                   'internal_originating_connection_count', 'internal_originating_rejected_connection_count', 
                   'internal_to_highport_count',  'internal_to_lowport_count', 'internal_to_service_count',  
                   'internal_received_connection_count', 'internal_originating_bytes_sent_sum', 
                   'internal_originating_bytes_received_sum', 'external_originating_bytes_sent_sum', 
                   'external_originating_bytes_received_sum', 'internal_originating_packets_sent_count', 
                   'internal_originating_packets_received_count', 'external_originating_packets_sent_count', 
                   'external_originating_packets_received_count', 'smb_client_connection_count',  
                   'smb_server_connection_count', 'smb_producer_consumer_ratio_average', 
                   'smb_producer_consumer_ratio_max',  'smb_producer_consumer_ratio_min', 
                   'http_client_connection_count',  'http_server_connection_count', 
                   'http_producer_consumer_ratio_average', 'http_producer_consumer_ratio_max',  
                   'http_producer_consumer_ratio_min', 'dns_client_connection_count',  
                   'dns_server_connection_count', 'dns_producer_consumer_ratio_average', 
                   'dns_producer_consumer_ratio_max',  'dns_producer_consumer_ratio_min', 
                   'ssl_client_connection_count',  'ssl_server_connection_count', 
                   'ssl_producer_consumer_ratio_average', 'ssl_producer_consumer_ratio_max',  
                   'ssl_producer_consumer_ratio_min', 'rdp_client_connection_count',  
                   'rdp_server_connection_count', 'rdp_producer_consumer_ratio_average', 
                   'rdp_producer_consumer_ratio_max',  'rdp_producer_consumer_ratio_min']
features = search.results[feature_columns]

In [9]:
# remove columns that have constant values
# Column with constant values do add value to model and gives invalid results.
train = features.loc[:,features.apply(pd.Series.nunique) != 1]

In [11]:
# Fit a Gaussian Anomaly Detector
model = GaussianAnomalyDetector(contamination=0.05, log_transform=True)
model.fit(train)

GaussianAnomalyDetector(contamination=0.05, log_transform=True)

In [18]:
# get predictions and reasons from the model
zipped_list = list(zip(model.predict(train), model.get_reason()))
predictions = pd.DataFrame(zipped_list, columns=['prediction', 'reason'])

In [19]:
# display anamolies
predictions[predictions.prediction == 1]

Unnamed: 0,prediction,reason
10,1,dns_client_connection_count
26,1,dns_client_connection_count
33,1,http_producer_consumer_ratio_max
50,1,http_producer_consumer_ratio_min
79,1,internal_originating_connection_count
120,1,http_producer_consumer_ratio_max
140,1,http_producer_consumer_ratio_max
