In [1]:
import pandas as pd
import numpy as np
import math
import string
from sklearn.ensemble import RandomForestClassifier

In [2]:
# These are 100 malware domains taken from the Bambenek DGA domain feed. 
raw_dga_domains = open("dga_dns_traffic.csv").readlines()

In [3]:
# We strip off the trailing newline.
dga_domains = []
for domain in raw_dga_domains:
    dga_domains.append(domain.rstrip())

In [4]:
# Visual inspection of the domain shows that it appears very random. 
dga_domains[0]

'dhwcrsensinaix.com'

In [5]:
# These are 100 popular domains taken from Cisco Umbrella.
# Umbrella was chosen over otherwise better lists like Tranco because it includes subdomains. 
raw_whitelisted_domains = open("top-100.csv").readlines()

In [6]:
whitelisted_domains = []
for domain in raw_whitelisted_domains:
    whitelisted_domains.append(domain.rstrip())

In [7]:
# We can see that the top domain appears less random and is human recognizable. 
whitelisted_domains[0]

'netflix.com'

In [8]:
# This is a short, sweet, but probably not correct approximate entropy calculation. 
def goofy_entropy(input_string):
    return len(set(input_string))/len(input_string)

In [9]:
## This is a confusing but probably correct approximate entropy calculation. 
## Original code by Ero Carrera
## http://blog.dkbza.org/2007/05/scanning-data-for-entropy-anomalies.html
def serious_entropy(input_string):
    if not input_string:
        return 0

    entropy = 0
    for character in string.printable:
        character_frequency = float(input_string.count(character)) / len(input_string)
        if character_frequency > 0:
            entropy += -character_frequency * math.log(character_frequency, 2)

    return entropy

In [10]:
# Just a little Python trick to let us easily try both calculations. 
entropy = serious_entropy

In [11]:
# Based on human visual insection malware domains often appear random. 
# A human being can differentiate maliscius domains based on it "looking random" so approximate entropy makes sense as a feature. 
# Other types of maliscius activity can be detected by the domain length so we choose that as a second feature. 
def featurize_domain(input_string):
    domain_entropy = entropy(input_string)
    domain_length = len(input_string)
    
    return (domain_entropy, domain_length)

In [12]:
labels = []

In [13]:
dga_features = []
for domain in dga_domains:
    dga_features.append(featurize_domain(domain))
    labels.append(0) # A label of '0' means maliscius 

In [14]:
whitelist_features = []
for domain in whitelisted_domains:
    whitelist_features.append(featurize_domain(domain))
    labels.append(1) # A label of '1' means benign 

In [15]:
features = dga_features + whitelist_features

In [16]:
# Random forrests where chosen as they are the most common classification algorithm. 
# Random forrsts are robust, do not require data normalization, and are an all around solid choice. 
model = RandomForestClassifier(random_state = 42)

In [17]:
model = model.fit(features, labels)



In [18]:
# These domains are a collection of legitimate domains mixed in with malicious ones. 
# The 'malicious' domains where generated using https://github.com/alphasoc/flightsim.
# The legitimate domains are from Cisco Umbrella (but not overlapping with the training set). 
raw_unknown_domains = open("unknown_dns_traffic.csv").readlines()

In [19]:
unknown_domains = []
for domain in raw_unknown_domains:
    unknown_domains.append(domain.rstrip())

In [20]:
unknown_features = []
for domain in unknown_domains:
    unknown_features.append(featurize_domain(domain))

In [21]:
predictions = model.predict(unknown_features).tolist()

In [22]:
for prediction, domain in zip(predictions, unknown_domains):
    if prediction == 0:
        print(domain)
        
# We can see here that the model finds a lot of DGA domains and even DNS tunneling. 
# However it also misclassifies a lot of legitimate domains. 
# To improve this a resonable next step would be to identify more features to differentiate the two classes better.
# Other resonable improvements include training on more data, and tweaking the algorithm's ptional peramaters. 

ggjmlrl.top
ggjmlrl.xyz
blniypq.top
blniypq.xyz
entlvvc.top
entlvvc.xyz
entlvvc.com
qzahkrf.xyz
mhdqtpw.top
mhdqtpw.com
tryuevg.top
tryuevg.xyz
ekgmjxo.top
ekgmjxo.xyz
ekgmjxo.com
vgmwcnj.com
mnleyhd.xyz
mnleyhd.com
tpzutbq.xyz
tpzutbq.com
jkcubbtzxtgkejptevbhxojsudcdtt.sandbox.alphasoc.xyz
vqamvawedpytmrzdpkjuqkoeqedgis.sandbox.alphasoc.xyz
gzphmlemmjbycxgtjzdjwylqogouki.sandbox.alphasoc.xyz
omnebeglqedjhyulgyzqlftyvlcben.sandbox.alphasoc.xyz
mudqxltcsznhyzqmzrexblxslyvulx.sandbox.alphasoc.xyz
xyjqjdtpgyqtaepskjadayhhgxjila.sandbox.alphasoc.xyz
jzfiimqmrgplwkmmxjohkvlbrcwzow.sandbox.alphasoc.xyz
xjswzjbtrrxjohmivtgdkimddyvdsm.sandbox.alphasoc.xyz
quslgbkllpfawelrqxwhbtlfufvwfa.sandbox.alphasoc.xyz
vnlpmljcxagjstxpjjgrpvzzeekyqg.sandbox.alphasoc.xyz
nflximg.com
time-ios.apple.com
cdn-0.nflximg.com
init-p01st.push.apple.com
windows.com
fe.apple-dns.net
vortex-win.data.microsoft.com
rubiconproject.com
a.akamaiedge.net
s0.2mdn.net
aaplimg.com
ib.adnxs.com
dns-test1.hola.org
cdn.onenote.ne