In [None]:
!curl -L -o dns-tunneling-queries-classification.zip\
  https://www.kaggle.com/api/v1/datasets/download/saurabhshahane/dns-tunneling-queries-classification

In [None]:
!unzip dns-tunneling-queries-classification.zip

In [52]:
PRINTABLE = "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"

assert len(PRINTABLE) == 94
print("Length:", len(PRINTABLE))

char_to_idx = {c: i for i, c in enumerate(PRINTABLE)}
char_to_idx

Length: 94


{'0': 0,
 '1': 1,
 '2': 2,
 '3': 3,
 '4': 4,
 '5': 5,
 '6': 6,
 '7': 7,
 '8': 8,
 '9': 9,
 'a': 10,
 'b': 11,
 'c': 12,
 'd': 13,
 'e': 14,
 'f': 15,
 'g': 16,
 'h': 17,
 'i': 18,
 'j': 19,
 'k': 20,
 'l': 21,
 'm': 22,
 'n': 23,
 'o': 24,
 'p': 25,
 'q': 26,
 'r': 27,
 's': 28,
 't': 29,
 'u': 30,
 'v': 31,
 'w': 32,
 'x': 33,
 'y': 34,
 'z': 35,
 'A': 36,
 'B': 37,
 'C': 38,
 'D': 39,
 'E': 40,
 'F': 41,
 'G': 42,
 'H': 43,
 'I': 44,
 'J': 45,
 'K': 46,
 'L': 47,
 'M': 48,
 'N': 49,
 'O': 50,
 'P': 51,
 'Q': 52,
 'R': 53,
 'S': 54,
 'T': 55,
 'U': 56,
 'V': 57,
 'W': 58,
 'X': 59,
 'Y': 60,
 'Z': 61,
 '!': 62,
 '"': 63,
 '#': 64,
 '$': 65,
 '%': 66,
 '&': 67,
 "'": 68,
 '(': 69,
 ')': 70,
 '*': 71,
 '+': 72,
 ',': 73,
 '-': 74,
 '.': 75,
 '/': 76,
 ':': 77,
 ';': 78,
 '<': 79,
 '=': 80,
 '>': 81,
 '?': 82,
 '@': 83,
 '[': 84,
 '\\': 85,
 ']': 86,
 '^': 87,
 '_': 88,
 '`': 89,
 '{': 90,
 '|': 91,
 '}': 92,
 '~': 93}

In [53]:
import pandas as pd

df = pd.read_csv("training.csv", header=None, names=["label", "domain"])
df.columns

Index(['label', 'domain'], dtype='object')

In [54]:
print("labels with 1 (exfil)", (df['label'] == 1).sum())

labels with 1 (exfil) 12000


In [55]:
print("labels with 1 (benign)", (df['label'] == 0).sum())

labels with 1 (benign) 3000


In [56]:
import numpy as np

def domain_to_features(domain: str) -> np.ndarray:
  domain = domain.lower()
  counts = np.zeros(len(PRINTABLE), dtype=np.float32)
  for c in domain:
    if c in char_to_idx:
      counts[char_to_idx[c]] += 1

  length = len(domain)
  if length == 0:
    entropy = 0.0
  else:
    probs = counts / length
    entropy = -np.sum(probs * np.log2(probs + 1e-12)) # Note to self, the addition here avoids log2(0) which is undefined

  entropy_avg = entropy
  len_avg = float(length)

  return np.concatenate([counts, [length, entropy, entropy_avg, len_avg]])

In [57]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# Note this is a recreation based on the information i could gather from this blog post: https://www.splunk.com/en_us/blog/security/machine-learning-in-security-detect-dns-data-exfiltration-using-deep-learning.html

class SplunkDNSExfilModel(nn.Module):
  def forward(self, x):
    x = F.relu(self.fc1(x))
    x = F.dropout(x, p=0.5, training=self.training)
    x = F.relu(self.fc2(x))
    x = F.dropout(x, p=0.5, training=self.training)
    x = self.fc3(x)
    return x.squeeze(1) # remove a dim since it returns shape (256, 1), will now return (256)

  def __init__(self):
    super().__init__()
    self.fc1 = nn.Linear(98, 256)
    self.fc2 = nn.Linear(256, 256)
    self.fc3 = nn.Linear(256, 1)


In [58]:
X = df['domain'].values
y = df['label'].values
X, y

(array(['q+Z8AnwaBA.hidemyself.org.', 'q+Z8A3wbBA.hidemyself.org.',
        'q+Z8BHwcBA.hidemyself.org.', ..., 'r17801.tunnel.tuns.org.',
        'dIUEABDEGSBAAAQAGT7HQUAAAAMFAAAAC37CAAFUWW5XBOHRPM4PIAEAAHT6M2A.AAAEAQQCWZBIH6BEGV33RX3C4CKGEOPSNLKQNAMMKV4EPKKFENMLTM2O6BLOES2.DXNVSRGEDZRWYPR3UFL5HUBRVY5OYVOM4VA2N3LZ44KDGBXB4TMKEY767FHCKFL.MPBXZUCIGNPOCNEHCZVK62CHBS6KYI4J4TR2.tunnel.tuns.org.',
        'r17802.tunnel.tuns.org.'], shape=(15000,), dtype=object),
 array([1, 1, 1, ..., 1, 1, 1], shape=(15000,)))

In [59]:
from torch.utils.data import Dataset, DataLoader
class DNSDataset(Dataset):
  def __init__(self, domains, labels):
    self.features = torch.from_numpy(np.stack([domain_to_features(domain) for domain in domains])).float()
    self.labels = torch.from_numpy(labels.astype(np.float32))

  def __len__(self):
    return len(self.labels)

  def __getitem__(self, idx):
    return self.features[idx], self.labels[idx]

In [60]:
train_dataset = DNSDataset(X, y)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

In [61]:
df = pd.read_csv("validating.csv", header=None, names=["label", "domain"])
X_validation = df['domain'].values
y_validation = df['label'].values
X, y

(array(['q+Z8AnwaBA.hidemyself.org.', 'q+Z8A3wbBA.hidemyself.org.',
        'q+Z8BHwcBA.hidemyself.org.', ..., 'r17801.tunnel.tuns.org.',
        'dIUEABDEGSBAAAQAGT7HQUAAAAMFAAAAC37CAAFUWW5XBOHRPM4PIAEAAHT6M2A.AAAEAQQCWZBIH6BEGV33RX3C4CKGEOPSNLKQNAMMKV4EPKKFENMLTM2O6BLOES2.DXNVSRGEDZRWYPR3UFL5HUBRVY5OYVOM4VA2N3LZ44KDGBXB4TMKEY767FHCKFL.MPBXZUCIGNPOCNEHCZVK62CHBS6KYI4J4TR2.tunnel.tuns.org.',
        'r17802.tunnel.tuns.org.'], shape=(15000,), dtype=object),
 array([1, 1, 1, ..., 1, 1, 1], shape=(15000,)))

In [62]:
print("labels with 1 (exfil)", (df['label'] == 1).sum())
print("labels with 0 (benign)", (df['label'] == 0).sum())

labels with 1 (exfil) 4000
labels with 0 (benign) 1000


In [63]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_validation, y_validation, test_size=0.5, random_state=42)

In [64]:
validation_dataset = DNSDataset(X_train, y_train)
validation_loader = DataLoader(validation_dataset, batch_size=32, shuffle=True)

test_dataset = DNSDataset(X_test, y_test)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=True)

In [65]:
criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([0.25]))

In [66]:
model = SplunkDNSExfilModel()

In [67]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [68]:
from sklearn.metrics import roc_auc_score

epochs = 4
best_val_auc = 0
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(epochs):
  for xb, yb in train_loader:
    model.train()
    xb, yb = xb.to(device), yb.to(device)
    pred = model(xb)
    loss = criterion(pred, yb)
    print(loss)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

  model.eval()
  with torch.no_grad():
    val_preds = torch.sigmoid(model(validation_dataset.features.to(device))).cpu().numpy().flatten()
    val_auc = roc_auc_score(validation_dataset.labels, val_preds)
    print(val_auc)

    if val_auc > best_val_auc:
      best_val_auc = val_auc
      torch.save(model.state_dict(), "splunk_dns_exfil_mlp_best.pth")

    if (epoch+1) % 10 == 0:
          print(f"Epoch {epoch+1:02d} | Val AUC: {val_auc:.6f}")

tensor(0.4961, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
tensor(0.2312, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
tensor(0.3289, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
tensor(0.2380, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
tensor(0.3716, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
tensor(0.4163, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
tensor(0.3690, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
tensor(0.1870, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
tensor(0.4025, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
tensor(0.2309, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
tensor(0.2406, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
tensor(0.2465, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
tensor(0.2956, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
tensor(0.2209, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
tensor(0.1560, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
tensor(0.2450, grad_fn=<BinaryCrossEntro

In [None]:
model.load_state_dict(torch.load("splunk_dns_exfil_mlp_best.pth"))
model.eval()
with torch.no_grad():
    test_probs = torch.sigmoid(model(test_dataset.features.to(device))).cpu().numpy().flatten()
    test_preds = (test_probs >= 0.5)

    from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
    print("AUC:", roc_auc_score(y_test, test_probs))
    print(confusion_matrix(y_test, test_preds))
    print(classification_report(y_test, test_preds))

In [None]:
df

In [None]:
# base32 encoded secret nz2jmrnfo7tiajjA5dgbBwn4up3YWY7vY2tKb5FK2LuYRex8q

unseen_test_domain = "nz2jmrnfo7tiajjA5dgbBwn4up3YWY7vY2tKb5FK2LuYRex8q.evilbadbadattacker.com"
feature_vector = domain_to_features(unseen_test_domain)

model.eval()

with torch.no_grad():
  x = torch.from_numpy(feature_vector).float().unsqueeze(0).to(device)
  logit = model(x)
  prob = torch.sigmoid(logit).item()
  print(prob)

In [None]:
unseen_test_domain = "hideinfo.org"
feature_vector = domain_to_features(unseen_test_domain)

model.eval()

with torch.no_grad():
  x = torch.from_numpy(feature_vector).float().unsqueeze(0).to(device)
  logit = model(x)
  prob = torch.sigmoid(logit).item()
  print(prob)