In [105]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx
import torch
import dgl
from glob import glob
import dgl.nn as dglnn
import torch.nn as nn
import torch.nn.functional as F

### Load data

In [178]:
dns_data = glob("./data/*-dns.csv")
df_all = []
for data in dns_data[3:4]:
    print(data)
    df = pd.read_csv(data,parse_dates=True, squeeze=True)
    df_all.append(df)
df_final = pd.concat(df_all, ignore_index=True)

./data/labelled_2021may-ip-10-100-1-26-dns.csv


In [179]:
df_final.head()

Unnamed: 0,Timestamp,SourceIP,DestinationIP,DnsQuery,DnsAnswer,DnsAnswerTTL,DnsQueryNames,DnsQueryClass,DnsQueryType,NumberOfAnswers,DnsResponseCode,DnsOpCode,SensorId,sus,evil
0,2021-05-16T17:13:14Z,10.100.1.95,10.100.0.2,ssm.us-east-2.amazonaws.com,,,ssm.us-east-2.amazonaws.com,['IN'],['A'],0,0,0,ip-10-100-1-95,0,0
1,2021-05-16T17:13:14Z,10.100.0.2,10.100.1.95,ssm.us-east-2.amazonaws.com,['52.95.19.240'],['17'],ssm.us-east-2.amazonaws.com,['IN'],['A'],1,0,0,ip-10-100-1-95,0,0
2,2021-05-16T17:13:14Z,10.100.1.95,10.100.0.2,ssm.us-east-2.amazonaws.com,,,ssm.us-east-2.amazonaws.com,['IN'],['AAAA'],0,0,0,ip-10-100-1-95,0,0
3,2021-05-16T17:13:14Z,10.100.0.2,10.100.1.95,ssm.us-east-2.amazonaws.com,,,ssm.us-east-2.amazonaws.com,['IN'],['AAAA'],0,0,0,ip-10-100-1-95,0,0
4,2021-05-16T17:13:16Z,10.100.1.186,10.100.0.2,ssm.us-east-2.amazonaws.com,,,ssm.us-east-2.amazonaws.com,['IN'],['A'],0,0,0,ip-10-100-1-186,0,0


#### Data Preprocessing

Drop irrelavant rows and columns

In [180]:
duplicateRows = df_final[df_final.duplicated()]
print("Duplicate Rows except first occurrence based on all columns are :")
print(len(duplicateRows))
df_final = df_final.drop_duplicates()

Duplicate Rows except first occurrence based on all columns are :
10


In [181]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 259 entries, 0 to 268
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Timestamp        259 non-null    object
 1   SourceIP         259 non-null    object
 2   DestinationIP    259 non-null    object
 3   DnsQuery         259 non-null    object
 4   DnsAnswer        65 non-null     object
 5   DnsAnswerTTL     65 non-null     object
 6   DnsQueryNames    259 non-null    object
 7   DnsQueryClass    259 non-null    object
 8   DnsQueryType     259 non-null    object
 9   NumberOfAnswers  259 non-null    int64 
 10  DnsResponseCode  259 non-null    int64 
 11  DnsOpCode        259 non-null    int64 
 12  SensorId         259 non-null    object
 13  sus              259 non-null    int64 
 14  evil             259 non-null    int64 
dtypes: int64(5), object(10)
memory usage: 32.4+ KB


In [182]:
# dropping DNSAnswer and DnsAnswerTTL as these features have too many missing value
# we will also drop the timestamp
df_final = df_final.drop(columns=["Timestamp","DnsAnswer","DnsAnswerTTL"])

In [183]:
# Check if DnsQuery and DnsQueryName are same
df_final['DnsQuery'].equals(df_final['DnsQueryNames'])


True

In [184]:
# Drop as They are same
df_final = df_final.drop(columns=["DnsQueryNames"])

In [185]:
df_final.head()

Unnamed: 0,SourceIP,DestinationIP,DnsQuery,DnsQueryClass,DnsQueryType,NumberOfAnswers,DnsResponseCode,DnsOpCode,SensorId,sus,evil
0,10.100.1.95,10.100.0.2,ssm.us-east-2.amazonaws.com,['IN'],['A'],0,0,0,ip-10-100-1-95,0,0
1,10.100.0.2,10.100.1.95,ssm.us-east-2.amazonaws.com,['IN'],['A'],1,0,0,ip-10-100-1-95,0,0
2,10.100.1.95,10.100.0.2,ssm.us-east-2.amazonaws.com,['IN'],['AAAA'],0,0,0,ip-10-100-1-95,0,0
3,10.100.0.2,10.100.1.95,ssm.us-east-2.amazonaws.com,['IN'],['AAAA'],0,0,0,ip-10-100-1-95,0,0
4,10.100.1.186,10.100.0.2,ssm.us-east-2.amazonaws.com,['IN'],['A'],0,0,0,ip-10-100-1-186,0,0


In [193]:
# Checking unique values in categorical features
print(f"Unique DnsQuery = {len(df_final.DnsQuery.unique())}")
print(f"Unique DnsQueryClass = {len(df_final.DnsQueryClass.unique())}")
print(f"Unique DnsQueryType = {len(df_final.DnsQueryType.unique())}")
print(f"Unique SensorId = {len(df_final.SensorId.unique())}")



Unique DnsQuery = 16
Unique DnsQueryClass = 2
Unique DnsQueryType = 6
Unique SensorId = 5


##### Separating different aspects of data

In [216]:
nodes1 = df_final.iloc[:,:2]
nodes2 = df_final.iloc[:,1:2]

features = df_final.iloc[:,2:-1]
labels = df_final.iloc[:,-1:]

In [218]:
nodes1

Unnamed: 0,SourceIP
0,10.100.1.95
1,10.100.0.2
2,10.100.1.95
3,10.100.0.2
4,10.100.1.186
...,...
264,10.100.0.2
265,10.100.1.4
266,10.100.1.4
267,10.100.0.2


In [197]:
labels

Unnamed: 0,evil
0,0
1,0
2,0
3,0
4,0
...,...
264,0
265,0
266,0
267,0


#### Category Encoding
Note: Ignoring that the catboost encoder should be used with training data

In [158]:
import category_encoders as ce


In [203]:
# Define catboost encoder
cbe_encoder = ce.cat_boost.CatBoostEncoder()
  
# Fit encoder and transform the features
cbe_encoder.fit(features['DnsQuery'], labels)
# train_cbe = cbe_encoder.transform(train)


CatBoostEncoder(cols=['DnsQuery'])

In [210]:
dnsQ_trans = cbe_encoder.transform(features['DnsQuery'])
features['DnsQuery'] =dnsQ_trans

In [211]:
features.head()

Unnamed: 0,DnsQuery,DnsQueryClass,DnsQueryType,NumberOfAnswers,DnsResponseCode,DnsOpCode,SensorId,sus
0,7e-05,['IN'],['A'],0,0,0,ip-10-100-1-95,0
1,7e-05,['IN'],['A'],1,0,0,ip-10-100-1-95,0
2,7e-05,['IN'],['AAAA'],0,0,0,ip-10-100-1-95,0
3,7e-05,['IN'],['AAAA'],0,0,0,ip-10-100-1-95,0
4,7e-05,['IN'],['A'],0,0,0,ip-10-100-1-186,0


##### One-hot encoding

In [214]:
features_final = pd.get_dummies(features, columns=["DnsQueryClass","DnsQueryType","SensorId"], drop_first=True, sparse=True)


In [215]:
features_final.head()

Unnamed: 0,DnsQuery,NumberOfAnswers,DnsResponseCode,DnsOpCode,sus,DnsQueryClass_['IN'],DnsQueryType_['AAAA'],DnsQueryType_['PTR'],DnsQueryType_['SRV'],DnsQueryType_['TXT'],DnsQueryType_['Unknown'],SensorId_ip-10-100-1-186,SensorId_ip-10-100-1-26,SensorId_ip-10-100-1-4,SensorId_ip-10-100-1-95
0,7e-05,0,0,0,0,1,0,0,0,0,0,0,0,0,1
1,7e-05,1,0,0,0,1,0,0,0,0,0,0,0,0,1
2,7e-05,0,0,0,0,1,1,0,0,0,0,0,0,0,1
3,7e-05,0,0,0,0,1,1,0,0,0,0,0,0,0,1
4,7e-05,0,0,0,0,1,0,0,0,0,0,1,0,0,0


##### Generating Graph From Networkx

In [170]:
G = nx.from_pandas_edgelist(df_final, source="SourceIP", target="DestinationIP", edge_attr=True, create_using=nx.MultiDiGraph())


In [222]:
G.nodes()

NodeView(('10.100.1.95', '10.100.0.2', '10.100.1.186', '10.100.1.105', '10.100.1.26', '10.100.1.4', '141.212.123.189', '141.212.123.193', '74.120.14.31', '209.17.97.122', '209.17.96.42', '209.17.97.58', '185.232.64.121', '192.35.168.27', '129.250.206.86'))

In [223]:
len(G.edges())

259

In [224]:
g = dgl.from_networkx(G)


In [229]:
g.nodes()
g.edges()

(tensor([ 5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
          5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  2,  2,  2,  2,
          2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
          1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
          1,  1,  1,  1,  1,  1,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
          3,  3,  3,  3,  3,

#### Edge Classification

In [243]:
feat_arr = features_final.to_numpy() # This should not be 'object'
feat = torch.from_numpy(np.array(feat_arr[:, :], dtype=np.float))

In [247]:
labl = torch.from_numpy(labels.evil.to_numpy())


In [275]:
# torch.randn(15)

In [282]:
g.edata['feature'] = feat
g.ndata['feature'] = torch.randn(15)

g.edata['label'] = labl
# synthetic train-validation-test splits
g.edata['train_mask'] = torch.zeros(259, dtype=torch.bool).bernoulli(0.6)

#### Model

In [283]:
import dgl.function as fn


class MLPPredictor(nn.Module):
    def __init__(self, in_features, out_classes):
        super().__init__()
        self.W = nn.Linear(in_features * 2, out_classes)

    def apply_edges(self, edges):
        h_u = edges.src['h']
        h_v = edges.dst['h']
        score = self.W(torch.cat([h_u, h_v], 1))
        return {'score': score}

    def forward(self, graph, h):
        # h contains the node representations computed from the GNN defined
        # in the node classification section (Section 5.1).
        with graph.local_scope():
            graph.ndata['h'] = h
            graph.apply_edges(self.apply_edges)
            return graph.edata['score']

In [284]:



class SAGE(nn.Module):
    def __init__(self, in_feats, hid_feats, out_feats):
        super().__init__()
        self.conv1 = dglnn.SAGEConv(
            in_feats=in_feats, out_feats=hid_feats, aggregator_type='mean')
        self.conv2 = dglnn.SAGEConv(
            in_feats=hid_feats, out_feats=out_feats, aggregator_type='mean')

    def forward(self, graph, inputs):
        # inputs are features of nodes
        h = self.conv1(graph, inputs)
        h = F.relu(h)
        h = self.conv2(graph, h)
        return h

In [285]:
class Model(nn.Module):
    def __init__(self, in_features, hidden_features, out_classes):
        super().__init__()
        self.sage = SAGE(in_features, hidden_features, out_classes)
        self.pred = MLPPredictor(in_features,out_classes)
    def forward(self, g, x):
        h = self.sage(g, x)
        return self.pred(g, h)

In [287]:
node_features = g.ndata['feature']
edge_label = g.edata['label']
train_mask = g.edata['train_mask']
model = Model(15, 20, 2)
opt = torch.optim.Adam(model.parameters())
for epoch in range(10):
    pred = model(g, node_features)
    loss = ((pred[train_mask] - edge_label[train_mask]) ** 2).mean()
    opt.zero_grad()
    loss.backward()
    opt.step()
    print(loss.item())

DGLError: Expect number of features to match number of nodes (len(u)). Got 2 and 15 instead.