### Neural Networks and Sequential Modeling of Traceroutes

Treating the traceroute as a sequence of steps towards a destination could reveal more nuanced patterns. Accuracy with these models can be as high as 80% but the hypothesis that additional information may be encoded in the order was disproven as the LSTM had no additional value when compared with the standard neural nets.  Therefore, the only thing that matters is which subnets are used, not necessarily in which order.



In [1]:
import json
import pprint
import os
import pickle
import random
import sys

import matplotlib.pyplot as plt
import socket, struct
import multiprocessing as mp

import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras import layers, backend, models

from sklearn import metrics
from sklearn import datasets, cluster
from sklearn.metrics import classification_report, confusion_matrix


In [2]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 13837181204682714592
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 11284293223
locality {
  bus_id: 1
  links {
  }
}
incarnation: 302216122573034960
physical_device_desc: "device: 0, name: Tesla K80, pci bus id: 0000:00:1e.0, compute capability: 3.7"
]


In [None]:
f= open('BigFtaglOrdered.pickle', 'rb') 
ftagl=pickle.load(f)
f= open('BigExpanded.pickle', 'rb') 
expanded_routes=pickle.load(f)

### Create data structures (skip if serializied)

In [None]:
X=np.zeros(shape=[len(expanded_routes),30,50])

In [None]:
## Transform raw route information into LSTM structure with dimensionality reduction
## Note: Feature Agglomeration was done on all days

row=0
numEmpty=0
df_rows=np.zeros(shape=(len(expanded_routes),2874))
for seq in expanded_routes:
    vec=np.zeros(shape=[30,2874])
    df_vec=np.zeros(shape=[1,2874])
    there=0
    for i in range(30):
        if seq.get(i):
            vec[i][seq[i]]=1
            df_vec[0][seq[i]]=1
            there+=1

            
    vec=ftagl.transform(vec)
    df_rows[row]=df_vec
    X[row]=vec
    row+=1

In [None]:
#ftagl.fit(df_rows)
#with open('BigFtaglOrdered.pickle', 'wb') as output:
#    pickle.dump(ftagl,output,pickle.HIGHEST_PROTOCOL)

In [None]:
df=pd.DataFrame(df_rows)
del df_rows
del expanded_routes
df=pd.concat([pd.read_parquet('BigExtractedFeatures.parquet'),df], axis=1)

In [None]:
df.columns=[str(x) for x in df.columns]
df.to_parquet('universe.parquet')

In [None]:
np.savez('X',X)

### Load serialized data structures

In [15]:
df=pd.read_parquet('universe.parquet')
df=df[df['NumHops']>2]
df=df.reset_index(drop=True)

## Keras Neural Net

In [None]:
# Dimensionality reduction if desired

# Fit new dim reduction matrix, not necessary if *ftagl*.pickle exists
#ftagl = cluster.FeatureAgglomeration(n_clusters=100)

reduced = ftagl.transform(df[df.columns.difference([header for header in df.columns if not (str(header).isdigit())])])

In [5]:
# Process  and balance data

#df=df.drop(columns=df.columns.difference([header for header in df.columns if not (str(header).isdigit())]))
#reduced=pd.DataFrame(reduced)
#df=pd.concat([df,reduced],axis=1)
#del reduced


bd=df[df['Benign'] == True]
even=len(bd)
md=df[df['Benign'] == False]
md.reset_index(drop=True)

bd=bd.sample(frac=1).reset_index(drop=True)
md=md.sample(frac=1).reset_index(drop=True)
md=md.loc[md.index < even]
ad=pd.concat([bd,md])

del bd
del md

In [6]:
print(len(ad[ad['Benign']==True]),len(ad[ad['Benign']==False]))

22255 22255


In [7]:
exclude=ad.columns.difference(['indicator','Benign','Dest','Route','index'])
ad=ad.sample(frac=1).reset_index(drop=True)

In [8]:
model = tf.keras.Sequential([
# Adds a densely-connected layer with 64 units to the model:
layers.Dense(64, activation='relu', input_dim=2879),

layers.Dense(64, activation='relu'),

layers.Dense(64, activation='relu'),
# Add a sigmoid layer for outputs:
layers.Dense(1, activation='sigmoid')])

In [9]:
model.compile(optimizer=tf.train.AdamOptimizer(0.001),
              loss='binary_crossentropy', 
              metrics=['accuracy'])

#cease=keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=5, verbose=0, mode='auto', baseline=None)
history=model.fit(ad[exclude],ad['Benign'], epochs=5, verbose=1, batch_size=64,
          validation_split=.2, shuffle=True)


Train on 35608 samples, validate on 8902 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [10]:
print("Classification report for classifier %s\n"
      % (metrics.classification_report(ad['Benign'], model.predict(ad[exclude])>.5)))
print("Confusion matrix:\n%s" % metrics.confusion_matrix(ad['Benign'], model.predict(ad[exclude]) > .5,labels=[True,False]))

Classification report for classifier               precision    recall  f1-score   support

       False       0.81      0.86      0.83     22255
        True       0.85      0.79      0.82     22255

    accuracy                           0.83     44510
   macro avg       0.83      0.83      0.83     44510
weighted avg       0.83      0.83      0.83     44510


Confusion matrix:
[[17661  4594]
 [ 3128 19127]]


In [11]:
del ad

## Keras LSTM 

### Data Generator

In [None]:
class DataGenerator(keras.utils.Sequence):

    def __init__(self, data, labels, dimReduction, batch_size=64, dim=(30,50), n_classes=2, shuffle=True):
        'Initialization'
        self.dim = dim
        self.batch_size = batch_size
        self.n_classes = n_classes
        self.shuffle = shuffle
        self.on_epoch_end()
        self.expanded_routes=data
        self.benign_label=labels
        self.index=0
        self.dimReduction=dimReduction

    def __len__(self):

        return int(np.floor(len(self.expanded_routes) / self.batch_size))

    def __getitem__(self,ignore):

        X=np.zeros(shape=[self.batch_size,self.dim[0],self.dim[1]])
        y=np.array(self.benign_label[self.index:self.index+self.batch_size])
        
        #row=0
        #while row < batch_size:
        #for s in range(self.index,self.index+self.batch_size):
           
        #    for i in range(30):
        #        vec=np.zeros(shape=[1,7566])
        #        if self.expanded_routes[s].get(i):
        #            vec[0][self.expanded_routes[s][i]]=1
       #        
       #         vec=self.dimReduction.transform(vec)
       #         X[row][i]=vec
       #         
       #     row+=1

       # self.index=self.index+self.batch_size
        return X, y

    def on_epoch_end(self):

        if self.shuffle == True:
            print('Shuffled')
        
        self.index=0

    def __data_generation(self,rand=None):

        # Initialization
        X=np.random.rand(self.batch_size,30,50)
        y = np.round(np.random.rand((self.batch_size)))

        return X, y

In [18]:
df=pd.read_parquet('universe.parquet')
keep=df['NumHops']>2

In [19]:
## Define empty training vars to be populated later

X=np.load('X.npz')['arr_0']
X=X[keep]
Y=np.array(df['Benign'])
Y=Y[keep]
X_e=np.array(df[['Timeouts','AveragePing','NumHops','Tail Timeouts','Reached']])
X_e=X_e[keep]
df=df[keep]
df=df.reset_index(drop=True)
print(X.shape, Y.shape, X_e.shape, df.shape)

(515253, 30, 50) (515253,) (515253, 5)


In [26]:
## Map benign and malicious entries for balancing purposes

benignIndices=[]
maliciousIndices=[]
for x in range(0,len(df)):
    if df['Benign'][x] == True:
        benignIndices.append(x)
    else:
        maliciousIndices.append(x)

In [27]:
## Create training data from equal parts benign and malicious data, shuffle to avoid proximity effects

X_benign_e=X_e[benignIndices,:]
X_benign=X[benignIndices,:]
Y_benign=Y[benignIndices]
X_mal=X[maliciousIndices,:]
X_mal_e=X_e[maliciousIndices,:]
Y_mal=Y[maliciousIndices]
del X
del Y
np.random.seed(387562875)
np.random.shuffle(X_mal)
np.random.seed(387562875)
np.random.shuffle(X_mal_e)
np.random.seed(387562875)
np.random.shuffle(Y_mal)
X_mal_e=X_mal_e[0:len(X_benign),:]
X_mal=X_mal[0:len(X_benign),:]
Y_mal=Y_mal[0:len(X_benign)]
X_t_e=np.concatenate((X_benign_e,X_mal_e), axis=0)
X_t=np.concatenate((X_benign,X_mal), axis=0)
Y_t=np.concatenate((Y_benign,Y_mal), axis=0)
del X_benign
del X_benign_e
del X_mal
del X_mal_e
del Y_benign
del Y_mal
np.random.seed(387562875)
np.random.shuffle(X_t)
np.random.seed(387562875)
np.random.shuffle(X_t_e)
np.random.seed(387562875)
np.random.shuffle(Y_t)

In [28]:
model = tf.keras.Sequential([
# Adds an LSTM layer to intake the traceroute as a sequence of stops:
layers.LSTM(64, activation='relu', input_shape=(30,50), return_sequences=False),
# Add dense layers to further interpret results:
layers.Dense(64, activation='relu'),
    
layers.Dense(32, activation='relu'),
# Add a sigmoid output layer
layers.Dense(1, activation='sigmoid')])

print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 64)                29440     
_________________________________________________________________
dense_4 (Dense)              (None, 64)                4160      
_________________________________________________________________
dense_5 (Dense)              (None, 32)                2080      
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 33        
Total params: 35,713
Trainable params: 35,713
Non-trainable params: 0
_________________________________________________________________
None


In [29]:
model.compile(optimizer=tf.train.AdamOptimizer(0.001),
              loss='binary_crossentropy', 
              metrics=['accuracy'])

history=model.fit(X_t,Y_t, epochs=10, verbose=1, batch_size=64,
          validation_split=0.2, shuffle=True)

#training_generator = DataGenerator(expanded_routes,df['Benign'],ftagl,batch_size=2048)
#validation_generator=DataGenerator(expanded_routes[0],df['Benign'][0],ftagl)
#history=model.fit_generator(generator=training_generator,validation_data=validation_generator,use_multiprocessing=True,epochs=50)

Train on 35608 samples, validate on 8902 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
#data = np.random.random((1000, 32))
#labels = random_one_hot_labels((1000, 10))

model.evaluate(X_t, Y_t, batch_size=32)

In [30]:
print("Classification report for classifier %s\n"
      % (metrics.classification_report(Y_t, model.predict(X_t)>.5)))
print("Confusion matrix:\n%s" % metrics.confusion_matrix(Y_t, model.predict(X_t) > .5,labels=[True,False]))

Classification report for classifier               precision    recall  f1-score   support

       False       0.71      0.81      0.76     22255
        True       0.78      0.67      0.72     22255

    accuracy                           0.74     44510
   macro avg       0.74      0.74      0.74     44510
weighted avg       0.74      0.74      0.74     44510


Confusion matrix:
[[14981  7274]
 [ 4288 17967]]


# Keras Hybrid LSTM

In [31]:
print(X_t_e.shape, X_t.shape, Y_t.shape)

(44510, 5) (44510, 30, 50) (44510,)


In [32]:

#Two input sequences, the first is standard statistics about the traceroute and the second the sequence of stops 
my_inputs = layers.Input(shape=(5,),dtype='float32')
ip_inputs = layers.Input(shape=(30,50),dtype='float32')

# LSTM layer
lstm_out = layers.LSTM(64, activation='relu')(ip_inputs)

# Merge outputs from the LSTM layer with the next set of inputs
x = keras.layers.concatenate([lstm_out, my_inputs])

# Add dense layers to combine LSTM information with other data
x = layers.Dense(64, activation='relu')(x)
x = layers.Dense(64, activation='relu')(x)

# Sigmoid output layer
predictions = layers.Dense(1, activation='sigmoid')(x)


model = models.Model(inputs=[my_inputs,ip_inputs], outputs=predictions)
model.compile(optimizer=tf.train.AdamOptimizer(0.001),
              loss='binary_crossentropy', 
              metrics=['accuracy'])

model.summary()
#board=keras.callbacks.TensorBoard(log_dir='./logs', histogram_freq=0, batch_size=32, write_graph=True, write_grads=False, write_images=False, embeddings_freq=0, embeddings_layer_names=None, embeddings_metadata=None, embeddings_data=None)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, 30, 50)       0                                            
__________________________________________________________________________________________________
lstm_1 (LSTM)                   (None, 64)           29440       input_2[0][0]                    
__________________________________________________________________________________________________
input_1 (InputLayer)            (None, 5)            0                                            
__________________________________________________________________________________________________
concatenate (Concatenate)       (None, 69)           0           lstm_1[0][0]                     
                                                                 input_1[0][0]                    
__________

In [33]:
model.fit([X_t_e,X_t], Y_t, epochs=20, verbose =1, batch_size=64,validation_split=0.2)

Train on 35608 samples, validate on 8902 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7f4eb183c908>

In [36]:
del X_t
del Y_t

## Autoencoder 

#### Reduce dimensionality of subnet data
#### Failed experiment: it appears the the data is too sparse for the sutoencoder to learn anything.  The output is always the same set of 4 or 5 entries predicted as ones regardless of the input

In [3]:
df=pd.read_parquet('universe.parquet')
df=df[df['NumHops']>2]
df=df[df.columns.difference([header for header in df.columns if not (str(header).isdigit())])]
df=np.array(df)

In [35]:
pd.Series(df.sum(axis=1)).describe()

count    464614.000000
mean          7.606760
std           1.894357
min           0.000000
25%           6.000000
50%           7.000000
75%           9.000000
max          16.000000
dtype: float64

In [5]:
encoding_dim = 256

# input vec
input_vec = layers.Input(shape=(2874,))

x=layers.Dense(512,activation='relu')(input_vec)

x=layers.Dense(512,activation='relu')(x)

# "encoded" is the encoded representation of the input
encoded = layers.Dense(encoding_dim, activation='relu')(input_vec)

x=layers.Dense(512,activation='relu',input_dim=(256,))(encoded)

x=layers.Dense(512,activation='relu',input_dim=(256,))(x)

# "decoded" is the reconstruction of the input
decoded = layers.Dense(2874, activation='sigmoid')(encoded)

# this model maps an input to its reconstruction
autoencoder = models.Model(input_vec, decoded)
autoencoder.summary()
#encoder = models.Model(input_vec, encoded)

# create a placeholder for an encoded (32-dimensional) input
#encoded_input = layers.Input(shape=(encoding_dim,))
# retrieve the last layer of the autoencoder model
#decoder_layer = autoencoder.layers[-1]
# create the decoder model
#decoder = models.Model(encoded_input, decoder_layer(encoded_input))

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 2874)              0         
_________________________________________________________________
dense_2 (Dense)              (None, 256)               736000    
_________________________________________________________________
dense_5 (Dense)              (None, 2874)              738618    
Total params: 1,474,618
Trainable params: 1,474,618
Non-trainable params: 0
_________________________________________________________________


In [6]:
autoencoder.compile(optimizer=tf.train.AdamOptimizer(1), loss='mse',metrics=['accuracy'])

autoencoder.fit(df,df,
                epochs=1,
                batch_size=4096,
                shuffle=True,
                validation_data=(df,df),
                verbose=1)

Train on 464614 samples, validate on 464614 samples
Epoch 1/1


<tensorflow.python.keras.callbacks.History at 0x7f56b880b400>

In [7]:
pred=autoencoder.predict(df)

In [10]:
pred=pd.DataFrame(pred).drop_duplicates()
pred

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2864,2865,2866,2867,2868,2869,2870,2871,2872,2873
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
