In [1]:
import sys
import binascii
import multiprocessing as mp
from itertools import chain
from scapy.all import *
sys.path.append('hed-dlg/')

import numpy as np
import random
from scipy.stats import itemfreq

import blocks
from blocks.bricks import Linear, Softmax, Softplus, NDimensionalSoftmax
from blocks.bricks.recurrent import GatedRecurrent, Fork, LSTM
from blocks.initialization import Constant, IsotropicGaussian, Identity, Uniform
from blocks.bricks.cost import BinaryCrossEntropy, CategoricalCrossEntropy
from blocks.filter import VariableFilter
from blocks.roles import PARAMETER
from blocks.graph import ComputationGraph

import theano
from theano import tensor as T

###These warnings do not impede progress
#WARNING: Failed to execute tcpdump. Check it is installed and in the PATH
#WARNING: No route found for IPv6 destination :: (no default route?)


Using gpu device 0: GeForce GTX TITAN X (CNMeM is disabled, cuDNN 4007)


# Data

In [2]:
dataPath = '/data/bradsfirstpcaps.pcap'
pcaps = rdpcap(dataPath)
sessions = pcaps.sessions()

In [3]:
#turn the sessions into a dictionary key = session_number, val = list of packages in hex

i=0
hexSessions = {}

for k,v in sessions.items(): # v is the session
    #for attr, value in v.__dict__.iteritems(): THIS IS TO GET DICT OF VALUES
    #    print attr, value
    #if i == 2:
    #    break
    scpcaps = []    
    for p in v: #p is the individual packet in the session
        
        try:
            rawindex = len(p[Raw])
            payloadLens.append(rawindex)
            scpcaps.append(binascii.hexlify(str(p.original)[:-rawindex])) #turn it into hex
        except:
            scpcaps.append(binascii.hexlify(p.original))
        #for attr, value in p.payload.__dict__.iteritems():#this give the fields that are accessable
        #    print attr, value
        
        #print len(binascii.hexlify(p.original))
    hexSessions['session_' + str(i)] = scpcaps
    
    i+=1

In [4]:
#Making the hex dictionary
hexstring = '0,	1,	2,	3,	4,	5,	6,	7,	8,	9,	A,	B,	C,	D,	E,	F,	10,	11,	12,	13,	14,	15,	16,	17,	18,	19\
,	1A,	1B,	1C,	1D,	1E,	1F,	20,	21,	22,	23,	24,	25,	26,	27,	28,	29,	2A,	2B,	2C,	2D,	2E,	2F,	30,	31,	32,	33,	34,	35\
,	36,	37,	38,	39,	3A,	3B,	3C,	3D,	3E,	3F,	40,	41,	42,	43,	44,	45,	46,	47,	48,	49,	4A,	4B,	4C,	4D,	4E,	4F,	50,	51\
,	52,	53,	54,	55,	56,	57,	58,	59,	5A,	5B,	5C,	5D,	5E,	5F,	60,	61,	62,	63,	64,	65,	66,	67,	68,	69,	6A,	6B,	6C,	6D\
,	6E,	6F,	70,	71,	72,	73,	74,	75,	76,	77,	78,	79,	7A,	7B,	7C,	7D,	7E,	7F,	80,	81,	82,	83,	84,	85,	86,	87,	88,	89\
,	8A,	8B,	8C,	8D,	8E,	8F,	90,	91,	92,	93,	94,	95,	96,	97,	98,	99,	9A,	9B,	9C,	9D,	9E,	9F,	A0,	A1,	A2,	A3,	A4,	A5\
,	A6,	A7,	A8,	A9,	AA,	AB,	AC,	AD,	AE,	AF,	B0,	B1,	B2,	B3,	B4,	B5,	B6,	B7,	B8,	B9,	BA,	BB,	BC,	BD,	BE,	BF,	C0,	C1\
,	C2,	C3,	C4,	C5,	C6,	C7,	C8,	C9,	CA,	CB,	CC,	CD,	CE,	CF,	D0,	D1,	D2,	D3,	D4,	D5,	D6,	D7,	D8,	D9,	DA,	DB,	DC,	DD\
,	DE,	DF,	E0,	E1,	E2,	E3,	E4,	E5,	E6,	E7,	E8,	E9,	EA,	EB,	EC,	ED,	EE,	EF,	F0,	F1,	F2,	F3,	F4,	F5,	F6,	F7,	F8,	F9\
,	FA,	FB,	FC,	FD,	FE,	FF'.replace('\t', '')

hexList = hexstring.lower().split(',')
hexList.append('<EOP>') #End Of Packet token
hexDict = {}
    
for key, val in enumerate(hexList):
    if len(val) == 1:
        val = '0'+val
    hexDict[val] = key  #dictionary k=hex, v=int  

In [None]:
#Is padding on top (older timesteps/sessions) better than padding on bottom (recent timesteps)?

In [28]:
maxPackets = 10
#def hexOneHot(number):
#    zeroVec = np.zeros(257)
#    zeroVec[number] = 1.0
#    return zeroVec

def oneHot(index, granular = 'hex'):
    if granular == 'hex':
        vecLen = 257
    else:
        vecLen = 17
    
    zeroVec = np.zeros(vecLen)
    zeroVec[index] = 1.0
    
    return zeroVec

def oneHotSessions(sessionDict, maxPackets = maxPackets, packetTimeSteps = 256,
                   reverse = False, charLevel = False):
    """
    This takes a list of int tokens and onehot encodes them, pads sessions with zero tensors according to maxPackets
    and packet according to packetTimeSteps
    
    sessionDict = dict of lists of key = sessions and value = list of packets
    timeSteps = maximum len of packet. it will be padded with zero vectors is packet is too short.
    
    """
    
    listOsessions = []

    if charLevel:
        vecLen = 17
    else:
        vecLen = 257
    
    sessionKeys = sessionDict.keys()
    
    for session in sessionKeys:
        #sessionTensor = np.zeros((maxPackets, packetTimeSteps, vecLen))
        #lenSession = len(session)
        sessionCollect = []
        
        if len(sessionDict[session]) > maxPackets: #crop the number of sessions to maxPackets
            sessionList = sessionDict[session][:maxPackets]
        else:
            sessionList = sessionDict[session]
        
        for packet in sessionList:
            packet = [hexDict[packet[i:i+2]] for i in xrange(0,len(packet)-2+1,2)]
            
            if len(packet) >= packetTimeSteps: #crop packet to length packetTimeSteps
                packet = packet[:packetTimeSteps-1]
            
            packet = packet+[256] #add <EOP> end of packet token
        
            pacMat = np.array([oneHot(x) for x in packet]) #one hot encoding of packet into a matrix
            pacMatLen = len(pacMat)
        
            #padding packet
            if reverse:
                pacMat = pacMat[::-1]
            
            if pacMatLen < packetTimeSteps:
                #pad by stacking zeros on top of data so that earlier timesteps do not have information
                #padding the packet such that zeros are after the actual info for better translation
                pacMat = np.vstack( (pacMat, np.zeros((packetTimeSteps-pacMatLen,vecLen))) ) 

            if pacMatLen > packetTimeSteps:
                pacMat = pacMat[:packetTimeSteps, :]
                
            sessionCollect.append(pacMat)
        
        #padding session
        sessionCollect = np.asarray(sessionCollect)
        numPacketsInSession = np.asarray(sessionCollect).shape[0]
        if numPacketsInSession < maxPackets:
            #pad sessions to fit the 
            sessionCollect = np.vstack( (sessionCollect,np.zeros((maxPackets-numPacketsInSession, 
                                                                 packetTimeSteps, vecLen))) )
            
        listOsessions.append(sessionCollect)
        
    return listOsessions

In [29]:
sessions = oneHotSessions(hexSessions)

# Functions

In [7]:
def floatX(X):
    return np.asarray(X, dtype=theano.config.floatX)

def dropout(X, p=0.):
    if p != 0:
        retain_prob = 1 - p
        X = X / retain_prob * srng.binomial(X.shape, p=retain_prob, dtype=theano.config.floatX)
    return X

# Gradient clipping
def clip_norm(g, c, n): 
    '''n is the norm, c is the threashold, and g is the gradient'''
    
    if c > 0: 
        g = T.switch(T.ge(n, c), g*c/n, g) 
    return g

def clip_norms(gs, c):
    norm = T.sqrt(sum([T.sum(g**2) for g in gs]))
    return [clip_norm(g, c, norm) for g in gs]

# Regularizers
def max_norm(p, maxnorm = 0.):
    if maxnorm > 0:
        norms = T.sqrt(T.sum(T.sqr(p), axis=0))
        desired = T.clip(norms, 0, maxnorm)
        p = p * (desired/ (1e-7 + norms))
    return p

def gradient_regularize(p, g, l1 = 0., l2 = 0.):
    g += p * l2
    g += T.sgn(p) * l1
    return g

def weight_regularize(p, maxnorm = 0.):
    p = max_norm(p, maxnorm)
    return p

def Adam(params, cost, lr=0.0002, b1=0.1, b2=0.001, e=1e-8, l1 = 0., l2 = 0., maxnorm = 0., c = 8):
    
    updates = []
    grads = T.grad(cost, params)
    grads = clip_norms(grads, c)
    
    i = theano.shared(floatX(0.))
    i_t = i + 1.
    fix1 = 1. - b1**(i_t)
    fix2 = 1. - b2**(i_t)
    lr_t = lr * (T.sqrt(fix2) / fix1)
    
    for p, g in zip(params, grads):
        m = theano.shared(p.get_value() * 0.)
        v = theano.shared(p.get_value() * 0.)
        m_t = (b1 * g) + ((1. - b1) * m)
        v_t = (b2 * T.sqr(g)) + ((1. - b2) * v)
        g_t = m_t / (T.sqrt(v_t) + e)
        g_t = gradient_regularize(p, g_t, l1=l1, l2=l2)
        p_t = p - (lr_t * g_t)
        p_t = weight_regularize(p_t, maxnorm=maxnorm)
        
        updates.append((m, m_t))
        updates.append((v, v_t))
        updates.append((p, p_t))
    
    updates.append((i, i_t))
    return updates

def RMSprop(cost, params, lr = 0.001, l1 = 0., l2 = 0., maxnorm = 0., rho=0.9, epsilon=1e-6, c = 8):
    
    grads = T.grad(cost, params)
    grads = clip_norms(grads, c)
    updates = []
    
    for p, g in zip(params, grads):
        g = gradient_regularize(p, g, l1 = l1, l2 = l2)
        acc = theano.shared(p.get_value() * 0.)
        acc_new = rho * acc + (1 - rho) * g ** 2
        updates.append((acc, acc_new))
        
        updated_p = p - lr * (g / T.sqrt(acc_new + epsilon))
        updated_p = weight_regularize(updated_p, maxnorm = maxnorm)
        updates.append((p, updated_p))
    return updates

In [8]:
#makes output by shifting inputs down in time one step and then copying the last time step to the end.
def targetModifier(targetArray):
    newTarget = np.vstack((targetArray[1:, :], targetArray[-1,:]))
    return newTarget

def targetMaker(listOinputs):
    #TODO: do this with arrays
    outputs = []
    for inp in listOinputs:
        outputs.append(targetModifier(inp))
    outputs = np.asarray(outputs)
    
    return outputs

# Encoder RNN

In [9]:
dim = 257 #original data dimension/timesteps/columns
rnnType = 'gru' #gru or lstm
bidirectional = True
X = T.tensor3('inputs')
Xrev = T.matrix('reversed_inputs')
linewt_init = Uniform(width=0.08)
rnnwt_init = IsotropicGaussian(0.05)
rnnbias_init = Constant(0.0)

if rnnType == 'gru':
    rnn = GatedRecurrent(dim=dim, weights_init = rnnwt_init, biases_init = rnnbias_init, name = 'gru')
    dimMultiplier = 2
else:
    rnn = LSTM(dim=dim, weights_init = rnnwt_init, biases_init = rnnbias_init, name = 'lstm')
    dimMultiplier = 4

###ICLR suggestion -> don't use bias in RNNs
###RECURRENT LAYER

#To use or not to use that is the question
fork = Fork(output_names=['linear', 'gates'],
            name='fork', input_dim=dim, output_dims=[dim, dim * dimMultiplier], 
            weights_init = linewt_init, biases_init = rnnbias_init)
data1, data2 = fork.apply(X)

###for raw inputs
#data1 = X
#data2 = T.concatenate([X]*dimMultiplier, axis=2)

if rnnType == 'gru':
    hEnc = rnn.apply(data1, data2)[:,-1] #the [:,-1] gets the last hidden state for each obs in minibatch
                                         #i.e. the last state for each sentence
else:
    hinit, _ = rnn.apply(data2)
    hEnc = hinit[:,-1]

hEnc = T.reshape(hEnc,(maxPackets, 1, dim))
#get weights initialized. without weights are nans.
fork.initialize()
rnn.initialize()

In [10]:
#Encoder will return a maxPackets x packet length matrix
encoder = theano.function([X], hEnc, allow_input_downcast=True)

In [13]:
#test ENCODED PACKETS shape = (maxPackets, 1, dim)
encoder(sessions[1]).shape

(10, 1, 257)

# Context RNN

In [11]:

if rnnType == 'gru':
    rnnContext = GatedRecurrent(dim=dim, weights_init = rnnwt_init, 
                                biases_init = rnnbias_init, name = 'gruContext')
    dimMultiplier = 2
else:
    rnnContext = LSTM(dim=dim, weights_init = rnnwt_init, biases_init = rnnbias_init, 
                      name = 'lstmContext')
    dimMultiplier = 4


###ICLR suggestion -> don't use bias in RNNs
#is encoding at each layer really the best way? or just feeding the raw through?
###RECURRENT LAYER
forkContext = Fork(output_names=['linearContext', 'gatesContext'],
            name='forkContext', input_dim=dim, output_dims=[dim, dim * dimMultiplier], 
            weights_init = linewt_init, biases_init = rnnbias_init)
data3, data4 = forkContext.apply(hEnc)

if rnnType == 'gru':
    hContext = rnnContext.apply(data3, data4)
else:
    hinitContext, _ = rnnContext.apply(data4)
    hContext = hinitContext

#THINK ABOUT ADDING L2 POOLING BEFORE CAT
if bidirectional:
    
    data3 = data3[::-1]
    data4 = data4[::-1]
    
    if rnnType == 'gru':
        rnnContextRev = GatedRecurrent(dim=dim, weights_init = rnnwt_init, 
                                       biases_init = rnnbias_init, name = 'gruContextRev')
        hContextRev = rnnContextRev.apply(data3, data4)
    else:
        rnnContextRev = LSTM(dim=dim, weights_init = rnnwt_init, biases_init = rnnbias_init,
                             name = 'lstmContextRev')
        hinitContext, _ = rnnContextRev.apply(data4)
        hContextRev = hinitContext
    
    
    hContext = T.concatenate((hContext, hContextRev), axis=2)
    rnnContextRev.initialize()
    
#get weights initialized. without weights are nans.
forkContext.initialize()
rnnContext.initialize()

In [12]:
#output shape = (maxPackets, 1, dim*2)
context = theano.function([X], hContext, allow_input_downcast=True)

In [14]:
context(sessions[1]).shape

(10, 1, 514)

# Decoder RNN

In [15]:
#does the fork encoding need to happen here?
#do we simply cat the hContext with the next words?

In [42]:
dimDec = dim*2

if rnnType == 'gru':
    rnnDec = GatedRecurrent(dim=dim, weights_init = rnnwt_init, 
                            biases_init = rnnbias_init, name = 'gruDecoder')
    dimMultiplier = 2
else:
    rnnDec = LSTM(dim=dim, weights_init = rnnwt_init, biases_init = rnnbias_init, name = 'lstmDecoder')
    dimMultiplier = 4


forkDec = Fork(output_names=['linear', 'gates'],
            name='forkDec', input_dim=dimDec, output_dims=[dim, dim*dimMultiplier], 
            weights_init = linewt_init, biases_init = rnnbias_init)

forkFinal = Fork(output_names=['linear', 'gates'],
            name='forkDec', input_dim=dim, output_dims=[dim, dim*dimMultiplier], 
            weights_init = linewt_init, biases_init = rnnbias_init)

data5, data6 = forkDec.apply(hContext)#reduce dimension of bidirectLSTM

#decoding data needs to be one timestep (next packet in session) ahead, thus data1 we ignore the first packet
#and the last hidden state of the context RNN.
data7 = T.concatenate((data5[:-1,:,:], data1[1:, :-1, :]), axis=1) #data1 is the original embedding of X

#data8 = T.concatenate((data7, data5), axis = 2)
data8, data9 = forkFinal.apply(data7)


if rnnType == 'gru':
    hDec = rnnDec.apply(data8, data9) 
else:
    hinit, _ = rnnDec.apply(data9)
    hDec = hinit

#Smooth out the probabilities of hDec
softmax = NDimensionalSoftmax()
softout = softmax.apply(hDec, extra_ndim = 1)
    

precost = X[1:, :, :]*np.log(softout) + (1-X[1:, :, :])*np.log(1-softout)
cost = -T.mean(T.sum(T.sum(precost, axis = 2), axis = 1))
#cost = BinaryCrossEntropy().apply(X[1:, :, :], softout)

#get weights initialized
forkDec.initialize()
forkFinal.initialize()
rnnDec.initialize()

In [43]:
decoderTest = theano.function([X], cost, allow_input_downcast=True)

In [45]:
decoderTest(sessions[1])

array(3.7743141651153564, dtype=float32)

In [17]:
#output shape = (maxPackets, )
decoder = theano.function([X], hDec, allow_input_downcast=True)

In [18]:
decTest = decoder(sessions[1])

In [61]:
decTest

array([[[  2.84144655e-04,  -1.53979345e-04,  -1.43260710e-04, ...,
           5.78663048e-06,  -8.96845377e-05,   3.08299241e-05],
        [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00, ...,
           0.00000000e+00,   0.00000000e+00,   0.00000000e+00],
        [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00, ...,
           0.00000000e+00,   0.00000000e+00,   0.00000000e+00],
        ..., 
        [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00, ...,
           0.00000000e+00,   0.00000000e+00,   0.00000000e+00],
        [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00, ...,
           0.00000000e+00,   0.00000000e+00,   0.00000000e+00],
        [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00, ...,
           0.00000000e+00,   0.00000000e+00,   0.00000000e+00]],

       [[  6.37453981e-04,  -2.28210120e-04,  -1.84183169e-04, ...,
           6.43644235e-05,  -2.77327839e-04,   1.47678642e-04],
        [  0.00000000e+00,   0.00000000e+00,   0.00000000e+0

In [None]:
pYx = 1/(1+T.exp(-hDec))
softmax = NDimensionalSoftmax()
softout = softmax.apply(pYx, extra_ndim = 1)

#test
decoderTest = theano.function([X], softout, allow_input_downcast=True)

In [46]:
#cost = BinaryCrossEntropy().apply(Y, softout)

#cg = ComputationGraph([cost])

In [49]:
costTest = theano.function([X], precost, allow_input_downcast=True)

In [50]:
testOut = costTest(sessions[1])

In [58]:
testOut

array([[[ -3.89871583e-03,  -3.89872422e-03,  -3.89882107e-03, ...,
          -3.89862969e-03,  -3.89850629e-03,  -3.89875122e-03],
        [ -3.89181473e-03,  -3.88616836e-03,  -3.89087247e-03, ...,
          -3.89794679e-03,  -3.91108310e-03,  -3.93225765e-03],
        [ -3.91480559e-03,  -3.91625846e-03,  -3.90912173e-03, ...,
          -3.90114449e-03,  -3.88275879e-03,  -3.90282343e-03],
        ..., 
        [ -3.89864040e-03,  -3.89864040e-03,  -3.89864040e-03, ...,
          -3.89864040e-03,  -3.89864040e-03,  -3.89864040e-03],
        [ -3.89864040e-03,  -3.89864040e-03,  -3.89864040e-03, ...,
          -3.89864040e-03,  -3.89864040e-03,  -3.89864040e-03],
        [ -3.89864040e-03,  -3.89864040e-03,  -3.89864040e-03, ...,
          -3.89864040e-03,  -3.89864040e-03,  -3.89864040e-03]],

       [[ -3.89882922e-03,  -3.89889674e-03,  -3.89896357e-03, ...,
          -3.89861921e-03,  -3.89827415e-03,  -3.89897381e-03],
        [ -3.88982450e-03,  -3.88449896e-03,  -3.88299325e-0

In [62]:
np.mean(np.sum(np.sum(testOut, axis = 2), axis =1))

-969.9986

In [None]:
learning_rate = 0.01
params = VariableFilter(roles = [PARAMETER])(cg.variables)
#updates = Adam(params, cost, learning_rate, c=10) #c is gradient clipping parameter
updates = RMSprop(cost, params, learning_rate, c=1)

In [None]:
gradients = T.grad(cost, params)
gradients = clip_norms(gradients, 1)
gradientFun = theano.function([X,Y], gradients, allow_input_downcast=True)
train = theano.function([X,Y], cost, updates = updates, allow_input_downcast=True)
predict = theano.function([X], softout, allow_input_downcast=True)

#test
inputs = np.asarray(normalizedData[:3])
outputs = targetMaker(inputs)

In [None]:
#shuffle data
random.shuffle(normalizedData)
trainPercent = 0.9
trainIndex = int(len(normalizedData)*trainPercent)

trainData = normalizedData[0:trainIndex]
testData = normalizedData[trainIndex:]

In [None]:
#TODO: make a training function
runname = 'firstRun'
epochCost = []
gradNorms = []

epochs = 200000
batch_size = 64
iteration = 0

for epoch in xrange(epochs):
    
    costCollect = []

    for start, end in zip(range(0, len(trainData),batch_size), range(batch_size, len(trainData), batch_size)):
        
        inputs = trainData[start:end]
        outputs = targetMaker(inputs)
        costfun = train(inputs, outputs)
        
        
        costCollect.append(costfun)
                
        iteration+=1
        
    ####SAVE COST TO FILE  
    if epoch%30 == 0:
        print(' ')
        print 'Epoch: ', epoch
        epochCost.append(np.mean(costCollect))
        print 'Epoch cost average: ', epochCost[-1]
        grads = gradientFun(inputs, outputs)
        for gra in grads:
            print '  gradient norms: ', np.linalg.norm(gra)
        
    
    np.savetxt(runname+"_COST.csv", epochCost, delimiter=",")


In [None]:
dataPath = '/data/bradspcaps.txt'
data = []
with open(dataPath, 'rb') as f:
    for line in f.readlines():
        data.append(line.split("'data': ")[-1].split(',')[0].replace("'", ""))

In [None]:
#Making the hex dictionary
hexstring = '0,	1,	2,	3,	4,	5,	6,	7,	8,	9,	A,	B,	C,	D,	E,	F,	10,	11,	12,	13,	14,	15,	16,	17,	18,	19\
,	1A,	1B,	1C,	1D,	1E,	1F,	20,	21,	22,	23,	24,	25,	26,	27,	28,	29,	2A,	2B,	2C,	2D,	2E,	2F,	30,	31,	32,	33,	34,	35\
,	36,	37,	38,	39,	3A,	3B,	3C,	3D,	3E,	3F,	40,	41,	42,	43,	44,	45,	46,	47,	48,	49,	4A,	4B,	4C,	4D,	4E,	4F,	50,	51\
,	52,	53,	54,	55,	56,	57,	58,	59,	5A,	5B,	5C,	5D,	5E,	5F,	60,	61,	62,	63,	64,	65,	66,	67,	68,	69,	6A,	6B,	6C,	6D\
,	6E,	6F,	70,	71,	72,	73,	74,	75,	76,	77,	78,	79,	7A,	7B,	7C,	7D,	7E,	7F,	80,	81,	82,	83,	84,	85,	86,	87,	88,	89\
,	8A,	8B,	8C,	8D,	8E,	8F,	90,	91,	92,	93,	94,	95,	96,	97,	98,	99,	9A,	9B,	9C,	9D,	9E,	9F,	A0,	A1,	A2,	A3,	A4,	A5\
,	A6,	A7,	A8,	A9,	AA,	AB,	AC,	AD,	AE,	AF,	B0,	B1,	B2,	B3,	B4,	B5,	B6,	B7,	B8,	B9,	BA,	BB,	BC,	BD,	BE,	BF,	C0,	C1\
,	C2,	C3,	C4,	C5,	C6,	C7,	C8,	C9,	CA,	CB,	CC,	CD,	CE,	CF,	D0,	D1,	D2,	D3,	D4,	D5,	D6,	D7,	D8,	D9,	DA,	DB,	DC,	DD\
,	DE,	DF,	E0,	E1,	E2,	E3,	E4,	E5,	E6,	E7,	E8,	E9,	EA,	EB,	EC,	ED,	EE,	EF,	F0,	F1,	F2,	F3,	F4,	F5,	F6,	F7,	F8,	F9\
,	FA,	FB,	FC,	FD,	FE,	FF'.replace('\t', '')

hexList = hexstring.lower().split(',')
hexList.append('EOP') #End Of Packet token
hexDict = {}
    
for key, val in enumerate(hexList):
    if len(val) == 1:
        val = '0'+val
    hexDict[val] = key    

#we add 256 on the end to signify the end of the packet ('EOP')
tokenizedHeader = [[hexDict[header[i:i+2]] for i in xrange(0,len(header)-2+1,2)]+[256] for header in data]


#list of arrays that represent a header with row = time 
oneHotHeaders = [np.asarray([oneHot(item) for item in header]) for header in tokenizedHeader]

normalizedData = normalizeArrays(oneHotHeaders, 253, reverse=False)

# Pretraining essentials

In [None]:
numTokens = 257
rnnType = 'gru'
X = T.tensor3('inputs')
Y = T.tensor3('outputs')
linewt_init = Uniform(width=0.02)
rnnwt_init = IsotropicGaussian(0.08)
rnnbias_init = Constant(0.0)

if rnnType == 'gru':
    rnnDec = GatedRecurrent(dim=numTokens, weights_init = rnnwt_init, biases_init = rnnbias_init, name = 'gru')
    dimMultiplier = 2
else:
    rnnDec = LSTM(dim=numTokens, weights_init = rnnwt_init, biases_init = rnnbias_init, name = 'lstm')
    dimMultiplier = 4

###ICLR suggestion -> don't use bias in RNNs
###RECURRENT LAYER
forkDec = Fork(output_names=['linear', 'gates'],
            name='fork', input_dim=numTokens, output_dims=[numTokens, numTokens * dimMultiplier], 
            weights_init = linewt_init, biases_init = rnnbias_init)
data5, data6 = forkDec.apply(X)

if rnnType == 'gru':
    hDec = rnnDec.apply(data5, data6) 
else:
    hinit, _ = rnnDec.apply(data6)
    hDec = hinit

#CRITICAL: need to loop through the arrays. Do regular people update after every sequence? or minibatch of seqs?
    
pYx = 1/(1+T.exp(-hDec))
softmax = NDimensionalSoftmax()
softout = softmax.apply(pYx, extra_ndim = 1)

#get weights initialized
forkDec.initialize()
rnnDec.initialize()

#cost = BinaryCrossEntropy().apply(Y, softout)
precost = Y*np.log(softout) + (1-Y)*np.log(1-softout)
cost = -T.mean(T.sum(T.sum(precost[:,:-1,:], axis = 2), axis = 1))
cg = ComputationGraph([cost])

learning_rate = 0.01
params = VariableFilter(roles = [PARAMETER])(cg.variables)
#updates = Adam(params, cost, learning_rate, c=10) #c is gradient clipping parameter
updates = RMSprop(cost, params, learning_rate, c=1)

gradients = T.grad(cost, params)
gradients = clip_norms(gradients, 1)
gradientFun = theano.function([X,Y], gradients, allow_input_downcast=True)
train = theano.function([X,Y], cost, updates = updates, allow_input_downcast=True)
predict = theano.function([X], softout, allow_input_downcast=True)

In [None]:
random.shuffle(normalizedData)
trainPercent = 0.9
trainIndex = int(len(normalizedData)*trainPercent)

trainData = normalizedData[0:trainIndex]
testData = normalizedData[trainIndex:]

runname = 'firstRun'
epochCost = []
gradNorms = []

epochs = 200000
batch_size = 64
iteration = 0

for epoch in xrange(epochs):
    
    costCollect = []

    for start, end in zip(range(0, len(trainData),batch_size), range(batch_size, len(trainData), batch_size)):
        
        inputs = trainData[start:end]
        outputs = targetMaker(inputs)
        costfun = train(inputs, outputs)
        
        
        costCollect.append(costfun)
                
        iteration+=1
        
    ####SAVE COST TO FILE  
    if epoch%30 == 0:
        print(' ')
        print 'Epoch: ', epoch
        epochCost.append(np.mean(costCollect))
        print 'Epoch cost average: ', epochCost[-1]
        grads = gradientFun(inputs, outputs)
        for gra in grads:
            print '  gradient norms: ', np.linalg.norm(gra)
        
    
    np.savetxt(runname+"_COST.csv", epochCost, delimiter=",")

# Converting to CPU

In [None]:
#GPU TO CPU conversion
#Now get the weights from the test function. These weights will be numpy arrays
w1 = test.get_shared()[0].get_value()

#Here the weights are going to be set to the numpy arrays taken from the GPU predict function
input_linear.parameters[0].set_value(w1)

In [None]:
test.get_shared()[2].get_value().shape

In [None]:
chars = '1234567890abcdefghijklmnopqrstuvwxyz'
words = ['']

# Scratchpad

#we add 256 on the end to signify the end of the packet ('EOP')

maxPackets = 10 #limit the number of packets
tokSessions = []
oneHotSessions = []

for ses in hexSessions.keys():    
    tokPacket = []
    oneHotPacket = []
    for p in hexSessions[ses][:maxPackets]:
        tokP = [hexDict[p[i:i+2]] for i in xrange(0,len(p)-2+1,2)]+[256] #takes hexstring and tokenizes hex pairs
        tokPacket.append(tokP)
        oneHotPacket.append(oneHot(tokP))

    tokSessions.append(tokPacket)
    oneHotSessions.append(oneHotPacket)


###ALT RNN LAYER
def initialize(to_init):
    for bricks in to_init:
        bricks.weights_init = initialization.Uniform(width=0.08)
        bricks.biases_init = initialization.Constant(0)
        bricks.initialize()

def gru_layer(dim, h, n):
    fork = Fork(output_names=['linear' + str(n), 'gates' + str(n)],
                name='fork' + str(n), input_dim=dim, output_dims=[dim, dim * 2])
    gru = GatedRecurrent(dim=dim, name='gru' + str(n))
    initialize([fork, gru])
    linear, gates = fork.apply(h)
    return gru.apply(linear, gates)


def lstm_layer(dim, h, n):
    linear = Linear(input_dim=dim, output_dim=dim * 4, name='linear' + str(n))
    lstm = LSTM(dim=dim, name='lstm' + str(n))
    initialize([linear, lstm])
    return lstm.apply(linear.apply(h))
