In [1]:
import numpy as np # library that supports large and multi-dimensional arrays and matrices
import pandas as pd # library for data manipulation and analysis
import scipy.io # to read data from and write data to a variety of file formats
from scipy import sparse # 2D sparse matrix package
from sklearn.decomposition import PCA # linear dimensionality reduction
from sklearn.linear_model import Ridge # linear least squares with L2 regularization
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from scipy.stats import zscore
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import math
from matplotlib.pyplot import *
import time

In [2]:
class Reservoir(object):
    """
    Build a reservoir and evaluate internal states
    
    Parameters:
        n_internal_units = processing units in the reservoir
        spectral_radius = largest eigenvalue of the reservoir matrix of connection weights
        leak = amount of leakage in the reservoir state update (optional)
        connectivity = percentage of nonzero connection weights (unused in circle reservoir)
        input_scaling = scaling of the input connection weights
        noise_level = deviation of the Gaussian noise injected in the state update
        circle = generate determinisitc reservoir with circle topology
    """
    
    def __init__(self, n_internal_units=100, spectral_radius=0.99, leak=None,
                 connectivity=0.3, input_scaling=0.2, noise_level=0.01, circle=False):
        
        # Initialize attributes
        self._n_internal_units = n_internal_units
        self._input_scaling = input_scaling
        self._noise_level = noise_level
        self._leak = leak

        # Input weights depend on input size: they are set when data is provided
        self._input_weights = None

        # Generate internal weights
        if circle:
            self._internal_weights = self._initialize_internal_weights_Circ(
                    n_internal_units,
                    spectral_radius)
        else:
            self._internal_weights = self._initialize_internal_weights(
                n_internal_units,
                connectivity,
                spectral_radius)


    def _initialize_internal_weights_Circ(self, n_internal_units, spectral_radius):
        np.random.seed(33)
        internal_weights = np.zeros((n_internal_units, n_internal_units))
        internal_weights[0,-1] = spectral_radius
        for i in range(n_internal_units-1):
            internal_weights[i+1,i] = spectral_radius
                
        return internal_weights
    
    
    def _initialize_internal_weights(self, n_internal_units,
                                     connectivity, spectral_radius):

        # Generate sparse, uniformly distributed weights.
        np.random.seed(33)
        internal_weights = sparse.rand(n_internal_units,
                                       n_internal_units,
                                       density=connectivity).todense()

        # Ensure that the nonzero values are uniformly distributed in [-0.5, 0.5]
        np.random.seed(33)
        internal_weights[np.where(internal_weights > 0)] -= 0.5
        
        # Adjust the spectral radius.
        E, _ = np.linalg.eig(internal_weights)
        e_max = np.max(np.abs(E))
        internal_weights /= np.abs(e_max)/spectral_radius       

        return internal_weights


    def _compute_state_matrix(self, X, n_drop=0):
        N, T, _ = X.shape
        previous_state = np.zeros((N, self._n_internal_units), dtype=float)

        # Storage
        state_matrix = np.empty((N, T - n_drop, self._n_internal_units), dtype=float)
        for t in range(T):
            current_input = X[:, t, :]

            # Calculate state
            state_before_tanh = self._internal_weights.dot(previous_state.T) + self._input_weights.dot(current_input.T)

            # Add noise
            state_before_tanh += np.random.rand(self._n_internal_units, N)*self._noise_level

            # Apply nonlinearity and leakage (optional)
            if self._leak is None:
                previous_state = np.tanh(state_before_tanh).T
            else:
                previous_state = (1.0 - self._leak)*previous_state + np.tanh(state_before_tanh).T

            # Store everything after the dropout period
            if (t > n_drop - 1):
                state_matrix[:, t - n_drop, :] = previous_state

        return state_matrix


    def get_states(self, X, n_drop=0, bidir=True):
        np.random.seed(33)
        N, T, V = X.shape
        np.random.seed(33)
        if self._input_weights is None:
            self._input_weights = (2.0*np.random.binomial(1, 0.5 , [self._n_internal_units, V]) - 1.0)*self._input_scaling

        # compute sequence of reservoir states
        states = self._compute_state_matrix(X, n_drop)
    
        # reservoir states on time reversed input
        if bidir is True:
            X_r = X[:, ::-1, :]
            states_r = self._compute_state_matrix(X_r, n_drop)
            states = np.concatenate((states, states_r), axis=2)

        return states
    
    def getReservoirEmbedding(self, X,pca, ridge_embedding,  n_drop=5, bidir=True, test = False):

        res_states = self.get_states(X, n_drop=5, bidir=True)


        N_samples = res_states.shape[0]
        res_states = res_states.reshape(-1, res_states.shape[2])                   
        # ..transform..
        if test:
            red_states = pca.transform(res_states)
        else:
            red_states = pca.fit_transform(res_states)          
        # ..and put back in tensor form
        red_states = red_states.reshape(N_samples,-1,red_states.shape[1])  

        coeff_tr = []
        biases_tr = []   

        for i in range(X.shape[0]):
            ridge_embedding.fit(red_states[i, 0:-1, :], red_states[i, 1:, :])
            coeff_tr.append(ridge_embedding.coef_.ravel())
            biases_tr.append(ridge_embedding.intercept_.ravel())
        #print(np.array(coeff_tr).shape,np.array(biases_tr).shape)
        input_repr = np.concatenate((np.vstack(coeff_tr), np.vstack(biases_tr)), axis=1)
        return input_repr

In [3]:
def targetify(s):
    if s == 'BENIGN':
        return 0
    else:
        return 1

In [4]:
def eqArray(a,b):
    return np.where(a == b, 1, 0)

In [5]:
#datasets = ["Wednesday2017.csv"]

#features_Th15022018 = ['Fwd Seg Size Min', 'Init Fwd Win Byts', 'Fwd IAT Min', 'Bwd Pkt Len Max', 'Bwd IAT Mean', 'Pkt Len Max', 'Fwd IAT Tot', 'Fwd IAT Max', 'Bwd Pkt Len Std', 'Flow IAT Mean', 'ACK Flag Cnt', 'Fwd IAT Mean', 'Flow Duration', 'Flow IAT Min', 'Flow IAT Max', 'Bwd IAT Max', 'Idle Max', 'Init Bwd Win Byts', 'Idle Min', 'Flow IAT Std']
#features_Fr16022018 = ['Fwd Pkt Len Std', 'Fwd Pkt Len Max', 'Fwd Pkt Len Mean', 'Pkt Len Std', 'Fwd Seg Size Avg', 'Pkt Len Mean', 'Bwd Pkt Len Mean', 'TotLen Fwd Pkts', 'Subflow Fwd Byts', 'Bwd Pkt Len Max', 'Bwd Pkt Len Std', 'Pkt Len Var', 'Flow IAT Mean', 'ACK Flag Cnt', 'Pkt Len Max']

#numFeatures = [10, 15, 20]
#fracOfData = [0.5, 0.75, 1]
#numInternalUnits = [5, 10, 15, 20]

In [6]:
#dataset = "Wednesday2017raw.csv"
df1 = pd.read_csv('Wednesday2017raw.csv')
df1


  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,Flow ID,Source IP,Source Port,Destination IP,Destination Port,Protocol,Timestamp,Flow Duration,Total Fwd Packets,Total Backward Packets,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,192.168.10.14-209.48.71.168-49459-80-6,192.168.10.14,49459,209.48.71.168,80,6,5/7/2017 8:42,38308,1,1,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BENIGN
1,192.168.10.3-192.168.10.17-389-49453-6,192.168.10.17,49453,192.168.10.3,389,6,5/7/2017 8:42,479,11,5,...,32,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BENIGN
2,192.168.10.3-192.168.10.17-88-46124-6,192.168.10.17,46124,192.168.10.3,88,6,5/7/2017 8:42,1095,10,6,...,32,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BENIGN
3,192.168.10.3-192.168.10.17-389-49454-6,192.168.10.17,49454,192.168.10.3,389,6,5/7/2017 8:42,15206,17,12,...,32,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BENIGN
4,192.168.10.3-192.168.10.17-88-46126-6,192.168.10.17,46126,192.168.10.3,88,6,5/7/2017 8:42,1092,9,6,...,32,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BENIGN
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
692698,192.168.10.3-192.168.10.14-53-51114-17,192.168.10.14,51114,192.168.10.3,53,17,5/7/2017 12:10,32215,4,2,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BENIGN
692699,192.168.10.3-192.168.10.16-53-24054-17,192.168.10.16,24054,192.168.10.3,53,17,5/7/2017 3:02,324,2,2,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BENIGN
692700,192.168.10.51-23.208.163.130-58030-443-6,23.208.163.130,443,192.168.10.51,58030,6,5/7/2017 10:06,82,2,1,...,32,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BENIGN
692701,192.168.10.3-192.168.10.14-53-51694-17,192.168.10.14,51694,192.168.10.3,53,17,5/7/2017 1:19,1048635,6,2,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BENIGN


In [7]:
df2 = df1.groupby([' Timestamp'])[' Flow Duration'].count()
df2 = pd.DataFrame(df2).reset_index()
df2.columns=[' Timestamp','CountOfOtherBiFlowsAtThisTimestamp']
df = df1.merge(df2, left_on=' Timestamp', right_on=' Timestamp')
df = df.sort_values(' Timestamp')
num_features = 20 # should be 10, 15, or 20
features_wed = ['CountOfOtherBiFlowsAtThisTimestamp', ' Bwd Packet Length Mean', ' Idle Max', ' Fwd IAT Max', 
                ' ACK Flag Count', ' Protocol', ' Destination Port', ' Avg Bwd Segment Size', ' Fwd IAT Std', 
                'Idle Mean', ' Source Port', ' Flow IAT Max', 'Fwd IAT Total', ' Average Packet Size',
                ' Bwd Packet Length Std', ' Packet Length Std', ' Packet Length Mean', 
                'Bwd Packet Length Max', ' min_seg_size_forward', 'Fwd Packets/s',' Min Packet Length'] 
features = features_wed[0:num_features]
df

Unnamed: 0,Flow ID,Source IP,Source Port,Destination IP,Destination Port,Protocol,Timestamp,Flow Duration,Total Fwd Packets,Total Backward Packets,...,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label,CountOfOtherBiFlowsAtThisTimestamp
70746,133.237.16.47-192.168.10.17-443-32968-6,133.237.16.47,443,192.168.10.17,32968,6,5/7/2017 10:00,3,2,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BENIGN,1622
71684,192.168.10.3-192.168.10.17-53-3066-17,192.168.10.17,3066,192.168.10.3,53,17,5/7/2017 10:00,183,2,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BENIGN,1622
71683,192.168.10.3-192.168.10.17-53-10868-17,192.168.10.17,10868,192.168.10.3,53,17,5/7/2017 10:00,275,2,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BENIGN,1622
71682,192.168.10.3-192.168.10.17-53-3722-17,192.168.10.17,3722,192.168.10.3,53,17,5/7/2017 10:00,194,2,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BENIGN,1622
71681,192.168.10.3-192.168.10.5-53-52520-17,192.168.10.5,52520,192.168.10.3,53,17,5/7/2017 10:00,50700,2,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BENIGN,1622
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69893,192.168.10.1-192.168.10.3-53-62423-17,192.168.10.3,62423,192.168.10.1,53,17,5/7/2017 9:59,81895569,2,2,...,142895.0,0.0,142895.0,142895.0,81700000.0,0.0,81700000.0,81700000.0,BENIGN,1061
69894,192.168.10.1-192.168.10.3-53-61329-17,192.168.10.3,61329,192.168.10.1,53,17,5/7/2017 9:59,60667,1,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BENIGN,1061
69895,192.168.10.1-192.168.10.3-53-60371-17,192.168.10.3,60371,192.168.10.1,53,17,5/7/2017 9:59,23450,1,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BENIGN,1061
69881,192.168.10.1-192.168.10.3-53-61208-17,192.168.10.3,61208,192.168.10.1,53,17,5/7/2017 9:59,50295,1,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BENIGN,1061


In [8]:
fraction = 0.5
print(str(num_features) + " features")
print("fraction:" + str(fraction))
data = df.sample(frac=fraction, replace=True, random_state=1)

# get X and y. Normalize X and make it into 3D shape for reservoir
num_col = data.shape[1]
num_row = data.shape[0]

X_data = data[features]
X_data[features] = X_data[features].apply(pd.to_numeric, errors='coerce', axis=1)
#norm_scaler = preprocessing.StandardScaler()
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(X_data.values)
X = np.nan_to_num(x_scaled)
if len(X.shape) < 3:
    X = np.atleast_3d(X)
y = data[' Label'].apply(targetify)
print("Finished loading X and y......")

# split into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)

print("X_train shape:" + str(X_train.shape), "y_train shape:" + str(y_train.shape))
print("X_test shape:" + str(X_test.shape), "y_test shape:" + str(y_test.shape))

pca = PCA() #n_components gives number of components to keep for linear dimensionality reduction
ridge_embedding = Ridge(alpha=10, fit_intercept=True)
readout = Ridge(alpha=5)

20 features
fraction:0.5


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


Finished loading X and y......
X_train shape:(277081, 20, 1) y_train shape:(277081,)
X_test shape:(69271, 20, 1) y_test shape:(69271,)


In [9]:
n=30 #number of internal units changed to 90 from 10
print(str(n) + " internal units")

#run through reservoir
start = time.clock()
res = Reservoir(n_internal_units=n, spectral_radius=0.9, leak=0.2,
     connectivity=0.25, input_scaling=0.3, noise_level=0.01, circle=False)
input_repr = res.getReservoirEmbedding(np.array(X_train), pca, ridge_embedding,  n_drop=0, bidir=False, test = False)
print("Finished loading training reservoir embedding......") # n_drop changed to 0 from 5
end = time.clock()
input_repr_te = res.getReservoirEmbedding(np.array(X_test), pca, ridge_embedding,  n_drop=0, bidir=False, test = True)
print("Finished loading testing reservoir embedding......")  # n_drop changed to 0 from 5

#fit output
readout.fit(input_repr, y_train)
pred_class = readout.predict(input_repr_te)
#predictions = [int(round(x)) for x in pred_class]
true_class = list(y_test)

#analysis
compdf = pd.DataFrame({'pred_class':pred_class, 'true_class':true_class})
compdf = compdf.sort_values('pred_class', ascending=False)
print(str(compdf.head(10)))
#compdf.to_csv(str(dataset.split('_')[0]) + '_' + str(fraction) + '_' + str(num_features) + '_' + str(n) + '.csv')
#accuracy = np.sum(list(map(eqArray, predictions, true_class))) / len(true_class)
#f1 = f1_score(true_class, predictions)
#auc = roc_auc_score(true_class, predictions)

#print("# of nonzero:" + str(np.count_nonzero(predictions)))
#print("accuracy is " + str(accuracy))
#print("f1 is " + str(f1))
#print("auc is " + str(auc))
#print("*******************************************************************")

30 internal units
Finished loading training reservoir embedding......
Finished loading testing reservoir embedding......
       pred_class  true_class
59682    1.285959           1
35196    1.285045           1
57692    1.255742           1
42779    1.237151           1
17540    1.213558           1
27532    1.212720           1
19699    1.212586           1
67060    1.208757           1
68346    1.205970           1
10746    1.187439           1


In [10]:
def myRound(x, r):
    if x>r/float(1000):
        return 1
    else:
        return 0

In [11]:
predictions = list(compdf['pred_class'].apply(myRound, r=225))
true_class = list(compdf['true_class'])
accuracy = np.sum(list(map(eqArray, predictions, true_class))) / len(true_class)
accuracy

0.9735098381718179

In [12]:
from sklearn.metrics import confusion_matrix

In [13]:
def myRound(x, r):
    if x>r/float(1000):
        return 1
    else:
        return 0

In [14]:
confm = confusion_matrix(true_class, predictions)
confm

array([[42453,  1676],
       [  159, 24983]], dtype=int64)

In [15]:
tn, fp, fn, tp = confm.ravel()
(tn + tp)/(tn+tp+fn+fp)

0.9735098381718179

In [16]:
print("False alarm rate is")
fp/(tn+fp)

False alarm rate is


0.03797955992657889

In [17]:
print("Precision is")
Precision = tp/(tp+fp)
Precision

Precision is


0.9371319254285607

In [18]:
print("Recall is")
Recall = tp/(tp+fn)
Recall

Recall is


0.9936759207700262

In [19]:
print("F1 is")
2*Precision*Recall/(Precision+Recall)

F1 is


0.9645759734368061

In [20]:
print("Run Time: %.4f" % (end-start))

Run Time: 987.9935
