In [1]:
import numpy as np # library that supports large and multi-dimensional arrays and matrices
import pandas as pd # library for data manipulation and analysis
import scipy.io # to read data from and write data to a variety of file formats
from scipy import sparse # 2D sparse matrix package
from sklearn.decomposition import PCA # linear dimensionality reduction
from sklearn.linear_model import Ridge # linear least squares with L2 regularization
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from scipy.stats import zscore
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import math
from matplotlib.pyplot import *

In [2]:
class Reservoir(object):
    """
    Build a reservoir and evaluate internal states
    
    Parameters:
        n_internal_units = processing units in the reservoir
        spectral_radius = largest eigenvalue of the reservoir matrix of connection weights
        leak = amount of leakage in the reservoir state update (optional)
        connectivity = percentage of nonzero connection weights (unused in circle reservoir)
        input_scaling = scaling of the input connection weights
        noise_level = deviation of the Gaussian noise injected in the state update
        circle = generate determinisitc reservoir with circle topology
    """
    
    def __init__(self, n_internal_units=100, spectral_radius=0.99, leak=None,
                 connectivity=0.3, input_scaling=0.2, noise_level=0.01, circle=False):
        
        # Initialize attributes
        self._n_internal_units = n_internal_units
        self._input_scaling = input_scaling
        self._noise_level = noise_level
        self._leak = leak

        # Input weights depend on input size: they are set when data is provided
        self._input_weights = None

        # Generate internal weights
        if circle:
            self._internal_weights = self._initialize_internal_weights_Circ(
                    n_internal_units,
                    spectral_radius)
        else:
            self._internal_weights = self._initialize_internal_weights(
                n_internal_units,
                connectivity,
                spectral_radius)


    def _initialize_internal_weights_Circ(self, n_internal_units, spectral_radius):
        
        internal_weights = np.zeros((n_internal_units, n_internal_units))
        internal_weights[0,-1] = spectral_radius
        for i in range(n_internal_units-1):
            internal_weights[i+1,i] = spectral_radius
                
        return internal_weights
    
    
    def _initialize_internal_weights(self, n_internal_units,
                                     connectivity, spectral_radius):

        # Generate sparse, uniformly distributed weights.
        internal_weights = sparse.rand(n_internal_units,
                                       n_internal_units,
                                       density=connectivity).todense()

        # Ensure that the nonzero values are uniformly distributed in [-0.5, 0.5]
        internal_weights[np.where(internal_weights > 0)] -= 0.5
        
        # Adjust the spectral radius.
        E, _ = np.linalg.eig(internal_weights)
        e_max = np.max(np.abs(E))
        internal_weights /= np.abs(e_max)/spectral_radius       

        return internal_weights


    def _compute_state_matrix(self, X, n_drop=0):
        N, T, _ = X.shape
        previous_state = np.zeros((N, self._n_internal_units), dtype=float)

        # Storage
        state_matrix = np.empty((N, T - n_drop, self._n_internal_units), dtype=float)
        for t in range(T):
            current_input = X[:, t, :]

            # Calculate state
            state_before_tanh = self._internal_weights.dot(previous_state.T) + self._input_weights.dot(current_input.T)

            # Add noise
            state_before_tanh += np.random.rand(self._n_internal_units, N)*self._noise_level

            # Apply nonlinearity and leakage (optional)
            if self._leak is None:
                previous_state = np.tanh(state_before_tanh).T
            else:
                previous_state = (1.0 - self._leak)*previous_state + np.tanh(state_before_tanh).T

            # Store everything after the dropout period
            if (t > n_drop - 1):
                state_matrix[:, t - n_drop, :] = previous_state

        return state_matrix


    def get_states(self, X, n_drop=0, bidir=True):
        N, T, V = X.shape
        if self._input_weights is None:
            self._input_weights = (2.0*np.random.binomial(1, 0.5 , [self._n_internal_units, V]) - 1.0)*self._input_scaling

        # compute sequence of reservoir states
        states = self._compute_state_matrix(X, n_drop)
    
        # reservoir states on time reversed input
        if bidir is True:
            X_r = X[:, ::-1, :]
            states_r = self._compute_state_matrix(X_r, n_drop)
            states = np.concatenate((states, states_r), axis=2)

        return states
    
    def getReservoirEmbedding(self, X,pca, ridge_embedding,  n_drop=0, bidir=True, test = False):

        res_states = self.get_states(X, n_drop=0, bidir=True)


        N_samples = res_states.shape[0]
        res_states = res_states.reshape(-1, res_states.shape[2])                   
        # ..transform..
        if test:
            red_states = pca.transform(res_states)
        else:
            red_states = pca.fit_transform(res_states)          
        # ..and put back in tensor form
        red_states = red_states.reshape(N_samples,-1,red_states.shape[1])  

        coeff_tr = []
        biases_tr = []   

        for i in range(X.shape[0]):
            ridge_embedding.fit(red_states[i, 0:-1, :], red_states[i, 1:, :])
            coeff_tr.append(ridge_embedding.coef_.ravel())
            biases_tr.append(ridge_embedding.intercept_.ravel())
        #print(np.array(coeff_tr).shape,np.array(biases_tr).shape)
        input_repr = np.concatenate((np.vstack(coeff_tr), np.vstack(biases_tr)), axis=1)
        return input_repr

In [3]:
def targetify(s):
    if s == 'BENIGN':
        return 0
    else:
        return 1

In [4]:
def eqArray(a,b):
    return np.where(a == b, 1, 0)

In [5]:
df1 = pd.read_csv('short_oversampled_CICDDoS2019.csv')
df1

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,Flow ID,Source IP,Source Port,Destination IP,Destination Port,Protocol,Timestamp,...,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,SimillarHTTP,Inbound,Label,target
0,331784,2383541,25018,172.16.0.5-192.168.50.1-900-51191-17,172.16.0.5,900,192.168.50.1,51191,17,2018-12-01 11:27:48.991419,...,0.0,0.0,0.0,0.0,0.0,0.0,0,1,DrDoS_LDAP,1
1,681538,2787990,3686,172.16.0.5-192.168.50.1-900-30666-17,172.16.0.5,900,192.168.50.1,30666,17,2018-12-01 11:29:40.752984,...,0.0,0.0,0.0,0.0,0.0,0.0,0,1,DrDoS_LDAP,1
2,129580,9703,7328,192.168.50.8-74.208.236.171-58739-80-6,192.168.50.8,58739,74.208.236.171,80,6,2018-12-01 09:58:53.243688,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0,BENIGN,0
3,422212,630396,1480,172.16.0.5-192.168.50.1-780-975-17,172.16.0.5,780,192.168.50.1,975,17,2018-12-01 10:40:27.990422,...,0.0,0.0,0.0,0.0,0.0,0.0,0,1,DrDoS_NTP,1
4,67101,5034493,1847,172.217.10.98-192.168.50.7-80-52361-6,192.168.50.7,52361,172.217.10.98,80,6,2018-12-01 13:29:02.216554,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0,BENIGN,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,619953,211,6203,192.168.50.7-4.2.2.4-60801-53-17,192.168.50.7,60801,4.2.2.4,53,17,2018-12-01 09:17:35.742876,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0,BENIGN,0
999996,289395,2027963,12376,172.16.0.5-192.168.50.1-798-28534-17,172.16.0.5,798,192.168.50.1,28534,17,2018-12-01 11:26:13.214604,...,0.0,0.0,0.0,0.0,0.0,0.0,0,1,DrDoS_LDAP,1
999997,673174,2464450,26748,172.16.0.5-192.168.50.1-975-62605-17,172.16.0.5,975,192.168.50.1,62605,17,2018-12-01 11:28:10.369171,...,0.0,0.0,0.0,0.0,0.0,0.0,0,1,DrDoS_LDAP,1
999998,861416,1474353,28971,172.16.0.5-192.168.50.1-518-50242-17,172.16.0.5,518,192.168.50.1,50242,17,2018-12-01 11:23:48.951612,...,0.0,0.0,0.0,0.0,0.0,0.0,0,1,DrDoS_LDAP,1


In [6]:
df2 = df1.groupby([' Timestamp'])[' Flow Duration'].count()
df2 = pd.DataFrame(df2).reset_index()
df2.columns=[' Timestamp','BiFlowsCount']
df = df1.merge(df2, left_on=' Timestamp', right_on=' Timestamp')
df = df.sort_values(' Timestamp')
del df['Unnamed: 0']
del df['Unnamed: 0.1']
del df['Unnamed: 0.1.1']
del df['target']
del df[' Inbound']
df

Unnamed: 0,Flow ID,Source IP,Source Port,Destination IP,Destination Port,Protocol,Timestamp,Flow Duration,Total Fwd Packets,Total Backward Packets,...,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,SimillarHTTP,Label,BiFlowsCount
98143,172.217.3.110-192.168.50.6-80-56085-6,192.168.50.6,56085,172.217.3.110,80,6,2018-12-01 09:17:13.931484,34847273,9,8,...,32.908965,33030.0,32967.0,10004347.0,8980.692846,10014717.0,9999149.0,0,BENIGN,151
98130,172.217.3.110-192.168.50.6-80-56085-6,192.168.50.6,56085,172.217.3.110,80,6,2018-12-01 09:17:13.931484,34847273,9,8,...,32.908965,33030.0,32967.0,10004347.0,8980.692846,10014717.0,9999149.0,0,BENIGN,151
98131,172.217.3.110-192.168.50.6-80-56085-6,192.168.50.6,56085,172.217.3.110,80,6,2018-12-01 09:17:13.931484,34847273,9,8,...,32.908965,33030.0,32967.0,10004347.0,8980.692846,10014717.0,9999149.0,0,BENIGN,151
98132,172.217.3.110-192.168.50.6-80-56085-6,192.168.50.6,56085,172.217.3.110,80,6,2018-12-01 09:17:13.931484,34847273,9,8,...,32.908965,33030.0,32967.0,10004347.0,8980.692846,10014717.0,9999149.0,0,BENIGN,151
98133,172.217.3.110-192.168.50.6-80-56085-6,192.168.50.6,56085,172.217.3.110,80,6,2018-12-01 09:17:13.931484,34847273,9,8,...,32.908965,33030.0,32967.0,10004347.0,8980.692846,10014717.0,9999149.0,0,BENIGN,151
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
769002,172.16.0.5-192.168.50.1-58456-40970-6,172.16.0.5,58456,192.168.50.1,40970,6,2018-12-01 13:34:27.399353,1,2,0,...,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0,Syn,1
626544,172.16.0.5-192.168.50.1-43281-3482-6,172.16.0.5,43281,192.168.50.1,3482,6,2018-12-01 13:34:27.399459,1,2,0,...,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0,Syn,1
828222,172.16.0.5-192.168.50.1-43286-10991-6,172.16.0.5,43286,192.168.50.1,10991,6,2018-12-01 13:34:27.401133,1,2,0,...,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0,Syn,1
540722,172.16.0.5-192.168.50.1-43297-21257-6,172.16.0.5,43297,192.168.50.1,21257,6,2018-12-01 13:34:27.402247,1,2,0,...,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0,Syn,1


In [7]:
num_features = 20 # should be 10, 15, or 20
df[" Label"].value_counts()

BENIGN        500153
DrDoS_LDAP    204119
Syn           148501
DrDoS_NTP     112981
UDP-lag        34212
WebDDoS           34
Name:  Label, dtype: int64

In [8]:
features = ['BiFlowsCount', ' Source Port', ' Destination Port', ' URG Flag Count', ' ACK Flag Count', ' Fwd Packet Length Min',
' Avg Fwd Segment Size', 'Fwd Packets/s', ' Min Packet Length', ' Fwd Packet Length Mean', ' Flow Packets/s', ' Packet Length Mean',
' CWE Flag Count', ' Protocol', ' Average Packet Size', ' Down/Up Ratio', 'Flow Bytes/s', 'Init_Win_bytes_forward', ' RST Flag Count',
' Bwd IAT Min']

In [9]:
features = features[0:num_features]

In [10]:
fraction = 0.5
print(str(num_features) + " features")
print("fraction:" + str(fraction))
data = df.sample(frac=fraction, replace=True, random_state=1)

# get X and y. Normalize X and make it into 3D shape for reservoir
num_col = data.shape[1]
num_row = data.shape[0]

X_data = data[features]
X_data[features] = X_data[features].apply(pd.to_numeric, errors='coerce', axis=1)
#norm_scaler = preprocessing.StandardScaler()
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(X_data.values)
X = np.nan_to_num(x_scaled)
if len(X.shape) < 3:
    X = np.atleast_3d(X)
y = data[' Label'].apply(targetify)
print("Finished loading X and y......")

# split into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)

print("X_train shape:" + str(X_train.shape), "y_train shape:" + str(y_train.shape))
print("X_test shape:" + str(X_test.shape), "y_test shape:" + str(y_test.shape))

pca = PCA() #n_components gives number of components to keep for linear dimensionality reduction
ridge_embedding = Ridge(alpha=10, fit_intercept=True)
readout = Ridge(alpha=5)

20 features
fraction:0.5


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


Finished loading X and y......
X_train shape:(400000, 20, 1) y_train shape:(400000,)
X_test shape:(100000, 20, 1) y_test shape:(100000,)


In [11]:
n=30 #number of internal units
print(str(n) + " internal units")

#run through reservoir
res = Reservoir(n_internal_units=n, spectral_radius=0.9, leak=0.2,
     connectivity=0.25, input_scaling=0.3, noise_level=0.01, circle=False)
input_repr = res.getReservoirEmbedding(np.array(X_train), pca, ridge_embedding,  n_drop=0, bidir=False, test = False)
print("Finished loading training reservoir embedding......")
input_repr_te = res.getReservoirEmbedding(np.array(X_test), pca, ridge_embedding,  n_drop=0, bidir=False, test = True)
print("Finished loading testing reservoir embedding......")

#fit output
readout.fit(input_repr, y_train)
pred_class = readout.predict(input_repr_te)
#predictions = [int(round(x)) for x in pred_class]
true_class = list(y_test)

#analysis
compdf = pd.DataFrame({'pred_class':pred_class, 'true_class':true_class})
compdf = compdf.sort_values('pred_class', ascending=False)
print(str(compdf.head(10)))
#compdf.to_csv(str(dataset.split('_')[0]) + '_' + str(fraction) + '_' + str(num_features) + '_' + str(n) + '.csv')
#accuracy = np.sum(list(map(eqArray, predictions, true_class))) / len(true_class)
#f1 = f1_score(true_class, predictions)
#auc = roc_auc_score(true_class, predictions)

#print("# of nonzero:" + str(np.count_nonzero(predictions)))
#print("accuracy is " + str(accuracy))
#print("f1 is " + str(f1))
#print("auc is " + str(auc))
#print("*******************************************************************")

30 internal units
Finished loading training reservoir embedding......
Finished loading testing reservoir embedding......
       pred_class  true_class
60785    1.268058           1
39090    1.243995           1
93481    1.240413           1
14741    1.233377           1
19141    1.217966           1
57148    1.211528           1
48697    1.210378           1
62896    1.209772           1
31679    1.208839           1
21453    1.203685           1


In [12]:
def myRound(x, r):
    if x>r/float(1000):
        return 1
    else:
        return 0

In [13]:
predictions = list(compdf['pred_class'].apply(myRound, r=225))
true_class = list(compdf['true_class'])
accuracy = np.sum(list(map(eqArray, predictions, true_class))) / len(true_class)
accuracy

0.99929

In [14]:
from sklearn.metrics import confusion_matrix

In [15]:
def myRound(x, r):
    if x>r/float(1000):
        return 1
    else:
        return 0

In [None]:
confm = confusion_matrix(true_class, predictions)
confm

In [None]:
tn, fp, fn, tp = confm.ravel()
(tn + tp)/(tn+tp+fn+fp)

In [None]:
print("False alarm rate is")
fp/(tn+fp)

In [None]:
print("Precision is")
Precision = tp/(tp+fp)
Precision

In [None]:
print("Recall is")
Recall = tp/(tp+fn)
Recall

In [None]:
print("F1 is")
2*Precision*Recall/(Precision+Recall)