In [1]:
import numpy as np # library that supports large and multi-dimensional arrays and matrices
import pandas as pd # library for data manipulation and analysis
import scipy.io # to read data from and write data to a variety of file formats
from scipy import sparse # 2D sparse matrix package
from sklearn.decomposition import PCA # linear dimensionality reduction
from sklearn.linear_model import Ridge # linear least squares with L2 regularization
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from scipy.stats import zscore
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import math
from matplotlib.pyplot import *

In [2]:
class Reservoir(object):
    """
    Build a reservoir and evaluate internal states
    
    Parameters:
        n_internal_units = processing units in the reservoir
        spectral_radius = largest eigenvalue of the reservoir matrix of connection weights
        leak = amount of leakage in the reservoir state update (optional)
        connectivity = percentage of nonzero connection weights (unused in circle reservoir)
        input_scaling = scaling of the input connection weights
        noise_level = deviation of the Gaussian noise injected in the state update
        circle = generate determinisitc reservoir with circle topology
    """
    
    def __init__(self, n_internal_units=100, spectral_radius=0.99, leak=None,
                 connectivity=0.3, input_scaling=0.2, noise_level=0.01, circle=False):
        
        # Initialize attributes
        self._n_internal_units = n_internal_units
        self._input_scaling = input_scaling
        self._noise_level = noise_level
        self._leak = leak

        # Input weights depend on input size: they are set when data is provided
        self._input_weights = None

        # Generate internal weights
        if circle:
            self._internal_weights = self._initialize_internal_weights_Circ(
                    n_internal_units,
                    spectral_radius)
        else:
            self._internal_weights = self._initialize_internal_weights(
                n_internal_units,
                connectivity,
                spectral_radius)


    def _initialize_internal_weights_Circ(self, n_internal_units, spectral_radius):
        
        internal_weights = np.zeros((n_internal_units, n_internal_units))
        internal_weights[0,-1] = spectral_radius
        for i in range(n_internal_units-1):
            internal_weights[i+1,i] = spectral_radius
                
        return internal_weights
    
    
    def _initialize_internal_weights(self, n_internal_units,
                                     connectivity, spectral_radius):

        # Generate sparse, uniformly distributed weights.
        internal_weights = sparse.rand(n_internal_units,
                                       n_internal_units,
                                       density=connectivity).todense()

        # Ensure that the nonzero values are uniformly distributed in [-0.5, 0.5]
        internal_weights[np.where(internal_weights > 0)] -= 0.5
        
        # Adjust the spectral radius.
        E, _ = np.linalg.eig(internal_weights)
        e_max = np.max(np.abs(E))
        internal_weights /= np.abs(e_max)/spectral_radius       

        return internal_weights


    def _compute_state_matrix(self, X, n_drop=0):
        N, T, _ = X.shape
        previous_state = np.zeros((N, self._n_internal_units), dtype=float)

        # Storage
        state_matrix = np.empty((N, T - n_drop, self._n_internal_units), dtype=float)
        for t in range(T):
            current_input = X[:, t, :]

            # Calculate state
            state_before_tanh = self._internal_weights.dot(previous_state.T) + self._input_weights.dot(current_input.T)

            # Add noise
            state_before_tanh += np.random.rand(self._n_internal_units, N)*self._noise_level

            # Apply nonlinearity and leakage (optional)
            if self._leak is None:
                previous_state = np.tanh(state_before_tanh).T
            else:
                previous_state = (1.0 - self._leak)*previous_state + np.tanh(state_before_tanh).T

            # Store everything after the dropout period
            if (t > n_drop - 1):
                state_matrix[:, t - n_drop, :] = previous_state

        return state_matrix


    def get_states(self, X, n_drop=0, bidir=True):
        N, T, V = X.shape
        if self._input_weights is None:
            self._input_weights = (2.0*np.random.binomial(1, 0.5 , [self._n_internal_units, V]) - 1.0)*self._input_scaling

        # compute sequence of reservoir states
        states = self._compute_state_matrix(X, n_drop)
    
        # reservoir states on time reversed input
        if bidir is True:
            X_r = X[:, ::-1, :]
            states_r = self._compute_state_matrix(X_r, n_drop)
            states = np.concatenate((states, states_r), axis=2)

        return states
    
    def getReservoirEmbedding(self, X,pca, ridge_embedding,  n_drop=0, bidir=True, test = False):

        res_states = self.get_states(X, n_drop=0, bidir=True)


        N_samples = res_states.shape[0]
        res_states = res_states.reshape(-1, res_states.shape[2])                   
        # ..transform..
        if test:
            red_states = pca.transform(res_states)
        else:
            red_states = pca.fit_transform(res_states)          
        # ..and put back in tensor form
        red_states = red_states.reshape(N_samples,-1,red_states.shape[1])  

        coeff_tr = []
        biases_tr = []   

        for i in range(X.shape[0]):
            ridge_embedding.fit(red_states[i, 0:-1, :], red_states[i, 1:, :])
            coeff_tr.append(ridge_embedding.coef_.ravel())
            biases_tr.append(ridge_embedding.intercept_.ravel())
        #print(np.array(coeff_tr).shape,np.array(biases_tr).shape)
        input_repr = np.concatenate((np.vstack(coeff_tr), np.vstack(biases_tr)), axis=1)
        return input_repr

In [3]:
def targetify(s):
    if s == 'BENIGN':
        return 0
    else:
        return 1

In [4]:
def eqArray(a,b):
    return np.where(a == b, 1, 0)

In [24]:
df1 = pd.read_csv('DDoS2019.csv')
df2 = df1.groupby(['Time'])['Flow Duration'].count()
df2 = pd.DataFrame(df2).reset_index()
df2.columns=['Time','BiFlowsCount']
df = df1.merge(df2, left_on='Time', right_on='Time')
df = df.sort_values('Time')
del df['Unnamed: 0']
num_features = 20 # should be 10, 15, or 20
df

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,ACK Flag Count,Active Max,Active Mean,Active Min,Active Std,Average Packet Size,Avg Bwd Segment Size,Avg Fwd Segment Size,Bwd Avg Bulk Rate,Bwd Avg Bytes/Bulk,...,Table Name,Time,Total Backward Packets,Total Fwd Packets,Total Length of Bwd Packets,Total Length of Fwd Packets,URG Flag Count,act_data_pkt_fwd,min_seg_size_forward,BiFlowsCount
2987377,1,0,0.0,0,0.000000,274.285714,476.666667,122.500000,0,0,...,DrDoS_NTP.csv,2018-12-01 10:00:01 AM,9,12,4290,1470,0,4,32,1
2987378,0,0,0.0,0,0.000000,27.166667,10.333333,33.666667,0,0,...,DrDoS_NTP.csv,2018-12-01 10:00:02 AM,3,3,31,101,1,2,20,3
2987379,0,0,0.0,0,0.000000,0.000000,0.000000,0.000000,0,0,...,DrDoS_NTP.csv,2018-12-01 10:00:02 AM,1,1,0,0,1,0,32,3
2987380,0,0,0.0,0,0.000000,0.000000,0.000000,0.000000,0,0,...,DrDoS_NTP.csv,2018-12-01 10:00:02 AM,2,1,0,0,1,0,32,3
2987381,0,0,0.0,0,0.000000,0.000000,0.000000,0.000000,0,0,...,DrDoS_NTP.csv,2018-12-01 10:00:05 AM,0,2,0,0,1,0,32,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2987372,0,0,0.0,0,0.000000,0.000000,0.000000,0.000000,0,0,...,DrDoS_NTP.csv,2018-12-01 9:59:57 AM,2,1,0,0,1,0,32,3
2987373,0,93489,77872.5,62256,22085.066097,29.894737,30.666667,26.000000,0,0,...,DrDoS_NTP.csv,2018-12-01 9:59:57 AM,6,13,184,338,1,7,20,3
2987374,0,0,0.0,0,0.000000,40.000000,0.000000,30.800000,0,0,...,DrDoS_NTP.csv,2018-12-01 9:59:59 AM,0,5,0,154,1,3,20,3
2987375,0,0,0.0,0,0.000000,10.333333,12.400000,0.000000,0,0,...,DrDoS_NTP.csv,2018-12-01 9:59:59 AM,5,1,62,0,1,0,20,3


In [25]:
features = ['Inbound', 'URG Flag Count', 'Bwd Packet Length Min', 'CWE Flag Count', 'min_seg_size_forward',
           'BiFlowsCount', 'Destination Port', 'Source Port', 'RST Flag Count', 'ACK Flag Count', 'Down/Up Ratio', 'Init_Win_bytes_backward', 
           'Init_Win_bytes_forward', 'Bwd Packet Length Mean', 'Protocol', 'Avg Bwd Segment Size', 'Fwd PSH Flags',
           'Fwd Packet Length Max', 'Fwd Packet Length Mean', 'Min Packet Length']

In [26]:
features = features[0:num_features]

In [27]:
fraction = 0.125
print(str(num_features) + " features")
print("fraction:" + str(fraction))
data = df.sample(frac=fraction, replace=True, random_state=1)

# get X and y. Normalize X and make it into 3D shape for reservoir
num_col = data.shape[1]
num_row = data.shape[0]

X_data = data[features]
X_data[features] = X_data[features].apply(pd.to_numeric, errors='coerce', axis=1)
#norm_scaler = preprocessing.StandardScaler()
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(X_data.values)
X = np.nan_to_num(x_scaled)
if len(X.shape) < 3:
    X = np.atleast_3d(X)
y = data['Label'].apply(targetify)
print("Finished loading X and y......")

# split into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)

print("X_train shape:" + str(X_train.shape), "y_train shape:" + str(y_train.shape))
print("X_test shape:" + str(X_test.shape), "y_test shape:" + str(y_test.shape))

pca = PCA() #n_components gives number of components to keep for linear dimensionality reduction
ridge_embedding = Ridge(alpha=10, fit_intercept=True)
readout = Ridge(alpha=5)

15 features
fraction:0.25


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


Finished loading X and y......
X_train shape:(1070367, 15, 1) y_train shape:(1070367,)
X_test shape:(267592, 15, 1) y_test shape:(267592,)


In [28]:
n=10 #number of internal units
print(str(n) + " internal units")

#run through reservoir
res = Reservoir(n_internal_units=n, spectral_radius=0.9, leak=0.2,
     connectivity=0.25, input_scaling=0.3, noise_level=0.01, circle=False)
input_repr = res.getReservoirEmbedding(np.array(X_train), pca, ridge_embedding,  n_drop=0, bidir=False, test = False)
print("Finished loading training reservoir embedding......")
input_repr_te = res.getReservoirEmbedding(np.array(X_test), pca, ridge_embedding,  n_drop=0, bidir=False, test = True)
print("Finished loading testing reservoir embedding......")

#fit output
readout.fit(input_repr, y_train)
pred_class = readout.predict(input_repr_te)
#predictions = [int(round(x)) for x in pred_class]
true_class = list(y_test)

#analysis
compdf = pd.DataFrame({'pred_class':pred_class, 'true_class':true_class})
compdf = compdf.sort_values('pred_class', ascending=False)
print(str(compdf.head(10)))
#compdf.to_csv(str(dataset.split('_')[0]) + '_' + str(fraction) + '_' + str(num_features) + '_' + str(n) + '.csv')
#accuracy = np.sum(list(map(eqArray, predictions, true_class))) / len(true_class)
#f1 = f1_score(true_class, predictions)
#auc = roc_auc_score(true_class, predictions)

#print("# of nonzero:" + str(np.count_nonzero(predictions)))
#print("accuracy is " + str(accuracy))
#print("f1 is " + str(f1))
#print("auc is " + str(auc))
#print("*******************************************************************")

10 internal units
Finished loading training reservoir embedding......
Finished loading testing reservoir embedding......
        pred_class  true_class
0              1.0           1
178385         1.0           1
178387         1.0           1
178388         1.0           1
178389         1.0           1
178390         1.0           1
178391         1.0           1
178392         1.0           1
178393         1.0           1
178394         1.0           1


In [29]:
def myRound(x, r):
    if x>r/float(1000):
        return 1
    else:
        return 0

In [30]:
predictions = list(compdf['pred_class'].apply(myRound, r=225))
true_class = list(compdf['true_class'])
accuracy = np.sum(list(map(eqArray, predictions, true_class))) / len(true_class)
accuracy

1.0

In [31]:
from sklearn.metrics import confusion_matrix

In [32]:
def myRound(x, r):
    if x>r/float(1000):
        return 1
    else:
        return 0

In [33]:
confm = confusion_matrix(true_class, predictions)
confm

array([[267592]], dtype=int64)

In [34]:
tn, fp, fn, tp = confm.ravel()
(tn + tp)/(tn+tp+fn+fp)

ValueError: not enough values to unpack (expected 4, got 1)

In [None]:
print("False alarm rate is")
fp/(tn+fp)

In [None]:
print("Precision is")
Precision = tp/(tp+fp)
Precision

In [None]:
print("Recall is")
Recall = tp/(tp+fn)
Recall

In [None]:
print("F1 is")
2*Precision*Recall/(Precision+Recall)