In [1]:
import numpy as np # library that supports large and multi-dimensional arrays and matrices
import pandas as pd # library for data manipulation and analysis
import scipy.io # to read data from and write data to a variety of file formats
from scipy import sparse # 2D sparse matrix package
from sklearn.decomposition import PCA # linear dimensionality reduction
from sklearn.linear_model import Ridge # linear least squares with L2 regularization
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from scipy.stats import zscore
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import math
from matplotlib.pyplot import *

In [2]:
class Reservoir(object):
    """
    Build a reservoir and evaluate internal states
    
    Parameters:
        n_internal_units = processing units in the reservoir
        spectral_radius = largest eigenvalue of the reservoir matrix of connection weights
        leak = amount of leakage in the reservoir state update (optional)
        connectivity = percentage of nonzero connection weights (unused in circle reservoir)
        input_scaling = scaling of the input connection weights
        noise_level = deviation of the Gaussian noise injected in the state update
        circle = generate determinisitc reservoir with circle topology
    """
    
    def __init__(self, n_internal_units=100, spectral_radius=0.99, leak=None,
                 connectivity=0.3, input_scaling=0.2, noise_level=0.01, circle=False):
        
        # Initialize attributes
        self._n_internal_units = n_internal_units
        self._input_scaling = input_scaling
        self._noise_level = noise_level
        self._leak = leak

        # Input weights depend on input size: they are set when data is provided
        self._input_weights = None

        # Generate internal weights
        if circle:
            self._internal_weights = self._initialize_internal_weights_Circ(
                    n_internal_units,
                    spectral_radius)
        else:
            self._internal_weights = self._initialize_internal_weights(
                n_internal_units,
                connectivity,
                spectral_radius)


    def _initialize_internal_weights_Circ(self, n_internal_units, spectral_radius):
        
        internal_weights = np.zeros((n_internal_units, n_internal_units))
        internal_weights[0,-1] = spectral_radius
        for i in range(n_internal_units-1):
            internal_weights[i+1,i] = spectral_radius
                
        return internal_weights
    
    
    def _initialize_internal_weights(self, n_internal_units,
                                     connectivity, spectral_radius):

        # Generate sparse, uniformly distributed weights.
        internal_weights = sparse.rand(n_internal_units,
                                       n_internal_units,
                                       density=connectivity).todense()

        # Ensure that the nonzero values are uniformly distributed in [-0.5, 0.5]
        internal_weights[np.where(internal_weights > 0)] -= 0.5
        
        # Adjust the spectral radius.
        E, _ = np.linalg.eig(internal_weights)
        e_max = np.max(np.abs(E))
        internal_weights /= np.abs(e_max)/spectral_radius       

        return internal_weights


    def _compute_state_matrix(self, X, n_drop=0):
        N, T, _ = X.shape
        previous_state = np.zeros((N, self._n_internal_units), dtype=float)

        # Storage
        state_matrix = np.empty((N, T - n_drop, self._n_internal_units), dtype=float)
        for t in range(T):
            current_input = X[:, t, :]

            # Calculate state
            state_before_tanh = self._internal_weights.dot(previous_state.T) + self._input_weights.dot(current_input.T)

            # Add noise
            state_before_tanh += np.random.rand(self._n_internal_units, N)*self._noise_level

            # Apply nonlinearity and leakage (optional)
            if self._leak is None:
                previous_state = np.tanh(state_before_tanh).T
            else:
                previous_state = (1.0 - self._leak)*previous_state + np.tanh(state_before_tanh).T

            # Store everything after the dropout period
            if (t > n_drop - 1):
                state_matrix[:, t - n_drop, :] = previous_state

        return state_matrix


    def get_states(self, X, n_drop=0, bidir=True):
        N, T, V = X.shape
        if self._input_weights is None:
            self._input_weights = (2.0*np.random.binomial(1, 0.5 , [self._n_internal_units, V]) - 1.0)*self._input_scaling

        # compute sequence of reservoir states
        states = self._compute_state_matrix(X, n_drop)
    
        # reservoir states on time reversed input
        if bidir is True:
            X_r = X[:, ::-1, :]
            states_r = self._compute_state_matrix(X_r, n_drop)
            states = np.concatenate((states, states_r), axis=2)

        return states
    
    def getReservoirEmbedding(self, X,pca, ridge_embedding,  n_drop=0, bidir=True, test = False):

        res_states = self.get_states(X, n_drop=0, bidir=True)

        N_samples = res_states.shape[0]
        res_states = res_states.reshape(-1, res_states.shape[2])                    
        # ..transform..
        if test:
            red_states = pca.transform(res_states)
        else:
            red_states = pca.fit_transform(res_states)          
        # ..and put back in tensor form
        red_states = red_states.reshape(N_samples,-1,red_states.shape[1])
        print("red_states:" + str(red_states.shape))

        coeff_tr = []
        biases_tr = []   
        
        for i in range(X.shape[0]):
            ridge_embedding.fit(red_states[i, 0:-1, :], red_states[i, 1:, :])
            coeff_tr.append(ridge_embedding.coef_.ravel())
            biases_tr.append(ridge_embedding.intercept_.ravel())
        #print(np.array(coeff_tr).shape,np.array(biases_tr).shape)
        input_repr = np.concatenate((np.vstack(coeff_tr), np.vstack(biases_tr)), axis=1)
        return input_repr

In [20]:
df = pd.read_csv('RIPE2019_kmeans_new.csv', header=None)
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,32,33,34,35,36,37,38,39,40,41
0,0,0,0,0,2070,132,2805,292,5,18,...,0,0,0,0,0,4913,1,660,366,0
1,1,0,1,0,1426,149,2237,307,5,18,...,0,0,0,0,0,2979,0,441,357,0
2,2,0,2,0,1241,133,2250,311,5,19,...,0,0,0,0,0,3609,0,443,372,0
3,3,0,3,0,1252,107,2451,322,5,20,...,0,0,0,0,0,3474,0,410,373,0
4,4,0,4,0,1414,139,2416,241,5,20,...,0,0,0,0,0,3982,0,539,373,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10075,2355,23,55,0,1352,95,2753,198,5,17,...,0,0,0,0,0,3583,0,427,378,0
10076,2356,23,56,0,1296,65,3239,124,5,18,...,0,0,0,0,0,2894,0,431,371,0
10077,2357,23,57,0,1137,102,2363,200,5,19,...,0,0,0,0,0,3351,0,404,380,0
10078,2358,23,58,0,1219,112,1878,174,5,18,...,0,0,0,0,0,3633,0,489,373,0


In [5]:
def targetify(s):
    if s == 0.0:
        return 0
    else:
        return 1

In [6]:
#df_test = pd.read_csv('slammer_test.csv', header=None)


In [21]:
def eqArray(a,b):
    return np.where(a == b, 1, 0)

In [22]:
cols = [38, 4, 13, 39, 5, 37, 40, 6, 11, 12, 15, 7, 10, 9, 16, 14, 24, 8, 26, 25 ] # checked: correct
features = df.columns[cols]


In [35]:
df['Target']=df.loc[:,41]

X_data = df[features]
X_data[features] = X_data[features].apply(pd.to_numeric, errors='coerce', axis=1)
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(X_data.values)
X = np.nan_to_num(x_scaled)
if len(X.shape) < 3:
    X = np.atleast_3d(X)
y = df['Target']
print("Finished loading training X and y......")
print("Finished loading test X and y......")

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)

print("X_train shape:" + str(X_train.shape), "y_train shape:" + str(y_train.shape))
print("X_test shape:" + str(X_test.shape), "y_test shape:" + str(y_test.shape))

pca = PCA() #n_components gives number of components to keep for linear dimensionality reduction
ridge_embedding = Ridge(alpha=10, fit_intercept=True)
readout = Ridge(alpha=5)

Finished loading training X and y......
Finished loading test X and y......
X_train shape:(8064, 20, 1) y_train shape:(8064,)
X_test shape:(2016, 20, 1) y_test shape:(2016,)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [36]:
n=30 #number of internal units
print(str(n) + " internal units")

#run through reservoir
res = Reservoir(n_internal_units=n, spectral_radius=0.9, leak=0.2,
     connectivity=0.25, input_scaling=0.3, noise_level=0.01, circle=False)
input_repr = res.getReservoirEmbedding(np.array(X_train), pca, ridge_embedding,  n_drop=0, bidir=False, test = False)
print("Finished loading training reservoir embedding......")
input_repr_te = res.getReservoirEmbedding(np.array(X_test), pca, ridge_embedding,  n_drop=0, bidir=False, test = True)
print("Finished loading testing reservoir embedding......")

#fit output
readout.fit(input_repr, y_train)
pred_class = readout.predict(input_repr_te)
#predictions = [int(round(x)) for x in pred_class]
true_class = list(y_test)

#analysis
compdf = pd.DataFrame({'pred_class':pred_class, 'true_class':true_class})
compdf = compdf.sort_values('pred_class', ascending=False)
print(str(compdf.head(10)))

30 internal units
red_states:(8064, 20, 60)
Finished loading training reservoir embedding......
red_states:(2016, 20, 60)
Finished loading testing reservoir embedding......
      pred_class  true_class
1295    0.342228           1
1875    0.315029           1
1175    0.290790           1
2005    0.240686           0
1582    0.212879           0
94      0.124220           0
1982    0.121625           1
552     0.097386           0
1806    0.089874           0
1485    0.087571           0


In [37]:
def myRound(x, r):
    if x>r/float(1000):
        return 1
    else:
        return 0

In [38]:
predictions = list(compdf['pred_class'].apply(myRound, r=225))
true_class = list(compdf['true_class'])
accuracy = np.sum(list(map(eqArray, predictions, true_class))) / len(true_class)
accuracy

0.9990079365079365

In [39]:
from sklearn.metrics import confusion_matrix

In [40]:
def myRound(x, r):
    if x>r/float(1000):
        return 1
    else:
        return 0

In [41]:
confm = confusion_matrix(true_class, predictions)
confm

array([[2011,    1],
       [   1,    3]], dtype=int64)

In [42]:
tn, fp, fn, tp = confm.ravel()
(tn + tp)/(tn+tp+fn+fp)

0.9990079365079365

In [43]:
print("False alarm rate is")
fp/(tn+fp)

False alarm rate is


0.0004970178926441351

In [44]:
print("Precision is")
Precision = tp/(tp+fp)
Precision

Precision is


0.75

In [45]:
print("Recall is")
Recall = tp/(tp+fn)
Recall

Recall is


0.75

In [46]:
print("F1 is")
2*Precision*Recall/(Precision+Recall)

F1 is


0.75