In [1]:
import pandas
import numpy
import rrcf
import time

from sklearn import metrics
from sklearn import preprocessing
from urllib.parse import urlparse

In [2]:
data = []

def file_to_dataframe(filepath, label):
    with open(filepath, 'r') as fd:
        lines = fd.readlines()
        request ={}
        for line in lines:
            if len(line.strip()) == 0:
                continue
            elif line.startswith('GET') or line.startswith('POST') or line.startswith('PUT'):
                if request:
                    data.append(request)
                request = {}
                line = line.split()
                parsed_url = urlparse(line[1])
                request['query'] = parsed_url.query
                request['label'] = label
            elif ' ' in line:
                continue
            else:
                request['query'] = line.strip()

In [3]:
file_to_dataframe('normalTrafficTraining.txt', 0)
file_to_dataframe('anomalousTrafficTest.txt', 1)

In [4]:
df = pandas.DataFrame(data=data)
original_df = df.copy(deep=True)

In [5]:
def convert_to_categorical(dataframe, column):
    dataframe[column] = dataframe[column].astype('category').cat.codes
    return dataframe

for col_name in list(df.columns):
    convert_to_categorical(df, col_name)

In [6]:
arr = df.to_numpy()

In [7]:
min_max_scaler = preprocessing.MinMaxScaler()
scaled_arr = min_max_scaler.fit_transform(arr)
scaled_df = pandas.DataFrame(data=scaled_arr.flatten())

In [8]:
def shingle(series, dim):
    height = len(series) - dim + 1
    shingled = numpy.zeros((dim, height))
    for i in range(dim):
        shingled[i] = series[i:i + height]
    return shingled

In [9]:
trials = 5
value = list(scaled_df[0])
arr = numpy.array(value)
y = numpy.array(list(df["label"]))
X = shingle(arr, 10)
X = numpy.transpose(X)
t1, _ = numpy.shape(X)
[n,d] = numpy.shape(X)
y = y[:t1]
time_all = numpy.zeros((trials,4))
precision_all = numpy.zeros((trials,4))
auc_all = numpy.zeros((trials,4))

In [11]:
for j in range(0,trials):
    print('\n\n******'+' trial '+str(j+1)+'*******\n\n')
    num_trees = 500
    tree_size = 256
    start = time.time()
    forest = []
    while len(forest) < num_trees:
        # Select random subsets of points uniformly from point set
        ixs = numpy.random.choice(n, size=(n // tree_size, tree_size), replace=False)
        # Add sampled trees to forest
        trees = [rrcf.RCTree(X[ix], index_labels=ix) for ix in ixs]
        forest.extend(trees)

    end = time.time()
    time_all[j,0] = end - start        
        
    # Compute average CoDisp
    avg_codisp = pandas.Series(0.0, index=numpy.arange(n))
    index = numpy.zeros(n)
    for tree in forest:
        codisp = pandas.Series({leaf : tree.codisp(leaf) for leaf in tree.leaves})
        avg_codisp[codisp.index] += codisp
        numpy.add.at(index, codisp.index.values, 1)
    avg_codisp /= index
    iso_scores = avg_codisp  
    print(avg_codisp)



****** trial 1*******


0         7.045045
1         8.525000
2         5.450000
3         8.016667
4         5.337302
            ...   
122112    4.229730
122113    5.500000
122114    1.986702
122115    5.361111
122116    4.583333
Length: 122117, dtype: float64


****** trial 2*******


0         10.250000
1         26.923077
2         18.433333
3         42.500000
4         41.033333
            ...    
122112     2.589020
122113     4.885802
122114     3.145614
122115    11.000000
122116     2.869631
Length: 122117, dtype: float64


****** trial 3*******


0          7.421875
1          4.547143
2          5.229323
3         22.083333
4         33.600000
            ...    
122112     2.141026
122113    12.250000
122114     4.640625
122115     6.051282
122116     2.664516
Length: 122117, dtype: float64


****** trial 4*******


0          7.622222
1          7.099206
2          7.513889
3         31.100000
4         10.366071
            ...    
122112     2.100000
122113     5.1