In [3]:
import numpy as np
import matplotlib.pyplot as plt
from numpy import genfromtxt
import pandas as pd
import seaborn as sns
from tabulate import tabulate 
import latextable
from texttable import Texttable


# 1. Dataset and Mapping creation

In [4]:
def create_dataset(df):
    '''Create a dataset of numerical encoding for each column
        *** input ***
        * df - the pandas dataframe that needs to be changed to numerical data
        *** returns ***
        * df - the new pandas dataframe with the numerical encodings
        * class_dict - a dictionary containing the mappings for each column header and the label for each numerical value in the new dataframe
    '''
    
    class_dict = {}
    for column in df:
        values, df = create_mapping(df,column)
        class_dict[column] = values

    return df, class_dict

def create_mapping(df,column):
    '''create the mapping from label to number
    *** input ***
    *df - the dataframe (pandas) to start from
    *column - the column name
    *** returns ***
    * values - the dict with as the key the label and as value  the numercal value
    * df - the new dataframe (pandas) with for one column numerical values
    '''
    
    values = {}
    count = 1
    for i in range(len(df[column])):
 
        if isinstance(df.loc[i,column], str):
            if df.loc[i,column] in values.keys():
            
                df.loc[i,column] = values[df.loc[i,column]]
            else:
                values[df.loc[i,column]] = count
                df.loc[i, column] = count
                count+=1
        else:
            if str(df.loc[i,column]) in values.keys():
            
                df.loc[i,column] = values[str(df.loc[i,column])]
            else:
                values[str(df.loc[i,column])] = count
                df.loc[i, column] = count
                count+=1

    return values, df
                
            
def feat2vec(file):
    '''create a boxplot of the data
    *** input ***
    *** returns ***
    '''
    df1 = pd.read_pickle(file)
    data, class_dict = create_dataset(df1)
    data.to_pickle("vectorized1.pkl")  
    return data,class_dict,df1


In [5]:
file = 'data/dataframe_new.pkl'
# data, class_dict,df = feat2vec(file)

df = pd.read_pickle(file)
data = pd.read_pickle('vectorized1.pkl')
print(data)


      type    url protocol  Host  User-Agent  Accept-Encoding  Accept  \
0        1      1        1     1           1                1       1   
1        2      2        1     1           1                1       1   
2        2      3        1     1           1                1       1   
3        1      4        1     1           1                1       1   
4        2      5        1     1           1                1       1   
...    ...    ...      ...   ...         ...              ...     ...   
53044    1   2829        1     1           1                1       1   
53045    1   2830        1     1           1                1       1   
53046    2   2831        1     1           1                1       1   
53047    2  15561        1     1           1                1       1   
53048    2     36        1     1           1                1       1   

       Connection  content-type  Cookie  ...  etag  content-length  message  \
0               1             1       1  ...

# 2. Creating a table of features and data preparation

In [6]:
# check how many features 
# how many different features for each
# distribution of these features
# make assumption of what is usefull
# 60001 requests at this time and 35 columns at this time
# what to do with NANS


def sanitized_data(headers):
    '''further sanitizes the headers'''
    headers = [x.replace("\n","") for x in headers]
    headers = [x.replace('"',"") for x in headers]
    headers = [x.replace("{","") for x in headers]
    return headers
    
def barplot(data):
    '''creates a barplot of the data'''
    x = [name[:5] for name in data.columns]
    y = [np.bincount(data.to_numpy().astype(int)[:,i])[1:].shape[0] for i in range(data.to_numpy().shape[1])]
    y2 =[data.to_numpy().astype(int)[:,i] for i in range(data.to_numpy().shape[1])]
    fig = plt.figure()
    colors = sns.color_palette('pastel')

    ax = fig.add_axes([0,1,2,3])
    ax.bar(x,y,color=colors[:35])
    plt.show()

def boxplot(data):
    '''creates a boxplot of the data'''
    plt.figure(figsize=(10,6), tight_layout=True)
    ax = sns.boxplot(data=data,orient='h', palette='Set2', linewidth=2.5)
    ax.set(title='Boxplot', xlabel='', ylabel='Height (cm)')
    
def skewing_data(data):
    '''Displays a pandas dataframe that shows the number of different classes in each feature, 
    the average amount that the same feature is seen, the minimum amount that one of the classes of the feature is seen,
    and the maximum amount that the class of a feature is seen
    This gives us insight in which classes are used a lot within each feature'''
    
    # the amount of times a feature class is present in the data
    num_featclass = [np.bincount(data.to_numpy().astype(int)[:,i])[1:] for i in range(data.to_numpy().shape[1])]
    
    feature_names = sanitized_data([data.columns][0])
    max_num = [max(x) for x in num_featclass]
    min_num = [min(x) for x in num_featclass]
    avg_num = [np.mean(x) for x in num_featclass]
    classes = [len(x) for x in num_featclass]
    total = [sum(x) for x in num_featclass]
    
    names = ['Features','Max', 'Min','Avg','Classes','Total']
    columns = [[feature_names[i], max_num[i], min_num[i], avg_num[i], classes[i],total[i]] for i in range(len(feature_names))] 
    df = pd.DataFrame(columns, columns=names)
    display(df)

def create_latex_table(tab):
    '''Creates the output for a latex table'''
    print('Tabulate Table:')
    print(tabulate(tab, headers='firstrow'))
    table = Texttable()
    table.set_cols_align(["c"] * 4)
    table.set_deco(Texttable.HEADER | Texttable.VLINES)
    print('\nTexttable Table:')
    print(table.draw())
    
    print('\nTabulate Latex:')
    print(tabulate(rows, headers='firstrow', tablefmt='latex'))
    print('\nTexttable Latex:')
    print(latextable.draw_latex(table, caption='A comparison of rocket features.'))
    


skewing_data(data)

print('end')

Unnamed: 0,Features,Max,Min,Avg,Classes,Total
0,type,42656,1719,17683.0,3,53049
1,url,6717,1,3.4091,15561,53049
2,protocol,53049,53049,53049.0,1,53049
3,Host,53049,53049,53049.0,1,53049
4,User-Agent,53049,53049,53049.0,1,53049
5,Accept-Encoding,53049,53049,53049.0,1,53049
6,Accept,53049,53049,53049.0,1,53049
7,Connection,53049,53049,53049.0,1,53049
8,content-type,21639,4,8841.5,6,53049
9,Cookie,21647,2,4080.692308,13,53049


end


## 2.1 Data Preparation

In [8]:
def data_prep(data):
    # the features that have only one class means that those features are the same for each call. This is due to the test environment.
    # for our research do they matter?
    # but might matter if the anomalous data does have multiple classes? Also the might skew the dataset, since the algorithm will just learn that 
    # if something else is present it is the anomalous data
    
    return n_data

   ## 2.2 Cosine similarity

In [None]:
from numpy.linalg import norm
def create_cosine_similarity(V, feat1,feat2):
    A = V[feat1]
    B = V[feat2]
    cosine = np.dot(A,B)/(norm(A)*norm(B))
    print("cosing similarity between: " + feat1 + " and " +feat2 + ':', cosine)
create_cosine_similarity(data, 'url' ,'response code' )

import sklearn
from sklearn.metrics import pairwise
pairwise.cosine_similarity(data)

cosing similarity between: url and response code: 0.451257778147415


there are three different coockie contents right now

## 2.2 Boxplots 


In [180]:
import sklearn


# 3. Anomaly visualisation

In [10]:
file = 'data/dataframe_new.pkl'


In [11]:
file2 = 'data/anamolies.pkl'

In [169]:
df = pd.read_pickle(file2)
print(df)
print(df.columns)
data2, class_dict2,df = feat2vec(file2)

# skewing_data(data2)


      Id           Request Tijdstempel          Response Tijdstempel Methode  \
0     90  Mon Mar 20 21:50:04 CET 2023  Mon Mar 20 21:50:04 CET 2023     GET   
1     92  Mon Mar 20 21:50:04 CET 2023  Mon Mar 20 21:50:04 CET 2023     GET   
2     93  Mon Mar 20 21:50:04 CET 2023  Mon Mar 20 21:50:04 CET 2023     GET   
3     94  Mon Mar 20 21:50:04 CET 2023  Mon Mar 20 21:50:04 CET 2023     GET   
4     95  Mon Mar 20 21:50:04 CET 2023  Mon Mar 20 21:50:04 CET 2023     GET   
..   ...                           ...                           ...     ...   
503  613  Mon Mar 20 21:52:53 CET 2023  Mon Mar 20 21:53:07 CET 2023     GET   
504  614  Mon Mar 20 21:53:09 CET 2023  Mon Mar 20 21:53:09 CET 2023     GET   
505  615  Mon Mar 20 21:53:36 CET 2023  Mon Mar 20 21:53:37 CET 2023     GET   
506  617  Mon Mar 20 21:53:31 CET 2023  Mon Mar 20 21:53:45 CET 2023     GET   
507  616  Mon Mar 20 21:53:31 CET 2023  Mon Mar 20 21:53:45 CET 2023     GET   

                                       

# 4. Sankey diagram of Attack flows

In [14]:
import plotly.graph_objects as go
from collections import OrderedDict
import seaborn as sns
import random
import numpy as np
from collections import defaultdict

In [15]:
df1 = pd.read_pickle(file)
num_dict = class_dict['url']
num_dict = {v: k for k, v in num_dict.items()}

## 4.1 Dataframe for ApiFlows

In [62]:
class APIflows:
    '''This object creates a representation for the apicalls that the user makes to the application
        self.source - the source lists for the beginning of each flow which starts at the client
        self.target - the target list for where the flow leads to
        self.coockies - set of already observed coockies
        self.flowdict - '''
    def __init__(self, data):
        self.coockies = set()
        self.data = data
        self.flowdict = OrderedDict()
        self.paths = []
        self.flows()
        self.path_info()
    
    def flows(self):
        '''checks if the coockie is new or seen already and adds things to the right lists/sets'''
        self.coockies.add(self.data['coockie'][0])
        self.flowdict[(0,self.data['url'][0])] = 1
        path = [(0,1)]
        for i in range(1,len(self.data['coockie'])-1):
            
            # check if coockie is already seen
            if self.data['coockie'][i] in self.coockies:
                
                # if the same coockie thus the same user
                # if the flow is not seen yet
                if self.data['coockie'][i] == self.data['coockie'][i-1]:
                    path.append((self.data['url'][i-1],self.data['url'][i]))
                    self.add_flow_to_dict(self.data['url'][i-1],self.data['url'][i])

                else:
                    self.paths.append(path)
                    path = [(0,self.data['url'][i])]
                    self.add_flow_to_dict(0,self.data['url'][i])
                    
            else:
                self.paths.append(path)
                path = [(0,self.data['url'][i])]
                self.add_flow_to_dict(0,self.data['url'][i])
                self.coockies.add(self.data['coockie'][i])
    
    
    def add_flow_to_dict(self,flow1,flow2):
        '''Checks whether the flow is from the same user (by comparing coockies) and adds 
            them to the flow dict and source and target lists
            
            ***input***
            flow1 & flow2: the flows of the first and second user            
            '''

        if ((flow1,flow2)) not in self.flowdict:
            self.flowdict[((flow1,flow2))] = 1

        else:
            self.flowdict[((flow1,flow2))] += 1
            
    def path_info(self):    
        '''Get the shortest and longest paths
            Also get which paths are taken most often'''
        self.shortest = self.paths
        self.shortest.sort(key=lambda l: (len(l), l))
        self.numbers = {}
        
        for i in range(len(self.paths)):
            if tuple(self.paths[i]) in list(self.numbers.keys()):
                self.numbers[tuple(self.paths[i])] += 1  
            
            else:    
                self.numbers[tuple(self.paths[i])] = 1

In [63]:
flows = APIflows(data)

## 4.2 Code for sankey visualisations 

In [162]:
def create_sankey(flows,n,lab,option='default'):
    '''creates sankey diagram according to option for flow
        *** input***
            flows - the object containing the dicts and lists of flows/paths
            n - the number of flows  you want visualised
            lab - the labels for each flow
            option - the option of what needs to be visualised
            
        *** Options ***
            longest - The longest n paths users have taken
            shortest - The shortest n paths users have taken
            most - The n paths users take most offten
            least - The n paths users take least often
            defualt - Visualising the most seen api calls made from on point to another
        '''
    if option == 'longest':
        longest = flows.shortest
        longest.sort(key=lambda l: (len(l), l),reverse=True)
        plot_sankey(longest[:n], lab, flows.numbers,n)

    elif option == 'shortest':
        shortest = flows.shortest
        shortest.sort(key=lambda l: (len(l), l))
        plot_sankey(shortest[:n], lab, flows.numbers,n)
        
    elif option == 'most':
        flow_dict = dict(sorted(flows.numbers.items(), key=lambda x:x[1],reverse=True))
        plot_sankey(list(flow_dict.keys())[:n],lab, flows.numbers,n)
        
    elif option == 'least':
        flow_dict = dict(sorted(flows.numbers.items(), key=lambda x:x[1]))
        plot_sankey(list(flow_dict.keys())[:n], lab, flows.numbers,n)
        
    else:
        flow_dict = dict(sorted(flows.flowdict.items(), key=lambda x:x[1],reverse=True))
        plot_sankey(list(flow_dict.keys())[:n],lab, flow_dict,n)
        
        
def plot_sankey(s,lab,flow_dict,n):
    '''Plot the sankey diagram
    *** Input ***
        s - the source for each vertex
        lab - the label for each node
        flow_dict - the dictionary with paths containing edges and values
        n - the number of paths
    '''
    
    color_pal = sns.color_palette("Spectral",n).as_hex()
    
    # sort so that the sankey diagram is visualised in the right order
    s.sort()
    # if a list is used, some changes are made to visualise the paths
    if type(s[0]) is list:
        v = [[flow_dict[tuple(x)]] * len(x) for x in s]
        v1 = []
        s1 = []
        for i in range(len(s)):
            s1 = s1 + s[i]
            v1 = v1 + v[i]
        s =s1
        v =v1

    # this is used when there is not path just edges
    else:
        v = [flow_dict[x] for x in s]

    # creating label names instead of numbers
    l = ['client'] + [lab[x][:12] for x in range(1,15562)]
    
    # the dictionary is filled with tuples, so again some changes need to be added to representation
    if type(s[0][0]) is tuple:
        s1 = []
        for i in range(len(s)):
            s1 = s1 + list(s[i])
        s =s1
    
    # creating the figure
    fig = go.Figure(go.Sankey(
        arrangement='snap',
        node=dict(
              label = l,
            line = dict(color = "black", width = 0.5),
            pad=15,
            thickness=20,
        ),
        link=dict(
            source = list(np.array(s)[:,0]) ,
            target = list(np.array(s)[:,1]),
            value= v,
        )
    ))

    fig.show()

## 4.3 Sankey Visualisations

In [163]:
create_sankey(flows,50, num_dict,'default')

In [164]:
create_sankey(flows,20, num_dict,'longest')

In [165]:
create_sankey(flows,1000, num_dict,'shortest')

In [166]:
create_sankey(flows,50, num_dict,'most')

In [168]:
create_sankey(flows,50, num_dict,'least')