In [36]:
import numpy as np
import matplotlib.pyplot as plt
from numpy import genfromtxt
import pandas as pd
import seaborn as sns
from tabulate import tabulate 
import latextable
from texttable import Texttable

# 1. Dataset and Mapping creation

In [37]:
def create_dataset(df):
    '''Create a dataset of numerical encoding for each column
        *** input ***
        * df - the pandas dataframe that needs to be changed to numerical data
        *** returns ***
        * df - the new pandas dataframe with the numerical encodings
        * class_dict - a dictionary containing the mappings for each column header and the label for each numerical value in the new dataframe
    '''
    
    class_dict = {}
    for column in df:
        values, df = create_mapping(df,column)
        class_dict[column] = values

    return df, class_dict

def create_mapping(df,column):
    '''create the mapping from label to number
    *** input ***
    *df - the dataframe (pandas) to start from
    *column - the column name
    *** returns ***
    * values - the dict with as the key the label and as value  the numercal value
    * df - the new dataframe (pandas) with for one column numerical values
    '''
    
    values = {}
    count = 1
    for i in range(len(df[column])):
 
        if isinstance(df.loc[i,column], str):
            if df.loc[i,column] in values.keys():
            
                df.loc[i,column] = values[df.loc[i,column]]
            else:
                values[df.loc[i,column]] = count
                df.loc[i, column] = count
                count+=1
        else:
            if str(df.loc[i,column]) in values.keys():
            
                df.loc[i,column] = values[str(df.loc[i,column])]
            else:
                values[str(df.loc[i,column])] = count
                df.loc[i, column] = count
                count+=1

    return values, df
                
            
def feat2vec(file):
    '''create a boxplot of the data
    *** input ***
    *** returns ***
    '''
    df = pd.read_pickle(file)
    data, class_dict = create_dataset(df)
    return data,class_dict


In [38]:
file = 'data/dataframe_new.pkl'
data, class_dict = feat2vec(file)
# print(class_dict)

# 2. Creating a table of features and data preparation

In [39]:
# check how many features 
# how many different features for each
# distribution of these features
# make assumption of what is usefull
# 60001 requests at this time and 35 columns at this time
# what to do with NANS


def sanitized_data(headers):
    '''further sanitizes the headers'''
    headers = [x.replace("\n","") for x in headers]
    headers = [x.replace('"',"") for x in headers]
    headers = [x.replace("{","") for x in headers]
    return headers
    
def barplot(data):
    '''creates a barplot of the data'''
    x = [name[:5] for name in data.columns]
    y = [np.bincount(data.to_numpy().astype(int)[:,i])[1:].shape[0] for i in range(data.to_numpy().shape[1])]
    y2 =[data.to_numpy().astype(int)[:,i] for i in range(data.to_numpy().shape[1])]
    fig = plt.figure()
    colors = sns.color_palette('pastel')

    ax = fig.add_axes([0,1,2,3])
    ax.bar(x,y,color=colors[:35])
    plt.show()

def boxplot(data):
    '''creates a boxplot of the data'''
    plt.figure(figsize=(10,6), tight_layout=True)
    ax = sns.boxplot(data=data,orient='h', palette='Set2', linewidth=2.5)
    ax.set(title='Boxplot', xlabel='', ylabel='Height (cm)')
    
def skewing_data(data):
    '''Displays a pandas dataframe that shows the number of different classes in each feature, 
    the average amount that the same feature is seen, the minimum amount that one of the classes of the feature is seen,
    and the maximum amount that the class of a feature is seen
    This gives us insight in which classes are used a lot within each feature'''
    
    # the amount of times a feature class is present in the data
    num_featclass = [np.bincount(data.to_numpy().astype(int)[:,i])[1:] for i in range(data.to_numpy().shape[1])]
    
    feature_names = sanitized_data([data.columns][0])
    max_num = [max(x) for x in num_featclass]
    min_num = [min(x) for x in num_featclass]
    avg_num = [np.mean(x) for x in num_featclass]
    classes = [len(x) for x in num_featclass]
    total = [sum(x) for x in num_featclass]
    
    names = ['Features','Max', 'Min','Avg','Classes','Total']
    columns = [[feature_names[i], max_num[i], min_num[i], avg_num[i], classes[i],total[i]] for i in range(len(feature_names))] 
    df = pd.DataFrame(columns, columns=names)
    display(df)

def create_latex_table(tab):
    '''Creates the output for a latex table'''
    print('Tabulate Table:')
    print(tabulate(tab, headers='firstrow'))
    table = Texttable()
    table.set_cols_align(["c"] * 4)
    table.set_deco(Texttable.HEADER | Texttable.VLINES)
    print('\nTexttable Table:')
    print(table.draw())
    
    print('\nTabulate Latex:')
    print(tabulate(rows, headers='firstrow', tablefmt='latex'))
    print('\nTexttable Latex:')
    print(latextable.draw_latex(table, caption='A comparison of rocket features.'))
    


skewing_data(data)



Unnamed: 0,Features,Max,Min,Avg,Classes,Total
0,type,42656,1719,17683.0,3,53049
1,url,6717,1,3.4091,15561,53049
2,protocol,53049,53049,53049.0,1,53049
3,Host,53049,53049,53049.0,1,53049
4,User-Agent,53049,53049,53049.0,1,53049
5,Accept-Encoding,53049,53049,53049.0,1,53049
6,Accept,53049,53049,53049.0,1,53049
7,Connection,53049,53049,53049.0,1,53049
8,content-type,21639,4,8841.5,6,53049
9,Cookie,21647,2,4080.692308,13,53049


In [40]:
def data_prep(data):
    # the features that have only one class means that those features are the same for each call. This is due to the test environment.
    # for our research do they matter?
    return n_data

# 3. Anomaly visualisation

In [41]:
file = 'data/dataframe_new.pkl'


In [42]:
file2 = 'data/anamolies.pkl'

In [43]:
df = pd.read_pickle(file2)
print(df)
print(df.columns)
data2, class_dict2 = feat2vec(file2)

# skewing_data(data2)


      Id           Request Tijdstempel          Response Tijdstempel Methode  \
0     90  Mon Mar 20 21:50:04 CET 2023  Mon Mar 20 21:50:04 CET 2023     GET   
1     92  Mon Mar 20 21:50:04 CET 2023  Mon Mar 20 21:50:04 CET 2023     GET   
2     93  Mon Mar 20 21:50:04 CET 2023  Mon Mar 20 21:50:04 CET 2023     GET   
3     94  Mon Mar 20 21:50:04 CET 2023  Mon Mar 20 21:50:04 CET 2023     GET   
4     95  Mon Mar 20 21:50:04 CET 2023  Mon Mar 20 21:50:04 CET 2023     GET   
..   ...                           ...                           ...     ...   
503  613  Mon Mar 20 21:52:53 CET 2023  Mon Mar 20 21:53:07 CET 2023     GET   
504  614  Mon Mar 20 21:53:09 CET 2023  Mon Mar 20 21:53:09 CET 2023     GET   
505  615  Mon Mar 20 21:53:36 CET 2023  Mon Mar 20 21:53:37 CET 2023     GET   
506  617  Mon Mar 20 21:53:31 CET 2023  Mon Mar 20 21:53:45 CET 2023     GET   
507  616  Mon Mar 20 21:53:31 CET 2023  Mon Mar 20 21:53:45 CET 2023     GET   

                                       

# Sankey diagram of Attack flows

In [44]:
import plotly.graph_objects as go
print(df)

      Id           Request Tijdstempel          Response Tijdstempel Methode  \
0     90  Mon Mar 20 21:50:04 CET 2023  Mon Mar 20 21:50:04 CET 2023     GET   
1     92  Mon Mar 20 21:50:04 CET 2023  Mon Mar 20 21:50:04 CET 2023     GET   
2     93  Mon Mar 20 21:50:04 CET 2023  Mon Mar 20 21:50:04 CET 2023     GET   
3     94  Mon Mar 20 21:50:04 CET 2023  Mon Mar 20 21:50:04 CET 2023     GET   
4     95  Mon Mar 20 21:50:04 CET 2023  Mon Mar 20 21:50:04 CET 2023     GET   
..   ...                           ...                           ...     ...   
503  613  Mon Mar 20 21:52:53 CET 2023  Mon Mar 20 21:53:07 CET 2023     GET   
504  614  Mon Mar 20 21:53:09 CET 2023  Mon Mar 20 21:53:09 CET 2023     GET   
505  615  Mon Mar 20 21:53:36 CET 2023  Mon Mar 20 21:53:37 CET 2023     GET   
506  617  Mon Mar 20 21:53:31 CET 2023  Mon Mar 20 21:53:45 CET 2023     GET   
507  616  Mon Mar 20 21:53:31 CET 2023  Mon Mar 20 21:53:45 CET 2023     GET   

                                       

In [46]:
# for column in data:
#     print(column)
# print(class_dict['c'])
print(data)

      type    url protocol  Host  User-Agent  Accept-Encoding  Accept  \
0        1      1        1     1           1                1       1   
1        2      2        1     1           1                1       1   
2        2      3        1     1           1                1       1   
3        1      4        1     1           1                1       1   
4        2      5        1     1           1                1       1   
...    ...    ...      ...   ...         ...              ...     ...   
53044    1   2829        1     1           1                1       1   
53045    1   2830        1     1           1                1       1   
53046    2   2831        1     1           1                1       1   
53047    2  15561        1     1           1                1       1   
53048    2     36        1     1           1                1       1   

       Connection  content-type  Cookie  ...  etag  content-length  message  \
0               1             1       1  ...

In [75]:
import plotly.graph_objects as go
from collections import OrderedDict

class APIflows:
    def __init__(self, data):
        self.source = []
        self.target = []
        self.coockies = set()
        self.data = data
        self.flowdict = OrderedDict()
        self.flows()
    
    def flows(self):
        for i in range(len(self.data['coockie'])-1):

            if self.data['coockie'][i] in self.coockies:
                self.info(self.data['coockie'][i],self.data['coockie'][i+1],self.data['url'][i],self.data['url'][i+1])

            else:

                self.info(0,self.data['coockie'][i],0,self.data['url'][i])
                self.info(self.data['coockie'][i], self.data['coockie'][i+1],self.data['url'][i],self.data['url'][i+1])
                self.coockies.add(self.data['coockie'][i])
    
    def info(self, coockie1, coockie2,flow1,flow2):
        if coockie1 == coockie2:
            if ((coockie1,coockie2)) not in self.flowdict:
                self.source.append(flow1)
                self.target.append(flow2)
                self.flowdict[((flow1,flow2))] = 1

            else:
                self.flowdict[((flow1,flow2))] += 1

    
flows = APIflows(data)
print(flows.source)
print(flows.target)
print(list(flows.flowdict.values()))
fig = go.Figure(go.Sankey(
    arrangement='snap',
    node=dict(
        label = data['coockie'],
#         x= 
#         y=
        pad= 15,
        color=['blue','red','green','yellow']
    ),
    link=dict(
#         arrowlen=15,
        source=flows.source,
        target=flows.target,
#         value= list(flows.flowdict.values()),
        value=[1,2,3,4],
        color=['blue','red','green','yellow']
    )
))

fig.show()

[2, 1, 3, 4]
[2, 1, 3, 4]
[2892, 1603, 8631, 15143]


In [25]:
import plotly.graph_objects as go
import urllib, json

url = 'https://raw.githubusercontent.com/plotly/plotly.js/master/test/image/mocks/sankey_energy.json'
response = urllib.request.urlopen(url)
data = json.loads(response.read())

# override gray link colors with 'source' colors
opacity = 0.4
# change 'magenta' to its 'rgba' value to add opacity
data['data'][0]['node']['color'] = ['rgba(255,0,255, 0.8)' if color == "magenta" else color for color in data['data'][0]['node']['color']]
data['data'][0]['link']['color'] = [data['data'][0]['node']['color'][src].replace("0.8", str(opacity))
                                    for src in data['data'][0]['link']['source']]

fig = go.Figure(data=[go.Sankey(
#     valueformat = ".0f",
#     valuesuffix = "TWh",
    # Define nodes
    node = dict(
      pad = 15,
#       thickness = 15,
#       line = dict(color = "black", width = 0.5),
      label =  data['data'][0]['node']['label'],
      color =  data['data'][0]['node']['color']
    ),
    # Add links
    link = dict(
      source =  data['data'][0]['link']['source'],
      target =  data['data'][0]['link']['target'],
      value =  data['data'][0]['link']['value'],
      label =  data['data'][0]['link']['label'],
      color =  data['data'][0]['link']['color']
))])

fig.update_layout(title_text="Energy forecast for 2050<br>Source: Department of Energy & Climate Change, Tom Counsell via <a href='https://bost.ocks.org/mike/sankey/'>Mike Bostock</a>",
                  font_size=10)
fig.show()

In [30]:
import plotly.graph_objects as go
import nbformat

fig = go.Figure(go.Sankey(
    arrangement='snap',
    node=dict(
        label=['A', 'B', 'C', 'D', 'E', 'F'],
#         line = dict(color = "black", width = 0.5),

        x=[0.2, 0.1, 0.5, 0.7, 0.3, 0.5],
        y=[0.7, 0.5, 0.2, 0.4, 0.2, 0.3],
        pad=15,
        thickness=15
    ),
    link=dict(
#         arrowlen=15,
        source=[0, 0, 1, 2, 5, 4, 3, 5],
        target=[5, 3, 4, 3, 0, 2, 2, 3],
        value=[1, 2, 1, 1, 1, 1, 1, 2]  
    )
))

fig.show()

'5.7.0'