In [1]:
import numpy as np
import matplotlib.pyplot as plt
from numpy import genfromtxt
import pandas as pd

In [5]:
def create_dataset(df):
    '''Create a dataset of numerical encoding for each column
        *** input ***
        * df - the pandas dataframe that needs to be changed to numerical data
        *** returns ***
        * df - the new pandas dataframe with the numerical encodings
        * class_dict - a dictionary containing the mappings for each column header and the label for each numerical value in the new dataframe
    '''
    
    class_dict = {}
    for column in df:
        values, df = create_mapping(df,column)
        values[column[0]] = values

    return df, class_dict

def create_mapping(df,column):
    '''create the mapping from label to number
    *** input ***
    *df - the dataframe (pandas) to start from
    *column - the column name
    *** returns ***
    * values - the dict with as the key the label and as value  the numercal value
    * df - the new dataframe (pandas) with for one column numerical values
    '''
    
    values = {}
    count = 1
    for i in range(len(df[column])):
 
        if isinstance(df.loc[i,column], str):
            if df.loc[i,column] in values.keys():
            
                df.loc[i,column] = values[df.loc[i,column]]
            else:
                values[df.loc[i,column]] = count
                df.loc[i, column] = count
                count+=1
        else:
            if str(df.loc[i,column]) in values.keys():
            
                df.loc[i,column] = values[str(df.loc[i,column])]
            else:
                values[str(df.loc[i,column])] = count
                df.loc[i, column] = count
                count+=1

    return values, df
                
            
def boxplot(file):
    '''create a boxplot of the data
    *** input ***
    *** returns ***
    '''
    df = pd.read_pickle(file)
    data, class_dict = create_dataset(df)
    plt.figure()
    
#     for column in data:
#         print(column)
#         plt.boxplot(data[column])
#         plt.show()
#     for column in data:
#         plt.plot(data[column])
#         plt.legend(data.columns)  
    print(data)
    return data,class_dict


In [6]:
file = 'data/dataframe_new.pkl'
data, class_dict = boxplot(file)

      type    url protocol  Host  User-Agent  Accept-Encoding  Accept  \
0        1      1        1     1           1                1       1   
1        2      2        1     1           1                1       1   
2        2      3        1     1           1                1       1   
3        1      4        1     1           1                1       1   
4        2      5        1     1           1                1       1   
...    ...    ...      ...   ...         ...              ...     ...   
53044    1   2829        1     1           1                1       1   
53045    1   2830        1     1           1                1       1   
53046    2   2831        1     1           1                1       1   
53047    2  15561        1     1           1                1       1   
53048    2     36        1     1           1                1       1   

       Connection  content-type  Cookie  ...  etag  content-length  message  \
0               1             1       1  ...

<Figure size 640x480 with 0 Axes>

In [7]:
# check how many features 
# how many different features for each
# distribution of these features
# make assumption of what is usefull
# 60001 requests at this time and 35 columns at this time
# what to do with NANS
import seaborn as sns
from tabulate import tabulate 
import latextable
from texttable import Texttable

def sanitized_data(headers):
    '''further sanitizes the headers'''
    headers = [x.replace("\n","") for x in headers]
    headers = [x.replace('"',"") for x in headers]
    headers = [x.replace("{","") for x in headers]
    return headers
    
def barplot(data):
    '''creates a barplot of the data'''
    x = [name[:5] for name in data.columns]
    y = [np.bincount(data.to_numpy().astype(int)[:,i])[1:].shape[0] for i in range(data.to_numpy().shape[1])]
    y2 =[data.to_numpy().astype(int)[:,i] for i in range(data.to_numpy().shape[1])]
    fig = plt.figure()
    colors = sns.color_palette('pastel')

    ax = fig.add_axes([0,1,2,3])
    ax.bar(x,y,color=colors[:35])
    plt.show()

def boxplot(data):
    '''creates a boxplot of the data'''
    plt.figure(figsize=(10,6), tight_layout=True)
    ax = sns.boxplot(data=data,orient='h', palette='Set2', linewidth=2.5)
    ax.set(title='Boxplot', xlabel='', ylabel='Height (cm)')
    
def skewing_data(data):
    '''Displays a pandas dataframe that shows the number of different classes in each feature, 
    the average amount that the same feature is seen, the minimum amount that one of the classes of the feature is seen,
    and the maximum amount that the class of a feature is seen
    This gives us insight in which classes are used a lot within each feature'''
    
    # the amount of times a feature class is present in the data
    num_featclass = [np.bincount(data.to_numpy().astype(int)[:,i])[1:] for i in range(data.to_numpy().shape[1])]
    
    feature_names = sanitized_data([data.columns][0])
    max_num = [max(x) for x in num_featclass]
    min_num = [min(x) for x in num_featclass]
    avg_num = [np.mean(x) for x in num_featclass]
    classes = [len(x) for x in num_featclass]
    total = [sum(x) for x in num_featclass]
    
    names = ['Features','Max', 'Min','Avg','Classes','Total']
    columns = [[feature_names[i], max_num[i], min_num[i], avg_num[i], classes[i],total[i]] for i in range(len(feature_names))] 
    df = pd.DataFrame(columns, columns=names)
    display(df)

def create_latex_table(tab):
    '''Creates the output for a latex table'''
    print('Tabulate Table:')
    print(tabulate(tab, headers='firstrow'))
    table = Texttable()
    table.set_cols_align(["c"] * 4)
    table.set_deco(Texttable.HEADER | Texttable.VLINES)
    print('\nTexttable Table:')
    print(table.draw())
    
    print('\nTabulate Latex:')
    print(tabulate(rows, headers='firstrow', tablefmt='latex'))
    print('\nTexttable Latex:')
    print(latextable.draw_latex(table, caption='A comparison of rocket features.'))
    


skewing_data(data)





Unnamed: 0,Features,Max,Min,Avg,Classes,Total
0,type,42656,1719,17683.0,3,53049
1,url,6717,1,3.4091,15561,53049
2,protocol,53049,53049,53049.0,1,53049
3,Host,53049,53049,53049.0,1,53049
4,User-Agent,53049,53049,53049.0,1,53049
5,Accept-Encoding,53049,53049,53049.0,1,53049
6,Accept,53049,53049,53049.0,1,53049
7,Connection,53049,53049,53049.0,1,53049
8,content-type,21639,4,8841.5,6,53049
9,Cookie,21647,2,4080.692308,13,53049


In [8]:
def data_prep(data):
    # the features that have only one class means that those features are the same for each call. This is due to the test environment.
    # for our research do they matter?
    
    return n_data

In [11]:
file = 'data/dataframe_new.pkl'


In [12]:
df = pd.read_pickle(file)
print(df[" date"])
import time
print(time.strftime("%a, %d %b %Y %H:%M:%S GMT"))
print(df.columns)

0         Wed, 01 Mar 2023 11:39:11 GMT
1         Wed, 01 Mar 2023 11:39:07 GMT
2         Wed, 01 Mar 2023 11:39:07 GMT
3         Wed, 01 Mar 2023 11:39:03 GMT
4         Wed, 01 Mar 2023 11:38:59 GMT
                      ...              
53044     Mon, 27 Feb 2023 16:47:25 GMT
53045     Mon, 27 Feb 2023 16:47:25 GMT
53046     Mon, 27 Feb 2023 16:47:25 GMT
53047     Mon, 27 Feb 2023 16:47:25 GMT
53048     Mon, 27 Feb 2023 16:47:25 GMT
Name:  date, Length: 53049, dtype: object
Tue, 21 Mar 2023 15:45:57 GMT
Index(['type', 'url', 'protocol', ' Host', ' User-Agent', ' Accept-Encoding',
       ' Accept', ' Connection', ' content-type', ' Cookie', ' Content-Length',
       ' id', 'response code', ' x-powered-by', ' date',
       ' x-envoy-upstream-service-time', ' server',
       ' x-envoy-decorator-operation', ' transfer-encoding', 'body', 'coockie',
       'ip address', 'request time', ' accept-ranges', ' cache-control',
       ' last-modified', ' etag', ' content-length', ' message',
   

docstrings