In [3]:
import numpy as np
import matplotlib.pyplot as plt
from numpy import genfromtxt
import pandas as pd

In [11]:
def create_dataset(df):
    '''Create a dataset of numerical encoding for each column
        *** input ***
        * df - the pandas dataframe that needs to be changed to numerical data
        *** returns ***
        * df - the new pandas dataframe with the numerical encodings
        * class_dict - a dictionary containing the mappings for each column header and the label for each numerical value in the new dataframe
    '''
    
    class_dict = {}
    for column in df:
        values, df = create_mapping(df,column)
        values[column[0]] = values

    return df, class_dict

def create_mapping(df,column):
    '''create the mapping from label to number
    *** input ***
    *df - the dataframe (pandas) to start from
    *column - the column name
    *** returns ***
    * values - the dict with as the key the label and as value  the numercal value
    * df - the new dataframe (pandas) with for one column numerical values
    '''
    
    values = {}
    count = 1
    for i in range(len(df[column])):
 
        if isinstance(df.loc[i,column], str):
            if df.loc[i,column] in values.keys():
            
                df.loc[i,column] = values[df.loc[i,column]]
            else:
                values[df.loc[i,column]] = count
                df.loc[i, column] = count
                count+=1
        else:
            if str(df.loc[i,column]) in values.keys():
            
                df.loc[i,column] = values[str(df.loc[i,column])]
            else:
                values[str(df.loc[i,column])] = count
                df.loc[i, column] = count
                count+=1

    return values, df
                
            
def boxplot():
    '''create a boxplot of the data
    *** input ***
    *** returns ***
    '''
    df = pd.read_pickle('dataframe.pkl')
    data, class_dict = create_dataset(df)
    plt.figure()
    
#     for column in data:
#         print(column)
#         plt.boxplot(data[column])
#         plt.show()
#     for column in data:
#         plt.plot(data[column])
#         plt.legend(data.columns)  
    print(data)
    return data,class_dict


In [12]:
data, class_dict = boxplot()

      type                                                url    protocol  \
0     POST                                          /register  HTTP/1.1\n   
1     POST                                         /addresses  HTTP/1.1\n   
2     POST                                             /cards  HTTP/1.1\n   
3      GET                                             /login  HTTP/1.1\n   
4      GET                                              /cart  HTTP/1.1\n   
...    ...                                                ...         ...   
6001  POST                                         /addresses  HTTP/1.1\n   
6002  POST                                             /cards  HTTP/1.1\n   
6003   GET                                             /login  HTTP/1.1\n   
6004   GET  /catalogue?tags=green%2Caction%2Cskin%2Cred&pa...  HTTP/1.1\n   
6005   GET  /detail.html?id=808a2de1-1aaa-4c25-a9b9-6612e8...  HTTP/1.1\n   

             Host                 User-Agent       Accept-Encoding  Accept 

<Figure size 640x480 with 0 Axes>

In [6]:
# check how many features 
# how many different features for each
# distribution of these features
# make assumption of what is usefull
# 60001 requests at this time and 35 columns at this time
# what to do with NANS
import seaborn as sns
from tabulate import tabulate 
import latextable
from texttable import Texttable

def sanitized_data(headers):
    '''further sanitizes the headers'''
    headers = [x.replace("\n","") for x in headers]
    headers = [x.replace('"',"") for x in headers]
    headers = [x.replace("{","") for x in headers]
    return headers
    
def barplot(data):
    '''creates a barplot of the data'''
    x = [name[:5] for name in data.columns]
    y = [np.bincount(data.to_numpy().astype(int)[:,i])[1:].shape[0] for i in range(data.to_numpy().shape[1])]
    y2 =[data.to_numpy().astype(int)[:,i] for i in range(data.to_numpy().shape[1])]
    fig = plt.figure()
    colors = sns.color_palette('pastel')

    ax = fig.add_axes([0,1,2,3])
    ax.bar(x,y,color=colors[:35])
    plt.show()

def boxplot(data):
    '''creates a boxplot of the data'''
    plt.figure(figsize=(10,6), tight_layout=True)
    ax = sns.boxplot(data=data,orient='h', palette='Set2', linewidth=2.5)
    ax.set(title='Boxplot', xlabel='', ylabel='Height (cm)')
    
def skewing_data(data):
    '''Displays a pandas dataframe that shows the number of different classes in each feature, 
    the average amount that the same feature is seen, the minimum amount that one of the classes of the feature is seen,
    and the maximum amount that the class of a feature is seen
    This gives us insight in which classes are used a lot within each feature'''
    
    # the amount of times a feature class is present in the data
    num_featclass = [np.bincount(data.to_numpy().astype(int)[:,i])[1:] for i in range(data.to_numpy().shape[1])]
    
    feature_names = sanitized_data([data.columns][0])
    max_num = [max(x) for x in num_featclass]
    min_num = [min(x) for x in num_featclass]
    avg_num = [np.mean(x) for x in num_featclass]
    classes = [len(x) for x in num_featclass]
    total = [sum(x) for x in num_featclass]
    
    names = ['Features','Max', 'Min','Avg','Classes','Total']
    columns = [[feature_names[i], max_num[i], min_num[i], avg_num[i], classes[i],total[i]] for i in range(len(feature_names))] 
    df = pd.DataFrame(columns, columns=names)
    display(df)

def create_latex_table(tab):
    '''Creates the output for a latex table'''
    print('Tabulate Table:')
    print(tabulate(tab, headers='firstrow'))
    table = Texttable()
    table.set_cols_align(["c"] * 4)
    table.set_deco(Texttable.HEADER | Texttable.VLINES)
    print('\nTexttable Table:')
    print(table.draw())
    
    print('\nTabulate Latex:')
    print(tabulate(rows, headers='firstrow', tablefmt='latex'))
    print('\nTexttable Latex:')
    print(latextable.draw_latex(table, caption='A comparison of rocket features.'))
    


skewing_data(data)





Unnamed: 0,Features,Max,Min,Avg,Classes,Total
0,type,4767,1239,3003.0,2,6006
1,url,797,1,3.154412,1904,6006
2,protocol,6006,6006,6006.0,1,6006
3,Host,6006,6006,6006.0,1,6006
4,User-Agent,6006,6006,6006.0,1,6006
5,Accept-Encoding,6006,6006,6006.0,1,6006
6,Accept,6006,6006,6006.0,1,6006
7,Connection,6006,6006,6006.0,1,6006
8,content-type,2414,7,1001.0,6,6006
9,Content-Length,4767,3,750.75,8,6006


In [7]:
def data_prep(data):
    # the features that have only one class means that those features are the same for each call. This is due to the test environment.
    # for our research do they matter?
    
    return n_data

0        1
1        1
2        1
3        1
4        1
        ..
6001    10
6002    10
6003    10
6004    10
6005    10
Name:  date, Length: 6006, dtype: object

In [None]:
df = pd.read_pickle('dataframe2.pkl')
print(df[" date"])
import time
print(time.strftime("%a, %d %b %Y %H:%M:%S GMT"))
print(df.columns)

docstrings