# CIC-Darknet2020 Dataset Statistics

Here we load data from the [CIC-Darknet2020](https://www.unb.ca/cic/datasets/darknet2020.html) dataset and process it for our experiments.

First we import all relevant libraries, set a random seed, and print python and library versions for reproducability

In [5]:
import datetime, os, platform, pprint, sys
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

seed: int = 14
time = datetime.datetime.now()
# set up pretty printer for easier data evaluation
pretty = pprint.PrettyPrinter(indent=4, width=30).pprint

print(
    f'''
    Last Execution: {time}
    python:\t{platform.python_version()}

    \tmatplotlib:\t{mpl.__version__}
    \tnumpy:\t\t{np.__version__}
    \tpandas:\t\t{pd.__version__}
    '''
)


    Last Execution: 2022-02-12 19:58:09.957448
    python:	3.7.10

    	matplotlib:	3.3.4
    	numpy:		1.20.3
    	pandas:		1.2.5
    


Next we prepare some helper functions to help process the data

In [6]:
def get_file_path(directory: str):
    '''
        Closure that will return a function. 
        Function will return the filepath to the directory given to the closure
    '''

    def func(file: str) -> str:
        return os.path.join(directory, file)

    return func



def load_data(filePath):
    '''
        Loads the Dataset from the given filepath and caches it for quick access in the future
        Function will only work when filepath is a .csv file
    '''

    # slice off the ./CSV/ from the filePath
    if filePath[0] == '.' and filePath[1] == '/':
        filePathClean: str = filePath[11::]
        pickleDump: str = f'./cache/{filePathClean}.pickle'
    else:
        pickleDump: str = f'./cache/{filePath}.pickle'
    
    print(f'Loading Dataset: {filePath}')
    print(f'\tTo Dataset Cache: {pickleDump}\n')
    
    # check if data already exists within cache
    if os.path.exists(pickleDump):
        df = pd.read_pickle(pickleDump)
        
    # if not, load data and clean it before caching it
    else:
        df = pd.read_csv(filePath, low_memory=True)
        df.to_pickle(pickleDump)
    
    return df



def features_with_bad_values(df: pd.DataFrame, datasetName: str) -> pd.DataFrame:
    '''
        Function will scan the dataframe for features with Inf, NaN, or Zero values.
        Returns a new dataframe describing the distribution of these values in the original dataframe
    '''

    # Inf and NaN values can take different forms so we screen for every one of them
    invalid_values: list = [ np.inf, np.nan, 'Infinity', 'inf', 'NaN', 'nan', 0 ]
    infs          : list = [ np.inf, 'Infinity', 'inf' ]
    NaNs          : list = [ np.nan, 'NaN', 'nan' ]

    # We will collect stats on the dataset, specifically how many instances of Infs, NaNs, and 0s are present.
    # using a dictionary that will be converted into a (3, 2+88) dataframe
    stats: dict = {
        'Dataset':[ datasetName, datasetName, datasetName ],
        'Value'  :['Inf', 'NaN', 'Zero']
    }

    i = 0
    for col in df.columns:
        
        i += 1
        feature = np.zeros(3)
        
        for value in invalid_values:
            if value in infs:
                j = 0
            elif value in NaNs:
                j = 1
            else:
                j = 2
            indexNames = df[df[col] == value].index
            if not indexNames.empty:
                feature[j] += len(indexNames)
                
        stats[col] = feature

    return pd.DataFrame(stats)


Before we do any processing on the data, we need to list out all their filepaths. If trying to reproduce the process carried out here, place files in the same location relative to the notebook.

In [7]:
data_path_1: str = './original/'
    
data_set_1: list = [
    'Darknet.csv',
]
    

data_set: list = data_set_1


file_path_1 = get_file_path(data_path_1)


file_set: list = list(map(file_path_1, data_set_1))

This gives us a set of file locations. Lets look at the set of files that make up the CIC-DDoS2019 dataset

In [8]:
print(f'We will be cleaning {len(file_set)} files:')
pretty(file_set)

We will be cleaning 1 files:
['./original/Darknet.csv']


It will also come in handy to record some statistics about the data as it is being processed

In [9]:
composition_columns = ['File', 'Benign', 'Malicious', 'Total', 'Percent Benign']
data_composition = pd.DataFrame(columns = composition_columns)

In [10]:
current_job = 0
print(f'''
    Dataset {current_job+1}/{len(data_set)}: We now look at {file_set[current_job]}
''')

df          = load_data(file_set[current_job])


    Dataset 1/1: We now look at ./original/Darknet.csv

Loading Dataset: ./original/Darknet.csv
	To Dataset Cache: ./cache/Darknet.csv.pickle



In [11]:
df.head()

Unnamed: 0,Flow ID,Src IP,Src Port,Dst IP,Dst Port,Protocol,Timestamp,Flow Duration,Total Fwd Packet,Total Bwd packets,...,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label,Label1
0,10.152.152.11-216.58.220.99-57158-443-6,10.152.152.11,57158,216.58.220.99,443,6,24/07/2015 04:09:48 PM,229,1,1,...,0,0,0,0,0.0,0.0,0.0,0.0,Non-Tor,AUDIO-STREAMING
1,10.152.152.11-216.58.220.99-57159-443-6,10.152.152.11,57159,216.58.220.99,443,6,24/07/2015 04:09:48 PM,407,1,1,...,0,0,0,0,0.0,0.0,0.0,0.0,Non-Tor,AUDIO-STREAMING
2,10.152.152.11-216.58.220.99-57160-443-6,10.152.152.11,57160,216.58.220.99,443,6,24/07/2015 04:09:48 PM,431,1,1,...,0,0,0,0,0.0,0.0,0.0,0.0,Non-Tor,AUDIO-STREAMING
3,10.152.152.11-74.125.136.120-49134-443-6,10.152.152.11,49134,74.125.136.120,443,6,24/07/2015 04:09:48 PM,359,1,1,...,0,0,0,0,0.0,0.0,0.0,0.0,Non-Tor,AUDIO-STREAMING
4,10.152.152.11-173.194.65.127-34697-19305-6,10.152.152.11,34697,173.194.65.127,19305,6,24/07/2015 04:09:45 PM,10778451,591,400,...,0,0,0,0,1437760000000000.0,3117718.131,1437760000000000.0,1437760000000000.0,Non-Tor,AUDIO-STREAMING


In [12]:
features = list(df.columns)

In [14]:
features

['Flow ID',
 'Src IP',
 'Src Port',
 'Dst IP',
 'Dst Port',
 'Protocol',
 'Timestamp',
 'Flow Duration',
 'Total Fwd Packet',
 'Total Bwd packets',
 'Total Length of Fwd Packet',
 'Total Length of Bwd Packet',
 'Fwd Packet Length Max',
 'Fwd Packet Length Min',
 'Fwd Packet Length Mean',
 'Fwd Packet Length Std',
 'Bwd Packet Length Max',
 'Bwd Packet Length Min',
 'Bwd Packet Length Mean',
 'Bwd Packet Length Std',
 'Flow Bytes/s',
 'Flow Packets/s',
 'Flow IAT Mean',
 'Flow IAT Std',
 'Flow IAT Max',
 'Flow IAT Min',
 'Fwd IAT Total',
 'Fwd IAT Mean',
 'Fwd IAT Std',
 'Fwd IAT Max',
 'Fwd IAT Min',
 'Bwd IAT Total',
 'Bwd IAT Mean',
 'Bwd IAT Std',
 'Bwd IAT Max',
 'Bwd IAT Min',
 'Fwd PSH Flags',
 'Bwd PSH Flags',
 'Fwd URG Flags',
 'Bwd URG Flags',
 'Fwd Header Length',
 'Bwd Header Length',
 'Fwd Packets/s',
 'Bwd Packets/s',
 'Packet Length Min',
 'Packet Length Max',
 'Packet Length Mean',
 'Packet Length Std',
 'Packet Length Variance',
 'FIN Flag Count',
 'SYN Flag Count',
 'R

In [15]:
df.shape

(141530, 85)

In [16]:
df.groupby('Label').size()

Label
Non-Tor    93356
NonVPN     23863
Tor         1392
VPN        22919
dtype: int64

In [17]:
df.groupby('Label1').size()

Label1
AUDIO-STREAMING     1484
Audio-Streaming    16580
Browsing           32808
Chat               11478
Email               6145
File-Transfer      11098
File-transfer         84
P2P                48520
VOIP                3566
Video-Streaming     9486
Video-streaming      281
dtype: int64

In [20]:
current_job = 0
print(f'''
    Dataset {current_job+1}/{len(data_set)}: We now look at {file_set[current_job]}
''')

# df          = load_data(file_set[current_job])
# df          = df.rename(columns=new_column_names)
# benign_df   = df[df['Label'] == 'BENIGN']

# data_composition = data_composition.append(pd.DataFrame([
#     [file_set[current_job][11:], benign_df.shape[0], df.shape[0]-benign_df.shape[0], df.shape[0], 100*benign_df.shape[0]/df.shape[0]]
# ], columns = composition_columns))


print(f"""
File:\t\t\t\t{file_set[current_job]}  
Job Number:\t\t\t{current_job+1}
Shape:\t\t\t\t{df.shape}
Samples:\t\t\t{df.shape[0]} 
Features:\t\t\t{df.shape[1]}
""")


    Dataset 1/1: We now look at ./original/Darknet.csv


File:				./original/Darknet.csv  
Job Number:			1
Shape:				(141530, 85)
Samples:			141530 
Features:			85



Now that we have a dataset loaded, let's explore the features and find which ones we want to eliminate, creating a 'pruning' list to reduce the size of the dataset. We will use a few simple heuristics to eliminate features before examining particular methodologies. One of those heuristics is to eliminate non-numerical data. We could encode these value, but at this stage the goal is dimension reduction. If we meet poor performance, we can come back and re-examine our heuristics

In [21]:
prune: list = [] # prune is a list of all features we know we don't want to use
clip : list = [] # clip is a list of all values we do not want to use


# if the feature is string valued, we add it to our pruning list
values = df.values
columns = df.columns
for i in range(df.shape[1]):
    # if type(values[0][i]) == str and columns[i] != 'Label':
        # prune.append(columns[i]) 
    print(f"Column: {i}\tType: {type(values[0][i])}\Feature: {columns[i]}")

Column: 0	Type: <class 'str'>	Label: Flow ID
Column: 1	Type: <class 'str'>	Label: Src IP
Column: 2	Type: <class 'int'>	Label: Src Port
Column: 3	Type: <class 'str'>	Label: Dst IP
Column: 4	Type: <class 'int'>	Label: Dst Port
Column: 5	Type: <class 'int'>	Label: Protocol
Column: 6	Type: <class 'str'>	Label: Timestamp
Column: 7	Type: <class 'int'>	Label: Flow Duration
Column: 8	Type: <class 'int'>	Label: Total Fwd Packet
Column: 9	Type: <class 'int'>	Label: Total Bwd packets
Column: 10	Type: <class 'int'>	Label: Total Length of Fwd Packet
Column: 11	Type: <class 'int'>	Label: Total Length of Bwd Packet
Column: 12	Type: <class 'int'>	Label: Fwd Packet Length Max
Column: 13	Type: <class 'int'>	Label: Fwd Packet Length Min
Column: 14	Type: <class 'float'>	Label: Fwd Packet Length Mean
Column: 15	Type: <class 'float'>	Label: Fwd Packet Length Std
Column: 16	Type: <class 'int'>	Label: Bwd Packet Length Max
Column: 17	Type: <class 'int'>	Label: Bwd Packet Length Min
Column: 18	Type: <class 'fl

In [None]:
assert False, 'Nothing is complete after this point'

AssertionError: Nothing is complete after this point

In [8]:
prune: list = [] # prune is a list of all features we know we don't want to use
clip : list = [] # clip is a list of all values we do not want to use

# we extract the data from the benign_df and use it to layout our features
# we use the benign_df because it is smaller and will process faster
# if the feature is string valued, we add it to our pruning list
values = benign_df.values
columns = benign_df.columns
for i in range(benign_df.shape[1]):
    if type(values[0][i]) == str and columns[i] != 'Label':
        prune.append(columns[i]) 
    print(f"Column: {i}\tType: {type(values[0][i])}\tLabel: {columns[i]}")

Column: 0	Type: <class 'int'>	Label: Unnamed
Column: 1	Type: <class 'str'>	Label: Flow ID
Column: 2	Type: <class 'str'>	Label: Source IP
Column: 3	Type: <class 'int'>	Label: Source Port
Column: 4	Type: <class 'str'>	Label: Destination IP
Column: 5	Type: <class 'int'>	Label: Destination Port
Column: 6	Type: <class 'int'>	Label: Protocol
Column: 7	Type: <class 'str'>	Label: Timestamp
Column: 8	Type: <class 'int'>	Label: Flow Duration
Column: 9	Type: <class 'int'>	Label: Total Fwd Packets
Column: 10	Type: <class 'int'>	Label: Total Backward Packets
Column: 11	Type: <class 'float'>	Label: Total Length of Fwd Packets
Column: 12	Type: <class 'float'>	Label: Total Length of Bwd Packets
Column: 13	Type: <class 'float'>	Label: Fwd Packet Length Max
Column: 14	Type: <class 'float'>	Label: Fwd Packet Length Min
Column: 15	Type: <class 'float'>	Label: Fwd Packet Length Mean
Column: 16	Type: <class 'float'>	Label: Fwd Packet Length Std
Column: 17	Type: <class 'float'>	Label: Bwd Packet Length Max
C

Next, we use our previously defined function to examine the dataset and see if any features have unappealing values mixed in with the Real number valued features. These include infinite and NaN (Not a number) values that could interfere with our model's ability to process the data

In [9]:
feature_stats = features_with_bad_values(df, file_set[current_job])

Now that we have compiled the stats on the undesirable values in the dataset, we inspect the data to find out what features we should get rid of.

Our stats take the form of a dataframe with the dataset location, value being looked for, and the value count for each feature in the dataset

In [None]:
feature_stats

We can see there are plenty of features with a large number of 0 values, but tells us little about the distribution of inf and nan values. Lets take a closer look at the stats

In [None]:
f = feature_stats[feature_stats['Value'] == 'Inf'].T
f[f[0] != 0]

Flow Bytes per Second and Flow Packets per Second have over 162 thousand inf values. This makes these features a candidate for pruning, but 162 thousand out of 5 million samples may not justify pruning the entire feature, we may just remove the samples with the inf values

In [None]:
f = feature_stats[feature_stats['Value'] == 'NaN'].T
f[f[1] != 0]

No NaN values in our set sofar. This is pretty surprising because NaN values cropped up alot when cleaning the TOR dataset created with the same tool. So lets add Inf and NaN values to our clip list since they take up a small fraction of the number of samples in the dataset. Our clip list just specifies what samples to remove if they have a given value

In [13]:
toClip = [ np.inf, np.nan, 'Infinity', 'inf', 'NaN', 'nan' ]
for i in toClip:
    if i not in clip:
        clip.append(i)

Now we investigate the distribution of 0 valued features in the dataset. Unlike Inf and NaN values, we dont necessarily have to remove them. However if a feature is overwhelmingly populated with 0 values, it would be pointless to include the feature in our experiments

In [None]:
f = feature_stats[feature_stats['Value'] == 'Zero'].T
f[f[2] != 0]

In [None]:
f_top = f[:2]
f_bottom = f[2:]
f_bottom[f_bottom[2] > 0]

When it comes to 0 values, 79 out of 88 of our features have more than 0. This isnt necessarily bad, we expect a fair number of 0 values in any distribution of number, but features with >99% 0 values are obvious candidates for pruning

In [None]:
f_bottom[f_bottom[2] > 5000]

In [17]:
f_bottom[f_bottom[2] > 5000].shape

(61, 1)

Filtering the 0 values for instances greater than 5000 still gave us 60 features. Still 5000 is rather arbitrary, but filtering for it helps us see all of the large counts. We can split the data into 4 partitions with regards to the number of 0 valued features
    
    0-5,000

    5,000-200,000

    200,000-1,000,000

    1,000,000-5,071,011


The range from 0-200,000 seems reasonable in any normal distribution of samples, but features with more than 200,000 are questionable. So next we filter for instances of 0 values greater than 200,000

In [None]:
f_bottom[f_bottom[2] > 200000]

In [19]:
f_bottom[f_bottom[2] > 200000].shape

(52, 1)

This still leaves us with a set of 51 out of our original 88 features. Expanding our search just values greater than 1,000,000 then shows

In [None]:
f_bottom[f_bottom[2] > 1000000]

In [None]:
f_bottom[f_bottom[2] > 200000].shape

Which shows no change. Filtering for instances greater than 5,000,000, we find

In [None]:
f_bottom[f_bottom[2] > 5000000]

In [23]:
f_bottom[f_bottom[2] > 5000000].shape

(48, 1)

So we have 48 features with almost nothing but 0 values, 3 features with between 1,000,000 and 5,000,000 0 values, 9 features with between 5,000 and 200,000 0 values, and 18 features with less than 5,000 zero values

In [24]:
pruneCandidates: list = list(f_bottom[f_bottom[2] > 5000000].T.columns)

In [None]:
pruneCandidates

We add any feature with more than 5 million 0 values to the prune list, giving us a preliminary list of 53/88 features to remove. 

In [26]:
# toPrune = f_bottom[f_bottom[2] > 5000000].T.columns
# for i in toPrune:
#     if i not in prune:
#         prune.append(i)
# len(prune) 

In [None]:
prune

We will also add the Unnamed feature to this list due to our inability to identify what characteristic of the dataset it represents, as well as Fwd Header Length.1 due to it being a dupicate

In [28]:
toPrune = ['Fwd Header Length.1', 'Unnamed']

for i in toPrune:
    if i not in prune:
        prune.append(i)
len(prune)

7

Now, lets make a few functions to do everything we did above so we can evaluate the features of the other 17 collections of data in the CIC_DDoS2019 dataset

In [29]:
def examine_dataset(job_id: int) -> dict({'File': str, 'Dataset': pd.DataFrame, 'Feature_stats': pd.DataFrame, 'Data_composition': pd.DataFrame}):
    '''
        Function will return a dictionary containing dataframe of the job_id passed in as well as that dataframe's
        feature stats, data composition, and file name.
    '''

    job_id = job_id - 1  # adjusts for indexing while enumerating jobs from 1
    print(f'Dataset {job_id+1}/{len(data_set)}: We now look at {file_set[job_id]}\n\n')

    # Load the dataset
    df: pd.DataFrame = load_data(file_set[job_id])
    df = df.rename(columns=new_column_names)
    benign_df: pd.DataFrame = df[df['Label'] == 'BENIGN']

    # Record the data composition of the dataset
    composition = pd.concat([data_composition.append(
        pd.DataFrame([
            [file_set[job_id][11:], benign_df.shape[0], df.shape[0] - benign_df.shape[0], df.shape[0], 100*benign_df.shape[0]/df.shape[0]]
        ], columns = composition_columns)
    )], ignore_index=False)

    # print the data composition
    print(f'''
        File:\t\t\t\t{file_set[job_id]}  
        Job Number:\t\t\t{job_id+1}
        Shape:\t\t\t\t{df.shape}
        Samples:\t\t\t{df.shape[0]} 
        Features:\t\t\t{df.shape[1]}
        Benign Samples:\t\t\t{benign_df.shape[0]}
        Malicious Samples:\t\t{df.shape[0]-benign_df.shape[0]}
        Benign-to-Malicious Ratio:\t{benign_df.shape[0]/(df.shape[0]-benign_df.shape[0])}
    ''')
    
    # return the dataframe and the feature stats
    data_summary =  {'File':file_set[job_id] , 'Dataset':df, 'Feature_stats':features_with_bad_values(df, file_set[job_id]), 'Data_composition':composition}
    return data_summary


def check_infs(data_summary: dict) -> pd.DataFrame:
    '''
        Function will return a dataframe of features with a value of Inf.
    '''

    
    vals: pd.DataFrame = data_summary['Feature_stats']
    inf_df = vals[vals['Value'] == 'Inf'].T

    return inf_df[inf_df[0] != 0]


def check_nans(data_summary: dict) -> pd.DataFrame:
    '''
        Function will return a dataframe of features with a value of NaN.
    '''

    vals: pd.DataFrame = data_summary['Feature_stats']
    nan_df = vals[vals['Value'] == 'NaN'].T

    return nan_df[nan_df[1] != 0]


def check_zeros(data_summary: dict) -> pd.DataFrame:
    '''
        Function will return a dataframe of features with a value of 0.
    '''

    vals: pd.DataFrame = data_summary['Feature_stats']
    zero_df = vals[vals['Value'] == 'Zero'].T

    return zero_df[zero_df[2] != 0]


def check_zeros_over_threshold(data_summary: dict, threshold: int) -> pd.DataFrame:
    '''
        Function will return a dataframe of features with a value of 0.
    '''

    vals: pd.DataFrame = data_summary['Feature_stats']
    zero_df = vals[vals['Value'] == 'Zero'].T
    zero_df_bottom = zero_df[2:]

    return zero_df_bottom[zero_df_bottom[2] > threshold]


def check_zeros_over_threshold_percentage(data_summary: dict, threshold: float) -> pd.DataFrame:
    '''
        Function will return a dataframe of features with all features with
        a frequency of 0 values greater than the threshold
    '''

    vals: pd.DataFrame = data_summary['Feature_stats']
    size: int = data_summary['Dataset'].shape[0]
    zero_df = vals[vals['Value'] == 'Zero'].T
    zero_df_bottom = zero_df[2:]

    return zero_df_bottom[zero_df_bottom[2] > threshold*size]


def create_new_prune_candidates(zeros_df: pd.DataFrame) -> list:
    '''
        Function creates a list of prune candidates from a dataframe of features with a high frequency of 0 values
    '''

    return list(zeros_df.T.columns)


def intersection_of_prune_candidates(pruneCandidates: list, newPruneCandidates: list) -> list:
    '''
        Function will return a list of features that are in both pruneCandidates and newPruneCandidates
    '''

    return list(set(pruneCandidates).intersection(newPruneCandidates))

### First, we test out our new functions on the first collection of data we evaluated above

## Data Collection #1

In [None]:
dataset_1 = examine_dataset(1)

In [None]:
check_infs(dataset_1)

In [None]:
check_nans(dataset_1)

In [None]:
check_zeros(dataset_1)

In [None]:
check_zeros_over_threshold(dataset_1, 5000000)

In [35]:
check_zeros_over_threshold_percentage(dataset_1, .95).shape

(48, 1)

So lets add the features that are made up of 95% or more 0 values to a pruneCandidates list. We will go through each collection of data within CIC_DDoS2019 and the intersection of all the pruneCandidates will be added to our prune list for preliminary feature selection.

In [None]:
newPruneCandidates: list = create_new_prune_candidates(check_zeros_over_threshold_percentage(dataset_1, .95))
pruneCandidates   : list = intersection_of_prune_candidates(pruneCandidates, newPruneCandidates)
pretty(pruneCandidates)

We skipped testing the add_to_comp_stats function because this data collection's stats are already in the data_composition dataframe

In [37]:
data_composition

Unnamed: 0,File,Benign,Malicious,Total,Percent Benign
0,01-12/DrDoS_DNS.csv,3402,5071011,5074413,0.067042


## Data Collection #2

Now, let's examine the next collection of data

In [None]:
dataset_2 = examine_dataset(2)

Here we see that the ratio of benign to malicious in this data collection is similar to the first. This collection is about half the size of the first and has around 20% of the inf values found in the first as well

In [None]:
check_infs(dataset_2)

We can see this collection also has no NaN valued entries

In [None]:
check_nans(dataset_2)

Checking out second collection for 0 values reveals a situation mirroring that of the first collection. Lets go through and check the number of features with 0 values over a particular threshold

In [None]:
check_zeros(dataset_2)

In [None]:
print(f'''
Features with a frequency of 0 values greater than
    2,000,000: {check_zeros_over_threshold(dataset_2, 2000000).shape[0]}
    1,000,000: {check_zeros_over_threshold(dataset_2, 1000000).shape[0]}
    500,000  : {check_zeros_over_threshold(dataset_2, 500000).shape[0]}
    200,000  : {check_zeros_over_threshold(dataset_2, 200000).shape[0]}
    50,000   : {check_zeros_over_threshold(dataset_2, 50000).shape[0]}
    5,000    : {check_zeros_over_threshold(dataset_2, 5000).shape[0]}
    0        : {check_zeros_over_threshold(dataset_2, 0).shape[0]}
''')

We can see that there is a similar distribution of 0 values in this data collection as there was in the first. Just as in the first, 48 features consist of 95% 0 values. So we add them to our pruneCandidates list

In [43]:
check_zeros_over_threshold_percentage(dataset_2, .95).shape

(48, 1)

In [None]:
newPruneCandidates: list = create_new_prune_candidates(check_zeros_over_threshold_percentage(dataset_2, .95))
pruneCandidates   : list = intersection_of_prune_candidates(pruneCandidates, newPruneCandidates)
pretty(pruneCandidates)

In [None]:
data_composition

In [None]:
data_composition = dataset_2['Data_composition']
data_composition

## Breakdown

In [159]:
benign_samples = data_composition['Benign'].sum()

In [160]:
ddos_samples = data_composition['Malicious'].sum()

In [161]:
total_samples = data_composition['Total'].sum()

In [None]:
pd.concat([data_composition.append(
        pd.DataFrame([
            ['CIC_DDoS2019', benign_samples, ddos_samples, total_samples, 100*benign_samples/total_samples]
        ], columns = composition_columns)
    )], ignore_index=True)

In [None]:
for collection in datasets:
    sumStats += collection['Feature_stats'][features]

In [None]:
sumStats

In [None]:
sumStats

Here we create a dictionary that maps all the raw CSV column labels with more meaningful, human interpretable labels. Extra whitespace is stripped, and superfluous information is eliminated.

In [None]:
new_column_names = {
    'Unnamed: 0'                :'Unnamed'                  , 'Flow ID'                     :'Flow ID'                      ,
    ' Source IP'                :'Source IP'                , ' Source Port'                :'Source Port'                  ,
    ' Destination IP'           :'Destination IP'           , ' Destination Port'           :'Destination Port'             ,
    ' Protocol'                 :'Protocol'                 , ' Total Length of Bwd Packets':'Total Length of Bwd Packets'  ,     
    ' Flow Duration'            :'Flow Duration'            , ' Total Fwd Packets'          :'Total Fwd Packets'            , 
    ' Total Backward Packets'   :'Total Backward Packets'   , 'Total Length of Fwd Packets' :'Total Length of Fwd Packets'  ,
    ' Timestamp'                :'Timestamp'                , ' Init_Win_bytes_backward'    :'Init Win bytes backward'      ,
    ' Fwd Packet Length Max'    :'Fwd Packet Length Max'    , ' Fwd Packet Length Min'      :'Fwd Packet Length Min'        ,
    ' Fwd Packet Length Mean'   :'Fwd Packet Length Mean'   , ' Fwd Packet Length Std'      :'Fwd Packet Length Std'        ,
    'Bwd Packet Length Max'     :'Bwd Packet Length Max'    , ' Bwd Packet Length Min'      :'Bwd Packet Length Min'        ,
    ' Bwd Packet Length Mean'   :'Bwd Packet Length Mean'   , ' Bwd Packet Length Std'      :'Bwd Packet Length Std'        ,
    'Flow Bytes/s'              :'Flow Bytes/s'             , ' Flow Packets/s'             :'Flow Packets/s'               ,
    ' Flow IAT Mean'            :'Flow IAT Mean'            , ' Flow IAT Std'               :'Flow IAT Std'                 ,
    ' Flow IAT Max'             :'Flow IAT Max'             , ' Flow IAT Min'               :'Flow IAT Min'                 ,
    'Fwd IAT Total'             :'Fwd IAT Total'            , ' Fwd IAT Mean'               :'Fwd IAT Mean'                 ,
    ' Fwd IAT Std'              :'Fwd IAT Std'              , ' Fwd IAT Max'                :'Fwd IAT Max'                  ,
    ' Fwd IAT Min'              :'Fwd IAT Min'              , 'Bwd IAT Total'               :'Bwd IAT Total'                ,    
    ' Bwd IAT Mean'             :'Bwd IAT Mean'             , ' Bwd IAT Std'                :'Bwd IAT Std'                  ,
    ' Bwd IAT Max'              :'Bwd IAT Max'              , ' Bwd IAT Min'                :'Bwd IAT Min'                  ,
    'Fwd PSH Flags'             :'Fwd PSH Flags'            , ' Bwd PSH Flags'              :'Bwd PSH Flags'                , 
    ' Fwd URG Flags'            :'Fwd URG Flags'            , ' Bwd URG Flags'              :'Bwd URG Flags'                ,
    ' Fwd Header Length'        :'Fwd Header Length'        , ' Bwd Header Length'          :'Bwd Header Length'            , 
    'Fwd Packets/s'             :'Fwd Packets/s'            , ' Bwd Packets/s'              :'Bwd Packets/s'                , 
    ' Min Packet Length'        :'Min Packet Length'        , ' Max Packet Length'          :'Max Packet Length'            , 
    ' Packet Length Mean'       :'Packet Length Mean'       , ' Packet Length Std'          :'Packet Length Std'            , 
    ' Packet Length Variance'   :'Packet Length Variance'   , 'FIN Flag Count'              :'FIN Flag Count'               ,
    ' SYN Flag Count'           :'SYN Flag Count'           , ' RST Flag Count'             :'RST Flag Count'               ,
    ' PSH Flag Count'           :'PSH Flag Count'           , ' ACK Flag Count'             :'ACK Flag Count'               , 
    ' URG Flag Count'           :'URG Flag Count'           , ' CWE Flag Count'             :'CWE Flag Count'               , 
    ' ECE Flag Count'           :'ECE Flag Count'           , ' Down/Up Ratio'              :'Down/Up Ratio'                ,
    ' Average Packet Size'      :'Average Packet Size'      , ' Avg Fwd Segment Size'       :'Avg Fwd Segment Size'         ,
    ' Avg Bwd Segment Size'     :'Avg Bwd Segment Size'     , ' Fwd Header Length.1'        :'Fwd Header Length.1'          , 
    'Fwd Avg Bytes/Bulk'        :'Fwd Avg Bytes/Bulk'       , ' Inbound'                    :'Inbound'                      , 
    ' Fwd Avg Packets/Bulk'     :'Fwd Avg Packets/Bulk'     , ' Fwd Avg Bulk Rate'          :'Fwd Avg Bulk Rate'            , 
    ' Bwd Avg Bytes/Bulk'       :'Bwd Avg Bytes/Bulk'       , ' Bwd Avg Packets/Bulk'       :'Bwd Avg Packets/Bulk'         ,
    'Bwd Avg Bulk Rate'         :'Bwd Avg Bulk Rate'        , 'Subflow Fwd Packets'         :'Subflow Fwd Packets'          ,
    ' Subflow Fwd Bytes'        :'Subflow Fwd Bytes'        , ' Subflow Bwd Packets'        :'Subflow Bwd Packets'          ,
    ' Subflow Bwd Bytes'        :'Subflow Bwd Bytes'        , 'Init_Win_bytes_forward'      :'Init Win bytes forward'       ,
    ' act_data_pkt_fwd'         :'act data pkt fwd'         , ' min_seg_size_forward'       :'min seg size forward'         ,     
    'Active Mean'               :'Active Mean'              , ' Active Std'                 :'Active Std'                   ,
    ' Active Max'               :'Active Max'               , ' Active Min'                 :'Active Min'                   , 
    'Idle Mean'                 :'Idle Mean'                , ' Idle Std'                   :'Idle Std'                     ,
    ' Idle Max'                 :'Idle Max'                 , ' Idle Min'                   :'Idle Min'                     ,
    'SimillarHTTP'              :'SimillarHTTP'             , ' Label'                      :'Label'                        ,
}

Here we try the tabgan library from 

In [None]:
from tabgan.sampler import OriginalGenerator, GANGenerator

In [None]:
train = pd.DataFrame(np.random.randint(-10, 150, size=(150, 4)), columns=list("ABCD"))
target = pd.DataFrame(np.random.randint(0, 3, size=(150, 1)), columns=list("Y"))
test = pd.DataFrame(np.random.randint(0, 100, size=(100, 4)), columns=list("ABCD"))

# generate data
new_train1, new_target1 = OriginalGenerator().generate_data_pipe(train, target, test, )
# new_train2, new_target2 = GANGenerator().generate_data_pipe(train, target, test, )

In [None]:
new_train1.shape
new_target1.shape