## Prepare First N Packet Datasets

In [1]:
import os
import pandas as pd
import numpy as np

# Directory where the CSV files are stored
CSV_DIR = 'datasets'

# List of different N values
Ns = range(2,20+1)

# Load and preprocess the data for common dataframe df1
df1 = pd.read_csv(os.path.join(CSV_DIR, 'wednesday_cf.csv'))

# Preprocessing df1 to facilitate filtering based on 'bidirectional_packets' for different values of n
df1 = df1[['flow_key_hash', 'bidirectional_packets', 'label']]

for n in Ns:
    # Load data for each df2 based on n value
    df2 = pd.read_csv(os.path.join(CSV_DIR, f'wednesday_pc_{n}.csv'))

    # Filter df2: remove duplicate 'forward_hash' entries, keeping only the row with the lowest 'id'
    df2.sort_values(by='id', inplace=True)
    df2 = df2.drop_duplicates(subset='flow_key_hash', keep='first')

    # Merge df1 and df2. This adds 'label' and 'bidirectional_packets' from df1 to df2
    merged_df = df2.merge(df1, on='flow_key_hash', how='left', suffixes=('', '_df1'))

    # Filter out rows where bidirectional_packets from df1 is less than n and keep rows with labels
    final_df = merged_df[(merged_df['bidirectional_packets_df1'] >= n) & merged_df['label'].notna()]

    # Ensure each 'flow_key_hash' is unique, then drop unnecessary columns including 'flow_key_hash'
    final_df = final_df.drop_duplicates(subset='flow_key_hash', keep='first').drop(columns=['bidirectional_packets_df1'])

    # Downcast integers and floats
    for col in final_df.columns:
        col_type = final_df[col].dtype
    
        if np.issubdtype(col_type, np.integer):
            final_df[col] = pd.to_numeric(final_df[col], downcast='integer')
        elif np.issubdtype(col_type, np.floating):
            final_df[col] = pd.to_numeric(final_df[col], downcast='float')    

    # Store the updated dataframe in a CSV file, keeping only those flows that have a label assigned
    final_df.to_parquet(os.path.join(CSV_DIR, f'wednesday_pc_{n}.parquet'), index=False)

In [2]:
cf_df = pd.read_csv(os.path.join(CSV_DIR, 'wednesday_cf.csv'))

# Downcast integers and floats
for col in final_df.columns:
    col_type = cf_df[col].dtype

    if np.issubdtype(col_type, np.integer):
        cf_df[col] = pd.to_numeric(cf_df[col], downcast='integer')
    elif np.issubdtype(col_type, np.floating):
        cf_df[col] = pd.to_numeric(cf_df[col], downcast='float')    

cf_df.to_parquet(os.path.join(CSV_DIR, 'wednesday_cf.parquet'), index=False)

In [3]:
import pandas as pd
import os
import numpy as np
import warnings
warnings.filterwarnings("ignore")

DAY = "wednesday"

# Initialize an empty list to store data dictionaries
data_list = []

# Initializing the table with updated formatting to match the expanded header.
header = ["DS", "TOTAL", "BENIGN", "ANOMALY", "Anomaly breakdown", "Min Dur.", "Mean Dur.", "Max Dur."]
rowh = "{:^12} " * 4 + "{:^26} "            + "{:^10} " + "{:^12} " + "{:^12} "
row  = "{:^12} " * 4 + "{:<18}  " + "{:>6} " + "{:>10} " + "{:>12} " + "{:>12} "
sep  = ["-"*12]  * 4 + ["-"*26]             + ["-"*10]  + ["-"*12] + ["-"*12]

print(rowh.format(*header))  # This should now work without an IndexError

Ns = ['cf'] + list(range(2, 20+1))

for n in Ns:
    print(rowh.format(*sep))

    # Determine the filename based on the value of n
    filename = f"{DAY}_cf.parquet" if n == 'cf' else f"{DAY}_pc_{n}.parquet"

    # Load the CSV
    csv = pd.read_parquet(os.path.join(CSV_DIR,filename))

    TOTAL = len(csv)
    BENIGN = len(csv[csv["label"] == "BENIGN"])
    ANOMALY = len(csv[(csv["label"] != "BENIGN")])

    # Create and append the summary row for each file
    summary_row = {
        "DS": DAY + '_' + str(n),
        "TOTAL": TOTAL,
        "BENIGN": BENIGN,
        "ANOMALY": ANOMALY,
        "Anomaly breakdown": [],
    }
    
    print(row.format(DAY+'_'+str(n), TOTAL, BENIGN, ANOMALY, "", "", "", "", "")) 
    for label in sorted(csv["label"].unique().tolist()):
        if label in ["BENIGN"]:  # Skip benign and NaN labels
            continue
        # Calculate min, mean, and max durations for each anomaly
        anomaly_data = csv[csv["label"] == label]['bidirectional_duration_ms']
        if not anomaly_data.empty:
            min_duration = int(np.min(anomaly_data))  # Convert to int
            mean_duration = float(np.mean(anomaly_data))  # Convert to float
            max_duration = int(np.max(anomaly_data))  # Convert to int
        
        print(row.format("", "", "", "", label, len(csv[csv["label"] == label]), min_duration, "{:.2f}".format(mean_duration), max_duration)) 

        # Append the anomaly details to the "Anomaly breakdown" list
        summary_row["Anomaly breakdown"].append({
            "Anomaly Type": label,
            "Count": len(anomaly_data),
            "Min Duration": min_duration,
            "Mean Duration": mean_duration,
            "Max Duration": max_duration,
        })
    
    data_list.append(summary_row)

# Now, `data_list` contains all the information
# Convert the list to a JSON string and write it to a file
json_data = json.dumps(data_list, indent=4)
with open('results/pc_anomaly_distribution.json', 'w') as file:
    file.write(json_data)

     DS         TOTAL        BENIGN      ANOMALY        Anomaly breakdown       Min Dur.   Mean Dur.     Max Dur.   
------------ ------------ ------------ ------------ -------------------------- ---------- ------------ ------------ 
wednesday_cf    502350       326363       175987                                                                    
                                                    DoS GoldenEye         7917          0     11028.96       106793 
                                                    DoS Hulk            158680          0       693.68       128843 
                                                    DoS Slowhttptest      3707          0      9562.98       167903 
                                                    DoS Slowloris         5683          0     33548.82       105745 
------------ ------------ ------------ ------------ -------------------------- ---------- ------------ ------------ 
wednesday_2     500493       324508       175985                

## Sanity check

In [4]:
import pandas as pd

X = 8

# Load the dataset
df = pd.read_parquet('datasets/wednesday_cf.parquet')

# Check how many rows have a certain label and 'bidirectional_packets' higher than or equal to X
benign_flows = df[(df['label'] == 'BENIGN') & (df['bidirectional_packets'] >= X)].shape[0]
goldeneye_flows = df[(df['label'] == 'DoS GoldenEye') & (df['bidirectional_packets'] >= X)].shape[0]
hulk_flows = df[(df['label'] == 'DoS Hulk') & (df['bidirectional_packets'] >= X)].shape[0]
slowhttptest_flows = df[(df['label'] == 'DoS Slowhttptest') & (df['bidirectional_packets'] >= X)].shape[0]
slowloris_flows = df[(df['label'] == 'DoS Slowloris') & (df['bidirectional_packets'] >= X)].shape[0]

# Print the result
print(f"Number of flows with label 'BENIGN' and 'bidirectional_packets' >= {X}: {benign_flows}")
print(f"Number of flows with label 'DoS GoldenEye' and 'bidirectional_packets' >= {X}: {goldeneye_flows}")
print(f"Number of flows with label 'DoS Hulk' and 'bidirectional_packets' >= {X}: {hulk_flows}")
print(f"Number of flows with label 'DoS Slowhttptest' and 'bidirectional_packets' >= {X}: {slowhttptest_flows}")
print(f"Number of flows with label 'DoS Slowloris' and 'bidirectional_packets' >= {X}: {slowloris_flows}")

Number of flows with label 'BENIGN' and 'bidirectional_packets' >= 8: 87891
Number of flows with label 'DoS GoldenEye' and 'bidirectional_packets' >= 8: 7567
Number of flows with label 'DoS Hulk' and 'bidirectional_packets' >= 8: 153941
Number of flows with label 'DoS Slowhttptest' and 'bidirectional_packets' >= 8: 458
Number of flows with label 'DoS Slowloris' and 'bidirectional_packets' >= 8: 1861


## Convert the values into LaTeX table

In [5]:
import json

# Function to add thousand separators
def add_thousand_sep(number):
    if isinstance(number, float):
        # For floating point numbers, split on decimal point
        integer_part, decimal_part = f"{number:.2f}".split(".")
        integer_part_with_sep = "{:_}".format(int(integer_part)).replace("_", " ")
        return f"{integer_part_with_sep}.{decimal_part}"
    elif isinstance(number, int):
        # For integers, just add the separator
        return "{:_}".format(number).replace("_", " ")
    else:
        # Return the value as it is if it's not a number
        return number

# Load the JSON data from the file
with open('results/pc_anomaly_distribution.json', 'r') as file:
    data_list = json.load(file)

# Start the LaTeX table and define the header
latex_code = """
\\begin{table*}[htbp]
\\scriptsize
\\centering
\\caption{Your Table Caption}
\\renewcommand{\\arraystretch}{0.6} % Reduce spacing
\\begin{tabular}{lrrrrrrrr}
\\toprule
\\textbf{DS} & \\textbf{TOTAL} & \\textbf{BENIGN} & \\textbf{ANOMALY} & \\textbf{Anomaly Type} & \\textbf{Count} & \\textbf{Min Dur. [ms]} & \\textbf{Mean Dur. [ms]} & \\textbf{Max Dur. [ms]} \\\\
\\midrule
"""

# Helper to determine how many anomaly breakdowns are present for a given DS
def count_anomalies(entry):
    return len(entry.get("Anomaly breakdown", []))

# Iterate over each entry in the data list to populate the table rows
for entry in data_list:
    # Extract DS value and transform it according to the specified rules
    ds_value = entry['DS']
    if ds_value == "wednesday_cf":
        ds_label = "CF"
    elif ds_value.startswith("wednesday_"):
        ds_number = ds_value.split("_")[-1]  # Extract the number part
        ds_label = f"PC={ds_number}"
    else:
        ds_label = ds_value  # Fallback to the original DS value if none of the above rules apply

    anomaly_count = count_anomalies(entry)

    # Add the summary row for each dataset with multirow if there are anomalies
    if anomaly_count > 0:
        latex_code += f"\\multirow{{{anomaly_count}}}{{*}}{{{ds_label}}} & "
        latex_code += f"\\multirow{{{anomaly_count}}}{{*}}{{{add_thousand_sep(entry['TOTAL'])}}} & "
        latex_code += f"\\multirow{{{anomaly_count}}}{{*}}{{{add_thousand_sep(entry['BENIGN'])}}} & "
        latex_code += f"\\multirow{{{anomaly_count}}}{{*}}{{{add_thousand_sep(entry['ANOMALY'])}}} & "
    else:
        latex_code += f"{ds_label} & {add_thousand_sep(entry['TOTAL'])} & {add_thousand_sep(entry['BENIGN'])} & {add_thousand_sep(entry['ANOMALY'])} & "

    # Flag to know if it's the first anomaly to avoid inserting the multirow DS and totals again
    first_anomaly = True
    for anomaly in entry['Anomaly breakdown']:
        if not first_anomaly:
            latex_code += " &  &  &  & "
        latex_code += f"{anomaly['Anomaly Type']} & {add_thousand_sep(anomaly['Count'])} & "
        latex_code += f"{add_thousand_sep(anomaly['Min Duration'])} & {add_thousand_sep(anomaly['Mean Duration'])} & "
        latex_code += f"{add_thousand_sep(anomaly['Max Duration'])} \\\\\n"
        first_anomaly = False
    if anomaly_count > 0:
        latex_code += "\\midrule\n"

# Close the LaTeX table structure
latex_code += """
\\bottomrule
\\end{tabular}
\\end{table*}
"""

# Display the generated LaTeX code (for testing purposes)
print(latex_code)

with open('results/pc_anomaly_distribution_table.tex', 'w') as file:
    file.write(latex_code)


\begin{table*}[htbp]
\scriptsize
\centering
\caption{Your Table Caption}
\renewcommand{\arraystretch}{0.6} % Reduce spacing
\begin{tabular}{lrrrrrrrr}
\toprule
\textbf{DS} & \textbf{TOTAL} & \textbf{BENIGN} & \textbf{ANOMALY} & \textbf{Anomaly Type} & \textbf{Count} & \textbf{Min Dur. [ms]} & \textbf{Mean Dur. [ms]} & \textbf{Max Dur. [ms]} \\
\midrule
\multirow{4}{*}{CF} & \multirow{4}{*}{502 350} & \multirow{4}{*}{326 363} & \multirow{4}{*}{175 987} & DoS GoldenEye & 7 917 & 0 & 11 028.96 & 106 793 \\
 &  &  &  & DoS Hulk & 158 680 & 0 & 693.68 & 128 843 \\
 &  &  &  & DoS Slowhttptest & 3 707 & 0 & 9 562.98 & 167 903 \\
 &  &  &  & DoS Slowloris & 5 683 & 0 & 33 548.82 & 105 745 \\
\midrule
\multirow{4}{*}{PC=2} & \multirow{4}{*}{500 493} & \multirow{4}{*}{324 508} & \multirow{4}{*}{175 985} & DoS GoldenEye & 7 917 & 0 & 130.35 & 58 333 \\
 &  &  &  & DoS Hulk & 158 680 & 0 & 456.34 & 32 096 \\
 &  &  &  & DoS Slowhttptest & 3 705 & 0 & 152.66 & 36 864 \\
 &  &  &  & DoS Slowloris 