## Producing Complete Flows

In [1]:
RESULTS_DIR = "results"
CSV_DIR = "datasets"
INPUT_DIR = "PCAP/deduplicated_reordered"
DAY = "wednesday"

SCENARIO = "cf"

## For this flow measurement 
 - we set `idle timeout` to `60 seconds`
 - we set `actie timeout` to `18000 seconds (5 hours)` to prevent long flows being separated into multiple entries
 - we expire flows on the first RST/FIN packet. With this, we want to avoid observing the consequence of the attacks, which manifests in the connections being terminated. Otherwise, we could not focus on solely on the attack characteristics themselves.
 - we drop all the flows which are being 'cut' from the active flows by expiring them on the first RST/FIN flag.
 - we make flow start part of the flow ID calculated as a hash to better identify flows later with their subflow counterparts.
 - we drop all flows associated with Heartbleed attack.

In [2]:
import os
import sys
from datetime import timedelta
import time
import logging
from nfstream import NFPlugin, NFStreamer
import nfstream
from labeller import cicids2017
import hashlib


# set up logging
def setup_logging(log_filename="generate-complete-flows.log"):
    with open(log_filename, "w"):  # Use 'w' to clear the existing log file, if it exists
        pass  # Do nothing, just open and close to clear the file

    logging.basicConfig(
        level=logging.INFO,
        format="%(message)s",
        handlers=[logging.FileHandler(log_filename), logging.StreamHandler(sys.stdout)],
    )


def consistent_hash(value):
    # This function converts a value into a consistent hash.
    return hashlib.sha256(value.encode()).hexdigest()
    

class PayloadManager(NFPlugin):
    """Manages the payload data for network flows."""

    def on_init(self, packet, flow):
        # Initialize payload sizes based on the packet direction.
        flow.udps.src2dst_payload = packet.payload_size if packet.direction == 0 else 0
        flow.udps.dst2src_payload = packet.payload_size if packet.direction == 1 else 0

    def on_update(self, packet, flow):
        # Update payload sizes based on the packet direction.
        flow.udps.src2dst_payload += packet.payload_size if packet.direction == 0 else 0
        flow.udps.dst2src_payload += packet.payload_size if packet.direction == 1 else 0


class FlowExpirationManager(NFPlugin):
    """Manages the expiration policy for TCP flows."""

    def on_init(self, packet, flow):
        # Set the expiration ID based on TCP rst or fin flags.
        if packet.rst or packet.fin:
            flow.expiration_id = -1

    def on_update(self, packet, flow):
        # Update expiration policy based on TCP rst or fin flags.
        if packet.rst or packet.fin:
            flow.expiration_id = -1


class FlowLabelManager(NFPlugin):
    """Labels flows upon expiration."""

    def __init__(self, day):
        self.day = day

    def on_expire(self, flow):
        # Assign a label to the flow and clean up payloads.
        flow.udps.label = cicids2017(
            self.day, flow, label_reverse=True, signal_reverse=False
        )
        self.cleanup_payload(flow)

    @staticmethod
    def cleanup_payload(flow):
        # Clean up payload data from the flow.
        if hasattr(flow.udps, "src2dst_payload"):
            del flow.udps.src2dst_payload
        if hasattr(flow.udps, "dst2src_payload"):
            del flow.udps.dst2src_payload


class HashManager(NFPlugin):
    """Calculate forward and backward hashes."""
    def on_init(self, packet, flow):
        flow.udps.flow_key_hash = consistent_hash(f"{packet.src_ip}-{packet.src_port}-{packet.dst_ip}-{packet.dst_port}-{packet.protocol}-{flow.bidirectional_first_seen_ms}")


def process_files_in_directory(input_dir: str, day: str, output_dir: str):
    """Process all PCAP files in a directory and output to another directory."""

    BPF = "ip and (ip proto \\tcp or \\udp)"  # only ipv4 tcp and udp traffic to capture

    input_file = os.path.join(input_dir, f"rd{day.capitalize()}.pcap")
    if os.path.isfile(input_file):
        output_file = os.path.join(output_dir, f"{day}_{SCENARIO}_wzpl.csv")

        start = time.time()

        streamer = NFStreamer(
              source=input_file
            , decode_tunnels=False                                # Default: True
            , bpf_filter=BPF                                      # Default: None
            , promiscuous_mode=True                               # Default: True
            , snapshot_length=1536                                # Default: 1536
            , idle_timeout=60                                     # Default: 120
            , active_timeout=18000                                # Default: 1800
            , accounting_mode=1                                   # Default: 0
            , udps=[                                              # Default: None
                FlowExpirationManager(),
                PayloadManager(),
                HashManager(),
                FlowLabelManager(day.capitalize())
            ]      
            , n_dissections=0                                     # Default: 20
            , statistical_analysis=True                           # Default: False
            , splt_analysis=20                                     # Default: 0
            , n_meters=1                                          # Default: 0
            , performance_report=0                                # Default: 0
        )

        # Convert the stream to a DataFrame
        df = streamer.to_pandas(columns_to_anonymize=[])
        logging.info(f"NFStream generated flows: {len(df)}")

        end = time.time()
        processing_time = end - start
        delta = timedelta(seconds=processing_time)
        # logging.info(f"Time required to generate flows: {str(delta)}")

        # Filter the DataFrame
        df_filtered = df[
            ~(
                (df["bidirectional_packets"] == 1)
                & (
                    (df["bidirectional_rst_packets"] == 1)
                    | (df["bidirectional_fin_packets"] == 1)
                )
            )
        ]
        logging.info(f"Flows filtered based on RST/FIN: {len(df)-len(df_filtered)}")
        df = df_filtered
        logging.info(f"Number of complete flows: {len(df)}")

        # Save the filtered DataFrame to a CSV file
        df.rename(columns={
                           "udps.label": "label",
                           "udps.flow_key_hash": "flow_key_hash"
                          }, inplace=True)
        df.to_csv(output_file, index=False)

        logging.info(f"Flows stored as: {day}_{SCENARIO}.csv")
        logging.info(f"\n")


if __name__ == "__main__":
    input_dir = INPUT_DIR
    output_dir = CSV_DIR
    day = DAY
    
    # Ensure output directory exists
    os.makedirs(output_dir, exist_ok=True)

    setup_logging()

    logging.info(f"Generating flows with NFStream v{nfstream.__version__}")
    logging.info(f"\n")
    
    process_files_in_directory(input_dir, day, output_dir)

Generating flows with NFStream v6.5.4a


NFStream generated flows: 1074383
Flows filtered based on RST/FIN: 219536
Number of  complete flows: 854847
Flows stored as: wednesday_cf.csv




## Show dataset distribution

In [3]:
import pandas as pd
import os
import warnings
warnings.filterwarnings("ignore")

day = DAY

# Initializing the table.
header = ["DS", "TOTAL", "BENIGN", "ANOMALY", "Anomaly breakdown"]
rowh = "{:^18}  "*(len(header)-1) + "{:^36}"
row  = "{:^18}  "*(len(header)-1) + "{:<26}" + "{:>6}"
sep  = ["-"*18]*(len(header)-1) + ["-"*36]
print(rowh.format(*header))
print(rowh.format(*sep))

csv = pd.read_csv(os.path.join(CSV_DIR,f"{day}_{SCENARIO}_wzpl.csv"))

TOTAL = len(csv)
BENIGN = len(csv[csv["label"] == "BENIGN"])
ANOMALY = len(csv[(csv["label"] != "BENIGN") & (csv["label"] != "NaN")])

print(row.format(DAY, TOTAL, BENIGN, ANOMALY, "", ""))
for label in sorted(csv["label"].unique().tolist()):
    if label == "BENIGN":
        continue
    print(row.format("","","","", label, len(csv[csv["label"] == label])))

        DS                TOTAL               BENIGN             ANOMALY                 Anomaly breakdown          
------------------  ------------------  ------------------  ------------------  ------------------------------------
    wednesday             854847              327470              527377                                        
                                                                                BENIGN - ZPL              122828
                                                                                DoS GoldenEye               7917
                                                                                DoS GoldenEye - ZPL         7483
                                                                                DoS Hulk                  158680
                                                                                DoS Hulk - ZPL            216413
                                                                                DoS Slow

## Filter out all flows with zero payload and drop Heartbleed flows

In [4]:
filtered_df = csv[~csv['label'].str.contains("ZPL")]
filtered_df = filtered_df[filtered_df['label'] != 'Heartbleed'].copy()
filtered_df.to_csv(os.path.join(CSV_DIR, f"{day}_{SCENARIO}_wozpl.csv"), index=False)

## Show dataset distribution after filtering

In [5]:
import pandas as pd
import os
import warnings
warnings.filterwarnings("ignore")

day = DAY

# Initializing the table.
header = ["DS", "TOTAL", "BENIGN", "ANOMALY", "Anomaly breakdown"]
rowh = "{:^18}  "*(len(header)-1) + "{:^36}"
row  = "{:^18}  "*(len(header)-1) + "{:<26}" + "{:>6}"
sep  = ["-"*18]*(len(header)-1) + ["-"*36]
print(rowh.format(*header))
print(rowh.format(*sep))

csv = pd.read_csv(os.path.join(CSV_DIR,f"{day}_{SCENARIO}_wozpl.csv"))

TOTAL = len(csv)
BENIGN = len(csv[csv["label"] == "BENIGN"])
ANOMALY = len(csv[(csv["label"] != "BENIGN") & (csv["label"] != "NaN")])

print(row.format(DAY, TOTAL, BENIGN, ANOMALY, "", ""))
for label in sorted(csv["label"].unique().tolist()):
    if label == "BENIGN":
        continue
    print(row.format("","","","", label, len(csv[csv["label"] == label])))

        DS                TOTAL               BENIGN             ANOMALY                 Anomaly breakdown          
------------------  ------------------  ------------------  ------------------  ------------------------------------
    wednesday             503457              327470              175987                                        
                                                                                DoS GoldenEye               7917
                                                                                DoS Hulk                  158680
                                                                                DoS Slowhttptest            3707
                                                                                DoS Slowloris               5683


## Show FIN and RST statistics

### Count of flows whose FIN or RST count is higher than 1, 2, and 3

In [6]:
for N in range(1,1+3):
    # Initializing the table.
    header = ["DS", "TOTAL", "BENIGN", "ANOMALY",
              f"T_FIN>{N}", f"B_FIN>{N}", f"A_FIN>{N}",
              f"T_RST>{N}", f"B_RST>{N}", f"A_RST>{N}"]
    rowh = "{:^9}  "*(len(header))
    row = "{:^9}  "*(len(header))
    sep = ["-"*9]*(len(header)) + ["-"*39]

    print("\n")
    
    print(rowh.format(*header)) 
    print(rowh.format(*sep))
    
    TOTAL = len(csv)
    FIN_GT = len(csv[csv["bidirectional_fin_packets"] > N])
    RST_GT = len(csv[csv["bidirectional_rst_packets"] > N])
    BENIGN = len(csv[csv["label"] == "BENIGN"])
    BENIGN_FIN_GT = len(csv[(csv["label"] == "BENIGN") & (csv["bidirectional_fin_packets"] > N)])
    BENIGN_RST_GT = len(csv[(csv["label"] == "BENIGN") & (csv["bidirectional_rst_packets"] > N)])
    ANOMALY = len(csv[(csv["label"] != "BENIGN")])
    ANOMALY_FIN_GT = len(csv[(csv["label"] != "BENIGN") & (csv["bidirectional_fin_packets"] > N)])
    ANOMALY_RST_GT = len(csv[(csv["label"] != "BENIGN") & (csv["bidirectional_rst_packets"] > N)])
    
    print(row.format(day, TOTAL, BENIGN, ANOMALY,
                     FIN_GT, BENIGN_FIN_GT, ANOMALY_FIN_GT,
                     RST_GT, BENIGN_RST_GT, ANOMALY_RST_GT))
    
    # print("-"*108)
print("\n")



   DS        TOTAL     BENIGN     ANOMALY    T_FIN>1    B_FIN>1    A_FIN>1    T_RST>1    B_RST>1    A_RST>1   
---------  ---------  ---------  ---------  ---------  ---------  ---------  ---------  ---------  ---------  
wednesday   503457     327470     175987        0          0          0          0          0          0      


   DS        TOTAL     BENIGN     ANOMALY    T_FIN>2    B_FIN>2    A_FIN>2    T_RST>2    B_RST>2    A_RST>2   
---------  ---------  ---------  ---------  ---------  ---------  ---------  ---------  ---------  ---------  
wednesday   503457     327470     175987        0          0          0          0          0          0      


   DS        TOTAL     BENIGN     ANOMALY    T_FIN>3    B_FIN>3    A_FIN>3    T_RST>3    B_RST>3    A_RST>3   
---------  ---------  ---------  ---------  ---------  ---------  ---------  ---------  ---------  ---------  
wednesday   503457     327470     175987        0          0          0          0          0          0  

## Show statistics for the FIN and RST counts

In [7]:
print(csv[['bidirectional_fin_packets', 'bidirectional_rst_packets']].describe())

       bidirectional_fin_packets  bidirectional_rst_packets
count              503457.000000              503457.000000
mean                    0.539005                   0.036164
std                     0.498477                   0.186698
min                     0.000000                   0.000000
25%                     0.000000                   0.000000
50%                     1.000000                   0.000000
75%                     1.000000                   0.000000
max                     1.000000                   1.000000


## Show top 5 FIN and RST counts with the higher occurence

In [8]:
N=5

# # Get descriptive statistics
print(f"Statistics for {day}:")
# stats = csv[['bidirectional_fin_packets', 'bidirectional_rst_packets']].describe(percentiles=[0.5, 0.75, 0.9, 0.95, 1.0])
# print(stats)

# Print the top N value counts for each column without the index name and series description
fin_counts = csv['bidirectional_fin_packets'].value_counts().head(N).to_string(header=False, index=True)
rst_counts = csv['bidirectional_rst_packets'].value_counts().head(N).to_string(header=False, index=True)

print(f"\nTop {N} Value Counts for FIN Packets:")
print(fin_counts)

print(f"\nTop {N} Value Counts for RST Packets:")
print(rst_counts)

# print("\n" + "-"*50 + "\n")

Statistics for wednesday:

Top 5 Value Counts for FIN Packets:
1    271366
0    232091

Top 5 Value Counts for RST Packets:
0    485250
1     18207


## Show flow expiration statistics

expiration_id =
 - 0 for idle timeout,
 - 1 for active timeout, or
 - -1 for custom expiration.
    

In [9]:
df = csv
print(df['expiration_id'].value_counts())

expiration_id
-1    289573
 0    213884
Name: count, dtype: int64


## Show time related statistics for the dataset

In [10]:
print("Stats about bidirectional_duration_ms:")
print(df['bidirectional_duration_ms'].describe().to_string())

print("\nShow top N longest flow values and their occurence:")
print(df['bidirectional_duration_ms'].value_counts().sort_index(ascending=False).head(10).to_string(header=False))

print("\nShow limited info of flow with the longest duration:")
print(df[df['bidirectional_duration_ms'] == df['bidirectional_duration_ms'].max()]
      [['src_ip', 'src_port', 'dst_ip', 'dst_port', 'protocol', 'bidirectional_duration_ms', 'label']]
      .head(10).to_string(index=False))

Stats about bidirectional_duration_ms:
count    5.034570e+05
mean     1.041577e+04
std      3.887668e+04
min      0.000000e+00
25%      6.000000e+00
50%      7.500000e+01
75%      3.550000e+02
max      1.652774e+06

Show top N longest flow values and their occurence:
1652774    1
1551220    1
1533020    1
1526352    1
1517830    1
1517829    5
1516166    1
1464626    1
1453045    1
1438665    1

Show limited info of flow with the longest duration:
       src_ip  src_port       dst_ip  dst_port  protocol  bidirectional_duration_ms  label
192.168.10.17     60326 52.79.87.176       443         6                    1652774 BENIGN


## Show the number of flows whose unique ID appears more than once in the DS

Such flows are esentially repeated across the dataset

In [11]:
# Prepare an empty list to store results
results_list = []

# Get unique labels
unique_labels = df['label'].unique()

# Iterate through each unique label
for label in unique_labels:
    # Filter the dataset for the current label
    df_label = df[df['label'] == label]
    
    # Count the occurrences of each unique value in the 'forward_hash' and 'udps.backward_hash' columns
    value_counts = df_label['flow_key_hash'].value_counts()

    # print(value_counts_f.head(10))
    
    # Count how many unique values have more than one occurrence
    more_than_one_unique = sum(value_counts > 1)
    
    # Append the results for the current label to the results list
    results_list.append({'Label': label, 
                         'Hash_More_Than_One': more_than_one_unique
                        })

# Convert the list of dictionaries to a DataFrame
results_df = pd.DataFrame(results_list)

# Print the result
print(results_df)

              Label  Hash_More_Than_One
0            BENIGN                 843
1     DoS Slowloris                   0
2  DoS Slowhttptest                   0
3          DoS Hulk                   0
4     DoS GoldenEye                   0


## Let's drop the duplicate entries for the BENIGN flows

In [12]:
# Sort the DataFrame by 'bidirectional_first_seen_ms' in ascending order
df_sorted = df.sort_values(by='id')

# Now drop duplicates, keeping the first occurrence (which is now the earliest)
df_unique = df_sorted.drop_duplicates(subset=['flow_key_hash'], keep='first')

# Prepare an empty list to store results
results_list = []

# Get unique labels
unique_labels = df_unique['label'].unique()

# Iterate through each unique label
for label in unique_labels:
    # Filter the dataset for the current label
    df_label = df_unique[df_unique['label'] == label]
    
    # Count the occurrences of each unique value in the 'forward_hash' and 'udps.backward_hash' columns
    value_counts = df_label['flow_key_hash'].value_counts()

    # print(value_counts_f.head(10))
    
    # Count how many unique values have more than one occurrence
    more_than_one_unique = sum(value_counts > 1)
    
    # Append the results for the current label to the results list
    results_list.append({'Label': label, 
                         'Hash_More_Than_One': more_than_one_unique
                        })

# Convert the list of dictionaries to a DataFrame
results_df = pd.DataFrame(results_list)

# Print the result
print(results_df)

df = df_unique
df.to_csv(os.path.join(CSV_DIR, f"{DAY}_{SCENARIO}.csv"), index=False)

              Label  Hash_More_Than_One
0            BENIGN                   0
1     DoS Slowloris                   0
2  DoS Slowhttptest                   0
3          DoS Hulk                   0
4     DoS GoldenEye                   0


## Insights into the temporal distribution of flows for each label based on the `bidirectional_first_seen_ms` timestamp

The output is a new DataFrame (`stats`) where each row corresponds to a unique label from our data, and the columns include:

- `label`: The unique identifier for each group of flows.
- `min`: The minimum time difference between consecutive flows within the same label group.
- `max`: The maximum time difference between consecutive flows within the same label group.
- `mean`: The average time difference between consecutive flows within the same label group.

In [13]:
# Sort the dataframe by label and then by first_seen_ms to ensure the order
df_sorted = df.sort_values(by=['label', 'bidirectional_first_seen_ms'])

# Calculate the difference in 'bidirectional_first_seen_ms' between subsequent rows within each 'label'
df_sorted['time_diff'] = df_sorted.groupby('label')['bidirectional_first_seen_ms'].diff()

# Now, group by 'label' and calculate min, max, and mean differences
stats = df_sorted.groupby('label')['time_diff'].agg(['min', 'max', 'mean']).reset_index()

# Print the resulting statistics for each label
print(stats)

print("\nThe same in a human readable form:\n")

# Convert milliseconds to more readable units
def convert_to_readable_time(ms):
    if pd.isna(ms):  # Check for NaN values
        return 'N/A'  # Return 'N/A' for NaN values
    if ms < 1000:
        return f"{ms:.2f} ms"  # Keep milliseconds if less than one second
    elif ms < 60000:
        return f"{ms / 1000:.2f} seconds"  # Convert to seconds if less than one minute
    elif ms < 3600000:
        return f"{ms / 60000:.2f} minutes"  # Convert to minutes if less than one hour
    else:
        return f"{ms / 3600000:.2f} hours"  # Convert to hours otherwise

# Apply the conversion to each time column
stats['min'] = stats['min'].apply(convert_to_readable_time)
stats['max'] = stats['max'].apply(convert_to_readable_time)
stats['mean'] = stats['mean'].apply(convert_to_readable_time)

# Print the updated DataFrame
print(stats)

              label  min       max        mean
0            BENIGN  0.0   93789.0   93.308881
1     DoS GoldenEye  0.0  158048.0   76.513896
2          DoS Hulk  0.0   52243.0    6.877520
3  DoS Slowhttptest  0.0  174425.0  359.253643
4     DoS Slowloris  0.0    9120.0  230.603133

The same in a human readable form:

              label      min            max       mean
0            BENIGN  0.00 ms   1.56 minutes   93.31 ms
1     DoS GoldenEye  0.00 ms   2.63 minutes   76.51 ms
2          DoS Hulk  0.00 ms  52.24 seconds    6.88 ms
3  DoS Slowhttptest  0.00 ms   2.91 minutes  359.25 ms
4     DoS Slowloris  0.00 ms   9.12 seconds  230.60 ms


## Show final dataset distribution

In [14]:
import pandas as pd
import os
import warnings
warnings.filterwarnings("ignore")

day = DAY

# Initializing the table.
header = ["DS", "TOTAL", "BENIGN", "ANOMALY", "Anomaly breakdown"]
rowh = "{:^18}  "*(len(header)-1) + "{:^36}"
row  = "{:^18}  "*(len(header)-1) + "{:<26}" + "{:>6}"
sep  = ["-"*18]*(len(header)-1) + ["-"*36]
print(rowh.format(*header))
print(rowh.format(*sep))

csv = pd.read_csv(os.path.join(CSV_DIR,f"{day}_{SCENARIO}.csv"))

TOTAL = len(csv)
BENIGN = len(csv[csv["label"] == "BENIGN"])
ANOMALY = len(csv[(csv["label"] != "BENIGN") & (csv["label"] != "NaN")])

print(row.format(DAY, TOTAL, BENIGN, ANOMALY, "", ""))
for label in sorted(csv["label"].unique().tolist()):
    if label == "BENIGN":
        continue
    print(row.format("","","","", label, len(csv[csv["label"] == label])))

        DS                TOTAL               BENIGN             ANOMALY                 Anomaly breakdown          
------------------  ------------------  ------------------  ------------------  ------------------------------------
    wednesday             502350              326363              175987                                        
                                                                                DoS GoldenEye               7917
                                                                                DoS Hulk                  158680
                                                                                DoS Slowhttptest            3707
                                                                                DoS Slowloris               5683
