# Preliminary Measurement

In [1]:
RESULTS_DIR = "results"
CSV_DIR = "datasets"
INPUT_DIR = "PCAP/deduplicated_reordered"
DAY = "wednesday"

SCENARIO = "initial"

## For this flow measurement 
we set
 - `idle timeout` to `60 seconds`
 - `active timeout` to `120 seconds`

to reflect the measurement in the original dataset as closely as possible

In [2]:
import os
import sys
from datetime import timedelta
import time
import logging
from nfstream import NFPlugin, NFStreamer
import nfstream
from labeller import cicids2017
import hashlib
import numpy as np
import pandas as pd

# set up logging
def setup_logging(log_filename="generate-initial-flows.log"):
    with open(log_filename, "w"):  # Use 'w' to clear the existing log file, if it exists
        pass  # Do nothing, just open and close to clear the file

    logging.basicConfig(
        level=logging.INFO,
        format="%(message)s",
        handlers=[logging.FileHandler(log_filename), logging.StreamHandler(sys.stdout)],
    )


def consistent_hash(value):
    # This function converts a value into a consistent hash.
    return hashlib.sha256(value.encode()).hexdigest()
    

class PayloadManager(NFPlugin):
    """Manages the payload data for network flows."""

    def on_init(self, packet, flow):
        # Initialize payload sizes based on the packet direction.
        flow.udps.src2dst_payload = packet.payload_size if packet.direction == 0 else 0
        flow.udps.dst2src_payload = packet.payload_size if packet.direction == 1 else 0

    def on_update(self, packet, flow):
        # Update payload sizes based on the packet direction.
        flow.udps.src2dst_payload += packet.payload_size if packet.direction == 0 else 0
        flow.udps.dst2src_payload += packet.payload_size if packet.direction == 1 else 0


class FlowExpirationManager(NFPlugin):
    """Manages the expiration policy for TCP flows."""

    def on_init(self, packet, flow):
        # Set the expiration ID based on TCP rst or fin flags.
        if packet.rst or packet.fin:
            flow.expiration_id = -1

    def on_update(self, packet, flow):
        # Update expiration policy based on TCP rst or fin flags.
        if packet.rst or packet.fin:
            flow.expiration_id = -1


class FlowLabelManager(NFPlugin):
    """Labels flows upon expiration."""

    def __init__(self, day):
        self.day = day

    def on_expire(self, flow):
        # Assign a label to the flow and clean up payloads.
        flow.udps.label = cicids2017(
            self.day, flow, label_reverse=True, signal_reverse=False
        )
        self.cleanup_payload(flow)

    @staticmethod
    def cleanup_payload(flow):
        # Clean up payload data from the flow.
        if hasattr(flow.udps, "src2dst_payload"):
            del flow.udps.src2dst_payload
        if hasattr(flow.udps, "dst2src_payload"):
            del flow.udps.dst2src_payload


class HashManager(NFPlugin):
    """Calculate forward and backward hashes."""
    def on_init(self, packet, flow):
        flow.udps.flow_key_hash = consistent_hash(f"{packet.src_ip}-{packet.src_port}-{packet.dst_ip}-{packet.dst_port}-{packet.protocol}")

            
def process_files_in_directory(input_dir: str, day: str, output_dir: str):
    """Process all PCAP files in a directory and output to another directory."""

    BPF = "ip and (ip proto \\tcp or \\udp)"  # only ipv4 tcp and udp traffic to capture

    input_file = os.path.join(input_dir, f"rd{day.capitalize()}.pcap")
    if os.path.isfile(input_file):
        output_file = os.path.join(output_dir, f"{day}_{SCENARIO}.parquet")

        start = time.time()

        streamer = NFStreamer(
              source=input_file
            , decode_tunnels=False                                # Default: True
            , bpf_filter=BPF                                      # Default: None
            , promiscuous_mode=True                               # Default: True
            , snapshot_length=1536                                # Default: 1536
            , idle_timeout=60                                     # Default: 120
            , active_timeout=120                                  # Default: 1800
            , accounting_mode=1                                   # Default: 0
            , udps=[                                              # Default: None
                # FlowExpirationManager(),
                PayloadManager(),
                HashManager(),
                FlowLabelManager(day.capitalize())
            ]      
            , n_dissections=0                                     # Default: 20
            , statistical_analysis=True                           # Default: False
            , splt_analysis=20                                    # Default: 0
            , n_meters=1                                          # Default: 0
            , performance_report=0                                # Default: 0
        )

        # Convert the stream to a DataFrame
        df = streamer.to_pandas(columns_to_anonymize=[])
        logging.info(f"NFStream generated flows: {len(df)}")

        end = time.time()
        processing_time = end - start
        delta = timedelta(seconds=processing_time)
        # logging.info(f"Time required to generate flows: {str(delta)}")

        # Rename column titles
        df.rename(columns={
                           "udps.label": "label",
                           "udps.flow_key_hash": "flow_key_hash"
                          }, inplace=True)
        
        # Downcast integers and floats
        for col in df.columns:
            col_type = df[col].dtype
        
            if np.issubdtype(col_type, np.integer):
                df[col] = pd.to_numeric(df[col], downcast='integer')
            elif np.issubdtype(col_type, np.floating):
                df[col] = pd.to_numeric(df[col], downcast='float') 
        
        # Save the filtered DataFrame to a parquet file
        df.to_parquet(output_file, index=False)

        logging.info(f"Flows stored as: {day}_{SCENARIO}.parquet")
        logging.info(f"\n")


if __name__ == "__main__":
    input_dir = INPUT_DIR
    output_dir = CSV_DIR
    day = DAY
    
    # Ensure output directory exists
    os.makedirs(output_dir, exist_ok=True)

    setup_logging()

    logging.info(f"Generating flows with NFStream v{nfstream.__version__}")
    logging.info(f"\n")
    
    process_files_in_directory(input_dir, day, output_dir)

Generating flows with NFStream v6.5.4a


NFStream generated flows: 504474
Flows stored as: wednesday_initial.parquet




## Show dataset distribution

In [3]:
import pandas as pd
import os
import warnings
warnings.filterwarnings("ignore")

day = DAY

# Initializing the table.
header = ["DS", "TOTAL", "BENIGN", "BENIGN - ZPL", "ANOMALY", "ANOMALY - ZPL", "Anomaly breakdown"]
rowh = "{:^10}  " + "{:^10}  " + "{:^10}  " + "{:^12}  " + "{:^10}  " + "{:^12}  " + "{:^32}"
row  = "{:^10}  " + "{:^10}  " + "{:^10}  " + "{:^12}  " + "{:^10}  " + "{:^12}  " + "{:<24}" + "{:>6}"
sep  = ["-"*10] + ["-"*10] + ["-"*10] + ["-"*12] + ["-"*10] + ["-"*12] + ["-"*32]
print(rowh.format(*header))
print(rowh.format(*sep))

csv = pd.read_parquet(os.path.join(CSV_DIR,f"{day}_{SCENARIO}.parquet"))

TOTAL = len(csv)
BENIGN = len(csv[csv["label"] == "BENIGN"])
BENIGNZPL = len(csv[csv["label"] == "BENIGN - ZPL"])
ANOMALY = len(csv[(csv["label"] != "BENIGN") & (csv["label"] != "BENIGN - ZPL") & (~csv["label"].str.contains("ZPL"))])
ANOMALYZPL = len(csv[(csv["label"] != "BENIGN") & (csv["label"] != "BENIGN - ZPL") & (csv["label"].str.contains("ZPL"))])

print(row.format(DAY, TOTAL, BENIGN, BENIGNZPL, ANOMALY, ANOMALYZPL, "", ""))
for label in sorted(csv["label"].unique().tolist()):
    if label == "BENIGN":
        continue
    if label == "BENIGN - ZPL":
        continue
    print(row.format("","","","","","", label, len(csv[csv["label"] == label])))

    DS        TOTAL       BENIGN    BENIGN - ZPL   ANOMALY    ANOMALY - ZPL         Anomaly breakdown        
----------  ----------  ----------  ------------  ----------  ------------  --------------------------------
wednesday     504474      310381       13580        174156        6357                                    
                                                                            DoS GoldenEye             7916
                                                                            DoS GoldenEye - ZPL        870
                                                                            DoS Hulk                158027
                                                                            DoS Hulk - ZPL             594
                                                                            DoS Slowhttptest          3010
                                                                            DoS Slowhttptest - ZPL    3088
                                

We observe a considerable number of flows with zero packet payload, a pattern that is not characteristic of any of the attacks in the Wednesday dataset.

## Show FIN and RST statistics

### Count of flows whose FIN or RST count is higher than 1, 2, and 3

In [4]:
for N in range(1,1+3):
    # Initializing the table.
    header = [
              f"T_FIN>{N}", f"B_FIN>{N}", f"A_FIN>{N}",
              f"T_RST>{N}", f"B_RST>{N}", f"A_RST>{N}"]
    rowh = "{:^9}  "*(len(header))
    row = "{:^9}  "*(len(header))
    sep = ["-"*9]*(len(header)) + ["-"*39]

    print("\n")
    
    print(rowh.format(*header)) 
    print(rowh.format(*sep))
    
    FIN_GT = len(csv[csv["bidirectional_fin_packets"] > N])
    RST_GT = len(csv[csv["bidirectional_rst_packets"] > N])

    BENIGN_FIN_GT = len(csv[(csv["label"] == "BENIGN") & (csv["bidirectional_fin_packets"] > N)])
    BENIGN_RST_GT = len(csv[(csv["label"] == "BENIGN") & (csv["bidirectional_rst_packets"] > N)])

    ANOMALY_FIN_GT = len(csv[(csv["label"] != "BENIGN") & (csv["bidirectional_fin_packets"] > N)])
    ANOMALY_RST_GT = len(csv[(csv["label"] != "BENIGN") & (csv["bidirectional_rst_packets"] > N)])
    
    print(row.format(

                     FIN_GT, BENIGN_FIN_GT, ANOMALY_FIN_GT,
                     RST_GT, BENIGN_RST_GT, ANOMALY_RST_GT))

print("\n")



 T_FIN>1    B_FIN>1    A_FIN>1    T_RST>1    B_RST>1    A_RST>1   
---------  ---------  ---------  ---------  ---------  ---------  
 153551      81030      72521      96880      10019      86861    


 T_FIN>2    B_FIN>2    A_FIN>2    T_RST>2    B_RST>2    A_RST>2   
---------  ---------  ---------  ---------  ---------  ---------  
  8827       3008       5819       40832      2273       38559    


 T_FIN>3    B_FIN>3    A_FIN>3    T_RST>3    B_RST>3    A_RST>3   
---------  ---------  ---------  ---------  ---------  ---------  
  1106        585        521       7090       1290       5800     




## Show statistics for the FIN and RST counts

In [5]:
print(csv[['bidirectional_fin_packets', 'bidirectional_rst_packets']].describe())

       bidirectional_fin_packets  bidirectional_rst_packets
count              504474.000000              504474.000000
mean                    0.856978                   0.678846
std                     0.927540                   1.058333
min                     0.000000                   0.000000
25%                     0.000000                   0.000000
50%                     1.000000                   0.000000
75%                     2.000000                   1.000000
max                    13.000000                 139.000000


## Show top 10 FIN and RST counts with the highest occurence

In [6]:
N=10

# # Get descriptive statistics
print(f"Statistics for {day}:")
# stats = csv[['bidirectional_fin_packets', 'bidirectional_rst_packets']].describe(percentiles=[0.5, 0.75, 0.9, 0.95, 1.0])
# print(stats)

# Print the top N value counts for each column without the index name and series description
fin_counts = csv['bidirectional_fin_packets'].value_counts().head(N).to_string(header=False, index=True)
rst_counts = csv['bidirectional_rst_packets'].value_counts().head(N).to_string(header=False, index=True)

print(f"\nTop {N} Value Counts for FIN Packets:")
print(fin_counts)

print(f"\nTop {N} Value Counts for RST Packets:")
print(rst_counts)

# print("\n" + "-"*50 + "\n")

Statistics for wednesday:

Top 10 Value Counts for FIN Packets:
0     237715
2     144724
1     113208
3       7721
4        377
5        215
6        202
7         96
10        84
8         83

Top 10 Value Counts for RST Packets:
0    309024
1     98570
2     56048
3     33742
4      6117
5       641
6       155
7        74
9        32
8        16


## Show flow expiration statistics

expiration_id =
 - 0 for idle timeout,
 - 1 for active timeout, or
 - -1 for custom expiration.
    

In [7]:
df = csv

print(df['expiration_id'].value_counts())

expiration_id
0    493226
1     11248
Name: count, dtype: int64


## Check flows associated with `Heartbleed` attack

In [8]:
print(df[df['label'] == 'Heartbleed'][['expiration_id', 'src_ip', 'src_port', 'dst_ip', 'dst_port', 'protocol', 'bidirectional_duration_ms', 'bidirectional_packets', 'label']].to_string())

        expiration_id         src_ip  src_port         dst_ip  dst_port  protocol  bidirectional_duration_ms  bidirectional_packets       label
430915              1     172.16.0.1     45022  192.168.10.51       444         6                     119303                   4414  Heartbleed
433835              1  192.168.10.51       444     172.16.0.1     45022         6                     119262                   4902  Heartbleed
437445              1  192.168.10.51       444     172.16.0.1     45022         6                     119261                   4924  Heartbleed
439999              1  192.168.10.51       444     172.16.0.1     45022         6                     119260                   4905  Heartbleed
444467              1  192.168.10.51       444     172.16.0.1     45022         6                     119298                   4871  Heartbleed
446675              1  192.168.10.51       444     172.16.0.1     45022         6                     119260                   4873  Hea

This attack was one long attack divided by active timeout into multiple instances. Each flow except the last was 120 seconds long.

## Show time related statistics for the dataset

In [9]:
print("Stats about bidirectional_duration_ms:")
print(df['bidirectional_duration_ms'].describe().to_string())

print("\nShow top N longest flow values and their occurence:")
print(df['bidirectional_duration_ms'].value_counts().sort_index(ascending=False).head(10).to_string(header=False))

print("\nShow a sample of rows with the longest duration:")
print(df[df['bidirectional_duration_ms'] == df['bidirectional_duration_ms'].max()]
      [['src_ip', 'src_port', 'dst_ip', 'dst_port', 'protocol', 'bidirectional_duration_ms', 'label']]
      .head(10).to_string(index=False))

Stats about bidirectional_duration_ms:
count    504474.000000
mean      10310.873072
std       28498.878528
min           0.000000
25%          25.000000
50%         136.000000
75%        1015.000000
max      119999.000000

Show top N longest flow values and their occurence:
119999    39
119998    10
119997     8
119996    13
119995    10
119994     6
119993     3
119992     6
119991     5
119990     6

Show a sample of rows with the longest duration:
       src_ip  src_port        dst_ip  dst_port  protocol  bidirectional_duration_ms  label
192.168.10.14     49533   13.107.4.50        80         6                     119999 BENIGN
  13.107.4.50        80 192.168.10.14     49533         6                     119999 BENIGN
  13.107.4.50        80 192.168.10.14     49533         6                     119999 BENIGN
192.168.10.15     49672   13.107.4.50        80         6                     119999 BENIGN
  13.107.4.50        80 192.168.10.15     49672         6                     119999

## Show the number of flows whose unique ID appears more than once in the DS

Such flows are esentially repeated across the dataset

In [10]:
# Prepare an empty list to store results
results_list = []

# Get unique labels
unique_labels = df['label'].unique()

# Iterate through each unique label
for label in unique_labels:
    # Filter the dataset for the current label
    df_label = df[df['label'] == label]
    
    # Count the occurrences of each unique value in the 'udps.forward_hash' and 'udps.backward_hash' columns
    value_counts = df_label['flow_key_hash'].value_counts()

    # print(value_counts_f.head(10))
    
    # Count how many unique values have more than one occurrence
    more_than_one_unique = sum(value_counts > 1)
    
    # Append the results for the current label to the results list
    results_list.append({'Label': label, 
                         'Hash_More_Than_One': more_than_one_unique
                        })

# Convert the list of dictionaries to a DataFrame
results_df = pd.DataFrame(results_list)

# Print the result
print(results_df)

                     Label  Hash_More_Than_One
0                   BENIGN               34903
1             BENIGN - ZPL                  85
2            DoS Slowloris                 278
3      DoS Slowloris - ZPL                   0
4         DoS Slowhttptest                1042
5   DoS Slowhttptest - ZPL                   0
6                 DoS Hulk               14267
7           DoS Hulk - ZPL                   1
8            DoS GoldenEye                 414
9      DoS GoldenEye - ZPL                   0
10              Heartbleed                   1


## Insights into the temporal distribution of flows for each label based on the `bidirectional_first_seen_ms` timestamp

The output is a new DataFrame (`stats`) where each row corresponds to a unique label from our data, and the columns include:

- `label`: The unique identifier for each group of flows.
- `min`: The minimum time difference between consecutive flows within the same label group.
- `max`: The maximum time difference between consecutive flows within the same label group.
- `mean`: The average time difference between consecutive flows within the same label group.

In [11]:
# Sort the dataframe by label and then by first_seen_ms to ensure the order
df_sorted = df.sort_values(by=['label', 'bidirectional_first_seen_ms'])

# Calculate the difference in 'bidirectional_first_seen_ms' between subsequent rows within each 'label'
df_sorted['time_diff'] = df_sorted.groupby('label')['bidirectional_first_seen_ms'].diff()

# Now, group by 'label' and calculate min, max, and mean differences
stats = df_sorted.groupby('label')['time_diff'].agg(['min', 'max', 'mean']).reset_index()

# Print the resulting statistics for each label
print(stats)

print("\nThe same in a human readable form:\n")

# Convert milliseconds to more readable units
def convert_to_readable_time(ms):
    if pd.isna(ms):  # Check for NaN values
        return 'N/A'  # Return 'N/A' for NaN values
    if ms < 1000:
        return f"{ms:.2f} ms"  # Keep milliseconds if less than one second
    elif ms < 60000:
        return f"{ms / 1000:.2f} seconds"  # Convert to seconds if less than one minute
    elif ms < 3600000:
        return f"{ms / 60000:.2f} minutes"  # Convert to minutes if less than one hour
    else:
        return f"{ms / 3600000:.2f} hours"  # Convert to hours otherwise

# Apply the conversion to each time column
stats['min'] = stats['min'].apply(convert_to_readable_time)
stats['max'] = stats['max'].apply(convert_to_readable_time)
stats['mean'] = stats['mean'].apply(convert_to_readable_time)

# Print the updated DataFrame
print(stats)

                     label       min       max           mean
0                   BENIGN       0.0   93789.0      98.113516
1             BENIGN - ZPL       0.0  234860.0    2236.216953
2            DoS GoldenEye       0.0  158048.0      76.523563
3      DoS GoldenEye - ZPL       0.0  130868.0     270.453395
4                 DoS Hulk       0.0   52243.0       6.905940
5           DoS Hulk - ZPL       0.0  286686.0    2290.623946
6         DoS Slowhttptest       0.0  174425.0     442.470588
7   DoS Slowhttptest - ZPL       0.0  397290.0     373.832523
8            DoS Slowloris       0.0   11879.0     252.041803
9      DoS Slowloris - ZPL       0.0   33944.0     712.571508
10              Heartbleed  120254.0  120297.0  120259.200000

The same in a human readable form:

                     label           min            max          mean
0                   BENIGN       0.00 ms   1.56 minutes      98.11 ms
1             BENIGN - ZPL       0.00 ms   3.91 minutes  2.24 seconds
2        

## Analyze the flow duration in the dataset

In [12]:
print(df['bidirectional_duration_ms'].describe())

count    504474.000000
mean      10310.873072
std       28498.878528
min           0.000000
25%          25.000000
50%         136.000000
75%        1015.000000
max      119999.000000
Name: bidirectional_duration_ms, dtype: float64


By looking at these statistics, we can get a good overall picture of the distribution and scale of the bidirectional durations in our dataset. For instance, 
 - **Large Gap Between 75th Percentile and Maximum Value:** The 75th percentile is at 1,015 ms (just over 1 second), while the maximum value is 119,999 ms (about 120 seconds or 2 minutes). This large gap suggests the presence of outliers or very large durations compared to the majority of the data.
 - **Median Much Lower Than Mean:** The median (50th percentile) value is 136 ms, which is significantly lower than the mean value of 10,310.87 ms. This discrepancy indicates that the data distribution is right-skewed. In other words, while the majority of the bidirectional durations are on the lower end (as evidenced by a relatively low median), there are some durations that are much longer, which increases the average (mean) and indicates the presence of outliers or a long tail to the right of the distribution.

These points indicate that the overall interpretation of the distribution being right-skewed with outliers. The data is concentrated on the lower end (short durations), but there are enough high duration values to skew the mean significantly above the median. This pattern is characteristic of a right-skewed distribution.

Let's investigate the long durations by assessing the packet inter-arrival time characteristics of the flows.

In [13]:
import ast  # For converting string representations of lists into actual lists

# print(df[df['bidirectional_duration_ms'] == 119999]
#       [['src_ip', 'src_port', 'dst_ip', 'dst_port', 'protocol', 'bidirectional_packets',
#         'bidirectional_duration_ms', 'label']]
#       .head(5).to_string(index=False))

# Convert 'splt_piat_ms' from string representations of lists to actual lists
df['splt_piat_ms'] = df['splt_piat_ms'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

# Create a new column for the maximum value in each list
df['max_splt_piat_ms'] = df['splt_piat_ms'].apply(max)

# Sort the DataFrame based on 'max_splt_piat_ms' in descending order
df_sorted = df.sort_values('max_splt_piat_ms', ascending=False)

# print(df_sorted['max_splt_piat_ms'].nlargest(8000))

# Select the top N rows
top_10_rows = df_sorted.head(40)

# Create a copy of the DataFrame with renamed columns for display
display_df = top_10_rows.rename(columns={
    'bidirectional_packets': 'packets',
    'bidirectional_duration_ms': 'duration_ms',
    # Add other columns to rename here
}).copy()

# Print the entire row information for these selected rows
# print(display_df[['src_ip', 'src_port', 'dst_ip', 'dst_port', 'protocol', 'packets',
        # 'duration_ms', 'label', 'splt_piat_ms']].to_string(index=False))

print(display_df[['id', 'packets',
        'duration_ms', 'label', 'splt_piat_ms']].to_string(index=False))

    id  packets  duration_ms        label                                                                           splt_piat_ms
 66540       10        60184       BENIGN          [0, 61, 0, 0, 62, 1, 0, 59999, 61, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1]
 18202       12        60073       BENIGN            [0, 23, 0, 0, 23, 1, 0, 0, 0, 59999, 27, 0, -1, -1, -1, -1, -1, -1, -1, -1]
483152       12        60236       BENIGN            [0, 78, 0, 0, 78, 1, 0, 0, 0, 59999, 80, 0, -1, -1, -1, -1, -1, -1, -1, -1]
   668       17        60413       BENIGN             [0, 23, 0, 0, 23, 1, 0, 291, 24, 0, 0, 5, 24, 0, 59999, 23, 0, -1, -1, -1]
434935        4        59999       BENIGN       [0, 0, 59999, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1]
315252       12        60077       BENIGN            [0, 23, 1, 0, 24, 1, 0, 0, 0, 59999, 29, 0, -1, -1, -1, -1, -1, -1, -1, -1]
441102        4        59999       BENIGN       [0, 0, 59999, 0, -1, -1, -1, -1, -1, -1, -1, -1, 

From the observed PIAT values, it is evident that there are some notably large PIATs within each of the analyzed flows. These values could suggest that separate flows have been merged into a single flow due to the idle timeout being set to 60 seconds in NFStream.

Upon reviewing the default Linux kernel values, a 60-second timeout appears to be a legitimate setting. The variables under [/proc/sys/net/netfilter/nf_conntrack_*](https://www.kernel.org/doc/Documentation/networking/nf_conntrack-sysctl.txt), which relate to the Linux kernel's netfilter connection tracking system—especially when configured with firewall support (iptables/nftables) and connection tracking—support this observation.

Moreover, additional default settings in the Linux kernel, such as `tcp_syn_retries`, `tcp_synack_retries`, `tcp_fin_timeout`, and `tcp_keepalive_time` under the [/proc/sys/net/ipv4/*](https://www.kernel.org/doc/Documentation/networking/ip-sysctl.txt) variables, corroborate that the configured 60 seconds idle timeout falls within a normal range. These variables, integral to IPv4 networking, enable system administrators to adjust various elements of the IPv4 network stack's behavior, including TCP-specific configurations.

Consequently, we classify these flows as valid, where large PIATs signify intervals of inactivity within a TCP connection, aligning with the operational patterns of applications that transmit data in an intermittent manner.

As a side note, it is important to recognize that TCP session timeout settings can vary significantly across different firewall vendors. For instance, CheckPoint, a notable firewall vendor, implements [these timeouts](https://support.checkpoint.com/results/sk/sk41248). 

## Conclusions
 - We observed a considerable number of flows with zero packet payload (ZPL), a characteristic not associated with any of the attacks identified in the Wednesday traffic trace.
 - There is a considerable number of flows that, while being an attack, they also caputre the consequence of a sucessful attack, that is the server starts terminating the connections (via RST/FIN) due to saturation
 - The Heartbleed attack appears as one prolonged attack that is segmented into multiple flow records by the active timeout setting.
 - Some flows are repeated with identical 5-tuples across the entire dataset.
 - There are serveral flows whose PIAT is just a few milliseconds below the idle timeout, which was set to 60 seconds. Upon reviewing the default Linux kernel values, a 60-second timeout appears to be a legitimate setting.