## Produce Partial Flows based on Packet Count

In [1]:
CSV_DIR = "datasets"
INPUT_DIR = "PCAP/deduplicated_reordered"
DAY = "wednesday"

In [2]:
import os
import sys
from datetime import timedelta
import time
import logging
from nfstream import NFPlugin, NFStreamer
import nfstream
# from labeller import cicids2017
import hashlib


# set up logging
def setup_logging(log_filename="generate-n-pc-flows.log"):
    with open(log_filename, "w"):  # Use 'w' to clear the existing log file, if it exists
        pass  # Do nothing, just open and close to clear the file

    logging.basicConfig(
        level=logging.INFO,
        format="%(message)s",
        handlers=[logging.FileHandler(log_filename), logging.StreamHandler(sys.stdout)],
    )

def consistent_hash(value):
    # This function converts a value into a consistent hash.
    return hashlib.sha256(value.encode()).hexdigest()
    

class PayloadManager(NFPlugin):
    """Manages the payload data for network flows."""

    def on_init(self, packet, flow):
        # Initialize payload sizes based on the packet direction.
        flow.udps.src2dst_payload = packet.payload_size if packet.direction == 0 else 0
        flow.udps.dst2src_payload = packet.payload_size if packet.direction == 1 else 0

    def on_update(self, packet, flow):
        # Update payload sizes based on the packet direction.
        flow.udps.src2dst_payload += packet.payload_size if packet.direction == 0 else 0
        flow.udps.dst2src_payload += packet.payload_size if packet.direction == 1 else 0


class FlowExpirationManager(NFPlugin):
    """Manages the expiration policy for TCP flows."""

    def on_init(self, packet, flow):
        # Set the expiration ID based on TCP rst or fin flags.
        if packet.rst or packet.fin:
            flow.expiration_id = -1

    def on_update(self, packet, flow):
        # Update expiration policy based on TCP rst or fin flags.
        if packet.rst or packet.fin:
            flow.expiration_id = -1


class FlowLabelManager(NFPlugin):
    """Labels flows upon expiration."""

    def __init__(self, day):
        self.day = day

    def on_expire(self, flow):
        # Assign a label to the flow and clean up payloads.
        flow.udps.label = cicids2017(
            self.day, flow, label_reverse=True, signal_reverse=False
        )
        self.cleanup_payload(flow)

    @staticmethod
    def cleanup_payload(flow):
        # Clean up payload data from the flow.
        if hasattr(flow.udps, "src2dst_payload"):
            del flow.udps.src2dst_payload
        if hasattr(flow.udps, "dst2src_payload"):
            del flow.udps.dst2src_payload


class PacketCountManager(NFPlugin):
    """Expire flows on specific packet count."""
    
    def __init__(self, max_packets):
        self.max_packets = max_packets

    def on_update(self, packet, flow):
        # Check for expiration
        if flow.bidirectional_packets == self.max_packets:
            flow.expiration_id = -1  # Mark for expiration in NFStream

class HashManager(NFPlugin):
    """Calculate forward and backward hashes."""
    
    def on_init(self, packet, flow):
        # Initialize packet count and compute initial hashes
        flow.udps.flow_key_hash = consistent_hash(f"{packet.src_ip}-{packet.src_port}-{packet.dst_ip}-{packet.dst_port}-{packet.protocol}-{flow.bidirectional_first_seen_ms}")


def process_files_in_directory(input_dir: str, day: str, output_dir: str, Ns: list):
    """Process all PCAP files in a directory and output to another directory."""

    BPF = "ip and (ip proto \\tcp or \\udp)"  # only ipv4 tcp and udp traffic to capture

    for n in Ns:
        input_file = os.path.join(input_dir, f"rd{day.capitalize()}.pcap")
        if os.path.isfile(input_file):
            output_file = os.path.join(output_dir, f"{day}_pc_{n}.csv")

            logging.info(f"----- PC={n} -----")
            # logging.info(f"Processing {input_file} into {output_file}")

            start = time.time()

            streamer = NFStreamer(
                  source=input_file
                , decode_tunnels=False                                # Default: True
                , bpf_filter=BPF                                      # Default: None
                , promiscuous_mode=True                               # Default: True
                , snapshot_length=1536                                # Default: 1536
                , idle_timeout=60                                     # Default: 120
                , active_timeout=18000                                # Default: 1800
                , accounting_mode=1                                   # Default: 0
                , udps=[                                              # Default: None
                    FlowExpirationManager(),
                    # PayloadManager(),
                    # FlowLabelManager(day.capitalize()),
                    HashManager(),
                    PacketCountManager(n)
                ]      
                , n_dissections=0                                     # Default: 20
                , statistical_analysis=True                           # Default: False
                , splt_analysis=20                                     # Default: 0
                , n_meters=1                                          # Default: 0
                , performance_report=0                                # Default: 0
            )

            # Convert the stream to a DataFrame
            df = streamer.to_pandas(columns_to_anonymize=[])
            logging.info(f"NFStream generated flows: {len(df)}")

            end = time.time()
            processing_time = end - start
            delta = timedelta(seconds=processing_time)
            # logging.info(f"Time required to generate flows: {str(delta)}")

            # Remove rows where 'bidirectional_packets' does not equal n packets
            df_filtered = df[df['bidirectional_packets'] == n]
            logging.info(f"Number of flows with packet count = {n}: {len(df_filtered)}")
            df = df_filtered
            
            # Save the filtered DataFrame to a CSV file
            df.rename(columns={
                                # "udps.label": "label",
                                "udps.flow_key_hash": "flow_key_hash"
                               }, inplace=True)
            df.to_csv(output_file, index=False)

            logging.info(f"Flows stored as: {day}_pc_{n}.csv")
            logging.info(f"\n")


if __name__ == "__main__":
    input_dir = INPUT_DIR
    output_dir = CSV_DIR
    day = DAY
    Ns = range(2,20+1)
    
    # Ensure output directory exists
    os.makedirs(output_dir, exist_ok=True)

    setup_logging()

    logging.info(f"Generating flows with NFStream v{nfstream.__version__}")
    logging.info(f"\n")

    process_files_in_directory(input_dir, day, output_dir, Ns)

Generating flows with NFStream v6.5.4a


----- PC=2 -----
NFStream generated flows: 6903639
Number of flows with packet count = 2: 6370622
Flows stored as: wednesday_pc_2.csv


----- PC=3 -----
NFStream generated flows: 4856398
Number of flows with packet count = 3: 3979821
Flows stored as: wednesday_pc_3.csv


----- PC=4 -----
NFStream generated flows: 3887544
Number of flows with packet count = 4: 2909344
Flows stored as: wednesday_pc_4.csv


----- PC=5 -----
NFStream generated flows: 3240103
Number of flows with packet count = 5: 2245608
Flows stored as: wednesday_pc_5.csv


----- PC=6 -----
NFStream generated flows: 2876898
Number of flows with packet count = 6: 1840940
Flows stored as: wednesday_pc_6.csv


----- PC=7 -----
NFStream generated flows: 2622305
Number of flows with packet count = 7: 1574465
Flows stored as: wednesday_pc_7.csv


----- PC=8 -----
NFStream generated flows: 2414889
Number of flows with packet count = 8: 1379201
Flows stored as: wednesday_pc_8.csv


----- P