# Preliminaries and helper functions

## Measure flows via NFStream

In [None]:
import nfstream
import sys
from nfstream import NFPlugin

print(nfstream.__version__)

class TCP_expiration(NFPlugin):

    def on_init(self, packet, flow):
        """Init function to handle packets."""
        if packet.rst or packet.fin:
            flow.expiration_id = -1

    def on_update(self, packet, flow):
        """Update function to handle packets."""
        if packet.rst or packet.fin:
            flow.expiration_id = -1

# only process ipv4 or ipv6 tcp and udp traffic
bpf = "(ip proto \\tcp or \\udp) or (ip6 proto \\tcp or \\udp)"  

if __name__ == '__main__':  # Mandatory if you are running on Windows Platform
    path = sys.argv[1]
    print("nfstream processing started. Use Ctrl+C to interrupt and save.")
    total_flows = nfstream.NFStreamer(source=path
                                      , decode_tunnels = True
                                      , statistical_analysis= True
                                      , splt_analysis=0
                                      , idle_timeout= 120
                                      , active_timeout= 1800
                                      , performance_report=1
                                      , udps=TCP_expiration()
                                      , bpf_filter=bpf
                                      ).to_csv(flows_per_file=2000000)

## Combine all CSVs into one

In [None]:
!awk '(NR == 1) || (FNR > 1)' *.csv > combined.csv

## Measurement stats

Total flows collected: 50,000,000

First flow collected at 1689779893668 = Wednesday, 19 July 2023 17:18:13.668 GMT+02:00 DST

Last flow collected at 1689827196977 = Thursday, 20 July 2023 06:26:36.977 GMT+02:00 DST

The difference between the two epoch times is 13:02:53

## Show column names

In [None]:
!head combined.csv

## Drop columns with no value for ML

In [None]:
# Keep only columns with no.
!cut -d, -f 2,11,12,14,17-19,22-24,27-53,54-77,78-81 combined.csv > filtered1.csv

# More elegant but SLOWER ways:

# awk -F, -v OFS=',' '
#     NR==1 {
#         for(i=1; i<=NF; i++) {
#             if ($i ~ /(.*first_seen.*|.*last_seen.*|id|expiration_id|requested_server_name|client_fingerprint|server_fingerprint|user_agent|content_type|src_ip|src_mac|dst_ip|dst_mac|src_oui|dst_oui|ip_version|vlan_id|tunnel_id|src_port|dst_port|protocol)$/) {
#                 cols[i]
#             }
#         }
#     }
#     {
#         sep = ""
#         for(i=1; i<=NF; i++) {
#             if (!(i in cols)) {
#                 printf "%s%s", sep, $i
#                 sep = OFS
#             }
#         }
#         printf ORS
#     }
# ' combined.csv > pruned.csv


# csvcut -C 'id,expiration_id,requested_server_name,client_fingerprint,server_fingerprint,user_agent,content_type,src_ip,src_mac,dst_ip,dst_mac,src_oui,dst_oui,ip_version,vlan_id,tunnel_id,src_port,dst_port,protocol,first_seen,last_seen' combined.csv > pruned.csv

## Filter out flows with low label accuracy and unknown labels

In [None]:
# Get rid of all the flows where labeling confidence via nDPI is not maximum (6) \
# and also those flows where the label is "Unknown"
!awk -F, 'BEGIN{OFS=","} {if (NR==1 || ($NF == 6 && $(NF-1) == 0 && $(NF-3) != "Unknown")) print $0}' filtered1.csv > filtered2.csv

## Get rid of the last 3 columns as we do not need them anymore

In [None]:
!cut -d, -f 1-62 filtered2.csv > filtered3.csv

## Filter out flows whose packet count is lower than 20

In [None]:
!awk -F, 'BEGIN{OFS=","} {if (NR==1 || $6 >= 20) print $0}' filtered3.csv > filtered4.csv

## Show number of unique application types

In [None]:
!awk -F ',' 'NR>1{print $(NF)}' filtered4.csv | sort | uniq -c | wc -l

## Keep a subset of application types for clarity

In [None]:
!awk -F, 'BEGIN{OFS=","} 
    NR == 1 || 
    $NF == "\"QUIC.YouTube\"" || 
    $NF == "\"QUIC.Facebook\"" || 
    $NF == "\"TLS.TikTok\"" || 
    $NF == "\"TLS.Apple\"" || 
    $NF == "\"Discord\"" || 
    $NF == "\"BitTorrent\"" || 
    $NF == "\"STUN\"" || 
    $NF == "\"HTTP\"" || 
    $NF == "\"SSH\"" || 
    $NF == "\"WhatsApp\"" {print $0}' filtered4.csv > filtered5.csv