In [1]:
from pcapkit import IP, extract, TCP
import time

In [2]:
from collections import Counter
import pandas as pd

In [3]:
from scapy.all import *



In [4]:
packets = rdpcap('sample_dataset/1perFirstHalfFriday.pcap')

In [5]:
# from collections import defaultdict

In [84]:
# TCP Flags Mapping
# Check here for bitmap: https://www.noction.com/blog/tcp-flags#:~:text=The%20hexadecimal%20number%200x02%20tells,a%20particular%20flag%20is%20set.
FIN = 0x01
SYN = 0x02
RST = 0x04
PSH = 0x08
ACK = 0x10
URG = 0x20
ECE = 0x40
CWR = 0x80

In [93]:
list_data = []
flow_byte_counts = {}
flow_fwd_states = []
# list_flow_byte = []
for pkt in packets:
    tmp_pack_dict = {}

    # Check if packet contain an IP layer
    if IP in pkt:
        
        tmp_pack_dict['SourceIP'] = pkt[IP].src
        tmp_pack_dict['DestinationIP'] = pkt[IP].dst
        tmp_pack_dict['Source Port'] = pkt[IP].sport if hasattr(pkt[IP], 'sport') else None
        tmp_pack_dict['Destination Port'] = pkt[IP].dport if hasattr(pkt[IP], 'dport') else None
    
        # Check https://www.iana.org/assignments/protocol-numbers/protocol-numbers.xhtml for Assigned Internet Protocol Numbers
        tmp_pack_dict['Protocol'] = pkt.proto

        # Unique identifier for the flow
        flow_key = frozenset([ 
            tmp_pack_dict['SourceIP'],
            tmp_pack_dict['DestinationIP'],
            tmp_pack_dict['Source Port'],
            tmp_pack_dict['Destination Port']
        ]) # does not matter the order
        flow_key_ordered = (tmp_pack_dict['SourceIP'], tmp_pack_dict['DestinationIP'], tmp_pack_dict['Source Port'], tmp_pack_dict['Destination Port']) # save it in order

        # Flow size
        flow_size = pkt.len

        if flow_key not in flow_byte_counts:
            flow_byte_counts[flow_key] = {
                'bytes': flow_size, 'first_timestamp': pkt.time, 'last_timestamp': pkt.time,
                'flow_duration': 0, 'total_fwd_packet': 1, 'total_bwd_packet': 0, 
                'total_fwd_packet_len': pkt.len , 'total_bwd_packet_len': 0, 
                'last_timestamp_fwd': pkt.time, 'last_timestamp_bwd': None, 'time_btw_packets_fwd': 0, 'time_btw_packets_bwd': 0, 
                'syn_flag_count': 0, 'fin_flag_count': 0, 'rst_flag_count': 0, 'psh_flag_count': 0,
                'ack_flag_count': 0 , 'urg_flag_count': 0, 'cwr_flag_count': 0, 'ece_flag_count': 0}
            # Save forward key
            flow_fwd_states.append(flow_key_ordered)
        else:
            # Update 'timestamp' with the minimum and maximum value
            flow_byte_counts[flow_key]['first_timestamp'] = min(flow_byte_counts[flow_key]['first_timestamp'], pkt.time)
            flow_byte_counts[flow_key]['last_timestamp'] = max(flow_byte_counts[flow_key]['last_timestamp'], pkt.time)
            # Flow duration in Seconds
            flow_byte_counts[flow_key]['flow_duration'] = flow_byte_counts[flow_key]['last_timestamp'] - flow_byte_counts[flow_key]['first_timestamp']
            flow_byte_counts[flow_key]['bytes'] += flow_size
            # Flow Bytes/s
            flow_byte_counts[flow_key]['bytes_s'] = flow_byte_counts[flow_key]['bytes']/flow_byte_counts[flow_key]['flow_duration']

            # Check if it's a forward pass
            if flow_key_ordered in flow_fwd_states: # Forward Pass
                flow_byte_counts[flow_key]['total_fwd_packet'] += 1
                flow_byte_counts[flow_key]['total_fwd_packet_len'] += pkt.len
                # IAT (Inter Arrival Time)
                flow_byte_counts[flow_key]['time_btw_packets_fwd'] += pkt.time - flow_byte_counts[flow_key]['last_timestamp_fwd'] 

                # Update last pckt timestamp
                flow_byte_counts[flow_key]['last_timestamp_fwd'] = pkt.time

            else:
                flow_byte_counts[flow_key]['total_bwd_packet'] += 1
                flow_byte_counts[flow_key]['total_bwd_packet_len'] += pkt.len
                # IAT (Inter Arrival Time)
                if flow_byte_counts[flow_key]['last_timestamp_fwd'] > 0: # check if there is a previous packet:
                    flow_byte_counts[flow_key]['time_btw_packets_fwd'] += pkt.time - flow_byte_counts[flow_key]['time_btw_packets_bwd'] # SUM Time diff between packets

                # Update last pckt timestamp
                flow_byte_counts[flow_key]['last_timestamp_bwd'] = pkt.time

            # Flags Decoding
            if TCP in pkt[IP]:
    
                flow_byte_counts[flow_key]['syn_flag_count'] += 1 if pkt[IP][TCP].flags & SYN else 0
                flow_byte_counts[flow_key]['fin_flag_count'] += 1 if pkt[IP][TCP].flags & FIN else 0
                flow_byte_counts[flow_key]['rst_flag_count'] += 1 if pkt[IP][TCP].flags & RST else 0
                flow_byte_counts[flow_key]['psh_flag_count'] += 1 if pkt[IP][TCP].flags & PSH else 0
                flow_byte_counts[flow_key]['ack_flag_count'] += 1 if pkt[IP][TCP].flags & ACK else 0
                flow_byte_counts[flow_key]['urg_flag_count'] += 1 if pkt[IP][TCP].flags & URG else 0
                flow_byte_counts[flow_key]['cwr_flag_count'] += 1 if pkt[IP][TCP].flags & CWR else 0
                flow_byte_counts[flow_key]['ece_flag_count'] += 1 if pkt[IP][TCP].flags & ECE else 0
            
        
    list_data.append(tmp_pack_dict)

In [94]:
# Convert the data to a list of dictionaries
data_list = [
    {
        'SourceIP': tuple(key)[0] if len(key) > 0 else None,
        'DestinationIP': tuple(key)[1] if len(key) > 1 else None,
        'Source Port': tuple(key)[2] if len(key) > 2 else None,
        'Destination Port': tuple(key)[3] if len(key) > 3 else None,
        'FlowID': key,
        'Bytes': value['bytes'],
        'First Timestamp': float(value['first_timestamp']),
        'Last Timestamp': float(value['last_timestamp']),
        'Flow Duration': value['flow_duration'],
        'Total Fwd Packets': value['total_fwd_packet'],
        'Total Bwd Packets': value['total_bwd_packet'],
        'Fwd IAT Mean': value['time_btw_packets_fwd']/(value['total_fwd_packet'] - 1) if value['total_fwd_packet'] > 1  else 0, # Minus 1 because of it's time between packets,
        'Bwd IAT Mean': value['time_btw_packets_bwd']/(value['total_bwd_packet'] - 1) if value['total_bwd_packet'] > 1  else 0, # Minus 1 because of it's time between packets
        'Flow IAT Mean': (value['time_btw_packets_fwd'] + value['time_btw_packets_bwd'])/(value['total_fwd_packet'] + value['total_bwd_packet'] - 2) if (value['total_fwd_packet'] + value['total_bwd_packet']) > 2 else 0,
        'SYN Flag Count': value['syn_flag_count'],
        'FIN Flag Count': value['fin_flag_count'],
        'RST Flag Count': value['rst_flag_count'],
        'PSH Flag Count': value['psh_flag_count'],
        'ACK Flag Count': value['ack_flag_count'],
        'URG Flag Count': value['urg_flag_count'],
        'CWR Flag Count': value['cwr_flag_count'],
        'ECE Flag Count': value['ece_flag_count']
    }
    for key, value in flow_byte_counts.items()
]

# Create a Pandas DataFrame
df = pd.DataFrame(data_list)

In [47]:
# Examples to verify Forward and Backward pass
# FlowID = (192.168.10.3, 49159, 192.168.10.5, 445)

In [95]:
df

Unnamed: 0,SourceIP,DestinationIP,Source Port,Destination Port,FlowID,Bytes,First Timestamp,Last Timestamp,Flow Duration,Total Fwd Packets,...,Bwd IAT Mean,Flow IAT Mean,SYN Flag Count,FIN Flag Count,RST Flag Count,PSH Flag Count,ACK Flag Count,URG Flag Count,CWR Flag Count,ECE Flag Count
0,192.168.10.9,63210,5355,224.0.0.252,"(192.168.10.9, 63210, 5355, 224.0.0.252)",56,1.499429e+09,1.499429e+09,0,1,...,0.0,0,0,0,0,0,0,0,0,0
1,192.168.10.9,192.168.10.3,1029,49671,"(192.168.10.9, 192.168.10.3, 1029, 49671)",190,1.499429e+09,1.499429e+09,0,1,...,0.0,0,0,0,0,0,0,0,0,0
2,1032,192.168.10.9,192.168.10.3,88,"(1032, 192.168.10.9, 192.168.10.3, 88)",353,1.499429e+09,1.499429e+09,0,1,...,0.0,0,0,0,0,0,0,0,0,0
3,88,192.168.10.9,192.168.10.3,1034,"(88, 192.168.10.9, 192.168.10.3, 1034)",40,1.499429e+09,1.499429e+09,0,1,...,0.0,0,0,0,0,0,0,0,0,0
4,5353,224.0.0.251,192.168.10.25,,"(5353, 224.0.0.251, 192.168.10.25)",362,1.499429e+09,1.499429e+09,91.246272,3,...,0.0,91.246272,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10050,192.168.10.9,192.168.10.3,62410,53,"(192.168.10.9, 192.168.10.3, 62410, 53)",70,1.499436e+09,1.499436e+09,0,1,...,0.0,0,0,0,0,0,0,0,0,0
10051,80,192.168.10.9,5292,104.16.27.216,"(80, 192.168.10.9, 5292, 104.16.27.216)",52,1.499436e+09,1.499436e+09,0,1,...,0.0,0,0,0,0,0,0,0,0,0
10052,192.168.10.9,192.168.10.3,61474,53,"(192.168.10.9, 192.168.10.3, 61474, 53)",70,1.499436e+09,1.499436e+09,0,1,...,0.0,0,0,0,0,0,0,0,0,0
10053,192.168.10.9,5284,443,65.39.202.99,"(192.168.10.9, 5284, 443, 65.39.202.99)",52,1.499436e+09,1.499436e+09,0,1,...,0.0,0,0,0,0,0,0,0,0,0
