In [69]:
from pcapkit import IP, extract, TCP
import time

In [2]:
from collections import Counter
import pandas as pd

In [86]:
from scapy.all import *

In [87]:
packets = rdpcap('sample_dataset/1perFirstHalfFriday.pcap')

In [88]:
# from collections import defaultdict

In [114]:
list_data = []
flow_byte_counts = {}
# list_flow_byte = []
for pkt in packets:
    tmp_pack_dict = {}

    # Check if packet contain an IP layer
    if IP in pkt:
        
        tmp_pack_dict['SourceIP'] = pkt[IP].src
        tmp_pack_dict['DestinationIP'] = pkt[IP].dst
        tmp_pack_dict['Source Port'] = pkt[IP].sport if hasattr(pkt[IP], 'sport') else None
        tmp_pack_dict['Destination Port'] = pkt[IP].dport if hasattr(pkt[IP], 'dport') else None
    
        # Check https://www.iana.org/assignments/protocol-numbers/protocol-numbers.xhtml for Assigned Internet Protocol Numbers
        tmp_pack_dict['Protocol'] = pkt.proto

        # Unique identifier for the flow
        flow_key = (tmp_pack_dict['SourceIP'], tmp_pack_dict['DestinationIP'], tmp_pack_dict['Source Port'], tmp_pack_dict['Destination Port'])

        # Flow size
        flow_size = pkt.len

        if flow_key not in flow_byte_counts:
            flow_byte_counts[flow_key] = {'bytes': flow_size, 'first_timestamp': pkt.time, 'last_timestamp': pkt.time, 'flow_duration': 0}
        else:
            # Update 'timestamp' with the minimum and maximum value
            flow_byte_counts[flow_key]['first_timestamp'] = min(flow_byte_counts[flow_key]['first_timestamp'], pkt.time)
            flow_byte_counts[flow_key]['last_timestamp'] = max(flow_byte_counts[flow_key]['last_timestamp'], pkt.time)
            # Flow duration in Seconds
            flow_byte_counts[flow_key]['flow_duration'] = flow_byte_counts[flow_key]['last_timestamp'] - flow_byte_counts[flow_key]['first_timestamp']
            flow_byte_counts[flow_key]['bytes'] += flow_size
            # Flow Bytes/s
            flow_byte_counts[flow_key]['bytes_s'] = flow_byte_counts[flow_key]['bytes']/flow_byte_counts[flow_key]['flow_duration']
        


    list_data.append(tmp_pack_dict)

In [116]:
# Convert the data to a list of dictionaries
data_list = [
    {
        'SourceIP': key[0],
        'DestinationIP': key[1],
        'Source Port': key[2],
        'Destination Port': key[3],
        'Bytes': value['bytes'],
        'First Timestamp': float(value['first_timestamp']),
        'Last Timestamp': float(value['last_timestamp']),
        'Flow Duration': value['flow_duration']
    }
    for key, value in flow_byte_counts.items()
]

# Create a Pandas DataFrame
df = pd.DataFrame(data_list)

In [117]:
df

Unnamed: 0,SourceIP,DestinationIP,Source Port,Destination Port,Bytes,First Timestamp,Last Timestamp,Flow Duration
0,192.168.10.9,224.0.0.252,63210.0,5355.0,56,1.499429e+09,1.499429e+09,0
1,192.168.10.9,192.168.10.3,1029.0,49671.0,190,1.499429e+09,1.499429e+09,0
2,192.168.10.9,192.168.10.3,1032.0,88.0,353,1.499429e+09,1.499429e+09,0
3,192.168.10.9,192.168.10.3,1034.0,88.0,40,1.499429e+09,1.499429e+09,0
4,192.168.10.25,224.0.0.251,5353.0,5353.0,362,1.499429e+09,1.499429e+09,91.246272
...,...,...,...,...,...,...,...,...
11486,192.168.10.9,192.168.10.3,62410.0,53.0,70,1.499436e+09,1.499436e+09,0
11487,192.168.10.9,104.16.27.216,5292.0,80.0,52,1.499436e+09,1.499436e+09,0
11488,192.168.10.9,192.168.10.3,61474.0,53.0,70,1.499436e+09,1.499436e+09,0
11489,192.168.10.9,65.39.202.99,5284.0,443.0,52,1.499436e+09,1.499436e+09,0


In [59]:
df.head()

Unnamed: 0,SourceIP,DestinationIP,Source Port,Destination Port,Protocol
0,,,,,
1,192.168.10.9,224.0.0.252,63210.0,5355.0,17.0
2,192.168.10.9,192.168.10.3,1029.0,49671.0,6.0
3,192.168.10.9,192.168.10.3,1032.0,88.0,6.0
4,192.168.10.9,192.168.10.3,1034.0,88.0,6.0


In [60]:
df[(df['SourceIP'] == '192.168.10.9') & (df['DestinationIP'] == '192.168.10.3')]

Unnamed: 0,SourceIP,DestinationIP,Source Port,Destination Port,Protocol
2,192.168.10.9,192.168.10.3,1029.0,49671.0,6.0
3,192.168.10.9,192.168.10.3,1032.0,88.0,6.0
4,192.168.10.9,192.168.10.3,1034.0,88.0,6.0
44,192.168.10.9,192.168.10.3,52921.0,53.0,17.0
47,192.168.10.9,192.168.10.3,1041.0,49666.0,6.0
...,...,...,...,...,...
49975,192.168.10.9,192.168.10.3,55434.0,53.0,17.0
49976,192.168.10.9,192.168.10.3,60946.0,53.0,17.0
49977,192.168.10.9,192.168.10.3,49750.0,53.0,17.0
49983,192.168.10.9,192.168.10.3,62410.0,53.0,17.0


In [6]:
df = pd.DataFrame.from_dict(Counter(srcIP), orient='index').reset_index()

In [7]:
df

Unnamed: 0,index,0
0,192.168.10.9,1090
1,192.168.10.25,1239
2,192.168.10.3,1262
3,192.168.10.14,941
4,192.168.10.5,1291
...,...,...
1728,72.5.205.36,1
1729,104.79.143.90,1
1730,52.4.232.179,3
1731,66.242.15.233,1
