In [None]:
from pcap_splitter.splitter import PcapSplitter
import os
import pandas as pd
import sklearn
import zat
import shutil

from zat.log_to_dataframe import LogToDataFrame
import numpy as np
from scapy.all import rdpcap

def list_files_in_directory(directory):
    file_list = []
    for filename in os.listdir(directory):
        if os.path.isfile(os.path.join(directory, filename)):
            file_list.append(filename)
    return file_list



def extract_5_tuple(pcap_file):
    tuples = set()  # Using a set to ensure unique tuples
    packets = rdpcap(pcap_file)
    
    print(f"found {len(packets)} packets in file")

    for idx, packet in enumerate(packets):
        if idx>0 and idx%50==0:
            print(".")
        if packet.haslayer('IP'):
            ip_layer = packet['IP']
            transport_layer = packet[ip_layer.payload.name]

            # Extracting the 5-tuple information
            src_ip = ip_layer.src
            dst_ip = ip_layer.dst
            src_port = transport_layer.sport
            dst_port = transport_layer.dport
            proto = ip_layer.proto

            # Forming the 5-tuple
            tuple_info = (src_ip, src_port, dst_ip, dst_port, proto)
            tuples.add(tuple_info)

    return tuples


def extract_fist_useful_tuple(pcap_file, useful_ip):
    packets = rdpcap(pcap_file)
    
    # print(f"found {len(packets)} packets in file")

    for idx, packet in enumerate(packets):
        if idx>0 and idx%50==0:
            print(".")
        if packet.haslayer('IP'):
            ip_layer = packet['IP']
            transport_layer = packet[ip_layer.payload.name]

            # Extracting the 5-tuple information
            src_ip = ip_layer.src
            dst_ip = ip_layer.dst
            if not hasattr(transport_layer, 'sport'):
                continue
            src_port = transport_layer.sport
            dst_port = transport_layer.dport
            proto = ip_layer.proto

            # Forming the 5-tuple
            tuple_info = (src_ip, src_port, dst_ip, dst_port, proto)
            if src_ip == useful_ip:
                return tuple_info

    return None


def move_files(file_list, source_folder, destination_folder):
    for file_name in file_list:
        source_file = os.path.join(source_folder, file_name)
        destination_file = os.path.join(destination_folder, file_name)
        
        try:
            shutil.move(source_file, destination_file)
        except FileNotFoundError:
            print(f"Error: {file_name} not found in {source_folder}")
        except Exception as e:
            print(f"Error while moving {file_name}: {e}")

In [None]:
# Good to print out versions of stuff
print('zat: {:s}'.format(zat.__version__))
print('Pandas: {:s}'.format(pd.__version__))
print('Numpy: {:s}'.format(np.__version__))
print('Scikit Learn Version:', sklearn.__version__)

# Splitting by flows:

In [3]:

prefix_malware_1 = "../../../Downloads/iot_23_datasets_full/CTU-IoT-Malware-Capture-1-1/"
ps = PcapSplitter(prefix_malware_1 + "2018-05-09-192.168.100.103.pcap")
MALICIOUS_IP = "192.168.100.103"
!mkdir {prefix_malware_1}/splitted
print(ps.split_by_session(prefix_malware_1 + "splitted", pkts_bpf_filter=f"src host {MALICIOUS_IP}"))

"""
prefix_bening_1 = "../../../Downloads/iot_23_datasets_full/CTU-Honeypot-Capture-4-1/"
ps = PcapSplitter(prefix_bening_1 + "2018-10-25-14-06-32-192.168.1.132.pcap")
VICTIM_IP = "192.168.1.132"
!mkdir {prefix_bening_1}/splitted
print(ps.split_by_session(prefix_bening_1 + "splitted", pkts_bpf_filter=f"src host {VICTIM_IP}"))
"""

"""
prefix_bening_2 = "../../../Downloads/iot_23_datasets_full/CTU-Honeypot-Capture-5-1/"
ps = PcapSplitter(prefix_bening_2 + "2018-09-21-capture.pcap")
VICTIM_IP = "192.168.2.3"
!mkdir -p {prefix_bening_2}/splitted
print(ps.split_by_session(prefix_bening_2 + "splitted", pkts_bpf_filter=f"src host {VICTIM_IP}"))
"""

"""
prefix_bening_3 = "../../../Downloads/iot_23_datasets_full/CTU-Honeypot-Capture-7-1/Somfy-03/"
ps = PcapSplitter(prefix_bening_3 + "2019-07-04-16-41-10-192.168.1.158.pcap")
VICTIM_IP = "192.168.1.158"
!mkdir -p {prefix_bening_3}/splitted
print(ps.split_by_session(prefix_bening_3 + "splitted", pkts_bpf_filter=f"src host {VICTIM_IP}"))
"""


"""
prefix_malware_2 = "../../../Downloads/iot_23_datasets_full/CTU-IoT-Malware-Capture-3-1/"
ps = PcapSplitter(prefix_malware_2 + "2018-05-21_capture.pcap")
MALICIOUS_IP = "192.168.2.5"
!mkdir {prefix_malware_2}/splitted
print(ps.split_by_session(prefix_malware_2 + "splitted", pkts_bpf_filter=f"src host {MALICIOUS_IP}"))
"""

"""
# To big to process...
prefix_malware_3 = "../../../Downloads/iot_23_datasets_full/CTU-IoT-Malware-Capture-7-1/"
ps = PcapSplitter(prefix_malware_3 + "2018-07-20-17-31-20-192.168.100.108.pcap")
MALICIOUS_IP = "192.168.100.108"
!mkdir  {prefix_malware_3}/splitted
print(ps.split_by_session(prefix_malware_3 + "splitted", pkts_bpf_filter=f"src host {MALICIOUS_IP}"))
"""

"""
prefix_malware_4 = "../../../Downloads/iot_23_datasets_full/CTU-IoT-Malware-Capture-8-1/"
ps = PcapSplitter(prefix_malware_4 + "2018-07-31-15-15-09-192.168.100.113.pcap")
MALICIOUS_IP = "192.168.100.113"
!mkdir  {prefix_malware_4}/splitted
print(ps.split_by_session(prefix_malware_4 + "splitted", pkts_bpf_filter=f"src host {MALICIOUS_IP}"))
"""

"""
# NOT USED
prefix_malware_5 = "../../../Downloads/iot_23_datasets_full/CTU-IoT-Malware-Capture-9-1/"
ps = PcapSplitter(prefix_malware_5 + "2018-07-25-10-53-16-192.168.100.111.pcap")
MALICIOUS_IP = "192.168.100.111"
!mkdir  {prefix_malware_5}/splitted
print(ps.split_by_session(prefix_malware_5 + "splitted", pkts_bpf_filter=f"src host {MALICIOUS_IP}"))
"""

"""
prefix_malware_6 = "../../../Downloads/iot_23_datasets_full/CTU-IoT-Malware-Capture-17-1/"
ps = PcapSplitter(prefix_malware_6 + "2018-09-06-11-43-12-192.168.100.111.only15000000.pcap")
MALICIOUS_IP = "192.168.100.111"
!mkdir  {prefix_malware_6}/splitted
print(ps.split_by_session(prefix_malware_6 + "splitted", pkts_bpf_filter=f"src host {MALICIOUS_IP}"))
"""

mkdir: cannot create directory ‘../../../Downloads/iot_23_datasets_full/CTU-IoT-Malware-Capture-1-1//splitted’: File exists
Started...
[ERROR: src/DnsLayer.cpp: parseResources:156         ] DNS layer contains more than 300 resources, probably a bad packet. Skipping parsing DNS resources
[ERROR: src/DnsLayer.cpp: parseResources:156         ] DNS layer contains more than 300 resources, probably a bad packet. Skipping parsing DNS resources
[ERROR: src/DnsLayer.cpp: parseResources:156         ] DNS layer contains more than 300 resources, probably a bad packet. Skipping parsing DNS resources
[ERROR: src/DnsLayer.cpp: parseResources:156         ] DNS layer contains more than 300 resources, probably a bad packet. Skipping parsing DNS resources
[ERROR: src/DnsLayer.cpp: parseResources:156         ] DNS layer contains more than 300 resources, probably a bad packet. Skipping parsing DNS resources
[ERROR: src/DnsLayer.cpp: parseResources:156         ] DNS layer contains more than 300 resources, p

'\nprefix_malware_6 = "../../../Downloads/iot_23_datasets_full/CTU-IoT-Malware-Capture-17-1/"\nps = PcapSplitter(prefix_malware_6 + "2018-09-06-11-43-12-192.168.100.111.only15000000.pcap")\nMALICIOUS_IP = "192.168.100.111"\n!mkdir  {prefix_malware_6}/splitted\nprint(ps.split_by_session(prefix_malware_6 + "splitted", pkts_bpf_filter=f"src host {MALICIOUS_IP}"))\n'

In [5]:
prefix = prefix_malware_1
useful_ip=MALICIOUS_IP

In [None]:
files = list_files_in_directory(prefix + "splitted")

In [None]:
len(files)

# Zeek Analysis:

In [6]:
# Create a Pandas dataframe from a Zeek log
log_to_df = LogToDataFrame()
bro_df = log_to_df.create_dataframe(prefix + 'bro/conn.log.labeled', usecols = ['id.orig_h', 'id.orig_p', 'id.resp_h', 'id.resp_p', 'orig_pkts', 'resp_pkts', 'tunnel_parents   label   detailed-label'])

# Print out the head of the dataframe
bro_df.head()

Unnamed: 0_level_0,id.orig_h,id.orig_p,id.resp_h,id.resp_p,orig_pkts,resp_pkts,tunnel_parents label detailed-label
ts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2018-05-09 15:30:31.015810966,192.168.100.103,51524,65.127.233.163,23,3,0,(empty) Malicious PartOfAHorizontalPortScan
2018-05-09 15:30:31.025054932,192.168.100.103,56305,63.150.16.171,23,1,0,(empty) Malicious PartOfAHorizontalPortScan
2018-05-09 15:30:31.045044899,192.168.100.103,41101,111.40.23.49,23,1,0,(empty) Malicious PartOfAHorizontalPortScan
2018-05-09 15:30:32.016239882,192.168.100.103,60905,131.174.215.147,23,3,0,(empty) Malicious PartOfAHorizontalPortScan
2018-05-09 15:30:32.024985075,192.168.100.103,44301,91.42.47.63,23,1,0,(empty) Malicious PartOfAHorizontalPortScan


In [7]:
bro_df['tunnel_parents   label   detailed-label'].unique()

['(empty)   Malicious   PartOfAHorizontalPortScan', '(empty)   Benign   -', '(empty)   Malicious   C&C']
Categories (3, object): ['(empty)   Benign   -', '(empty)   Malicious   PartOfAHorizontalPortScan', '(empty)   Malicious   C&C']

In [8]:
bro_df.groupby('tunnel_parents   label   detailed-label').count()

  bro_df.groupby('tunnel_parents   label   detailed-label').count()


Unnamed: 0_level_0,id.orig_h,id.orig_p,id.resp_h,id.resp_p,orig_pkts,resp_pkts
tunnel_parents label detailed-label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
(empty) Benign -,469275,469275,469275,469275,469275,469275
(empty) Malicious PartOfAHorizontalPortScan,539465,539465,539465,539465,539465,539465
(empty) Malicious C&C,8,8,8,8,8,8


In [40]:
filtered_df = bro_df.copy()[bro_df['tunnel_parents   label   detailed-label']=='(empty)   Malicious   PartOfAHorizontalPortScan']
filtered_df = filtered_df.copy()[filtered_df['id.orig_h']==MALICIOUS_IP]

In [41]:
filtered_df.groupby('id.resp_h').count().sort_values('id.orig_p')

  filtered_df.groupby('id.resp_h').count().sort_values('id.orig_p')


Unnamed: 0_level_0,id.orig_h,id.orig_p,id.resp_p,orig_pkts,resp_pkts,tunnel_parents label detailed-label
id.resp_h,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1.100.28.163,0,0,0,0,0,0
18.16.88.154,0,0,0,0,0,0
18.158.108.71,0,0,0,0,0,0
18.157.18.228,0,0,0,0,0,0
18.155.105.88,0,0,0,0,0,0
...,...,...,...,...,...,...
118.163.192.88,124,124,124,124,124,124
175.196.5.46,125,125,125,125,125,125
92.255.209.3,125,125,125,125,125,125
70.45.29.240,128,128,128,128,128,128


# Flow Pcap Analysis:

The following code cell would take many ours to complete. However, it can be interrupted at any time. If the subsequent cells are the runned, the work done so far will be saved in disk by file moving... 

If we need more data, we can get it by running this cells when we want...

In [None]:
doorlock_start = []
echo = [] # Bening Victim Amazon Echo 
generic_cc = []
cc_heartbeat = []
okiru = []
generic_ddos = []
muhstik_botnet_file_names = []
horizontal_scan_file_names = []
bening_flow_file_names = []

for idx, current_file_name in enumerate(files):
    if idx%500==0:
        print(f'{idx/len(files)}% done')
    if idx>10000:
        break
    pcap_file = prefix + "splitted/" + current_file_name
    useful_tuple = extract_fist_useful_tuple(
        pcap_file=pcap_file,
        useful_ip=useful_ip)
    
    if useful_tuple is not None:
        
        match = filered_df[(filered_df['id.orig_h']==useful_tuple[0]) &\
            (filered_df['id.orig_p']==useful_tuple[1]) &\
                (filered_df['id.resp_h']==useful_tuple[2]) &\
                    (filered_df['id.resp_p']==useful_tuple[3])] 
        
        if len(match)>=1:
            if len(match['tunnel_parents   label   detailed-label'].unique()) > 1:
                print('discarding ambiguous 5-tuple')

            label = match['tunnel_parents   label   detailed-label'].iloc[0]

            if '-   benign   -' in label:
                doorlock_start.append(current_file_name)
            elif 'Malicious   Okiru' in label:
                okiru.append(current_file_name)
            elif 'Malicious   DDoS' in label:
                generic_ddos.append(current_file_name)
            elif 'C&C-HeartBeat' in label:
                cc_heartbeat.append(current_file_name)
            elif 'Malicious   Attack' in label:
                muhstik_botnet_file_names.append(current_file_name)
            elif 'Horizontal' in label:
                horizontal_scan_file_names.append(current_file_name)
            elif 'Malicious   C&C' in label:
                generic_cc.append(current_file_name)
            else:
                bening_flow_file_names.append(current_file_name)

In [None]:
print(len(doorlock_start))
print(len(okiru))
print(len(generic_ddos))
print(len(cc_heartbeat))

print(len(horizontal_scan_file_names))
print(len(bening_flow_file_names))
print(len(generic_cc))
print(len(muhstik_botnet_file_names))

In [None]:
"""
!mkdir -p {prefix}/okiru
!mkdir -p {prefix}/generic_ddos
!mkdir -p {prefix}/cc_heartbeat

!mkdir -p {prefix}/h_scan
!mkdir -p {prefix}/bening_traffic
!mkdir -p {prefix}/generic_cc
!mkdir -p {prefix}/muhstik
"""

!mkdir -p {prefix}/doorlock_start


In [None]:
move_files(doorlock_start, prefix + "splitted", prefix + "doorlock_start")

move_files(okiru, prefix + "splitted", prefix + "okiru")
move_files(generic_ddos, prefix + "splitted", prefix + "generic_ddos")
move_files(cc_heartbeat, prefix + "splitted", prefix + "cc_heartbeat")

move_files(horizontal_scan_file_names, prefix + "splitted", prefix + "h_scan")
move_files(bening_flow_file_names, prefix + "splitted", prefix + "bening_traffic")
move_files(generic_cc, prefix + "splitted", prefix + "generic_cc")
move_files(muhstik_botnet_file_names, prefix + "splitted", prefix + "muhstik")

files = list_files_in_directory(prefix + "splitted")

In [None]:
doorlock_start_file_names = list_files_in_directory(prefix + "doorlock_start")
"""
horizontal_scan_file_names = list_files_in_directory(prefix + "horizontal_scan_flows")
bening_flow_file_names = list_files_in_directory(prefix + "bening_flows")
command_and_conquer_file_names = list_files_in_directory(prefix + "cc_flows")
muhstik_botnet_file_names = list_files_in_directory(prefix + "muhstik_botnet_flows")

print(len(horizontal_scan_file_names))
print(len(bening_flow_file_names))
print(len(command_and_conquer_file_names))
print(len(muhstik_botnet_file_names))
"""
print(len(doorlock_start_file_names))

# Where did we get the attacks?

## malicious:

- bening_traffic (for attacker) and h_scan came from capture1-1
- muhstik from capture 3-1
- okiru, cc_heartbeat and generic_ddos from capture 7-1

## bening:

- hue from honeypot-4-1
- echo from honeypot-5-1
- doorlock from honeypot-7-1 (Somfy-01, Somfy-02, and Somfy-03)


# File merging:

In [42]:
def merge_pcap_files(directory_path, output_file):
    packets = []
    
    # Iterate through all files in the directory
    for file_name in os.listdir(directory_path):
        file_path = os.path.join(directory_path, file_name)
        
        # Load each pcap file and append its packets to the list
        if file_path.endswith('.pcap'):
            print("Processing file:", file_name)
            packets.extend(rdpcap(file_path))
    
    # Sort packets by their timestamps
    packets.sort(key=lambda x: x.time)
    
    # Write the merged packets to a new pcap file
    wrpcap(output_file, packets)
    print("Merged packets written to:", output_file)