In [None]:
from pcap_splitter.splitter import PcapSplitter
import os
import pandas as pd
import sklearn
import zat
import shutil

from zat.log_to_dataframe import LogToDataFrame
from zat.dataframe_to_matrix import DataFrameToMatrix
import numpy as np
from scapy.all import rdpcap
from tqdm.notebook import tqdm

def list_files_in_directory(directory):
    file_list = []
    for filename in os.listdir(directory):
        if os.path.isfile(os.path.join(directory, filename)):
            file_list.append(filename)
    return file_list



def extract_5_tuple(pcap_file):
    tuples = set()  # Using a set to ensure unique tuples
    packets = rdpcap(pcap_file)
    
    print(f"found {len(packets)} packets in file")

    for idx, packet in enumerate(packets):
        if idx>0 and idx%50==0:
            print(".")
        if packet.haslayer('IP'):
            ip_layer = packet['IP']
            transport_layer = packet[ip_layer.payload.name]

            # Extracting the 5-tuple information
            src_ip = ip_layer.src
            dst_ip = ip_layer.dst
            src_port = transport_layer.sport
            dst_port = transport_layer.dport
            proto = ip_layer.proto

            # Forming the 5-tuple
            tuple_info = (src_ip, src_port, dst_ip, dst_port, proto)
            tuples.add(tuple_info)

    return tuples


def extract_fist_useful_tuple(pcap_file, useful_ip):
    packets = rdpcap(pcap_file)
    
    # print(f"found {len(packets)} packets in file")

    for idx, packet in enumerate(packets):
        if idx>0 and idx%50==0:
            print(".")
        if packet.haslayer('IP'):
            ip_layer = packet['IP']
            transport_layer = packet[ip_layer.payload.name]

            # Extracting the 5-tuple information
            src_ip = ip_layer.src
            dst_ip = ip_layer.dst
            src_port = transport_layer.sport
            dst_port = transport_layer.dport
            proto = ip_layer.proto

            # Forming the 5-tuple
            tuple_info = (src_ip, src_port, dst_ip, dst_port, proto)
            if src_ip == useful_ip:
                return tuple_info

    return None


def move_files(file_list, source_folder, destination_folder):
    for file_name in file_list:
        source_file = os.path.join(source_folder, file_name)
        destination_file = os.path.join(destination_folder, file_name)
        
        try:
            shutil.move(source_file, destination_file)
        except FileNotFoundError:
            print(f"Error: {file_name} not found in {source_folder}")
        except Exception as e:
            print(f"Error while moving {file_name}: {e}")

In [None]:
# Good to print out versions of stuff
print('zat: {:s}'.format(zat.__version__))
print('Pandas: {:s}'.format(pd.__version__))
print('Numpy: {:s}'.format(np.__version__))
print('Scikit Learn Version:', sklearn.__version__)

# Splitting by flows:

In [None]:
prefix = "../../../Downloads/iot_23_datasets_full/CTU-IoT-Malware-Capture-1-1/"
ps = PcapSplitter(prefix + "2018-05-09-192.168.100.103.pcap")
MALICIOUS_IP = "192.168.100.103"

In [None]:
print(ps.split_by_session(prefix + "splitted", pkts_bpf_filter=f"src host {MALICIOUS_IP}"))

In [None]:
files = list_files_in_directory(prefix + "splitted")

In [None]:
len(files)

# Zeek Analysis:

In [None]:
# Create a Pandas dataframe from a Zeek log
log_to_df = LogToDataFrame()
bro_df = log_to_df.create_dataframe(prefix + 'bro/conn.log.labeled')

# Print out the head of the dataframe
bro_df.head()

In [None]:
bro_df['tunnel_parents   label   detailed-label'.unique()]

In [None]:
bro_df.groupby('tunnel_parents   label   detailed-label').count()

# Flow Pcap Analysis:

The following code cell would take many ours to complete. However, it can be interrupted at any time. If the subsequent cells are the runned, the work done so far will be saved in disk by file moving... 

If we need more data, we can get it by running this cells when we want...

In [None]:
horizontal_scan_file_names = []
command_and_conquer_file_names = []
bening_flow_file_names = []

for idx, current_file_name in enumerate(files):
    if idx%500==0:
        print(f'{idx/len(files)}% done')
        
    pcap_file = prefix + "splitted/" + current_file_name
    useful_tuple = extract_fist_useful_tuple(
        pcap_file=pcap_file,
        useful_ip="192.168.100.103")
    if useful_tuple is not None:
        match = bro_df[(bro_df['id.orig_h']==useful_tuple[0]) &\
            (bro_df['id.orig_p']==useful_tuple[1]) &\
                (bro_df['id.resp_h']==useful_tuple[2]) &\
                    (bro_df['id.resp_p']==useful_tuple[3])] 
        if len(match)>1:
            if len(match['tunnel_parents   label   detailed-label'].unique()) > 1:
                print('discarding ambiguous 5-tuple')

            label = match['tunnel_parents   label   detailed-label'].iloc[0]

            if 'Horizontal' in label:
                horizontal_scan_file_names.append(current_file_name)
            elif 'C&C' in label:
                command_and_conquer_file_names.append(current_file_name)
            else:
                bening_flow_file_names.append(current_file_name)

In [110]:
print(len(horizontal_scan_file_names))
print(len(bening_flow_file_names))
print(len(command_and_conquer_file_names))

4122
329
0


In [111]:
move_files(horizontal_scan_file_names, prefix + "splitted", prefix + "horizontal_scan_flows")
move_files(bening_flow_file_names, prefix + "splitted", prefix + "bening_flows")
move_files(command_and_conquer_file_names, prefix + "splitted", prefix + "cc_flows")
files = list_files_in_directory(prefix + "splitted")

In [112]:
horizontal_scan_file_names = list_files_in_directory(prefix + "horizontal_scan_flows")
bening_flow_file_names = list_files_in_directory(prefix + "bening_flows")
command_and_conquer_file_names = list_files_in_directory(prefix + "cc_flows")
print(len(horizontal_scan_file_names))
print(len(bening_flow_file_names))
print(len(command_and_conquer_file_names))

7978
671
0
