In [1]:
import pandas as pd
import numpy as np
import ipaddress
import os
import sys

In [10]:
def create_features_from_csv(date, source_folder, destination_folder):
    """
    Turn raw packet data into one sequence per device. Store the result in CSV format. The result
    has all preprocessed features we need for LSTM training.
    
    Arguments:
        date: Name of the CSV file with packet data (without file extension). The results are also stored in a folder with the date name.
        source_folder: path to folder with raw CSV data
        destination_folder: path to folder where the result files will be written to
    """

    # skip if destination folder already exists
    path = "{}/{}".format(destination_folder, date)
    if os.path.exists(path):
        print("Folder {} already exists. Skip processing.".format(path))
        return
    else:
        print("Start processing data for {}.".format(path))
        os.makedirs(path)

    macs = ["d0:52:a8:00:67:5e", "44:65:0d:56:cc:d3", "70:ee:50:18:34:43",
      "f4:f2:6d:93:51:f1", "00:16:6c:ab:6b:88", "30:8c:fb:2f:e4:b2",
      "00:62:6e:51:27:2e", "e8:ab:fa:19:de:4f", "00:24:e4:11:18:a8",
      "ec:1a:59:79:f4:89", "50:c7:bf:00:56:39", "74:c6:3b:29:d7:1d",
      "ec:1a:59:83:28:11", "18:b4:30:25:be:e4", "70:ee:50:03:b8:ac",
      "00:24:e4:1b:6f:96", "74:6a:89:00:2e:25", "00:24:e4:20:28:c6",
      "d0:73:d5:01:83:08", "18:b7:9e:02:20:44", "e0:76:d0:33:bb:85",
      "70:5a:0f:e4:9b:c0", "08:21:ef:3b:fc:e3", "30:8c:fb:b6:ea:45",
      "40:f3:08:ff:1e:da", "74:2f:68:81:69:42", "ac:bc:32:d4:6f:2f",
      "b4:ce:f6:a7:a3:c2", "d0:a6:37:df:a1:e1", "f4:5c:89:93:cc:85",
      "14:cc:20:51:33:ea"]

    for device in macs:
        # import csv to pandas dataframe
        file = "{}/{}.csv".format(source_folder, date)
        df = pd.read_csv(file)

        # filter by one device
        df=df.query('MAC_source.str.contains("{}") | MAC_destination.str.contains("{}")'.format(device, device))

        # sort by time stamp
        # note: some packets in the PCAP data are out of order, thus this sorting step is important
        df = df.sort_values('TIME')

        # add Outbound: +1: outbound, -1: inbound
        direction = [1 if x == device else -1 for x in df['MAC_source']]
        df['Outbound'] = direction

        # Time since last packet
        times = list(df['TIME'])
        first_timestamp = times[0] if len(times) > 0 else 0
        times.insert(0, first_timestamp)
        times.pop()
        np_times = np.array(times)
        np_original = np.array(list(df['TIME']))
        np_intervals = np_original - np_times
        df['Time_since_last_packet'] = np_intervals.tolist()

        # compute field IP_source_internal and IP_destination_internal.
        # internal: +1
        # external: -1
        internal_source = [ipaddress.ip_address(ip).is_private for ip in df['IP_source']]
        df['IP_source_internal'] = [1 if internal else -1 for internal in internal_source]
        internal_destination = [ipaddress.ip_address(ip).is_private for ip in df['IP_destination']]
        df['IP_destination_internal'] = [1 if internal else -1 for internal in internal_destination]

        # compute column "Port_class_source" and "Port_class_destination"
        df['Port_class_source'] = [port if (0 < int(port) < 1024) else -1 for port in df['Port_source']]
        df['Port_class_destination'] = [port if (0 < int(port) < 1024) else -1 for port in df['Port_destination']]

        # keep only data for training
        df=df.drop(['TIME', 'MAC_source', 'MAC_destination'], axis=1)

        # verify there is non NaN, infinity or negative data for "Time_since_last_packet"
        time_intervals = df['Time_since_last_packet'].to_numpy()
        finite = np.isfinite(time_intervals)
        if (finite == False).sum() > 0:
            sys.exit('There are infinite time intervals in csv {}'.format(file))
        if np.isnan(time_intervals).any():
            sys.exit('There are NaN time intervals in csv {}'.format(file))
        if (time_intervals < 0).any():
            sys.exit('There are negative time intervals in csv {}'.format(file))

        # export data for training
        device_mac_for_file_name = device.replace(':', '-')
        df.to_csv("{}/{}/{}.csv".format(destination_folder, date, device_mac_for_file_name), index=False)



In [11]:
# create csv sequence for a few dates

dates = [
    "16-10-28"
]

for date in dates:
    create_features_from_csv(date, "../../csv_from_pcap", "../../csv_sequences")

Start processing data for ../../csv_sequences/16-10-28.


In [12]:
# create csv sequencees for all dates

dates = [
    "16-09-23", "16-10-04", "16-10-15", "16-10-26", "16-11-07", "16-11-18",
    "16-09-24", "16-10-05", "16-10-16", "16-10-27", "16-11-08", "16-11-19",
    "16-09-25", "16-10-06", "16-10-17", "16-10-28", "16-11-09", "16-11-20",
    "16-09-26", "16-10-07", "16-10-18", "16-10-29", "16-11-10", "16-11-21",
    "16-09-27", "16-10-08", "16-10-19", "16-10-30", "16-11-11", "16-11-22",
    "16-09-28", "16-10-09", "16-10-20", "16-10-31", "16-11-12",
    "16-09-29", "16-10-10", "16-10-21", "16-11-01", "16-11-13",
    "16-09-30", "16-10-11", "16-10-22", "16-11-02", "16-11-14",
    "16-10-01", "16-10-12", "16-10-23", "16-11-04", "16-11-15",
    "16-10-02", "16-10-13", "16-10-24", "16-11-05", "16-11-16",
    "16-10-03", "16-10-14", "16-10-25", "16-11-06", "16-11-17"
]

for date in dates:
    create_features_from_csv(date, "../../csv_from_pcap", "../../csv_sequences")

Start processing data for ../../csv_sequences/16-09-23.
Start processing data for ../../csv_sequences/16-10-04.
Start processing data for ../../csv_sequences/16-10-15.
Start processing data for ../../csv_sequences/16-10-26.
Start processing data for ../../csv_sequences/16-11-07.
Start processing data for ../../csv_sequences/16-11-18.
Start processing data for ../../csv_sequences/16-09-24.
Start processing data for ../../csv_sequences/16-10-05.
Start processing data for ../../csv_sequences/16-10-16.
Start processing data for ../../csv_sequences/16-10-27.
Start processing data for ../../csv_sequences/16-11-08.
Start processing data for ../../csv_sequences/16-11-19.
Start processing data for ../../csv_sequences/16-09-25.
Start processing data for ../../csv_sequences/16-10-06.
Start processing data for ../../csv_sequences/16-10-17.
Start processing data for ../../csv_sequences/16-10-28.
Start processing data for ../../csv_sequences/16-11-09.
Start processing data for ../../csv_sequences/16