In [1]:
# google colab paths

from google.colab import drive
drive.mount('/content/gdrive')

CSV_SEQUENCES = "/content/gdrive/MyDrive/Colab Notebooks/iot_device_classification/new_csv_sequences"
NPZ_WINDOWS = "/content/gdrive/MyDrive/Colab Notebooks/iot_device_classification/npz_windows"
LIST_OF_DEVICES = "/content/gdrive/MyDrive/Colab Notebooks/iot_device_classification/list_of_devices"

Mounted at /content/gdrive


In [None]:
# # local paths

# CSV_SEQUENCES = "C:/work_c/2022-09-20_unsw_dataset_iot_2018/csv_sequences"
# NPZ_WINDOWS = "C:/work_c/2022-09-20_unsw_dataset_iot_2018/npz_windows"
# LIST_OF_DEVICES = "C:/work_c/2022-09-20_unsw_dataset_iot_2018/list_of_devices"

In [2]:
WINDOW_SIZE = 200
# this number equals: WINDOW_COVERAGE=NUM_WINDOWS*WINDOW_SIZE/NUM_SAMPLES
WINDOW_COVERAGE = 5

import pandas as pd
import numpy as np
import random

def csv_sequence_to_numpy_windows(csv_file, label, num_of_labels):
    """
    Create windows of packet data. Data type: numpy.int16

    Arguments:
        - csv_file: the file path to the packet data
        - label: a number representing the device
        - num_of_labels: the total number of labels
    Return:
        A list of windows and a list of one-hot labels
    """
    print("Generating windows for {}".format(csv_file))
    # import as pandas dataframe
    df = pd.read_csv(csv_file)
    # remove unnecessary cols
    df=df.drop(['Packet_ID', 'IP_source', 'IP_destination', 'Port_source', 'Port_destination'], axis=1)
    # create windows at random locations
    num_of_samples = df.shape[0]
    num_of_features = df.shape[1]
    num_of_windows = int(WINDOW_COVERAGE*num_of_samples/WINDOW_SIZE)
    # special case: not enough data for one full window
    if num_of_samples < WINDOW_SIZE:
        num_of_windows = 0
    windows = np.empty((num_of_windows, WINDOW_SIZE, num_of_features), dtype=np.float32)
    for i in range(0,num_of_windows):
        start = random.randint(0, num_of_samples - WINDOW_SIZE)
        end = start + WINDOW_SIZE
        window = df[start:end]
        windows[i] = window
    # create labels (one-hot) 
    label_ohe = np.zeros((1, num_of_labels))
    label_ohe[0][label] = 1
    labels_ohe = np.empty((num_of_windows, 1, num_of_labels), dtype=np.bool_)
    for i in range(0,num_of_windows):
        labels_ohe[i] = label_ohe
    return (windows, labels_ohe)

In [3]:
# test csv_sequence_to_numpy_windows()

# normal case
windows, labels = csv_sequence_to_numpy_windows("{}/16-09-23/00-16-6c-ab-6b-88.csv".format(CSV_SEQUENCES), 0, 2)
# empty day
#windows, labels = csv_sequence_to_numpy_windows("{}/16-09-23/00-24-e4-20-28-c6.csv".format(CSV_SEQUENCES), 0, 2)
# special day
#windows, labels = csv_sequence_to_numpy_windows("{}/16-09-23/18-b4-30-25-be-e4.csv".format(CSV_SEQUENCES), 0, 2)

print("Shape of windows: {}".format(windows.shape))
print("Data type for windows: {}".format(windows.dtype))
print("Shape of labels: {}".format(labels.shape))
print("Data type for labels: {}".format(labels.dtype))
if len(windows) > 0:
    print("First window, part:")
    print(windows[0][:10])
    print("First label:")
    print(labels[0])

Generating windows for /content/gdrive/MyDrive/Colab Notebooks/iot_device_classification/new_csv_sequences/16-09-23/00-16-6c-ab-6b-88.csv
Shape of windows: (1429, 200, 8)
Data type for windows: float32
Shape of labels: (1429, 1, 2)
Data type for labels: bool
First window, part:
[[ 1.4700000e+02  1.7000000e+01 -1.0000000e+00  2.7990341e-04
   1.0000000e+00  1.0000000e+00  5.3000000e+01 -1.0000000e+00]
 [ 1.4300000e+02  1.7000000e+01 -1.0000000e+00  3.9410591e-04
   1.0000000e+00  1.0000000e+00  5.3000000e+01 -1.0000000e+00]
 [ 1.8800000e+02  6.0000000e+00  1.0000000e+00  7.2016954e-02
   1.0000000e+00 -1.0000000e+00 -1.0000000e+00  5.2220000e+03]
 [ 7.4000000e+01  6.0000000e+00  1.0000000e+00  3.9536953e-02
   1.0000000e+00 -1.0000000e+00 -1.0000000e+00  4.6500000e+02]
 [ 2.0400000e+02  6.0000000e+00 -1.0000000e+00  1.2252593e-01
  -1.0000000e+00  1.0000000e+00  5.2220000e+03 -1.0000000e+00]
 [ 6.6000000e+01  6.0000000e+00  1.0000000e+00  1.7001629e-03
   1.0000000e+00 -1.0000000e+00 -1

In [4]:
import itertools

def get_csv_file_list():
  """
  Create a list of all files with CSV sequence data. Additionally, create a label list.

  Returns:
    - A list of absolute file paths to CSV files
    - A list of labels
  """

  # read days list from auxilary CSV
  days_csv = "{}/list_of_days.csv".format(LIST_OF_DEVICES)
  df_days = pd.read_csv(days_csv)
  days = list(df_days['Day'])
  print("Total number of days: {}".format(len(days)))

  # read IoT device list from auxilary CSV
  devices_csv = "{}/list_of_devices.csv".format(LIST_OF_DEVICES)
  df_devices = pd.read_csv(devices_csv)
  # remove rare devices and gateway device
  df_devices = df_devices.drop([29,30])
  devices = list(df_devices['MAC ADDRESS'])
  print("Total number of devices {}".format(len(devices)))
  # turn "d0:52:a8:00:67:5e" into "d0-52-a8-00-67-5e"
  devices = [device.replace(":", "-") for device in devices]
  labels = list(df_devices['Label'])

  # Each device data exists for each day, so create a cartesian product
  device_and_day = list(itertools.product(devices, days))
  labels = np.repeat(labels, len(days))
  # Turn the device + day pairs into file pahts
  csv_files = []
  for device, day in device_and_day:
    csv = "{}/{}/{}.csv".format(CSV_SEQUENCES, day, device)
    csv_files.append(csv)
  return (csv_files, labels)


In [5]:
# run get_csv_file_list()
csv_files, labels = get_csv_file_list()
print("Number of csv files: {}".format(len(csv_files)))
print("Number of labels: {}".format(len(labels)))
print("First 30 csv files: ")
print(csv_files[0:30])
print("First 30 labels:")
print(labels[0:30])


Total number of days: 60
Total number of devices 29
Number of csv files: 1740
Number of labels: 1740
First 30 csv files: 
['/content/gdrive/MyDrive/Colab Notebooks/iot_device_classification/new_csv_sequences/16-09-23/d0-52-a8-00-67-5e.csv', '/content/gdrive/MyDrive/Colab Notebooks/iot_device_classification/new_csv_sequences/16-09-30/d0-52-a8-00-67-5e.csv', '/content/gdrive/MyDrive/Colab Notebooks/iot_device_classification/new_csv_sequences/16-10-07/d0-52-a8-00-67-5e.csv', '/content/gdrive/MyDrive/Colab Notebooks/iot_device_classification/new_csv_sequences/16-10-14/d0-52-a8-00-67-5e.csv', '/content/gdrive/MyDrive/Colab Notebooks/iot_device_classification/new_csv_sequences/16-10-21/d0-52-a8-00-67-5e.csv', '/content/gdrive/MyDrive/Colab Notebooks/iot_device_classification/new_csv_sequences/16-10-28/d0-52-a8-00-67-5e.csv', '/content/gdrive/MyDrive/Colab Notebooks/iot_device_classification/new_csv_sequences/16-11-05/d0-52-a8-00-67-5e.csv', '/content/gdrive/MyDrive/Colab Notebooks/iot_device

In [6]:
from itertools import permutations
from numpy import savez_compressed

def csv_sequences_to_npz_windows(csvs, labels, num_classes, npz_file):
    """
    Create windows from CSV packet data, shuffle the windows, and store windows plus labels in *.NPZ file.
    In the NPZ file dictionary, the windows will be stored under 'x', and the labels under 'y'.

    Arguments:
        - csvs: a list of CSV file paths
        - labels: a list of device classes as number
        - num_classes: total number of classes (required for one-hot encoding)
        - npz_file: the file path where the result should be stored, without *.npz file extension
    Returns:
        - None
    """
    # for each pair of csv/label, create windows using csv_sequence_to_numpy_windows()
    windows_all = None
    labels_all = None
    for csv, label in zip(csvs, labels):
        windows, labels = csv_sequence_to_numpy_windows(csv, label, num_classes)
        if windows_all is not None:
            windows_all = np.concatenate((windows_all,windows))
            labels_all = np.concatenate((labels_all, labels))
        else:
            windows_all = windows
            labels_all = labels
    # shuffle the data in "synced" way
    permutation = np.random.permutation(len(windows_all))
    windows_all = windows_all[permutation]
    labels_all = labels_all[permutation]
    # store everything in one NPZ file
    savez_compressed(npz_file, x = windows_all, y = labels_all)

In [7]:
# test csv_sequences_to_npz_windows

csv_files = [
    '/content/gdrive/MyDrive/Colab Notebooks/iot_device_classification/new_csv_sequences/16-09-23/d0-52-a8-00-67-5e.csv',
    '/content/gdrive/MyDrive/Colab Notebooks/iot_device_classification/new_csv_sequences/16-09-23/00-16-6c-ab-6b-88.csv',
    '/content/gdrive/MyDrive/Colab Notebooks/iot_device_classification/new_csv_sequences/16-09-24/d0-52-a8-00-67-5e.csv',
    '/content/gdrive/MyDrive/Colab Notebooks/iot_device_classification/new_csv_sequences/16-09-24/00-16-6c-ab-6b-88.csv',
]

labels = [
    0,0,1,1
]

num_classes = 2

npz_file = "{}/new_2_days_2_devices".format(NPZ_WINDOWS)

csv_sequences_to_npz_windows(csv_files, labels, num_classes, npz_file)

from numpy import load
npz_file = "{}.npz".format(npz_file)
dict_data = load(npz_file)
x = dict_data['x']
y = dict_data['y']
print("shape of windws: {}".format(x.shape))
print("shape of labels: {}".format(y.shape))
print("First window, part:")
print(x[0][:5])
print("First labels:")
print(y[:3])

Generating windows for /content/gdrive/MyDrive/Colab Notebooks/iot_device_classification/new_csv_sequences/16-09-23/d0-52-a8-00-67-5e.csv
Generating windows for /content/gdrive/MyDrive/Colab Notebooks/iot_device_classification/new_csv_sequences/16-09-23/00-16-6c-ab-6b-88.csv
Generating windows for /content/gdrive/MyDrive/Colab Notebooks/iot_device_classification/new_csv_sequences/16-09-24/d0-52-a8-00-67-5e.csv
Generating windows for /content/gdrive/MyDrive/Colab Notebooks/iot_device_classification/new_csv_sequences/16-09-24/00-16-6c-ab-6b-88.csv
shape of windws: (3822, 200, 8)
shape of labels: (3822, 1, 2)
First window, part:
[[ 72.          17.           1.           0.99893093   1.
    1.          -1.          53.        ]
 [ 72.          17.           1.           0.998935     1.
    1.          -1.          53.        ]
 [ 60.           6.           1.           0.4735372    1.
   -1.          -1.         443.        ]
 [ 72.          17.           1.           0.525352     1.
    

In [8]:
#create NPZ windows

NUM_CLASSES = 28

csv_files, labels = get_csv_file_list()

npz_file = "{}/update_new_feature_all_days_all_devices".format(NPZ_WINDOWS)
csv_sequences_to_npz_windows(csv_files, labels, NUM_CLASSES, npz_file)


Total number of days: 60
Total number of devices 29
Generating windows for /content/gdrive/MyDrive/Colab Notebooks/iot_device_classification/new_csv_sequences/16-09-23/d0-52-a8-00-67-5e.csv
Generating windows for /content/gdrive/MyDrive/Colab Notebooks/iot_device_classification/new_csv_sequences/16-09-30/d0-52-a8-00-67-5e.csv
Generating windows for /content/gdrive/MyDrive/Colab Notebooks/iot_device_classification/new_csv_sequences/16-10-07/d0-52-a8-00-67-5e.csv
Generating windows for /content/gdrive/MyDrive/Colab Notebooks/iot_device_classification/new_csv_sequences/16-10-14/d0-52-a8-00-67-5e.csv
Generating windows for /content/gdrive/MyDrive/Colab Notebooks/iot_device_classification/new_csv_sequences/16-10-21/d0-52-a8-00-67-5e.csv
Generating windows for /content/gdrive/MyDrive/Colab Notebooks/iot_device_classification/new_csv_sequences/16-10-28/d0-52-a8-00-67-5e.csv
Generating windows for /content/gdrive/MyDrive/Colab Notebooks/iot_device_classification/new_csv_sequences/16-11-05/d0-5

In [10]:
#look at generated NPZ windows

from numpy import load
npz_file = "{}/update_new_feature_all_days_all_devices.npz".format(NPZ_WINDOWS)
dict_data = load(npz_file)
x = dict_data['x']
y = dict_data['y']
print("shape of windws: {}".format(x.shape))
print("shape of labels: {}".format(y.shape))
print("First window, part:")
print(x[0][:5])
print("First labels:")
print(y[:3])

shape of windws: (1339324, 200, 8)
shape of labels: (1339324, 1, 28)
First window, part:
[[ 7.9000000e+01  6.0000000e+00 -1.0000000e+00  2.9087067e-05
  -1.0000000e+00  1.0000000e+00  8.0000000e+01 -1.0000000e+00]
 [ 2.3300000e+02  6.0000000e+00 -1.0000000e+00  2.8848648e-05
  -1.0000000e+00  1.0000000e+00  8.0000000e+01 -1.0000000e+00]
 [ 1.8100000e+02  6.0000000e+00 -1.0000000e+00  3.0994415e-05
  -1.0000000e+00  1.0000000e+00  8.0000000e+01 -1.0000000e+00]
 [ 1.9000000e+02  6.0000000e+00 -1.0000000e+00  2.9087067e-05
  -1.0000000e+00  1.0000000e+00  8.0000000e+01 -1.0000000e+00]
 [ 1.9100000e+02  6.0000000e+00 -1.0000000e+00  3.0040741e-05
  -1.0000000e+00  1.0000000e+00  8.0000000e+01 -1.0000000e+00]]
First labels:
[[[False False False False False False False False False False False
   False False False False False False False False False False False
   False False False False False  True]]

 [[False False False False False False False False False False False
   False False False F