In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tiktoken
import openai

openai.api_key = 'anonstring1'

from mac_vendor_lookup import MacLookup, VendorNotFoundError

from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split, KFold, cross_val_score, cross_validate
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix

import nest_asyncio
nest_asyncio.apply()

In [2]:
mac = MacLookup()

In [3]:
pd.set_option('display.max_columns', 500)

In [4]:
common_features = [
    'src_oui',
    'dst_oui',
    'protocol',
    'bidirectional_min_ps',
    'bidirectional_mean_ps',
    'bidirectional_max_ps',
    'dst2src_min_ps',
    'dst2src_mean_ps',
    'dst2src_max_ps',
    'src2dst_min_ps',
    'src2dst_mean_ps',
    'src2dst_max_ps',
    'label'
]

df1_rename_map = {
     'bidirectional_bytes' : 'bidirectional_total_bytes',
     'bidirectional_duration_ms' : 'bidirectional_total_duration_ms',
     'bidirectional_packets': 'bidirectional_total_packets',
     'bidirectional_stddev_ps' : 'bidirectional_stdev_ps',
     'dst2src_bytes' : 'dst2src_total_bytes',
     'dst2src_duration_ms' : 'dst2src_total_duration_ms',
     'dst2src_packets' : 'dst2src_total_packets',
     'dst2src_stddev_ps' : 'dst2src_stdev_ps',
     'src2dst_bytes' : 'src2dst_total_bytes',
     'src2dst_duration_ms' : 'src2dst_total_duration_ms',
     'src2dst_packets' : 'src2dst_total_packets',
     'src2dst_stddev_ps' : 'src2dst_stdev_ps'
}

df1_extra_features = list( df1_rename_map.keys() )
df1_features = common_features + df1_extra_features

df2_extra_features = list( df1_rename_map.values() ) + [ 
    'bidirectional_transmission_rate_byte_ms', 
    'bidirectional_transmission_rate_ms', 
    'dst2src_transmission_rate_bytes_ms', 
    'dst2src_transmission_rate_ms',
    'src2dst_transmission_rate_bytes_ms',
    'src2dst_transmission_rate_ms'
]
df2_features = common_features + df2_extra_features
df3_features = df2_features
df4_features = [ f for f in df2_features if f != 'label' ]

all_features = df2_features

In [5]:
# Unencrypted
df1 = pd.read_csv( '../data/raw/unswiotan18_labelled.csv', usecols=df1_features ).rename( columns=df1_rename_map ).astype( { 'protocol' : 'string' } )
df2 = pd.read_csv( '../data/raw/Sentinel_Consolidated_Protoflow_aggregate.csv', usecols=df2_features ).astype( { 'protocol' : 'string' } )

# Encrypted
df3 = pd.read_csv( '../data/raw/consolidated_z_b_w.csv', usecols=df3_features ).astype( { 'protocol' : 'string' } )
df4 = pd.read_csv( '../data/raw/capout_vb.csv', usecols=df4_features ).astype( { 'protocol' : 'string' } )

df1 = df1.dropna()
df2 = df2.dropna()
df3 = df3.dropna()
df4 = df4.dropna()

print( df1.shape )
print( df2.shape )
print( df3.shape )
print( df4.shape )

(946646, 25)
(1951, 31)
(10227, 31)
(2347, 30)


In [6]:
df1['bidirectional_total_duration_ms'] = df1['bidirectional_total_duration_ms'].replace( 0.0, 1.0 )
df1['dst2src_total_duration_ms'] = df1['dst2src_total_duration_ms'].replace( 0.0, 1.0 )
df1['src2dst_total_duration_ms'] = df1['src2dst_total_duration_ms'].replace( 0.0, 1.0 )
df1['bidirectional_transmission_rate_byte_ms'] = df1['bidirectional_total_bytes'] / df1['bidirectional_total_duration_ms']
df1['bidirectional_transmission_rate_ms'] = df1['bidirectional_total_packets'] / df1['bidirectional_total_duration_ms']
df1['dst2src_transmission_rate_bytes_ms'] = df1['dst2src_total_bytes'] / df1['dst2src_total_duration_ms']
df1['dst2src_transmission_rate_ms'] = df1['dst2src_total_packets'] / df1['dst2src_total_duration_ms']
df1['src2dst_transmission_rate_bytes_ms'] = df1['src2dst_total_bytes'] / df1['src2dst_total_duration_ms']
df1['src2dst_transmission_rate_ms'] = df1['src2dst_total_packets'] / df1['src2dst_total_duration_ms']
df1 = df1.dropna()


df2['bidirectional_total_duration_ms'] = df2['bidirectional_total_duration_ms'].replace( 0.0, 1.0 )
df2['dst2src_total_duration_ms'] = df2['dst2src_total_duration_ms'].replace( 0.0, 1.0 )
df2['src2dst_total_duration_ms'] = df2['src2dst_total_duration_ms'].replace( 0.0, 1.0 )
df2 = df2.dropna()


df3['bidirectional_total_duration_ms'] = df3['bidirectional_total_duration_ms'].replace( 0.0, 1.0 )
df3['dst2src_total_duration_ms'] = df3['dst2src_total_duration_ms'].replace( 0.0, 1.0 )
df3['src2dst_total_duration_ms'] = df3['src2dst_total_duration_ms'].replace( 0.0, 1.0 )
df3 = df3.dropna()


df4['bidirectional_total_duration_ms'] = df4['bidirectional_total_duration_ms'].replace( 0.0, 1.0 )
df4['dst2src_total_duration_ms'] = df4['dst2src_total_duration_ms'].replace( 0.0, 1.0 )
df4['src2dst_total_duration_ms'] = df4['src2dst_total_duration_ms'].replace( 0.0, 1.0 )
df4 = df4.dropna()

In [7]:
def process_oui( oui ):
    zigbee_map = {
        '0x0001' : '00:17:88',
        '0x2ce0' : '00:17:88',
        '0xf70f' : '00:17:88',
        '0x8b39' : '00:17:88',
        '0x1de6' : '00:12:4b',
        '0x55fb' : '00:12:4b',
        '0xddaf' : '00:12:4b',
        '0xe645' : '00:12:4b',
        '0x5d81' : '00:12:4b',
        '0x3a95' : '00:12:4b',
        '0x7bd2' : '00:12:4b',
        '0x74fa' : '00:12:4b',
        '0x8a91' : '00:12:4b',
        '0x237f' : '00:12:4b',
        '0x000' : '00:12:4b',
        '0xff4f' : '00:12:4b',
        '0x335b' : '00:12:4b',
        '0xf770' : '00:12:4b',
        '0x0000' : '00:15:5f',
    }
    
    if oui[:2] != '0x':
        if len( oui ) > 8:
            return oui[:8]
        else:
            return oui
    
    else:
        if oui in zigbee_map:
            return zigbee_map[oui]
        elif oui == '0xffff':
            return 'ff:ff:ff'
        else:
            return None

In [8]:
df3['src_oui'] = df3['src_oui'].apply( process_oui )
df3['dst_oui'] = df3['dst_oui'].apply( process_oui )

df3 = df3.dropna()

In [9]:
# Todo: Improve this function

def get_vendor( oui ):
    global not_found_map
    
    if oui == 'da:a1:19':
        return 'Google, Inc.'
    
    if oui == 'ff:ff:ff':
        return 'Broadcast'
    
    if oui[:5] == '33:33':
        return 'IPV6 Multicast'
    
    if oui == '01:00:5e':
        return 'Multicast'
    
    if oui == '70:88:6b':
        return 'Awair'
    
    if oui == '00:62:6e':
        return 'Insteon'

    try:
        return mac.lookup( oui )
    
    except VendorNotFoundError:        
        return 'Not Found'

In [10]:
df1['src_oui'] = df1['src_oui'].apply( get_vendor )
df1['dst_oui'] = df1['dst_oui'].apply( get_vendor )
df2['src_oui'] = df2['src_oui'].apply( get_vendor )
df2['dst_oui'] = df2['dst_oui'].apply( get_vendor )
df3['src_oui'] = df3['src_oui'].apply( get_vendor )
df3['dst_oui'] = df3['dst_oui'].apply( get_vendor )
df4['src_oui'] = df4['src_oui'].apply( get_vendor )
df4['dst_oui'] = df4['dst_oui'].apply( get_vendor )

In [11]:
# TODO: When making a classification decision, look at the number of samples and judge according to sample size

abstraction_table = {
    ( 'smart_camera', 'insteon_camera', 'samsung_smart_cam', 'tp_link_smart_cam', 'netatmo_welcome', 'dropcam', 'wthings_smart_baby_monitor' ) : 'smart_camera', # Withings smart baby monitor is not ideal; is a threat; solution: do packet analysis to see which type of packet (audio, video) is the most frequent
    ( 'router' ) : 'router',
    ( 'smartphone', 'android_phone', 'iphone', 'samsung_galaxy_tab', 'pixstar_photo_frame' ) : 'mobile', # Is photo frame a tablet? TODO: Provide justification
    ( 'smart_switch', 'wemo_switch' ) : 'smart_switch',
    ( 'smart_speaker', 'triby_speaker', 'smart_assistant', 'echo', 'iHome' ) : 'smart_speaker',
    ( 'smart_bulb', 'LiFX_smart_bulb' ) : 'smart_bulb',
    ( 'macbook', 'laptop' ) : 'computer',
    ( 'wemo_motion_sensor' ) : 'smart_motion_sensor',
    ( 'hp_printer' ) : 'smart_printer',
}

def process_labels( label ):
    global abstraction_table
    
    for device_set in abstraction_table:
        if label in device_set:
            return abstraction_table[device_set]
    
    return None

In [12]:
df1['label'] = df1['label'].apply( process_labels )
df1 = df1.dropna()

df2['label'] = df2['label'].apply( process_labels )
df2 = df2.dropna()

df3['label'] = df3['label'].apply( process_labels )
df3 = df3.dropna()

In [13]:
print( df1.shape )
print( df2.shape )
print( df3.shape )
print( df4.shape )

(928711, 31)
(1028, 31)
(9749, 31)
(2347, 30)


In [14]:
df1

Unnamed: 0,src_oui,dst_oui,protocol,bidirectional_total_duration_ms,bidirectional_total_packets,bidirectional_total_bytes,src2dst_total_duration_ms,src2dst_total_packets,src2dst_total_bytes,dst2src_total_duration_ms,dst2src_total_packets,dst2src_total_bytes,bidirectional_min_ps,bidirectional_mean_ps,bidirectional_stdev_ps,bidirectional_max_ps,src2dst_min_ps,src2dst_mean_ps,src2dst_stdev_ps,src2dst_max_ps,dst2src_min_ps,dst2src_mean_ps,dst2src_stdev_ps,dst2src_max_ps,label,bidirectional_transmission_rate_byte_ms,bidirectional_transmission_rate_ms,dst2src_transmission_rate_bytes_ms,dst2src_transmission_rate_ms,src2dst_transmission_rate_bytes_ms,src2dst_transmission_rate_ms
0,Netatmo,"TP-LINK TECHNOLOGIES CO.,LTD.",1,1313.0,4.0,392.0,1003.0,2.0,196.0,1003.0,2.0,196.0,98.0,98.000000,0.000000,98.0,98.0,98.000000,0.000000,98.0,98.0,98.000000,0.000000,98.0,smart_camera,0.298553,0.003046,0.195414,0.001994,0.195414,0.001994
1,"TP-LINK TECHNOLOGIES CO.,LTD.",IPV6 Multicast,58,1.0,1.0,86.0,1.0,1.0,86.0,1.0,0.0,0.0,86.0,86.000000,0.000000,86.0,86.0,86.000000,0.000000,86.0,0.0,0.000000,0.000000,0.0,router,86.000000,1.000000,0.000000,0.000000,86.000000,1.000000
2,"TP-LINK TECHNOLOGIES CO.,LTD.",IPV6 Multicast,58,1.0,1.0,86.0,1.0,1.0,86.0,1.0,0.0,0.0,86.0,86.000000,0.000000,86.0,86.0,86.000000,0.000000,86.0,0.0,0.000000,0.000000,0.0,router,86.000000,1.000000,0.000000,0.000000,86.000000,1.000000
3,"TP-LINK TECHNOLOGIES CO.,LTD.",IPV6 Multicast,58,1.0,1.0,86.0,1.0,1.0,86.0,1.0,0.0,0.0,86.0,86.000000,0.000000,86.0,86.0,86.000000,0.000000,86.0,0.0,0.000000,0.000000,0.0,router,86.000000,1.000000,0.000000,0.000000,86.000000,1.000000
4,Hewlett Packard,IPV6 Multicast,58,1.0,1.0,86.0,1.0,1.0,86.0,1.0,0.0,0.0,86.0,86.000000,0.000000,86.0,86.0,86.000000,0.000000,86.0,0.0,0.000000,0.000000,0.0,smart_printer,86.000000,1.000000,0.000000,0.000000,86.000000,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
946650,Belkin International Inc.,"Samsung Electronics Co.,Ltd",6,24.0,10.0,2032.0,22.0,5.0,524.0,23.0,5.0,1508.0,66.0,203.200000,367.481761,1236.0,66.0,104.800000,82.360185,252.0,66.0,301.600000,522.356966,1236.0,smart_motion_sensor,84.666667,0.416667,65.565217,0.217391,23.818182,0.227273
946651,Belkin International Inc.,Multicast,2,1.0,1.0,46.0,1.0,1.0,46.0,1.0,0.0,0.0,46.0,46.000000,0.000000,46.0,46.0,46.000000,0.000000,46.0,0.0,0.000000,0.000000,0.0,smart_motion_sensor,46.000000,1.000000,0.000000,0.000000,46.000000,1.000000
946652,Belkin International Inc.,IPV6 Multicast,58,1.0,1.0,86.0,1.0,1.0,86.0,1.0,0.0,0.0,86.0,86.000000,0.000000,86.0,86.0,86.000000,0.000000,86.0,0.0,0.000000,0.000000,0.0,smart_motion_sensor,86.000000,1.000000,0.000000,0.000000,86.000000,1.000000
946653,Hewlett Packard,IPV6 Multicast,58,1.0,1.0,86.0,1.0,1.0,86.0,1.0,0.0,0.0,86.0,86.000000,0.000000,86.0,86.0,86.000000,0.000000,86.0,0.0,0.000000,0.000000,0.0,smart_printer,86.000000,1.000000,0.000000,0.000000,86.000000,1.000000


In [15]:
global_pk = 0

df1['entry_id'] = 0
df2['entry_id'] = 0
df3['entry_id'] = 0
df4['entry_id'] = 0

for index, row in df1.iterrows():
    df1.at[index, 'entry_id'] = global_pk
    global_pk += 1

for index, row in df2.iterrows():
    df2.at[index, 'entry_id'] = global_pk
    global_pk += 1

for index, row in df3.iterrows():
    df3.at[index, 'entry_id'] = global_pk
    global_pk += 1

for index, row in df4.iterrows():
    df4.at[index, 'entry_id'] = global_pk
    global_pk += 1

df1 = df1.set_index( 'entry_id' )
df2 = df2.set_index( 'entry_id' )
df3 = df3.set_index( 'entry_id' )
df4 = df4.set_index( 'entry_id' )

In [16]:
df1

Unnamed: 0_level_0,src_oui,dst_oui,protocol,bidirectional_total_duration_ms,bidirectional_total_packets,bidirectional_total_bytes,src2dst_total_duration_ms,src2dst_total_packets,src2dst_total_bytes,dst2src_total_duration_ms,dst2src_total_packets,dst2src_total_bytes,bidirectional_min_ps,bidirectional_mean_ps,bidirectional_stdev_ps,bidirectional_max_ps,src2dst_min_ps,src2dst_mean_ps,src2dst_stdev_ps,src2dst_max_ps,dst2src_min_ps,dst2src_mean_ps,dst2src_stdev_ps,dst2src_max_ps,label,bidirectional_transmission_rate_byte_ms,bidirectional_transmission_rate_ms,dst2src_transmission_rate_bytes_ms,dst2src_transmission_rate_ms,src2dst_transmission_rate_bytes_ms,src2dst_transmission_rate_ms
entry_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1
0,Netatmo,"TP-LINK TECHNOLOGIES CO.,LTD.",1,1313.0,4.0,392.0,1003.0,2.0,196.0,1003.0,2.0,196.0,98.0,98.000000,0.000000,98.0,98.0,98.000000,0.000000,98.0,98.0,98.000000,0.000000,98.0,smart_camera,0.298553,0.003046,0.195414,0.001994,0.195414,0.001994
1,"TP-LINK TECHNOLOGIES CO.,LTD.",IPV6 Multicast,58,1.0,1.0,86.0,1.0,1.0,86.0,1.0,0.0,0.0,86.0,86.000000,0.000000,86.0,86.0,86.000000,0.000000,86.0,0.0,0.000000,0.000000,0.0,router,86.000000,1.000000,0.000000,0.000000,86.000000,1.000000
2,"TP-LINK TECHNOLOGIES CO.,LTD.",IPV6 Multicast,58,1.0,1.0,86.0,1.0,1.0,86.0,1.0,0.0,0.0,86.0,86.000000,0.000000,86.0,86.0,86.000000,0.000000,86.0,0.0,0.000000,0.000000,0.0,router,86.000000,1.000000,0.000000,0.000000,86.000000,1.000000
3,"TP-LINK TECHNOLOGIES CO.,LTD.",IPV6 Multicast,58,1.0,1.0,86.0,1.0,1.0,86.0,1.0,0.0,0.0,86.0,86.000000,0.000000,86.0,86.0,86.000000,0.000000,86.0,0.0,0.000000,0.000000,0.0,router,86.000000,1.000000,0.000000,0.000000,86.000000,1.000000
4,Hewlett Packard,IPV6 Multicast,58,1.0,1.0,86.0,1.0,1.0,86.0,1.0,0.0,0.0,86.0,86.000000,0.000000,86.0,86.0,86.000000,0.000000,86.0,0.0,0.000000,0.000000,0.0,smart_printer,86.000000,1.000000,0.000000,0.000000,86.000000,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
928706,Belkin International Inc.,"Samsung Electronics Co.,Ltd",6,24.0,10.0,2032.0,22.0,5.0,524.0,23.0,5.0,1508.0,66.0,203.200000,367.481761,1236.0,66.0,104.800000,82.360185,252.0,66.0,301.600000,522.356966,1236.0,smart_motion_sensor,84.666667,0.416667,65.565217,0.217391,23.818182,0.227273
928707,Belkin International Inc.,Multicast,2,1.0,1.0,46.0,1.0,1.0,46.0,1.0,0.0,0.0,46.0,46.000000,0.000000,46.0,46.0,46.000000,0.000000,46.0,0.0,0.000000,0.000000,0.0,smart_motion_sensor,46.000000,1.000000,0.000000,0.000000,46.000000,1.000000
928708,Belkin International Inc.,IPV6 Multicast,58,1.0,1.0,86.0,1.0,1.0,86.0,1.0,0.0,0.0,86.0,86.000000,0.000000,86.0,86.0,86.000000,0.000000,86.0,0.0,0.000000,0.000000,0.0,smart_motion_sensor,86.000000,1.000000,0.000000,0.000000,86.000000,1.000000
928709,Hewlett Packard,IPV6 Multicast,58,1.0,1.0,86.0,1.0,1.0,86.0,1.0,0.0,0.0,86.0,86.000000,0.000000,86.0,86.0,86.000000,0.000000,86.0,0.0,0.000000,0.000000,0.0,smart_printer,86.000000,1.000000,0.000000,0.000000,86.000000,1.000000


In [17]:
df2

Unnamed: 0_level_0,src_oui,src2dst_transmission_rate_ms,src2dst_transmission_rate_bytes_ms,src2dst_total_packets,src2dst_total_duration_ms,src2dst_total_bytes,src2dst_stdev_ps,src2dst_min_ps,src2dst_mean_ps,src2dst_max_ps,protocol,dst_oui,dst2src_transmission_rate_ms,dst2src_transmission_rate_bytes_ms,dst2src_total_packets,dst2src_total_duration_ms,dst2src_total_bytes,dst2src_stdev_ps,dst2src_min_ps,dst2src_mean_ps,dst2src_max_ps,bidirectional_transmission_rate_ms,bidirectional_transmission_rate_byte_ms,bidirectional_total_packets,bidirectional_total_duration_ms,bidirectional_total_bytes,bidirectional_stdev_ps,bidirectional_min_ps,bidirectional_mean_ps,bidirectional_max_ps,label
entry_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1
928711,D-Link International,0.001296,0.353241,85,65572.885986,23163.0,45.991719,179.0,272.505882,327.0,w,Multicast,0.000000,0.000000,0,1.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0,1.000000,0.0,0.000000,0.0,0.000000,0.0,smart_camera
928712,Intel Corporate,0.000553,0.227369,40,72270.144775,16432.0,479.361826,42.0,410.800000,1434.0,w,D-Link International,0.000761,0.104197,55,72267.015869,7530.0,157.782505,42.0,136.909091,828.0,0.001314,0.331558,95,72270.992920,23962.0,349.746991,42.0,270.263636,1434.0,smart_camera
928713,D-Link International,0.002320,0.477704,244,105153.079102,50232.0,85.941931,77.0,205.868852,415.0,w,Multicast,0.000000,0.000000,0,1.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0,1.000000,0.0,0.000000,0.0,0.000000,0.0,smart_camera
928714,D-Link International,0.000178,0.044421,18,101212.398682,4496.0,147.050844,42.0,249.777778,359.0,w,Broadcast,0.000000,0.000000,0,1.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0,1.000000,0.0,0.000000,0.0,0.000000,0.0,smart_camera
928715,D-Link International,0.000499,0.028480,36,72190.995117,2056.0,3.899984,54.0,57.111111,62.0,w,Multicast,0.000000,0.000000,0,1.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0,1.000000,0.0,0.000000,0.0,0.000000,0.0,smart_camera
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
929734,Belkin International Inc.,0.389872,48.344184,2,5.129883,248.0,11.000000,113.0,124.000000,135.0,w,Intel Corporate,0.000000,0.000000,0,1.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0,1.000000,0.0,0.000000,0.0,0.000000,0.0,smart_switch
929735,Belkin International Inc.,0.001074,0.211064,4,3723.988037,786.0,145.639109,42.0,196.500000,342.0,w,Broadcast,0.000000,0.000000,0,1.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0,1.000000,0.0,0.000000,0.0,0.000000,0.0,smart_switch
929736,"David Electronics Co., LTD.",0.001679,0.391330,113,67303.871826,26338.0,416.749753,42.0,233.079646,1434.0,w,Belkin International Inc.,0.002265,1.757640,144,63576.163086,111744.0,658.063288,42.0,776.000000,1434.0,0.003819,2.051620,257,67303.871826,138082.0,585.125390,42.0,466.979167,1434.0,smart_switch
929737,Belkin International Inc.,0.000603,0.032574,2,3315.518066,108.0,0.000000,54.0,54.000000,54.0,w,Multicast,0.000000,0.000000,0,1.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0,1.000000,0.0,0.000000,0.0,0.000000,0.0,smart_switch


In [18]:
df3

Unnamed: 0_level_0,src_oui,src2dst_transmission_rate_ms,src2dst_transmission_rate_bytes_ms,src2dst_total_packets,src2dst_total_duration_ms,src2dst_total_bytes,src2dst_stdev_ps,src2dst_min_ps,src2dst_mean_ps,src2dst_max_ps,protocol,dst_oui,dst2src_transmission_rate_ms,dst2src_transmission_rate_bytes_ms,dst2src_total_packets,dst2src_total_duration_ms,dst2src_total_bytes,dst2src_stdev_ps,dst2src_min_ps,dst2src_mean_ps,dst2src_max_ps,bidirectional_transmission_rate_ms,bidirectional_transmission_rate_byte_ms,bidirectional_total_packets,bidirectional_total_duration_ms,bidirectional_total_bytes,bidirectional_stdev_ps,bidirectional_min_ps,bidirectional_mean_ps,bidirectional_max_ps,label
entry_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1
929739,GreenPeak Technologies,0.000179,0.010524,51,284789.872070,2997.0,9.551898,51.0,58.764706,77.0,z,Broadcast,0.0,0.0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1.0,0.0,0.0,0.0,0.0,0.0,smart_speaker
929740,Texas Instruments,0.000083,0.004012,15,180711.858154,725.0,1.885618,47.0,48.333333,51.0,z,Broadcast,0.0,0.0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1.0,0.0,0.0,0.0,0.0,0.0,router
929741,GreenPeak Technologies,0.000183,0.010365,141,770782.792236,7989.0,6.870463,51.0,56.659574,65.0,z,Broadcast,0.0,0.0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1.0,0.0,0.0,0.0,0.0,0.0,smart_speaker
929742,Philips Lighting BV,1.000000,55.000000,1,1.000000,55.0,0.000000,55.0,55.000000,55.0,z,Philips Lighting BV,0.0,0.0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1.0,0.0,0.0,0.0,0.0,0.0,smart_bulb
929743,Philips Lighting BV,0.000045,0.002541,14,308482.526123,784.0,0.000000,56.0,56.000000,56.0,z,Broadcast,0.0,0.0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1.0,0.0,0.0,0.0,0.0,0.0,smart_bulb
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939483,ASUSTek COMPUTER INC.,0.253512,115.601659,4,15.778320,1824.0,0.000000,456.0,456.000000,456.0,w,Not Found,0.0,0.0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1.0,0.0,0.0,0.0,0.0,0.0,router
939484,Amazon Technologies Inc.,0.176182,39.640841,4,22.703857,900.0,0.000000,225.0,225.000000,225.0,w,Not Found,0.0,0.0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1.0,0.0,0.0,0.0,0.0,0.0,smart_camera
939485,Amazon Technologies Inc.,1.000000,225.000000,1,1.000000,225.0,0.000000,225.0,225.000000,225.0,w,Not Found,0.0,0.0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1.0,0.0,0.0,0.0,0.0,0.0,smart_camera
939486,Amazon Technologies Inc.,0.546899,123.052273,2,3.656982,450.0,0.000000,225.0,225.000000,225.0,w,Not Found,0.0,0.0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1.0,0.0,0.0,0.0,0.0,0.0,smart_camera


In [19]:
df4

Unnamed: 0_level_0,src_oui,src2dst_transmission_rate_ms,src2dst_transmission_rate_bytes_ms,src2dst_total_packets,src2dst_total_duration_ms,src2dst_total_bytes,src2dst_stdev_ps,src2dst_min_ps,src2dst_mean_ps,src2dst_max_ps,protocol,dst_oui,dst2src_transmission_rate_ms,dst2src_transmission_rate_bytes_ms,dst2src_total_packets,dst2src_total_duration_ms,dst2src_total_bytes,dst2src_stdev_ps,dst2src_min_ps,dst2src_mean_ps,dst2src_max_ps,bidirectional_transmission_rate_ms,bidirectional_transmission_rate_byte_ms,bidirectional_total_packets,bidirectional_total_duration_ms,bidirectional_total_bytes,bidirectional_stdev_ps,bidirectional_min_ps,bidirectional_mean_ps,bidirectional_max_ps
entry_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1
939488,ASUSTek COMPUTER INC.,0.000000,0.000000,32,1.000000,43322.0,0.000000,0.0,0.0,0.0,w,Not Found,1.0,169.0,1,1.0,169.0,0.0,169.0,169.0,169.0,10.289107,13560.107787,33,3.207275,43491.0,592.0,169.0,761.0,1353.0
939489,ASUSTek COMPUTER INC.,0.006273,2.076464,2,318.811279,662.0,0.000000,331.0,331.0,331.0,w,Broadcast,0.0,0.0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0,1.000000,0.0,0.0,0.0,0.0,0.0
939490,"Hon Hai Precision Ind. Co.,Ltd.",0.003040,0.512297,4,1315.643311,674.0,7.794229,164.0,168.5,182.0,w,Broadcast,0.0,0.0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0,1.000000,0.0,0.0,0.0,0.0,0.0
939491,ASUSTek COMPUTER INC.,0.000000,0.000000,2,1.000000,306.0,0.000000,0.0,0.0,0.0,w,"Google, Inc.",1.0,275.0,1,1.0,275.0,0.0,275.0,275.0,275.0,0.044258,8.571260,3,67.784668,581.0,61.0,153.0,214.0,275.0
939492,Juniper Networks,0.000000,0.000000,2,1.000000,324.0,0.000000,0.0,0.0,0.0,w,"Apple, Inc.",1.0,162.0,1,1.0,162.0,0.0,162.0,162.0,162.0,0.044518,7.211842,3,67.389160,486.0,0.0,162.0,162.0,162.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
941830,"Aruba, a Hewlett Packard Enterprise Company",1.000000,220.000000,1,1.000000,220.0,0.000000,220.0,220.0,220.0,w,"ALFA, INC.",0.0,0.0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0,1.000000,0.0,0.0,0.0,0.0,0.0
941831,"Aruba, a Hewlett Packard Enterprise Company",1.000000,235.000000,1,1.000000,235.0,0.000000,235.0,235.0,235.0,w,Broadcast,0.0,0.0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0,1.000000,0.0,0.0,0.0,0.0,0.0
941832,"Aruba, a Hewlett Packard Enterprise Company",1.000000,249.000000,1,1.000000,249.0,0.000000,249.0,249.0,249.0,w,Broadcast,0.0,0.0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0,1.000000,0.0,0.0,0.0,0.0,0.0
941833,Not Found,0.086658,13.951869,2,23.079346,322.0,0.000000,161.0,161.0,161.0,w,Broadcast,0.0,0.0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0,1.000000,0.0,0.0,0.0,0.0,0.0


In [21]:
df1['label'].value_counts()

label
computer               192326
smart_camera           189719
router                 134828
smart_motion_sensor    131625
smart_printer           85929
smart_speaker           80370
smart_switch            61800
mobile                  35807
smart_bulb              16307
Name: count, dtype: int64

In [20]:
df1.to_csv( '../data/csv/unswiotan18_labelled_dups.csv' )

In [22]:
# UNSW
df1['label'].value_counts()

label
computer               192326
smart_camera           189719
router                 134828
smart_motion_sensor    131625
smart_printer           85929
smart_speaker           80370
smart_switch            61800
mobile                  35807
smart_bulb              16307
Name: count, dtype: int64

In [23]:
# IOT Sentinel
df2['label'].value_counts()

label
smart_switch    593
smart_camera    435
Name: count, dtype: int64

In [24]:
# CONSOLIDATED Z_B_W
df3['label'].value_counts( )

label
router           8325
smart_camera     1171
mobile            134
smart_speaker      68
smart_switch       46
smart_bulb          5
Name: count, dtype: int64

In [25]:
# ENCRYPTED (CAPOUT)
# df4['label'].value_counts()

In [26]:
X_unsw = df1.drop( 'label', axis=1 )
y_unsw = df1['label']

X_iot = df2.drop( 'label', axis=1 )
y_iot = df2['label']

X_consolidated = df3.drop( 'label', axis=1 )
y_consolidated = df3['label']

X_encrypted = df4

In [27]:
X_train_unencrypted = X_unsw
y_train_unencrypted = y_unsw

In [28]:
X_train_unencrypted.head( 5 )

Unnamed: 0_level_0,src_oui,dst_oui,protocol,bidirectional_total_duration_ms,bidirectional_total_packets,bidirectional_total_bytes,src2dst_total_duration_ms,src2dst_total_packets,src2dst_total_bytes,dst2src_total_duration_ms,dst2src_total_packets,dst2src_total_bytes,bidirectional_min_ps,bidirectional_mean_ps,bidirectional_stdev_ps,bidirectional_max_ps,src2dst_min_ps,src2dst_mean_ps,src2dst_stdev_ps,src2dst_max_ps,dst2src_min_ps,dst2src_mean_ps,dst2src_stdev_ps,dst2src_max_ps,bidirectional_transmission_rate_byte_ms,bidirectional_transmission_rate_ms,dst2src_transmission_rate_bytes_ms,dst2src_transmission_rate_ms,src2dst_transmission_rate_bytes_ms,src2dst_transmission_rate_ms
entry_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1
0,Netatmo,"TP-LINK TECHNOLOGIES CO.,LTD.",1,1313.0,4.0,392.0,1003.0,2.0,196.0,1003.0,2.0,196.0,98.0,98.0,0.0,98.0,98.0,98.0,0.0,98.0,98.0,98.0,0.0,98.0,0.298553,0.003046,0.195414,0.001994,0.195414,0.001994
1,"TP-LINK TECHNOLOGIES CO.,LTD.",IPV6 Multicast,58,1.0,1.0,86.0,1.0,1.0,86.0,1.0,0.0,0.0,86.0,86.0,0.0,86.0,86.0,86.0,0.0,86.0,0.0,0.0,0.0,0.0,86.0,1.0,0.0,0.0,86.0,1.0
2,"TP-LINK TECHNOLOGIES CO.,LTD.",IPV6 Multicast,58,1.0,1.0,86.0,1.0,1.0,86.0,1.0,0.0,0.0,86.0,86.0,0.0,86.0,86.0,86.0,0.0,86.0,0.0,0.0,0.0,0.0,86.0,1.0,0.0,0.0,86.0,1.0
3,"TP-LINK TECHNOLOGIES CO.,LTD.",IPV6 Multicast,58,1.0,1.0,86.0,1.0,1.0,86.0,1.0,0.0,0.0,86.0,86.0,0.0,86.0,86.0,86.0,0.0,86.0,0.0,0.0,0.0,0.0,86.0,1.0,0.0,0.0,86.0,1.0
4,Hewlett Packard,IPV6 Multicast,58,1.0,1.0,86.0,1.0,1.0,86.0,1.0,0.0,0.0,86.0,86.0,0.0,86.0,86.0,86.0,0.0,86.0,0.0,0.0,0.0,0.0,86.0,1.0,0.0,0.0,86.0,1.0


In [29]:
df1

Unnamed: 0_level_0,src_oui,dst_oui,protocol,bidirectional_total_duration_ms,bidirectional_total_packets,bidirectional_total_bytes,src2dst_total_duration_ms,src2dst_total_packets,src2dst_total_bytes,dst2src_total_duration_ms,dst2src_total_packets,dst2src_total_bytes,bidirectional_min_ps,bidirectional_mean_ps,bidirectional_stdev_ps,bidirectional_max_ps,src2dst_min_ps,src2dst_mean_ps,src2dst_stdev_ps,src2dst_max_ps,dst2src_min_ps,dst2src_mean_ps,dst2src_stdev_ps,dst2src_max_ps,label,bidirectional_transmission_rate_byte_ms,bidirectional_transmission_rate_ms,dst2src_transmission_rate_bytes_ms,dst2src_transmission_rate_ms,src2dst_transmission_rate_bytes_ms,src2dst_transmission_rate_ms
entry_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1
0,Netatmo,"TP-LINK TECHNOLOGIES CO.,LTD.",1,1313.0,4.0,392.0,1003.0,2.0,196.0,1003.0,2.0,196.0,98.0,98.000000,0.000000,98.0,98.0,98.000000,0.000000,98.0,98.0,98.000000,0.000000,98.0,smart_camera,0.298553,0.003046,0.195414,0.001994,0.195414,0.001994
1,"TP-LINK TECHNOLOGIES CO.,LTD.",IPV6 Multicast,58,1.0,1.0,86.0,1.0,1.0,86.0,1.0,0.0,0.0,86.0,86.000000,0.000000,86.0,86.0,86.000000,0.000000,86.0,0.0,0.000000,0.000000,0.0,router,86.000000,1.000000,0.000000,0.000000,86.000000,1.000000
2,"TP-LINK TECHNOLOGIES CO.,LTD.",IPV6 Multicast,58,1.0,1.0,86.0,1.0,1.0,86.0,1.0,0.0,0.0,86.0,86.000000,0.000000,86.0,86.0,86.000000,0.000000,86.0,0.0,0.000000,0.000000,0.0,router,86.000000,1.000000,0.000000,0.000000,86.000000,1.000000
3,"TP-LINK TECHNOLOGIES CO.,LTD.",IPV6 Multicast,58,1.0,1.0,86.0,1.0,1.0,86.0,1.0,0.0,0.0,86.0,86.000000,0.000000,86.0,86.0,86.000000,0.000000,86.0,0.0,0.000000,0.000000,0.0,router,86.000000,1.000000,0.000000,0.000000,86.000000,1.000000
4,Hewlett Packard,IPV6 Multicast,58,1.0,1.0,86.0,1.0,1.0,86.0,1.0,0.0,0.0,86.0,86.000000,0.000000,86.0,86.0,86.000000,0.000000,86.0,0.0,0.000000,0.000000,0.0,smart_printer,86.000000,1.000000,0.000000,0.000000,86.000000,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
928706,Belkin International Inc.,"Samsung Electronics Co.,Ltd",6,24.0,10.0,2032.0,22.0,5.0,524.0,23.0,5.0,1508.0,66.0,203.200000,367.481761,1236.0,66.0,104.800000,82.360185,252.0,66.0,301.600000,522.356966,1236.0,smart_motion_sensor,84.666667,0.416667,65.565217,0.217391,23.818182,0.227273
928707,Belkin International Inc.,Multicast,2,1.0,1.0,46.0,1.0,1.0,46.0,1.0,0.0,0.0,46.0,46.000000,0.000000,46.0,46.0,46.000000,0.000000,46.0,0.0,0.000000,0.000000,0.0,smart_motion_sensor,46.000000,1.000000,0.000000,0.000000,46.000000,1.000000
928708,Belkin International Inc.,IPV6 Multicast,58,1.0,1.0,86.0,1.0,1.0,86.0,1.0,0.0,0.0,86.0,86.000000,0.000000,86.0,86.0,86.000000,0.000000,86.0,0.0,0.000000,0.000000,0.0,smart_motion_sensor,86.000000,1.000000,0.000000,0.000000,86.000000,1.000000
928709,Hewlett Packard,IPV6 Multicast,58,1.0,1.0,86.0,1.0,1.0,86.0,1.0,0.0,0.0,86.0,86.000000,0.000000,86.0,86.0,86.000000,0.000000,86.0,0.0,0.000000,0.000000,0.0,smart_printer,86.000000,1.000000,0.000000,0.000000,86.000000,1.000000


In [30]:
def create_fine_tuning_file( df ):
    fine_tuning_examples = []
    system_directive = {
        "role": "system",
        "content": """Your task is to use various attributes of encrypted Layer 2 (Data Link Layer of OSI Model) Frames transmitted/received by different IoT devices to classify them into one of 9 specific device types. You'll be provided the following attributes:

Communication Attributes

src_oui (e.g., 'Cisco'): Manufacturer of the device sending the frame.
dst_oui (e.g., 'Samsung' or 'multicast'): The destination that the frame is headed for.
protocol (e.g., 58 for IPV6 Multicast): Protocol number inside the Layer 2 Frame.

Transmission Metrics (Time in milliseconds, Size in bytes, Packets in count)

bidirectional_total_duration_ms: Total duration of bidirectional communication between the source and destination.
bidirectional_total_packets: Total number of packets transmitted bidirectionally across the source-destination channel.
bidirectional_total_bytes: Total bytes transmitted bidirectionally across the source-destination channel.
src2dst_total_duration_ms: Total duration of unidirectional communication from source to destination.
src2dst_total_packets: Total number of packets transmitted from source to destination.
src2dst_total_bytes: Total bytes transmitted from source to destination.
dst2src_total_duration_ms: Total duration of unidirectional communication from destination to source.
dst2src_total_packets: Total number of packets transmitted from destination to source.
dst2src_total_bytes: Total bytes transmitted from destination to source.

Packet Size Metrics (Size in bytes)

bidirectional_min_ps: Smallest packet size across bidirectional communication.
bidirectional_mean_ps: Mean packet size across bidirectional communication.
bidirectional_stdev_ps: Standard deviation in packet size across bidirectional communication.
bidirectional_max_ps: Largest packet size across bidirectional communication.
src2dst_min_ps: Smallest packet size from source to destination.
src2dst_mean_ps: Mean packet size from source to destination.
src2dst_stdev_ps: Standard deviation in packet size from source to destination.
src2dst_max_ps: Largest packet size from source to destination.
dst2src_min_ps: Smallest packet size from destination to source.
dst2src_mean_ps: Mean packet size from destination to source.
dst2src_stdev_ps: Standard deviation in packet size from destination to source.
dst2src_max_ps: Largest packet size from destination to source.

Transmission Rate Metrics (Bytes per millisecond, Packets per millisecond)

bidirectional_transmission_rate_byte_ms: Number of bytes transmitted per millisecond bidirectionally.
bidirectional_transmission_rate_ms: Number of packets transmitted per millisecond bidirectionally.
dst2src_transmission_rate_bytes_ms: Number of bytes transmitted per millisecond from destination to source.
dst2src_transmission_rate_ms: Number of packets transmitted per millisecond from destination to source.
src2dst_transmission_rate_bytes_ms: Number of bytes transmitted per millisecond from source to destination.
src2dst_transmission_rate_ms: Number of packets transmitted per millisecond from source to destination.

You can only classify the frame into one of the following 9 IoT device types:

Smart Camera
Router
Mobile
Smart Switch
Smart Speaker
Smart Bulb
Computer
Smart Motion Sensor
Smart Printer

Note: Not all attributes might be available for every frame, and some may be more telling than others in identifying the device type."""
    }

    for index, row in df.iterrows():
        example = { "messages": [] }
        example["messages"].append( system_directive )

        user_query = f"""Given the following attributes about the Layer 2 (Data Link Layer of OSI Model) Frames transmitted/received by this IOT device, what is the type of this device?:

1. src_oui: {row['src_oui']}
2. dst_oui: {row['dst_oui']}
3. protocol: {row['protocol']}
4. bidirectional_total_duration_ms: {row['bidirectional_total_duration_ms']}
5. bidirectional_total_packets: {row['bidirectional_total_packets']}
6. bidirectional_total_bytes: {row['bidirectional_total_bytes']}
7. src2dst_total_duration_ms: {row['src2dst_total_duration_ms']}
8. src2dst_total_packets: {row['src2dst_total_packets']}
9. src2dst_total_bytes: {row['src2dst_total_bytes']}
10. dst2src_total_duration_ms: {row['dst2src_total_duration_ms']}
11. dst2src_total_packets: {row['dst2src_total_packets']}
12. dst2src_total_bytes: {row['dst2src_total_bytes']}
13. bidirectional_min_ps: {row['bidirectional_min_ps']}
14. bidirectional_mean_ps: {row['bidirectional_mean_ps']}
15. bidirectional_stdev_ps: {row['bidirectional_stdev_ps']}
16. bidirectional_max_ps: {row['bidirectional_max_ps']}
17. src2dst_min_ps: {row['src2dst_min_ps']}
18. src2dst_mean_ps: {row['src2dst_mean_ps']}
19. src2dst_stdev_ps: {row['src2dst_stdev_ps']}
20. src2dst_max_ps: {row['src2dst_max_ps']}
21. dst2src_min_ps: {row['dst2src_min_ps']}
22. dst2src_mean_ps: {row['dst2src_mean_ps']}
23. dst2src_stdev_ps: {row['dst2src_stdev_ps']}
24. dst2src_max_ps: {row['dst2src_max_ps']}
25. bidirectional_transmission_rate_byte_ms: {row['bidirectional_transmission_rate_byte_ms']}
26. bidirectional_transmission_rate_ms: {row['bidirectional_transmission_rate_ms']}
27. dst2src_transmission_rate_bytes_ms: {row['dst2src_transmission_rate_bytes_ms']}
28. dst2src_transmission_rate_ms: {row['dst2src_transmission_rate_ms']}
29. src2dst_transmission_rate_bytes_ms: {row['src2dst_transmission_rate_bytes_ms']}
30. src2dst_tranmission_rate_ms: {row['src2dst_transmission_rate_ms']}"""
        
        assistant_answer = row['label']

        user_directive = {
            "role": "user",
            "content": user_query
        }

        assistant_directive = {
            "role": "assistant",
            "content": assistant_answer
        }

        example["messages"].append( user_directive )
        example["messages"].append( assistant_directive )

        print( user_query )

In [31]:
def count_tokens( s ):
    enc = tiktoken.encoding_for_model( "gpt-3.5-turbo" )
    return len( enc.encode( s ) )

In [32]:
count_tokens( """Your task is to use various attributes of encrypted Layer 2 (Data Link Layer of OSI Model) Frames transmitted/received by different IoT devices to classify them into one of 9 specific device types. You'll be provided the following attributes:

Communication Attributes

src_oui (e.g., 'Cisco'): Manufacturer of the device sending the frame.
dst_oui (e.g., 'Samsung' or 'multicast'): The destination that the frame is headed for.
protocol (e.g., 58 for IPV6 Multicast): Protocol number inside the Layer 2 Frame.

Transmission Metrics (Time in milliseconds, Size in bytes, Packets in count)

bidirectional_total_duration_ms: Total duration of bidirectional communication between the source and destination.
bidirectional_total_packets: Total number of packets transmitted bidirectionally across the source-destination channel.
bidirectional_total_bytes: Total bytes transmitted bidirectionally across the source-destination channel.
src2dst_total_duration_ms: Total duration of unidirectional communication from source to destination.
src2dst_total_packets: Total number of packets transmitted from source to destination.
src2dst_total_bytes: Total bytes transmitted from source to destination.
dst2src_total_duration_ms: Total duration of unidirectional communication from destination to source.
dst2src_total_packets: Total number of packets transmitted from destination to source.
dst2src_total_bytes: Total bytes transmitted from destination to source.

Packet Size Metrics (Size in bytes)

bidirectional_min_ps: Smallest packet size across bidirectional communication.
bidirectional_mean_ps: Mean packet size across bidirectional communication.
bidirectional_stdev_ps: Standard deviation in packet size across bidirectional communication.
bidirectional_max_ps: Largest packet size across bidirectional communication.
src2dst_min_ps: Smallest packet size from source to destination.
src2dst_mean_ps: Mean packet size from source to destination.
src2dst_stdev_ps: Standard deviation in packet size from source to destination.
src2dst_max_ps: Largest packet size from source to destination.
dst2src_min_ps: Smallest packet size from destination to source.
dst2src_mean_ps: Mean packet size from destination to source.
dst2src_stdev_ps: Standard deviation in packet size from destination to source.
dst2src_max_ps: Largest packet size from destination to source.

Transmission Rate Metrics (Bytes per millisecond, Packets per millisecond)

bidirectional_transmission_rate_byte_ms: Number of bytes transmitted per millisecond bidirectionally.
bidirectional_transmission_rate_ms: Number of packets transmitted per millisecond bidirectionally.
dst2src_transmission_rate_bytes_ms: Number of bytes transmitted per millisecond from destination to source.
dst2src_transmission_rate_ms: Number of packets transmitted per millisecond from destination to source.
src2dst_transmission_rate_bytes_ms: Number of bytes transmitted per millisecond from source to destination.
src2dst_transmission_rate_ms: Number of packets transmitted per millisecond from source to destination.

You can only classify the frame into one of the following 9 IoT device types:

Smart Camera
Router
Mobile
Smart Switch
Smart Speaker
Smart Bulb
Computer
Smart Motion Sensor
Smart Printer

Note: Not all attributes might be available for every frame, and some may be more telling than others in identifying the device type.""" )

672

In [33]:
count_tokens( """Given the following attributes about the Layer 2 (Data Link Layer of OSI Model) Frames transmitted/received by this IOT device, what is the type of this device?:

1. src_oui: Netatmo
2. dst_oui: TP-LINK TECHNOLOGIES CO.,LTD.
3. protocol: 1
4. bidirectional_total_duration_ms: 1313.0
5. bidirectional_total_packets: 4.0
6. bidirectional_total_bytes: 392.0
7. src2dst_total_duration_ms: 1003.0
8. src2dst_total_packets: 2.0
9. src2dst_total_bytes: 196.0
10. dst2src_total_duration_ms: 1003.0
11. dst2src_total_packets: 2.0
12. dst2src_total_bytes: 196.0
13. bidirectional_min_ps: 98.0
14. bidirectional_mean_ps: 98.0
15. bidirectional_stdev_ps: 0.0
16. bidirectional_max_ps: 98.0
17. src2dst_min_ps: 98.0
18. src2dst_mean_ps: 98.0
19. src2dst_stdev_ps: 0.0
20. src2dst_max_ps: 98.0
21. dst2src_min_ps: 98.0
22. dst2src_mean_ps: 98.0
23. dst2src_stdev_ps: 0.0
24. dst2src_max_ps: 98.0
25. bidirectional_transmission_rate_byte_ms: 0.2985529322162985
26. bidirectional_transmission_rate_ms: 0.0030464584920030465
27. dst2src_transmission_rate_bytes_ms: 0.19541375872382852
28. dst2src_transmission_rate_ms: 0.0019940179461615153
29. src2dst_transmission_rate_bytes_ms: 0.19541375872382852
30. src2dst_tranmission_rate_ms: 0.0019940179461615153

Do not include any words in your answer other than the device type.""" )

483

In [34]:
system_directive = """Your task is to use various attributes of encrypted Layer 2 (Data Link Layer of OSI Model) Frames transmitted/received by different IoT devices to classify them into one of 9 specific device types. You'll be provided the following attributes:

Communication Attributes

src_oui (e.g., 'Cisco'): Manufacturer of the device sending the frame.
dst_oui (e.g., 'Samsung' or 'multicast'): The destination that the frame is headed for.
protocol (e.g., 58 for IPV6 Multicast): Protocol number inside the Layer 2 Frame.

Transmission Metrics (Time in milliseconds, Size in bytes, Packets in count)

bidirectional_total_duration_ms: Total duration of bidirectional communication between the source and destination.
bidirectional_total_packets: Total number of packets transmitted bidirectionally across the source-destination channel.
bidirectional_total_bytes: Total bytes transmitted bidirectionally across the source-destination channel.
src2dst_total_duration_ms: Total duration of unidirectional communication from source to destination.
src2dst_total_packets: Total number of packets transmitted from source to destination.
src2dst_total_bytes: Total bytes transmitted from source to destination.
dst2src_total_duration_ms: Total duration of unidirectional communication from destination to source.
dst2src_total_packets: Total number of packets transmitted from destination to source.
dst2src_total_bytes: Total bytes transmitted from destination to source.

Packet Size Metrics (Size in bytes)

bidirectional_min_ps: Smallest packet size across bidirectional communication.
bidirectional_mean_ps: Mean packet size across bidirectional communication.
bidirectional_stdev_ps: Standard deviation in packet size across bidirectional communication.
bidirectional_max_ps: Largest packet size across bidirectional communication.
src2dst_min_ps: Smallest packet size from source to destination.
src2dst_mean_ps: Mean packet size from source to destination.
src2dst_stdev_ps: Standard deviation in packet size from source to destination.
src2dst_max_ps: Largest packet size from source to destination.
dst2src_min_ps: Smallest packet size from destination to source.
dst2src_mean_ps: Mean packet size from destination to source.
dst2src_stdev_ps: Standard deviation in packet size from destination to source.
dst2src_max_ps: Largest packet size from destination to source.

Transmission Rate Metrics (Bytes per millisecond, Packets per millisecond)

bidirectional_transmission_rate_byte_ms: Number of bytes transmitted per millisecond bidirectionally.
bidirectional_transmission_rate_ms: Number of packets transmitted per millisecond bidirectionally.
dst2src_transmission_rate_bytes_ms: Number of bytes transmitted per millisecond from destination to source.
dst2src_transmission_rate_ms: Number of packets transmitted per millisecond from destination to source.
src2dst_transmission_rate_bytes_ms: Number of bytes transmitted per millisecond from source to destination.
src2dst_transmission_rate_ms: Number of packets transmitted per millisecond from source to destination.

You can only classify the frame into one of the following 9 IoT device types:

Smart Camera
Router
Mobile
Smart Switch
Smart Speaker
Smart Bulb
Computer
Smart Motion Sensor
Smart Printer

Note: Not all attributes might be available for every frame, and some may be more telling than others in identifying the device type."""

user_query = """Given the following attributes about the Layer 2 (Data Link Layer of OSI Model) Frames transmitted/received by this IOT device, what is the type of this device?:

1. src_oui: Netatmo
2. dst_oui: TP-LINK TECHNOLOGIES CO.,LTD.
3. protocol: 1
4. bidirectional_total_duration_ms: 1313.0
5. bidirectional_total_packets: 4.0
6. bidirectional_total_bytes: 392.0
7. src2dst_total_duration_ms: 1003.0
8. src2dst_total_packets: 2.0
9. src2dst_total_bytes: 196.0
10. dst2src_total_duration_ms: 1003.0
11. dst2src_total_packets: 2.0
12. dst2src_total_bytes: 196.0
13. bidirectional_min_ps: 98.0
14. bidirectional_mean_ps: 98.0
15. bidirectional_stdev_ps: 0.0
16. bidirectional_max_ps: 98.0
17. src2dst_min_ps: 98.0
18. src2dst_mean_ps: 98.0
19. src2dst_stdev_ps: 0.0
20. src2dst_max_ps: 98.0
21. dst2src_min_ps: 98.0
22. dst2src_mean_ps: 98.0
23. dst2src_stdev_ps: 0.0
24. dst2src_max_ps: 98.0
25. bidirectional_transmission_rate_byte_ms: 0.2985529322162985
26. bidirectional_transmission_rate_ms: 0.0030464584920030465
27. dst2src_transmission_rate_bytes_ms: 0.19541375872382852
28. dst2src_transmission_rate_ms: 0.0019940179461615153
29. src2dst_transmission_rate_bytes_ms: 0.19541375872382852
30. src2dst_tranmission_rate_ms: 0.0019940179461615153

Do not include any words in your answer other than device type."""

print( count_tokens( system_directive ) )
print( count_tokens( user_query ) )
print( count_tokens( system_directive ) + count_tokens( user_query ) )

response = openai.ChatCompletion.create(
    model="gpt-3.5-turbo",
    messages=[
        {"role": "system", "content": system_directive },
        {"role": "user", "content": user_query },
    ]
)

print( response )

672
482
1154
{
  "id": "chatcmpl-84XF9Tds0yQnZ3oavjoFeH81WnKXD",
  "object": "chat.completion",
  "created": 1696091791,
  "model": "gpt-3.5-turbo-0613",
  "choices": [
    {
      "index": 0,
      "message": {
        "role": "assistant",
        "content": "Smart Camera"
      },
      "finish_reason": "stop"
    }
  ],
  "usage": {
    "prompt_tokens": 1165,
    "completion_tokens": 2,
    "total_tokens": 1167
  }
}


In [None]:
create_fine_tuning_file( df1.head( 1 ) )

In [None]:
X_test_unencrypted = X_iot
y_test_unencrypted = y_iot

In [None]:
X_test_encrypted = X_consolidated
y_test_encrypted = y_consolidated

In [None]:
X_validation_encrypted = X_encrypted

In [None]:
all_ouis = pd.concat( [ X_unsw['src_oui'], X_unsw['dst_oui'], X_iot['src_oui'], X_iot['dst_oui'], X_consolidated['src_oui'], X_consolidated['dst_oui'], X_encrypted['src_oui'], X_encrypted['dst_oui'] ], axis=0 )
oui_ohe = OneHotEncoder()
oui_ohe.fit( all_ouis.to_frame() )


class ManufacturerOHE( BaseEstimator, TransformerMixin ):
    def transform( self, X, y=None ):
        global oui_ohe
        return oui_ohe.transform( X )
    
    def fit( self, X, y=None ):
        return self

In [None]:
len( oui_ohe.categories_[0] )

In [None]:
all_protocols = pd.concat( [ X_unsw['protocol'], X_iot['protocol'], X_consolidated['protocol'], X_encrypted['protocol'] ], axis=0 )
protocol_ohe = OneHotEncoder()
protocol_ohe.fit( all_protocols.to_frame() )

class ProtocolOHE( BaseEstimator, TransformerMixin ):
    def transform( self, X, y=None ):
        global protocol_ohe
        return protocol_ohe.transform( X )
    
    def fit( self, X, y=None ):
        return self

In [None]:
protocol_ohe.categories_

In [None]:
col_transformer = make_column_transformer(
    ( ManufacturerOHE(), [ 'src_oui' ] ),
    ( ManufacturerOHE(), [ 'dst_oui' ] ),
    ( ProtocolOHE(), [ 'protocol' ] ),
    ( StandardScaler(), [ col for col in all_features if col not in { 'src_oui', 'dst_oui', 'protocol', 'label' } ] ),
    remainder='passthrough'
)

In [None]:
X_train_unencrypted = col_transformer.fit_transform( X_train_unencrypted )
X_test_unencrypted = col_transformer.fit_transform( X_test_unencrypted )
X_test_encrypted = col_transformer.fit_transform( X_test_encrypted )
X_validation_encrypted = col_transformer.fit_transform( X_validation_encrypted )

In [None]:
# all_labels = pd.concat( [ y_train_unencrypted, y_test_unencrypted, y_test_encrypted ], axis=0 )
# le = LabelEncoder()
# le.fit( all_labels.to_frame() )

# y_train_unencrypted = le.transform( y_train_unencrypted )
# y_test_unencrypted = le.transform( y_test_unencrypted )
# y_test_encrypted = le.transform( y_test_encrypted )

In [None]:
print( X_train_unencrypted.shape )
print( y_train_unencrypted.shape )

print( X_test_unencrypted.shape )
print( y_test_unencrypted.shape )

print( X_test_encrypted.shape )
print( y_test_encrypted.shape )

print( X_validation_encrypted.shape )

In [None]:
mlp = MLPClassifier( hidden_layer_sizes=( 100, 100 ), activation='relu', random_state=42 ) # Grid search to find out hyperparameters
k = 10
kf = KFold( n_splits=k, shuffle=True, random_state=42 )

In [None]:
# scores = cross_validate( mlp, X_train_unencrypted, y_train_unencrypted, cv=kf )
# scores

In [None]:
mlp.fit( X_train_unencrypted, y_train_unencrypted )

In [None]:
y_pred_unencrypted = mlp.predict( X_test_unencrypted )
cr_unencrypted = classification_report( y_test_unencrypted, y_pred_unencrypted, output_dict=True )
cr_unencrypted

In [None]:
cm = confusion_matrix( y_test_unencrypted, y_pred_unencrypted )
cm

In [None]:
y_pred_encrypted = mlp.predict( X_test_encrypted )
cr = classification_report( y_test_encrypted, y_pred_encrypted, output_dict=True )
cr

In [None]:
cm_encrypted = confusion_matrix( y_test_encrypted, y_pred_encrypted )
cm_encrypted

In [None]:
fig, ax = plt.subplots(figsize=(7.5, 7.5))
ax.matshow(cm_encrypted, cmap=plt.cm.Blues, alpha=0.3)
for i in range(cm_encrypted.shape[0]):
    for j in range(cm_encrypted.shape[1]):
        ax.text(x=j, y=i,s=cm_encrypted[i, j], va='center', ha='center', size='xx-large')
 
plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()

In [None]:
cm_unencrypted = confusion_matrix( y_test_unencrypted, y_pred_unencrypted )
cm_unencrypted

In [None]:
fig, ax = plt.subplots(figsize=(7.5, 7.5))
ax.matshow(cm_unencrypted, cmap=plt.cm.Blues, alpha=0.3)
for i in range(cm_unencrypted.shape[0]):
    for j in range(cm_unencrypted.shape[1]):
        ax.text(x=j, y=i,s=cm_unencrypted[i, j], va='center', ha='center', size='xx-large')
 
plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()