In [1]:
%run chatgpt_utils.ipynb

  TRAINING_INDICES_1_EXAMPLE = TRAINING_INDICES_0_EXAMPLE.append( pd.Index( [ 41032, 425, 405, 48, 106, 39748, 673855, 60, 37 ] ) )


In [29]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import make_column_transformer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report
import json

In [3]:
common_features = [
    'src_oui',
    'dst_oui',
    'protocol',
    'bidirectional_min_ps',
    'bidirectional_mean_ps',
    'bidirectional_max_ps',
    'dst2src_min_ps',
    'dst2src_mean_ps',
    'dst2src_max_ps',
    'src2dst_min_ps',
    'src2dst_mean_ps',
    'src2dst_max_ps',
    'label'
]

df1_rename_map = {
     'bidirectional_bytes' : 'bidirectional_total_bytes',
     'bidirectional_duration_ms' : 'bidirectional_total_duration_ms',
     'bidirectional_packets': 'bidirectional_total_packets',
     'bidirectional_stddev_ps' : 'bidirectional_stdev_ps',
     'dst2src_bytes' : 'dst2src_total_bytes',
     'dst2src_duration_ms' : 'dst2src_total_duration_ms',
     'dst2src_packets' : 'dst2src_total_packets',
     'dst2src_stddev_ps' : 'dst2src_stdev_ps',
     'src2dst_bytes' : 'src2dst_total_bytes',
     'src2dst_duration_ms' : 'src2dst_total_duration_ms',
     'src2dst_packets' : 'src2dst_total_packets',
     'src2dst_stddev_ps' : 'src2dst_stdev_ps'
}

df1_extra_features = list( df1_rename_map.keys() )
df1_features = common_features + df1_extra_features

df2_extra_features = list( df1_rename_map.values() ) + [ 
    'bidirectional_transmission_rate_byte_ms', 
    'bidirectional_transmission_rate_ms', 
    'dst2src_transmission_rate_bytes_ms', 
    'dst2src_transmission_rate_ms',
    'src2dst_transmission_rate_bytes_ms',
    'src2dst_transmission_rate_ms'
]
df2_features = common_features + df2_extra_features
df3_features = df2_features
df4_features = [ f for f in df2_features if f != 'label' ]

all_features = df2_features

In [4]:
unsw_dups_df = pd.read_csv( '../data/csv/unswiotan18_labelled_dups.csv', index_col='entry_id' )
unsw_selected_df = pd.read_csv( '../data/csv/unsw_selected.csv', index_col='entry_id' )
zbw_selected_df = pd.read_csv( '../data/csv/zbw_selected.csv', index_col='entry_id' )

In [5]:
training_df = unsw_dups_df.loc[TRAINING_INDICES_7_EXAMPLE].drop( 'label', axis=1 )
training_df['protocol'] = training_df['protocol'].astype( 'str' )
training_labels = unsw_dups_df.loc[TRAINING_INDICES_7_EXAMPLE]['label']

In [6]:
unsw_testing_df = unsw_selected_df.drop( 'label', axis=1 )
unsw_testing_df['protocol'] = unsw_testing_df['protocol'].astype( 'str' )
unsw_testing_labels = unsw_selected_df['label']

zbw_testing_df = zbw_selected_df.drop( 'label', axis=1 )
zbw_testing_labels = zbw_selected_df['label']

In [7]:
ouis = pd.concat( [ unsw_dups_df['src_oui'], unsw_dups_df['dst_oui'], zbw_selected_df['src_oui'], zbw_selected_df['dst_oui'] ] ).drop_duplicates().to_numpy().reshape( -1, 1 )
labels = unsw_dups_df['label'].drop_duplicates().to_numpy().reshape( -1, 1 )
protocols = pd.concat( [ unsw_dups_df['protocol'].astype( 'str' ), zbw_selected_df['protocol'] ] ).drop_duplicates().to_numpy().reshape( -1, 1 ) 

ouiOHE = OneHotEncoder( handle_unknown='ignore' )
labelOHE = OneHotEncoder( handle_unknown='ignore' )
protocolOHE = OneHotEncoder( handle_unknown='ignore' )

ouiOHE.fit( ouis )
labelOHE.fit( labels )
protocolOHE.fit( protocols )

class OuiOHE( BaseEstimator, TransformerMixin ):
    def transform( self, X, y=None ):
        global ouiOHE
        return ouiOHE.transform( X )
    
    def fit( self, X, y=None ):
        return self

class LabelOHE( BaseEstimator, TransformerMixin ):
    def transform( self, X, y=None ):
        global labelOHE
        return labelOHE.transform( X )
    
    def fit( self, X, y=None ):
        return self

class ProtocolOHE( BaseEstimator, TransformerMixin ):
    def transform( self, X, y=None ):
        global protocolOHE
        return protocolOHE.transform( X )
    
    def fit( self, X, y=None ):
        return self

In [8]:
col_transformer = make_column_transformer(
    ( OuiOHE(), [ 'src_oui' ] ),
    ( OuiOHE(), [ 'dst_oui' ] ),
    ( ProtocolOHE(), [ 'protocol' ] ),
    ( StandardScaler(), [ col for col in all_features if col not in { 'src_oui', 'dst_oui', 'protocol', 'label' } ] ),
    remainder='passthrough'
)

In [9]:
X_train = col_transformer.fit_transform( training_df )
y_train = training_labels

X_test_unsw = col_transformer.transform( unsw_testing_df )
y_test_unsw = unsw_testing_labels

X_test_zbw = col_transformer.transform( zbw_testing_df )
y_test_zbw = zbw_testing_labels



In [10]:
print( X_train.shape )
print( y_train.shape )
print()

print( X_test_unsw.shape )
print( y_test_unsw.shape )
print()

print( X_test_zbw.shape )
print( y_test_zbw.shape )

(63, 140)
(63,)

(1023, 140)
(1023,)

(1012, 140)
(1012,)


In [17]:
knn = KNeighborsClassifier()
knn.fit( X_train, y_train )
y_pred_unsw_knn = knn.predict( X_test_unsw )
y_pred_zbw_knn = knn.predict( X_test_zbw )

In [18]:
rf = RandomForestClassifier()
rf.fit( X_train, y_train )
y_pred_unsw_rf = rf.predict( X_test_unsw )
y_pred_zbw_rf = rf.predict( X_test_zbw )

In [19]:
svc = SVC()
svc.fit( X_train, y_train )
y_pred_unsw_svc = svc.predict( X_test_unsw )
y_pred_zbw_svc = svc.predict( X_test_zbw )

In [30]:
dict1 = classification_report( y_test_unsw, y_pred_unsw_knn, output_dict=True )

with open( '../data/json/report_knn_7_examples_unsw.json', 'w' ) as f:
    json.dump( dict1, f, indent=4 )

In [31]:
dict2 = classification_report( y_test_unsw, y_pred_unsw_rf, output_dict=True )

with open( '../data/json/report_rf_7_examples_unsw.json', 'w' ) as f:
    json.dump( dict2, f, indent=4 )

In [32]:
dict3 = classification_report( y_test_unsw, y_pred_unsw_svc, output_dict=True )

with open( '../data/json/report_svc_7_examples_unsw.json', 'w' ) as f:
    json.dump( dict3, f, indent=4 )

In [33]:
dict4 = classification_report( y_test_zbw, y_pred_zbw_knn, output_dict=True )

with open( '../data/json/report_knn_7_examples_zbw.json', 'w' ) as f:
    json.dump( dict4, f, indent=4 )

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [34]:
dict5 = classification_report( y_test_zbw, y_pred_zbw_rf, output_dict=True )

with open( '../data/json/report_rf_7_examples_zbw.json', 'w' ) as f:
    json.dump( dict5, f, indent=4 )

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [35]:
dict6 = classification_report( y_test_zbw, y_pred_zbw_svc, output_dict=True )

with open( '../data/json/report_svc_7_examples_zbw.json', 'w' ) as f:
    json.dump( dict6, f, indent=4 )

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
