In [1]:
import numpy as np
import pandas as pd
from sklearn.svm import SVC, LinearSVC
from sklearn.linear_model import SGDClassifier

from firewall import *

pd.set_option('display.max_columns', 100)

# Load Data

In [2]:
df = pd.read_csv("./log2.csv")
df.head()

Unnamed: 0,Source Port,Destination Port,NAT Source Port,NAT Destination Port,Action,Bytes,Bytes Sent,Bytes Received,Packets,Elapsed Time (sec),pkts_sent,pkts_received
0,57222,53,54587,53,allow,177,94,83,2,30,1,1
1,56258,3389,56258,3389,allow,4768,1600,3168,19,17,10,9
2,6881,50321,43265,50321,allow,238,118,120,2,1199,1,1
3,50553,3389,50553,3389,allow,3327,1438,1889,15,17,8,7
4,50002,443,45848,443,allow,25358,6778,18580,31,16,13,18


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 65532 entries, 0 to 65531
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Source Port           65532 non-null  int64 
 1   Destination Port      65532 non-null  int64 
 2   NAT Source Port       65532 non-null  int64 
 3   NAT Destination Port  65532 non-null  int64 
 4   Action                65532 non-null  object
 5   Bytes                 65532 non-null  int64 
 6   Bytes Sent            65532 non-null  int64 
 7   Bytes Received        65532 non-null  int64 
 8   Packets               65532 non-null  int64 
 9   Elapsed Time (sec)    65532 non-null  int64 
 10  pkts_sent             65532 non-null  int64 
 11  pkts_received         65532 non-null  int64 
dtypes: int64(11), object(1)
memory usage: 6.0+ MB


In [4]:
df.isna().sum()

Source Port             0
Destination Port        0
NAT Source Port         0
NAT Destination Port    0
Action                  0
Bytes                   0
Bytes Sent              0
Bytes Received          0
Packets                 0
Elapsed Time (sec)      0
pkts_sent               0
pkts_received           0
dtype: int64

In [5]:
port_columns = [c for c in df.columns if "Port" in c]
for col in port_columns:
    print(f"{col} --> Number of Unique Values = {df[col].nunique()}")

Source Port --> Number of Unique Values = 22724
Destination Port --> Number of Unique Values = 3273
NAT Source Port --> Number of Unique Values = 29152
NAT Destination Port --> Number of Unique Values = 2533


In [6]:
df['Action'].value_counts(normalize=True)

allow         0.574376
deny          0.228697
drop          0.196103
reset-both    0.000824
Name: Action, dtype: float64

# Create cross validation and final test sets

In [7]:
# train_df, test_df = create_train_and_final_test_sets(df=df)
train_df = pd.read_csv("./datasets/train_20221018_1118.csv")

In [8]:
train_df.shape

(58978, 12)

In [9]:
port_columns = [c for c in train_df.columns if "Port" in c]
for col in port_columns:
    print(f"{col} --> Number of Unique Values = {train_df[col].nunique()}")

Source Port --> Number of Unique Values = 21768
Destination Port --> Number of Unique Values = 3098
NAT Source Port --> Number of Unique Values = 26848
NAT Destination Port --> Number of Unique Values = 2393


# Baseline Models

In [10]:
train_df['Action'].value_counts(normalize=True)

allow         0.574367
deny          0.228695
drop          0.196107
reset-both    0.000831
Name: Action, dtype: float64

In [11]:
from sklearn.svm import NuSVC

In [12]:
#### STANDARD SCALER! 

all_estimators = [SVC(kernel='rbf', 
                      random_state=7742), 
                  SVC(kernel='poly', 
                      random_state=7742),
                  SVC(kernel='sigmoid', 
                      random_state=7742),
                  SVC(kernel='linear', 
                      random_state=7742),
                  LinearSVC(random_state=7742),
                  LinearSVC(max_iter=20_000, 
                            random_state=7742),
                  SGDClassifier(random_state=7742),
                  SGDClassifier(max_iter=20_000, 
                                random_state=7742)]


metrics=['f1_weighted', 'f1_micro', 'f1_macro',
         'recall_weighted', 'recall_micro', 'recall_macro',
         'precision_weighted', 'precision_micro', 'precision_macro',
         'accuracy', 'balanced_accuracy']

# base_df = get_all_baseline_model_performance(df=train_df, 
#                                              target_column="Action", 
#                                              estimators=all_estimators, 
#                                              metrics=metrics,
#                                              n_cv_splits=5, 
#                                              random_state=7742, 
#                                              shuffle=True, 
#                                              n_jobs=50, 
#                                              return_estimator=False, 
#                                              sort_metric="test_f1_weighted", 
#                                              smaller_is_better=False, 
#                                              candidate_ohe_columns=['Source Port', 'Destination Port', 
#                                                                     'NAT Source Port', 'NAT Destination Port'])

base_df = pd.read_csv("./models/baseline_models_ohe_10192022.csv")
# base_df.to_csv("./models/baseline_models_ohe_10192022.csv",index=False)
base_df.head()

Unnamed: 0,model,columns_encoded,test_f1_weighted_mean,test_f1_weighted_min,test_f1_weighted_max,test_f1_weighted_std,train_f1_weighted_mean,train_f1_weighted_min,train_f1_weighted_max,train_f1_weighted_std,test_f1_micro_mean,test_f1_micro_min,test_f1_micro_max,test_f1_micro_std,train_f1_micro_mean,train_f1_micro_min,train_f1_micro_max,train_f1_micro_std,test_f1_macro_mean,test_f1_macro_min,test_f1_macro_max,test_f1_macro_std,train_f1_macro_mean,train_f1_macro_min,train_f1_macro_max,train_f1_macro_std,test_recall_weighted_mean,test_recall_weighted_min,test_recall_weighted_max,test_recall_weighted_std,train_recall_weighted_mean,train_recall_weighted_min,train_recall_weighted_max,train_recall_weighted_std,test_recall_micro_mean,test_recall_micro_min,test_recall_micro_max,test_recall_micro_std,train_recall_micro_mean,train_recall_micro_min,train_recall_micro_max,train_recall_micro_std,test_recall_macro_mean,test_recall_macro_min,test_recall_macro_max,test_recall_macro_std,train_recall_macro_mean,train_recall_macro_min,train_recall_macro_max,train_recall_macro_std,test_precision_weighted_mean,test_precision_weighted_min,test_precision_weighted_max,test_precision_weighted_std,train_precision_weighted_mean,train_precision_weighted_min,train_precision_weighted_max,train_precision_weighted_std,test_precision_micro_mean,test_precision_micro_min,test_precision_micro_max,test_precision_micro_std,train_precision_micro_mean,train_precision_micro_min,train_precision_micro_max,train_precision_micro_std,test_precision_macro_mean,test_precision_macro_min,test_precision_macro_max,test_precision_macro_std,train_precision_macro_mean,train_precision_macro_min,train_precision_macro_max,train_precision_macro_std,test_accuracy_mean,test_accuracy_min,test_accuracy_max,test_accuracy_std,train_accuracy_mean,train_accuracy_min,train_accuracy_max,train_accuracy_std,test_balanced_accuracy_mean,test_balanced_accuracy_min,test_balanced_accuracy_max,test_balanced_accuracy_std,train_balanced_accuracy_mean,train_balanced_accuracy_min,train_balanced_accuracy_max,train_balanced_accuracy_std,fit_time_mean,fit_time_min,fit_time_max,fit_time_std,score_time_mean,score_time_min,score_time_max,score_time_std
0,"SVC(kernel='linear', random_state=7742)","Source Port, Destination Port, NAT Source Port",0.996877,0.996503,0.997261,0.000252,0.998613,0.998515,0.998706,6.4e-05,0.996948,0.996609,0.997372,0.000257,0.998614,0.998516,0.998707,6.4e-05,0.917364,0.890109,0.969532,0.030743,0.998862,0.998775,0.998939,5.3e-05,0.996948,0.996609,0.997372,0.000257,0.998614,0.998516,0.998707,6.4e-05,0.996948,0.996609,0.997372,0.000257,0.998614,0.998516,0.998707,6.4e-05,0.879764,0.847652,0.947903,0.038566,0.9988,0.998374,0.99928,0.000394,0.996956,0.996621,0.997375,0.000255,0.998618,0.998519,0.998709,6.3e-05,0.996948,0.996609,0.997372,0.000257,0.998614,0.998516,0.998707,6.4e-05,0.997483,0.996503,0.998292,0.000733,0.998928,0.998424,0.999296,0.000386,0.996948,0.996609,0.997372,0.000257,0.998614,0.998516,0.998707,6.4e-05,0.879764,0.847652,0.947903,0.038566,0.9988,0.998374,0.99928,0.000394,29.592915,26.055254,33.352509,2.718526,2.758888,2.690408,2.827435,0.04622
1,"SVC(kernel='linear', random_state=7742)","Source Port, Destination Port, NAT Source Port...",0.99686,0.996503,0.997261,0.000264,0.998613,0.998515,0.998706,6.4e-05,0.996931,0.996609,0.997372,0.00027,0.998614,0.998516,0.998707,6.4e-05,0.917352,0.890109,0.969532,0.030736,0.998862,0.998775,0.998939,5.3e-05,0.996931,0.996609,0.997372,0.00027,0.998614,0.998516,0.998707,6.4e-05,0.996931,0.996609,0.997372,0.00027,0.998614,0.998516,0.998707,6.4e-05,0.879757,0.847652,0.947903,0.038563,0.9988,0.998374,0.99928,0.000394,0.996938,0.996621,0.997375,0.000269,0.998618,0.998519,0.998709,6.3e-05,0.996931,0.996609,0.997372,0.00027,0.998614,0.998516,0.998707,6.4e-05,0.997464,0.996503,0.998292,0.000726,0.998928,0.998424,0.999296,0.000386,0.996931,0.996609,0.997372,0.00027,0.998614,0.998516,0.998707,6.4e-05,0.879757,0.847652,0.947903,0.038563,0.9988,0.998374,0.99928,0.000394,34.868756,31.684043,38.076043,2.468839,3.0629,2.895521,3.187483,0.105158
2,"SVC(kernel='linear', random_state=7742)","Source Port, Destination Port, NAT Destination...",0.99686,0.996503,0.997261,0.000264,0.998588,0.998515,0.998685,5.6e-05,0.996931,0.996609,0.997372,0.00027,0.998588,0.998516,0.998686,5.6e-05,0.917352,0.890109,0.969532,0.030736,0.997565,0.995617,0.998923,0.001565,0.996931,0.996609,0.997372,0.00027,0.998588,0.998516,0.998686,5.6e-05,0.996931,0.996609,0.997372,0.00027,0.998588,0.998516,0.998686,5.6e-05,0.879757,0.847652,0.947903,0.038563,0.99625,0.992217,0.999247,0.003056,0.996938,0.996621,0.997375,0.000269,0.998592,0.998519,0.998688,5.6e-05,0.996931,0.996609,0.997372,0.00027,0.998588,0.998516,0.998686,5.6e-05,0.997464,0.996503,0.998292,0.000726,0.998917,0.998414,0.999287,0.000388,0.996931,0.996609,0.997372,0.00027,0.998588,0.998516,0.998686,5.6e-05,0.879757,0.847652,0.947903,0.038563,0.99625,0.992217,0.999247,0.003056,28.130873,25.686436,29.971153,1.703711,2.740681,2.692439,2.818419,0.047076
3,LinearSVC(random_state=7742),"Source Port, Destination Port, NAT Destination...",0.99682,0.996503,0.997261,0.000281,0.998601,0.998515,0.998706,6.4e-05,0.996897,0.996609,0.997372,0.000277,0.998601,0.998516,0.998707,6.4e-05,0.913163,0.890109,0.969596,0.029517,0.998852,0.998775,0.998939,5.3e-05,0.996897,0.996609,0.997372,0.000277,0.998601,0.998516,0.998707,6.4e-05,0.996897,0.996609,0.997372,0.000277,0.998601,0.998516,0.998707,6.4e-05,0.874749,0.847652,0.94794,0.03765,0.998786,0.998374,0.999257,0.000386,0.996904,0.996621,0.997375,0.000276,0.998605,0.998519,0.998709,6.4e-05,0.996897,0.996609,0.997372,0.000277,0.998601,0.998516,0.998707,6.4e-05,0.997426,0.996503,0.998292,0.000665,0.998922,0.998414,0.999296,0.000389,0.996897,0.996609,0.997372,0.000277,0.998601,0.998516,0.998707,6.4e-05,0.874749,0.847652,0.94794,0.03765,0.998786,0.998374,0.999257,0.000386,2.02978,0.713588,2.849905,0.787562,0.431883,0.424043,0.447101,0.008292
4,"LinearSVC(max_iter=20000, random_state=7742)","Source Port, Destination Port, NAT Source Port...",0.99682,0.996503,0.997261,0.000281,0.998613,0.998515,0.998706,6.4e-05,0.996897,0.996609,0.997372,0.000277,0.998614,0.998516,0.998707,6.4e-05,0.913163,0.890109,0.969596,0.029517,0.998862,0.998775,0.998939,5.3e-05,0.996897,0.996609,0.997372,0.000277,0.998614,0.998516,0.998707,6.4e-05,0.996897,0.996609,0.997372,0.000277,0.998614,0.998516,0.998707,6.4e-05,0.874749,0.847652,0.94794,0.03765,0.9988,0.998374,0.99928,0.000394,0.996904,0.996621,0.997375,0.000276,0.998618,0.998519,0.998709,6.3e-05,0.996897,0.996609,0.997372,0.000277,0.998614,0.998516,0.998707,6.4e-05,0.997426,0.996503,0.998292,0.000665,0.998928,0.998424,0.999296,0.000386,0.996897,0.996609,0.997372,0.000277,0.998614,0.998516,0.998707,6.4e-05,0.874749,0.847652,0.94794,0.03765,0.9988,0.998374,0.99928,0.000394,3.033141,1.782557,4.13989,1.017081,0.420869,0.41006,0.432104,0.008354


In [13]:
base_df.head(35)

Unnamed: 0,model,columns_encoded,test_f1_weighted_mean,test_f1_weighted_min,test_f1_weighted_max,test_f1_weighted_std,train_f1_weighted_mean,train_f1_weighted_min,train_f1_weighted_max,train_f1_weighted_std,test_f1_micro_mean,test_f1_micro_min,test_f1_micro_max,test_f1_micro_std,train_f1_micro_mean,train_f1_micro_min,train_f1_micro_max,train_f1_micro_std,test_f1_macro_mean,test_f1_macro_min,test_f1_macro_max,test_f1_macro_std,train_f1_macro_mean,train_f1_macro_min,train_f1_macro_max,train_f1_macro_std,test_recall_weighted_mean,test_recall_weighted_min,test_recall_weighted_max,test_recall_weighted_std,train_recall_weighted_mean,train_recall_weighted_min,train_recall_weighted_max,train_recall_weighted_std,test_recall_micro_mean,test_recall_micro_min,test_recall_micro_max,test_recall_micro_std,train_recall_micro_mean,train_recall_micro_min,train_recall_micro_max,train_recall_micro_std,test_recall_macro_mean,test_recall_macro_min,test_recall_macro_max,test_recall_macro_std,train_recall_macro_mean,train_recall_macro_min,train_recall_macro_max,train_recall_macro_std,test_precision_weighted_mean,test_precision_weighted_min,test_precision_weighted_max,test_precision_weighted_std,train_precision_weighted_mean,train_precision_weighted_min,train_precision_weighted_max,train_precision_weighted_std,test_precision_micro_mean,test_precision_micro_min,test_precision_micro_max,test_precision_micro_std,train_precision_micro_mean,train_precision_micro_min,train_precision_micro_max,train_precision_micro_std,test_precision_macro_mean,test_precision_macro_min,test_precision_macro_max,test_precision_macro_std,train_precision_macro_mean,train_precision_macro_min,train_precision_macro_max,train_precision_macro_std,test_accuracy_mean,test_accuracy_min,test_accuracy_max,test_accuracy_std,train_accuracy_mean,train_accuracy_min,train_accuracy_max,train_accuracy_std,test_balanced_accuracy_mean,test_balanced_accuracy_min,test_balanced_accuracy_max,test_balanced_accuracy_std,train_balanced_accuracy_mean,train_balanced_accuracy_min,train_balanced_accuracy_max,train_balanced_accuracy_std,fit_time_mean,fit_time_min,fit_time_max,fit_time_std,score_time_mean,score_time_min,score_time_max,score_time_std
0,"SVC(kernel='linear', random_state=7742)","Source Port, Destination Port, NAT Source Port",0.996877,0.996503,0.997261,0.000252,0.998613,0.998515,0.998706,6.4e-05,0.996948,0.996609,0.997372,0.000257,0.998614,0.998516,0.998707,6.4e-05,0.917364,0.890109,0.969532,0.030743,0.998862,0.998775,0.998939,5.3e-05,0.996948,0.996609,0.997372,0.000257,0.998614,0.998516,0.998707,6.4e-05,0.996948,0.996609,0.997372,0.000257,0.998614,0.998516,0.998707,6.4e-05,0.879764,0.847652,0.947903,0.038566,0.9988,0.998374,0.99928,0.000394,0.996956,0.996621,0.997375,0.000255,0.998618,0.998519,0.998709,6.3e-05,0.996948,0.996609,0.997372,0.000257,0.998614,0.998516,0.998707,6.4e-05,0.997483,0.996503,0.998292,0.000733,0.998928,0.998424,0.999296,0.000386,0.996948,0.996609,0.997372,0.000257,0.998614,0.998516,0.998707,6.4e-05,0.879764,0.847652,0.947903,0.038566,0.9988,0.998374,0.99928,0.000394,29.592915,26.055254,33.352509,2.718526,2.758888,2.690408,2.827435,0.04622
1,"SVC(kernel='linear', random_state=7742)","Source Port, Destination Port, NAT Source Port...",0.99686,0.996503,0.997261,0.000264,0.998613,0.998515,0.998706,6.4e-05,0.996931,0.996609,0.997372,0.00027,0.998614,0.998516,0.998707,6.4e-05,0.917352,0.890109,0.969532,0.030736,0.998862,0.998775,0.998939,5.3e-05,0.996931,0.996609,0.997372,0.00027,0.998614,0.998516,0.998707,6.4e-05,0.996931,0.996609,0.997372,0.00027,0.998614,0.998516,0.998707,6.4e-05,0.879757,0.847652,0.947903,0.038563,0.9988,0.998374,0.99928,0.000394,0.996938,0.996621,0.997375,0.000269,0.998618,0.998519,0.998709,6.3e-05,0.996931,0.996609,0.997372,0.00027,0.998614,0.998516,0.998707,6.4e-05,0.997464,0.996503,0.998292,0.000726,0.998928,0.998424,0.999296,0.000386,0.996931,0.996609,0.997372,0.00027,0.998614,0.998516,0.998707,6.4e-05,0.879757,0.847652,0.947903,0.038563,0.9988,0.998374,0.99928,0.000394,34.868756,31.684043,38.076043,2.468839,3.0629,2.895521,3.187483,0.105158
2,"SVC(kernel='linear', random_state=7742)","Source Port, Destination Port, NAT Destination...",0.99686,0.996503,0.997261,0.000264,0.998588,0.998515,0.998685,5.6e-05,0.996931,0.996609,0.997372,0.00027,0.998588,0.998516,0.998686,5.6e-05,0.917352,0.890109,0.969532,0.030736,0.997565,0.995617,0.998923,0.001565,0.996931,0.996609,0.997372,0.00027,0.998588,0.998516,0.998686,5.6e-05,0.996931,0.996609,0.997372,0.00027,0.998588,0.998516,0.998686,5.6e-05,0.879757,0.847652,0.947903,0.038563,0.99625,0.992217,0.999247,0.003056,0.996938,0.996621,0.997375,0.000269,0.998592,0.998519,0.998688,5.6e-05,0.996931,0.996609,0.997372,0.00027,0.998588,0.998516,0.998686,5.6e-05,0.997464,0.996503,0.998292,0.000726,0.998917,0.998414,0.999287,0.000388,0.996931,0.996609,0.997372,0.00027,0.998588,0.998516,0.998686,5.6e-05,0.879757,0.847652,0.947903,0.038563,0.99625,0.992217,0.999247,0.003056,28.130873,25.686436,29.971153,1.703711,2.740681,2.692439,2.818419,0.047076
3,LinearSVC(random_state=7742),"Source Port, Destination Port, NAT Destination...",0.99682,0.996503,0.997261,0.000281,0.998601,0.998515,0.998706,6.4e-05,0.996897,0.996609,0.997372,0.000277,0.998601,0.998516,0.998707,6.4e-05,0.913163,0.890109,0.969596,0.029517,0.998852,0.998775,0.998939,5.3e-05,0.996897,0.996609,0.997372,0.000277,0.998601,0.998516,0.998707,6.4e-05,0.996897,0.996609,0.997372,0.000277,0.998601,0.998516,0.998707,6.4e-05,0.874749,0.847652,0.94794,0.03765,0.998786,0.998374,0.999257,0.000386,0.996904,0.996621,0.997375,0.000276,0.998605,0.998519,0.998709,6.4e-05,0.996897,0.996609,0.997372,0.000277,0.998601,0.998516,0.998707,6.4e-05,0.997426,0.996503,0.998292,0.000665,0.998922,0.998414,0.999296,0.000389,0.996897,0.996609,0.997372,0.000277,0.998601,0.998516,0.998707,6.4e-05,0.874749,0.847652,0.94794,0.03765,0.998786,0.998374,0.999257,0.000386,2.02978,0.713588,2.849905,0.787562,0.431883,0.424043,0.447101,0.008292
4,"LinearSVC(max_iter=20000, random_state=7742)","Source Port, Destination Port, NAT Source Port...",0.99682,0.996503,0.997261,0.000281,0.998613,0.998515,0.998706,6.4e-05,0.996897,0.996609,0.997372,0.000277,0.998614,0.998516,0.998707,6.4e-05,0.913163,0.890109,0.969596,0.029517,0.998862,0.998775,0.998939,5.3e-05,0.996897,0.996609,0.997372,0.000277,0.998614,0.998516,0.998707,6.4e-05,0.996897,0.996609,0.997372,0.000277,0.998614,0.998516,0.998707,6.4e-05,0.874749,0.847652,0.94794,0.03765,0.9988,0.998374,0.99928,0.000394,0.996904,0.996621,0.997375,0.000276,0.998618,0.998519,0.998709,6.3e-05,0.996897,0.996609,0.997372,0.000277,0.998614,0.998516,0.998707,6.4e-05,0.997426,0.996503,0.998292,0.000665,0.998928,0.998424,0.999296,0.000386,0.996897,0.996609,0.997372,0.000277,0.998614,0.998516,0.998707,6.4e-05,0.874749,0.847652,0.94794,0.03765,0.9988,0.998374,0.99928,0.000394,3.033141,1.782557,4.13989,1.017081,0.420869,0.41006,0.432104,0.008354
5,LinearSVC(random_state=7742),"Source Port, Destination Port, NAT Source Port...",0.99682,0.996503,0.997261,0.000281,0.998613,0.998515,0.998706,6.4e-05,0.996897,0.996609,0.997372,0.000277,0.998614,0.998516,0.998707,6.4e-05,0.913163,0.890109,0.969596,0.029517,0.998862,0.998775,0.998939,5.3e-05,0.996897,0.996609,0.997372,0.000277,0.998614,0.998516,0.998707,6.4e-05,0.996897,0.996609,0.997372,0.000277,0.998614,0.998516,0.998707,6.4e-05,0.874749,0.847652,0.94794,0.03765,0.9988,0.998374,0.99928,0.000394,0.996904,0.996621,0.997375,0.000276,0.998618,0.998519,0.998709,6.3e-05,0.996897,0.996609,0.997372,0.000277,0.998614,0.998516,0.998707,6.4e-05,0.997426,0.996503,0.998292,0.000665,0.998928,0.998424,0.999296,0.000386,0.996897,0.996609,0.997372,0.000277,0.998614,0.998516,0.998707,6.4e-05,0.874749,0.847652,0.94794,0.03765,0.9988,0.998374,0.99928,0.000394,2.105816,1.823953,2.26005,0.154713,0.426471,0.423064,0.432097,0.003084
6,"LinearSVC(max_iter=20000, random_state=7742)","Source Port, Destination Port, NAT Destination...",0.99682,0.996503,0.997261,0.000281,0.998601,0.998515,0.998706,6.4e-05,0.996897,0.996609,0.997372,0.000277,0.998601,0.998516,0.998707,6.4e-05,0.913163,0.890109,0.969596,0.029517,0.998852,0.998775,0.998939,5.3e-05,0.996897,0.996609,0.997372,0.000277,0.998601,0.998516,0.998707,6.4e-05,0.996897,0.996609,0.997372,0.000277,0.998601,0.998516,0.998707,6.4e-05,0.874749,0.847652,0.94794,0.03765,0.998786,0.998374,0.999257,0.000386,0.996904,0.996621,0.997375,0.000276,0.998605,0.998519,0.998709,6.4e-05,0.996897,0.996609,0.997372,0.000277,0.998601,0.998516,0.998707,6.4e-05,0.997426,0.996503,0.998292,0.000665,0.998922,0.998414,0.999296,0.000389,0.996897,0.996609,0.997372,0.000277,0.998601,0.998516,0.998707,6.4e-05,0.874749,0.847652,0.94794,0.03765,0.998786,0.998374,0.999257,0.000386,3.003364,1.250654,6.071458,1.805114,0.409644,0.399075,0.428989,0.011062
7,LinearSVC(random_state=7742),"Source Port, Destination Port, NAT Source Port",0.996803,0.996503,0.997261,0.000295,0.998613,0.998515,0.998706,6.4e-05,0.99688,0.996609,0.997372,0.000291,0.998614,0.998516,0.998707,6.4e-05,0.91315,0.890109,0.969596,0.029517,0.998862,0.998775,0.998939,5.3e-05,0.99688,0.996609,0.997372,0.000291,0.998614,0.998516,0.998707,6.4e-05,0.99688,0.996609,0.997372,0.000291,0.998614,0.998516,0.998707,6.4e-05,0.874742,0.847652,0.94794,0.03765,0.9988,0.998374,0.99928,0.000394,0.996887,0.996611,0.997375,0.00029,0.998618,0.998519,0.998709,6.3e-05,0.99688,0.996609,0.997372,0.000291,0.998614,0.998516,0.998707,6.4e-05,0.997407,0.996503,0.998292,0.000659,0.998928,0.998424,0.999296,0.000386,0.99688,0.996609,0.997372,0.000291,0.998614,0.998516,0.998707,6.4e-05,0.874742,0.847652,0.94794,0.03765,0.9988,0.998374,0.99928,0.000394,1.54247,1.215883,1.842916,0.200462,0.429475,0.423096,0.4421,0.006604
8,"LinearSVC(max_iter=20000, random_state=7742)","Source Port, Destination Port, NAT Source Port",0.996803,0.996503,0.997261,0.000295,0.998613,0.998515,0.998706,6.4e-05,0.99688,0.996609,0.997372,0.000291,0.998614,0.998516,0.998707,6.4e-05,0.91315,0.890109,0.969596,0.029517,0.998862,0.998775,0.998939,5.3e-05,0.99688,0.996609,0.997372,0.000291,0.998614,0.998516,0.998707,6.4e-05,0.99688,0.996609,0.997372,0.000291,0.998614,0.998516,0.998707,6.4e-05,0.874742,0.847652,0.94794,0.03765,0.9988,0.998374,0.99928,0.000394,0.996887,0.996611,0.997375,0.00029,0.998618,0.998519,0.998709,6.3e-05,0.99688,0.996609,0.997372,0.000291,0.998614,0.998516,0.998707,6.4e-05,0.997407,0.996503,0.998292,0.000659,0.998928,0.998424,0.999296,0.000386,0.99688,0.996609,0.997372,0.000291,0.998614,0.998516,0.998707,6.4e-05,0.874742,0.847652,0.94794,0.03765,0.9988,0.998374,0.99928,0.000394,3.507039,1.309311,8.274426,2.575209,0.420475,0.394088,0.431096,0.013684
9,"LinearSVC(max_iter=20000, random_state=7742)","Destination Port, NAT Source Port, NAT Destina...",0.996689,0.995908,0.99726,0.000501,0.997666,0.997537,0.997839,0.000106,0.996762,0.996016,0.997372,0.000487,0.997669,0.997541,0.997838,0.000105,0.917277,0.889689,0.969866,0.030957,0.995396,0.994607,0.998127,0.001366,0.996762,0.996016,0.997372,0.000487,0.997669,0.997541,0.997838,0.000105,0.996762,0.996016,0.997372,0.000487,0.997669,0.997541,0.997838,0.000105,0.879577,0.847372,0.948187,0.038823,0.992731,0.990967,0.998459,0.00288,0.996768,0.99602,0.997378,0.000487,0.997673,0.997548,0.997841,0.000104,0.996762,0.996016,0.997372,0.000487,0.997669,0.997541,0.997838,0.000105,0.997497,0.996293,0.998406,0.000731,0.998137,0.997643,0.998494,0.000348,0.996762,0.996016,0.997372,0.000487,0.997669,0.997541,0.997838,0.000105,0.879577,0.847372,0.948187,0.038823,0.992731,0.990967,0.998459,0.00288,6.718797,2.176435,11.169912,2.903355,0.417698,0.394096,0.427097,0.012154


## Best Baseline Model Error Analysis

# Hyperparameter Search

In [14]:
target = "Action"
drop_cols = ["NAT Destination Port"]

X = train_df.drop(columns=drop_cols+[target])
y = train_df[target].to_numpy()

ohe_cols = ["Source Port", "Destination Port", "NAT Source Port"]
scale_cols = [c for c in X.columns if c not in ohe_cols]

preprocess = ColumnTransformer(transformers=[("ohe", OneHotEncoder(handle_unknown='infrequent_if_exist'), ohe_cols), 
                                             ("scale", StandardScaler(), scale_cols)], 
                               remainder="passthrough",  
                               n_jobs=10)

model = Pipeline(steps=[("preprocess", preprocess), 
                        ("model", SVC(kernel='linear', 
                                      random_state=7742, 
                                      tol=5e-4, 
                                      cache_size=10_000, 
                                      break_ties=True))])


parameter_grid = {"model__C": np.logspace(-2, 2, 400), 
                  "model__class_weight": ["balanced", None]}

metrics=['f1_weighted', 'f1_micro', 'f1_macro',
         'recall_weighted', 'recall_micro', 'recall_macro',
         'precision_weighted', 'precision_micro', 'precision_macro',
         'accuracy', 'balanced_accuracy']

# gs1 = run_gridsearch(X=X, 
#                      y=y, 
#                      folds=5,
#                      estimator=model, 
#                      param_grid=parameter_grid, 
#                      n_jobs=50,
#                      scoring= metrics, 
#                      random_state=7742,
#                      save_name=get_gs_save_name(model_name=f"SVC_Linear"))

PATH=f"./models/SVC_Linear_20221019_0951.pkl"
gs1 = load_gs_from_pickle(pickle_filepath=PATH)
gs1_df = gs_to_clean_df(gs1.cv_results_, sort_metric="mean_test_f1_weighted")
gs1_df.loc[:,[c for c in gs1_df.columns if "std" not in c]].head()

Fitting 5 folds for each of 800 candidates, totalling 4000 fits


In [15]:
gs1.best_score_

0.997342884010411

In [16]:
gs1.best_params_

{'model__C': 0.8218394177456803, 'model__class_weight': 'balanced'}

In [17]:
gs1_df = gs_to_clean_df(gs1.cv_results_, sort_metric="mean_test_f1_weighted")
gs1_df.loc[:,[c for c in gs1_df.columns if "std" not in c]].head()

Unnamed: 0,C,class_weight,params,mean_test_f1_weighted,rank_test_f1_weighted,mean_train_f1_weighted,mean_test_f1_micro,rank_test_f1_micro,mean_train_f1_micro,mean_test_f1_macro,rank_test_f1_macro,mean_train_f1_macro,mean_test_recall_weighted,rank_test_recall_weighted,mean_train_recall_weighted,mean_test_recall_micro,rank_test_recall_micro,mean_train_recall_micro,mean_test_recall_macro,rank_test_recall_macro,mean_train_recall_macro,mean_test_precision_weighted,rank_test_precision_weighted,mean_train_precision_weighted,mean_test_precision_micro,rank_test_precision_micro,mean_train_precision_micro,mean_test_precision_macro,rank_test_precision_macro,mean_train_precision_macro,mean_test_accuracy,rank_test_accuracy,mean_train_accuracy,mean_test_balanced_accuracy,rank_test_balanced_accuracy,mean_train_balanced_accuracy
390,0.901337,balanced,"{'model__C': 0.9013370389517434, 'model__class...",0.997343,1,0.998102,0.997372,1,0.998101,0.944434,1,0.998323,0.997372,1,0.998101,0.997372,1,0.998101,0.91047,92,0.998814,0.997379,1,0.998107,0.997372,1,0.998101,0.997363,230,0.997835,0.997372,1,0.998101,0.91047,92,0.998814
388,0.880769,balanced,"{'model__C': 0.8807692733975462, 'model__class...",0.997343,1,0.998102,0.997372,1,0.998101,0.944434,1,0.998323,0.997372,1,0.998101,0.997372,1,0.998101,0.91047,92,0.998814,0.997379,1,0.998107,0.997372,1,0.998101,0.997363,230,0.997835,0.997372,1,0.998101,0.91047,92,0.998814
386,0.860671,balanced,"{'model__C': 0.8606708472376163, 'model__class...",0.997343,1,0.998102,0.997372,1,0.998101,0.944434,1,0.998323,0.997372,1,0.998101,0.997372,1,0.998101,0.91047,92,0.998814,0.997379,1,0.998107,0.997372,1,0.998101,0.997363,230,0.997835,0.997372,1,0.998101,0.91047,92,0.998814
384,0.841031,balanced,"{'model__C': 0.8410310505352605, 'model__class...",0.997343,1,0.998098,0.997372,1,0.998097,0.944434,1,0.99832,0.997372,1,0.998097,0.997372,1,0.998097,0.91047,92,0.998812,0.997379,1,0.998103,0.997372,1,0.998097,0.997363,230,0.997831,0.997372,1,0.998097,0.91047,92,0.998812
382,0.821839,balanced,"{'model__C': 0.8218394177456803, 'model__class...",0.997343,1,0.998098,0.997372,1,0.998097,0.944434,1,0.99832,0.997372,1,0.998097,0.997372,1,0.998097,0.91047,92,0.998812,0.997379,1,0.998103,0.997372,1,0.998097,0.997363,230,0.997831,0.997372,1,0.998097,0.91047,92,0.998812


In [19]:
gs1_df.sort_values(by="mean_test_balanced_accuracy", ascending=False).head(2)

Unnamed: 0,C,class_weight,params,mean_test_f1_weighted,std_test_f1_weighted,rank_test_f1_weighted,mean_train_f1_weighted,std_train_f1_weighted,mean_test_f1_micro,std_test_f1_micro,rank_test_f1_micro,mean_train_f1_micro,std_train_f1_micro,mean_test_f1_macro,std_test_f1_macro,rank_test_f1_macro,mean_train_f1_macro,std_train_f1_macro,mean_test_recall_weighted,std_test_recall_weighted,rank_test_recall_weighted,mean_train_recall_weighted,std_train_recall_weighted,mean_test_recall_micro,std_test_recall_micro,rank_test_recall_micro,mean_train_recall_micro,std_train_recall_micro,mean_test_recall_macro,std_test_recall_macro,rank_test_recall_macro,mean_train_recall_macro,std_train_recall_macro,mean_test_precision_weighted,std_test_precision_weighted,rank_test_precision_weighted,mean_train_precision_weighted,std_train_precision_weighted,mean_test_precision_micro,std_test_precision_micro,rank_test_precision_micro,mean_train_precision_micro,std_train_precision_micro,mean_test_precision_macro,std_test_precision_macro,rank_test_precision_macro,mean_train_precision_macro,std_train_precision_macro,mean_test_accuracy,std_test_accuracy,rank_test_accuracy,mean_train_accuracy,std_train_accuracy,mean_test_balanced_accuracy,std_test_balanced_accuracy,rank_test_balanced_accuracy,mean_train_balanced_accuracy,std_train_balanced_accuracy
30,0.014138,balanced,"{'model__C': 0.014137608138073606, 'model__cla...",0.9949,0.000568,656,0.995041,0.000146,0.994099,0.000594,729,0.994278,0.00016,0.834036,0.017218,596,0.845605,0.005053,0.994099,0.000594,729,0.994278,0.00016,0.994099,0.000594,729,0.994278,0.00016,0.964481,0.018451,1,0.997007,7.6e-05,0.996033,0.000673,562,0.996209,0.000145,0.994099,0.000594,729,0.994278,0.00016,0.802243,0.015426,623,0.808196,0.003892,0.994099,0.000594,729,0.994278,0.00016,0.964481,0.018451,1,0.997007,7.6e-05
152,0.057797,balanced,"{'model__C': 0.05779692884153313, 'model__clas...",0.995152,0.001122,640,0.9958,0.000492,0.994727,0.001426,724,0.99563,0.000768,0.870012,0.040113,579,0.934326,0.043971,0.994727,0.001426,724,0.99563,0.000768,0.994727,0.001426,724,0.99563,0.000768,0.96445,0.018773,2,0.997295,0.000357,0.995777,0.000773,604,0.996085,0.000125,0.994727,0.001426,724,0.99563,0.000768,0.837304,0.040606,609,0.903042,0.047482,0.994727,0.001426,724,0.99563,0.000768,0.96445,0.018773,2,0.997295,0.000357
