In [6]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import time
import numpy as np
import joblib
from sklearn.svm import SVC
from sklearn import svm
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix, precision_score, recall_score, roc_auc_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

data = pd.read_csv(r"C:\Users\isa\Documents\GitHub\Cyber-Threat-Detection-ML\df_log_clipped.csv")

print(data.columns)

#data.isna().sum()
data.iloc[:, 42].value_counts()


Index(['proto', 'state', 'dur', 'sbytes', 'dbytes', 'sttl', 'dttl', 'sloss',
       'dloss', 'service', 'sload', 'dload', 'spkts', 'dpkts', 'swin', 'dwin',
       'stcpb', 'dtcpb', 'smean', 'dmean', 'trans_depth', 'res_bdy_len',
       'sjit', 'djit', 'Src_pkt_AT', 'Dst_pkt_AT', 'tcprtt', 'synack',
       'ackdat', 'is_sm_ips_ports', 'ct_state_ttl', 'ct_flw_http_mthd',
       'is_ftp_login', 'ct_ftp_cmd', 'ct_srv_src', 'ct_srv_dst', 'ct_dst_ltm',
       'ct_src_ltm', 'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm',
       'attack_cat', 'label'],
      dtype='object')


label
0    1938908
1      88424
Name: count, dtype: int64

In [7]:

#////////SPLIT df/////////////////////////////
# X = columns 0 through 41 (i.e., first 42 columns)
X = data.iloc[:, 0:41]

# Y = column 42 (i.e., the 43rd column)
Y = data.iloc[:, 42]

X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.2, random_state=42)

#//////////////////////////////////////////////////////

In [8]:
#///////HOT ENCODE X (attributes) ////////////

# Identify column types
categorical_cols = X.select_dtypes(include=['object', 'category']).columns
numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns

# Preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ])

# Fit + transform
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

#/////////////////////////////////////////////////////

In [19]:
clf = LogisticRegression(random_state=42)

cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

scoring = {
    'f1': 'f1',
    'recall': 'recall',
    'roc_auc': 'roc_auc',
    'balanced_accuracy': 'balanced_accuracy'
}

search_space = [
    {
        'C':[0.1, 1],
        "solver": ["saga"],  # Only use these two for large datasets
        "penalty": ["l2"],  # Only L2 regularization, most commonly used
        "max_iter": [700],  # Reasonable range for max_iter
        "tol": [1e-4]  # Two different tolerances for convergence
    }
]

GS = GridSearchCV(estimator = clf,
                  param_grid = search_space,
                  scoring = 'f1',
                  cv=cv,
                 n_jobs=-1,
                 verbose=2)

In [21]:





# Run GridSearchCV
GS.fit(X_train_preprocessed, Y_train)

# Get best parameters from the grid search
print("Best Parameters: ", GS.best_params_)

# Get the best score (based on 'f1' in your case)
print("Best F1 Score: ", GS.best_score_)

# Get the best model from the grid search
best_model = GS.best_estimator_


y_pred = best_model.predict(X_test_preprocessed)
y_proba = best_model.predict_proba(X_test_preprocessed)[:, 1]

report_dict = classification_report(Y_test, y_pred, output_dict=True)
report_df = pd.DataFrame(report_dict).transpose()
print(report_df)


print("Confusion Matrix:\n", confusion_matrix(Y_test, y_pred))
print("Test ROC AUC:", roc_auc_score(Y_test, y_proba))

# Evaluate the best model on the test set
test_score = best_model.score(X_test_preprocessed, Y_test)
print("Test Accuracy with Best Model: ", test_score)


precision = precision_score(Y_test, y_pred, pos_label=1)
recall = recall_score(Y_test, y_pred, pos_label=1)

print("Precision:", precision)
print("Recall:", recall)
#save model
joblib.dump(best_model, 'best_logistic_model.pkl')


Fitting 3 folds for each of 2 candidates, totalling 6 fits




Best Parameters:  {'C': 1, 'max_iter': 700, 'penalty': 'l2', 'solver': 'saga', 'tol': 0.0001}
Best F1 Score:  0.8601805822781324
              precision    recall  f1-score        support
0              0.996389  0.989828  0.993097  387719.000000
1              0.805724  0.921625  0.859786   17748.000000
accuracy       0.986842  0.986842  0.986842       0.986842
macro avg      0.901056  0.955726  0.926442  405467.000000
weighted avg   0.988043  0.986842  0.987262  405467.000000
Confusion Matrix:
 [[383775   3944]
 [  1391  16357]]
Test ROC AUC: 0.9975450172023524
Test Accuracy with Best Model:  0.9868423324216274
Precision: 0.8057238559676864
Recall: 0.9216249718278116


['best_logistic_model.pkl']

SVM Classifier

In [23]:

from sklearn.svm import LinearSVC
from sklearn import svm
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.model_selection import GridSearchCV
import time


# Hyperparameters for linear kernel svm Grid Search
linear_grid = {
    'C': [0.5, 1, 1.5],
    'penalty': ['l2'],
    "tol": [1e-4]
}

linear_svm = LinearSVC(max_iter=10000, random_state=42, class_weight='balanced')

linear_GS = GridSearchCV(
    estimator=linear_svm,
    param_grid=linear_grid,
    scoring='f1',
    cv=cv,
    n_jobs=-1,
    verbose=2
)

start_time = time.time()
linear_GS.fit(X_train_preprocessed, Y_train)
elapsed_time = time.time() - start_time

print('Time for training linear SVM: {} seconds'.format(elapsed_time))
print('Best Parameters for linear SVM: ', linear_GS.best_params_)
print('Best F1 Score in Linear SVM: ', linear_GS.best_score_)

# Evaluate the best model on the test set
best_svm_model = linear_GS.best_estimator_
test_score = best_svm_model.score(X_test_preprocessed, Y_test)
print("Test Accuracy with Best Model: ", test_score)

# Get predictions
y_pred = best_svm_model.predict(X_test_preprocessed)

# Print classification report (includes precision, recall, F1)
report_dict = classification_report(Y_test, y_pred, output_dict=True)
report_df = pd.DataFrame(report_dict).transpose()
print(report_df)

# Print precision and recall directly
precision = precision_score(Y_test, y_pred)
recall = recall_score(Y_test, y_pred)
print("Precision:", precision)
print("Recall:", recall)

# Print confusion matrix
print("Confusion Matrix:\n", confusion_matrix(Y_test, y_pred))






Fitting 3 folds for each of 3 candidates, totalling 9 fits
Time for training linear SVM: 3687.6403017044067 seconds
Best Parameters for linear SVM:  {'C': 1.5, 'penalty': 'l2', 'tol': 0.0001}
Best F1 Score in Linear SVM:  0.8636551152952592
Test Accuracy with Best Model:  0.9860753156237129
              precision    recall  f1-score        support
0              0.999956  0.985482  0.992666  387719.000000
1              0.759033  0.999042  0.862654   17748.000000
accuracy       0.986075  0.986075  0.986075       0.986075
macro avg      0.879494  0.992262  0.927660  405467.000000
weighted avg   0.989410  0.986075  0.986975  405467.000000
Precision: 0.7590325342465754
Recall: 0.9990421455938697
Confusion Matrix:
 [[382090   5629]
 [    17  17731]]


# 