# Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, mean_squared_error

from tqdm import tqdm
import time
import joblib

In [2]:
X_train = np.load('/home/jovyan/ssl-ids/dataset/preprocessed/CIC-IDS-2017/FRI/Botnet/X_train.npy', allow_pickle=True)
y_train = np.load('/home/jovyan/ssl-ids/dataset/preprocessed/CIC-IDS-2017/FRI/Botnet/y_train.npy', allow_pickle=True)

X_test = np.load('/home/jovyan/ssl-ids/dataset/preprocessed/CIC-IDS-2017/FRI/Botnet/X_test.npy', allow_pickle=True)
y_test = np.load('/home/jovyan/ssl-ids/dataset/preprocessed/CIC-IDS-2017/FRI/Botnet/y_test.npy', allow_pickle=True)

#X_val = np.load('data/x_val.npy')
#y_val = np.load('data/y_val.npy')

In [108]:
X_train = np.load('/home/jovyan/ssl-ids/dataset/preprocessed/CIC-IDS-2017/FRI/PortScan/X_train.npy', allow_pickle=True)
y_train = np.load('/home/jovyan/ssl-ids/dataset/preprocessed/CIC-IDS-2017/FRI/PortScan/y_train.npy', allow_pickle=True)

X_test = np.load('/home/jovyan/ssl-ids/dataset/preprocessed/CIC-IDS-2017/FRI/PortScan/X_test.npy', allow_pickle=True)
y_test = np.load('/home/jovyan/ssl-ids/dataset/preprocessed/CIC-IDS-2017/FRI/PortScan/y_test.npy', allow_pickle=True)

In [72]:
X_train = np.load('/home/jovyan/ssl-ids/dataset/preprocessed/CIC-IDS-2017/FRI/DDoS/X_train.npy', allow_pickle=True)
y_train = np.load('/home/jovyan/ssl-ids/dataset/preprocessed/CIC-IDS-2017/FRI/DDoS/y_train.npy', allow_pickle=True)

X_test = np.load('/home/jovyan/ssl-ids/dataset/preprocessed/CIC-IDS-2017/FRI/DDoS/X_test.npy', allow_pickle=True)
y_test = np.load('/home/jovyan/ssl-ids/dataset/preprocessed/CIC-IDS-2017/FRI/DDoS/y_test.npy', allow_pickle=True)

In [29]:
#X_train = np.concatenate((X_train, X_val), axis=0)
#y_train = np.concatenate((y_train, y_val), axis=0)

# Playground

In [109]:
np.unique(y_train, return_counts=True)

(array(['BENIGN', 'PortScan'], dtype=object), array([ 89066, 111201]))

In [110]:
normal_data = X_train[y_train == 'BENIGN'] # normal class

# Helpers

In [111]:
def evaluate_model(model, y_test, y_pred):
    positive_class = 'normal'
    
    accuracy = accuracy_score(y_test, y_pred)
    
    precision = precision_score(y_test, y_pred, pos_label=positive_class, zero_division=True)
    recall = recall_score(y_test, y_pred, pos_label=positive_class, zero_division=True)
    f1 = f1_score(y_test, y_pred, pos_label=positive_class, zero_division=True)
    
    metrics = {
        'model': [model],
        'accuracy': [accuracy],
        'precision': [precision],
        'recall': [recall],
        'f1': [f1],
    }
    
    return metrics

# OCSVM

In [112]:
from sklearn.svm import OneClassSVM

In [113]:
ocsvm = OneClassSVM(kernel='rbf', gamma='auto')

batch_size = 100
n_batches = int(np.ceil(len(normal_data) / batch_size))

for i in tqdm(range(n_batches), desc="Training OCSVM"):
    start = i * batch_size
    end = min((i + 1) * batch_size, len(normal_data))
    batch_data = normal_data[start:end]
    
    if i == 0:
        ocsvm.fit(batch_data)
    else:
        ocsvm.fit(np.vstack((ocsvm.support_vectors_, batch_data)))

Training OCSVM: 100%|██████████| 891/891 [00:32<00:00, 27.03it/s] 


In [None]:
ocsvm = OneClassSVM(kernel='linear', gamma='auto')

start_time = time.time()

ocsvm.fit(normal_data)

training_time = time.time() - start_time
print(f"Training Time: {training_time}")

In [114]:
y_pred_test_ocsvm = ocsvm.predict(X_test)
y_pred_test_ocsvm = np.where(y_pred_test_ocsvm == 1, 'normal', 'anomaly')
y_test_converted_ocsvm = np.where(y_test == 'BENIGN', 'normal', 'anomaly')

In [115]:
evaluate_model("ocsvm", y_test_converted_ocsvm, y_pred_test_ocsvm)

{'model': ['ocsvm'],
 'accuracy': [0.4431045865858868],
 'precision': [0.4441243336469113],
 'recall': [0.9862471166818992],
 'f1': [0.6124511830921203]}

In [116]:
ocsvm_df = pd.DataFrame(evaluate_model("ocsvm", y_test_converted_ocsvm, y_pred_test_ocsvm)) 
ocsvm_df

Unnamed: 0,model,accuracy,precision,recall,f1
0,ocsvm,0.443105,0.444124,0.986247,0.612451


# SGDOCSVM

In [117]:
from sklearn.linear_model import SGDOneClassSVM

In [118]:
sgdocsvm = SGDOneClassSVM()

start_time = time.time()

sgdocsvm.fit(normal_data)

training_time = time.time() - start_time
print(f"Training Time: {training_time}")

Training Time: 0.14925837516784668


In [119]:
y_pred_test_sgdocsvm = sgdocsvm.predict(X_test)
y_pred_test_sgdocsvm = np.where(y_pred_test_sgdocsvm == 1, 'normal', 'anomaly')
y_test_converted_sgdocsvm = np.where(y_test == 'BENIGN', 'normal', 'anomaly')

In [120]:
evaluate_model("sgdocsvm", y_test_converted_sgdocsvm, y_pred_test_sgdocsvm)

{'model': ['sgdocsvm'],
 'accuracy': [0.7884578041865703],
 'precision': [0.9985970124618305],
 'recall': [0.526613570091831],
 'f1': [0.6895765657947227]}

In [121]:
sgdocsvm_df = pd.DataFrame(evaluate_model("sgdocsvm", y_test_converted_sgdocsvm, y_pred_test_sgdocsvm)) 
sgdocsvm_df

Unnamed: 0,model,accuracy,precision,recall,f1
0,sgdocsvm,0.788458,0.998597,0.526614,0.689577


# LOF

In [122]:
from sklearn.neighbors import LocalOutlierFactor

In [123]:
start_time = time.time()

lof = LocalOutlierFactor(n_neighbors=50, contamination=0.001, novelty=True)
lof.fit(normal_data)

training_time = time.time() - start_time
print(f"Training Time: {training_time}")

Training Time: 2.3590829372406006


In [124]:
y_pred_test_lof = lof.predict(X_test)
y_pred_test_lof = np.where(y_pred_test_lof == 1, 'normal', 'anomaly')
y_test_converted_lof = np.where(y_test == 'BENIGN', 'normal', 'anomaly')

In [125]:
evaluate_model("lof", y_test_converted_lof, y_pred_test_lof)

{'model': ['lof'],
 'accuracy': [0.4458231387626704],
 'precision': [0.445979020979021],
 'recall': [0.9992166079122601],
 'f1': [0.6167049437930617]}

In [126]:
lof_df = pd.DataFrame(evaluate_model("lof", y_test_converted_lof, y_pred_test_lof)) 
lof_df

Unnamed: 0,model,accuracy,precision,recall,f1
0,lof,0.445823,0.445979,0.999217,0.616705


# IF

In [127]:
from sklearn.ensemble import IsolationForest

In [128]:
start_time = time.time()

iforest = IsolationForest(n_estimators=200, max_samples=128, contamination=0.001, random_state=21)
iforest.fit(normal_data)

training_time = time.time() - start_time
print(f"Training Time: {training_time}")

Training Time: 11.943435668945312


In [129]:
y_pred_test_if = iforest.predict(X_test)
y_pred_test_if = np.where(y_pred_test_if == 1, 'normal', 'anomaly')
y_test_converted_if = np.where(y_test == "BENIGN", 'normal', 'anomaly')

In [130]:
evaluate_model("if", y_test_converted_if, y_pred_test_if)

{'model': ['if'],
 'accuracy': [0.4456872111538312],
 'precision': [0.44590367765624694],
 'recall': [0.9989119554336946],
 'f1': [0.6165748824714573]}

In [131]:
if_df = pd.DataFrame(evaluate_model("if", y_test_converted_if, y_pred_test_if)) 
if_df

Unnamed: 0,model,accuracy,precision,recall,f1
0,if,0.445687,0.445904,0.998912,0.616575


# PCA Reconstruction

In [132]:
from sklearn.decomposition import PCA

In [133]:
start_time = time.time()

pca = PCA(n_components=10)
pca.fit(normal_data)

training_time = time.time() - start_time
print(f"Training Time: {training_time}")

Training Time: 0.21074199676513672


In [134]:
normal_data_pca = pca.transform(normal_data)
normal_data_reconstructed = pca.inverse_transform(normal_data_pca)

train_reconstruction_error = np.mean((normal_data - normal_data_reconstructed) ** 2, axis=1)

In [135]:
threshold = np.percentile(train_reconstruction_error, 95)

In [136]:
X_test_pca = pca.transform(X_test)
X_test_reconstructed = pca.inverse_transform(X_test_pca)

test_reconstruction_error = np.mean((X_test - X_test_reconstructed) ** 2, axis=1)

In [137]:
test_reconstruction_error

array([0.28893681, 0.11478281, 0.11682281, ..., 0.11938117, 0.32031757,
       0.1151785 ])

In [138]:
y_pred_test_pca = (test_reconstruction_error > threshold).astype(int)
y_pred_test_pca = np.where(y_pred_test_pca == 1, 'normal', 'anomaly')
y_test_converted_pca = np.where(y_test == "BENIGN", 'normal', 'anomaly')

In [139]:
evaluate_model("pca", y_test_converted_pca, y_pred_test_pca)

{'model': ['pca'],
 'accuracy': [0.5736533457610005],
 'precision': [0.918785890073831],
 'recall': [0.048744396570483524],
 'f1': [0.0925772855017358]}

In [140]:
pca_df = pd.DataFrame(evaluate_model("pca", y_test_converted_pca, y_pred_test_pca)) 
pca_df

Unnamed: 0,model,accuracy,precision,recall,f1
0,pca,0.573653,0.918786,0.048744,0.092577


# Results

In [35]:
results_df = pd.concat([ocsvm_df, sgdocsvm_df, lof_df, if_df, pca_df], ignore_index=True)
results_df.to_csv('/home/jovyan/ssl-ids/result/CIC-IDS-2017/FRI/results_cic_ids_2017_fri_botnet.csv', index=False)

results_df

Unnamed: 0,model,accuracy,precision,recall,f1
0,ocsvm,0.985509,0.990203,0.995209,0.9927
1,sgdocsvm,0.683573,0.985976,0.6902,0.811992
2,lof,0.989379,0.990013,0.999353,0.994661
3,if,0.989408,0.990013,0.999383,0.994676
4,pca,0.058519,0.991166,0.049468,0.094233


In [141]:
results_df = pd.concat([ocsvm_df, sgdocsvm_df, lof_df, if_df, pca_df], ignore_index=True)
results_df.to_csv('/home/jovyan/ssl-ids/result/CIC-IDS-2017/FRI/results_cic_ids_2017_fri_portscan.csv', index=False)

results_df

Unnamed: 0,model,accuracy,precision,recall,f1
0,ocsvm,0.443105,0.444124,0.986247,0.612451
1,sgdocsvm,0.788458,0.998597,0.526614,0.689577
2,lof,0.445823,0.445979,0.999217,0.616705
3,if,0.445687,0.445904,0.998912,0.616575
4,pca,0.573653,0.918786,0.048744,0.092577


In [106]:
results_df = pd.concat([ocsvm_df, sgdocsvm_df, lof_df, if_df, pca_df], ignore_index=True)
results_df.to_csv('/home/jovyan/ssl-ids/result/CIC-IDS-2017/FRI/results_cic_ids_2017_fri_ddos.csv', index=False)

results_df

Unnamed: 0,model,accuracy,precision,recall,f1
0,ocsvm,0.606414,0.523785,0.992376,0.685668
1,sgdocsvm,0.81016,0.999494,0.561422,0.718986
2,lof,0.432007,0.432251,0.998691,0.603358
3,if,0.432228,0.432377,0.999203,0.603574
4,pca,0.399591,0.103131,0.050413,0.067721


In [36]:
joblib.dump(ocsvm, '/home/jovyan/ssl-ids/model/CIC-IDS-2017/FRI/Botnet/ocsvm.pkl')
joblib.dump(sgdocsvm_df, '/home/jovyan/ssl-ids/model/CIC-IDS-2017/FRI/Botnet/sgdocsvm_df.pkl')
joblib.dump(lof, '/home/jovyan/ssl-ids/model/CIC-IDS-2017/FRI/Botnet/lof.pkl')
joblib.dump(iforest, '/home/jovyan/ssl-ids/model/CIC-IDS-2017/FRI/Botnet/iforest.pkl')
joblib.dump(pca, '/home/jovyan/ssl-ids/model/CIC-IDS-2017/FRI/Botnet/pca.pkl')

['/home/jovyan/ssl-ids/model/CIC-IDS-2017/FRI/Botnet/pca.pkl']

In [142]:
joblib.dump(ocsvm, '/home/jovyan/ssl-ids/model/CIC-IDS-2017/FRI/PortScan/ocsvm.pkl')
joblib.dump(sgdocsvm_df, '/home/jovyan/ssl-ids/model/CIC-IDS-2017/FRI/PortScan/sgdocsvm_df.pkl')
joblib.dump(lof, '/home/jovyan/ssl-ids/model/CIC-IDS-2017/FRI/PortScan/lof.pkl')
joblib.dump(iforest, '/home/jovyan/ssl-ids/model/CIC-IDS-2017/FRI/PortScan/iforest.pkl')
joblib.dump(pca, '/home/jovyan/ssl-ids/model/CIC-IDS-2017/FRI/PortScan/pca.pkl')

['/home/jovyan/ssl-ids/model/CIC-IDS-2017/FRI/PortScan/pca.pkl']

In [107]:
joblib.dump(ocsvm, '/home/jovyan/ssl-ids/model/CIC-IDS-2017/FRI/DDoS/ocsvm.pkl')
joblib.dump(sgdocsvm_df, '/home/jovyan/ssl-ids/model/CIC-IDS-2017/FRI/DDoS/sgdocsvm_df.pkl')
joblib.dump(lof, '/home/jovyan/ssl-ids/model/CIC-IDS-2017/FRI/DDoS/lof.pkl')
joblib.dump(iforest, '/home/jovyan/ssl-ids/model/CIC-IDS-2017/FRI/DDoS/iforest.pkl')
joblib.dump(pca, '/home/jovyan/ssl-ids/model/CIC-IDS-2017/FRI/DDoS/pca.pkl')

['/home/jovyan/ssl-ids/model/CIC-IDS-2017/FRI/DDoS/pca.pkl']