In [1]:
import os

import numpy
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from cuml.dask.metrics import confusion_matrix
from sklearn.metrics import f1_score, recall_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
import joblib

current_path = os.path.join(os.getcwd())
print(current_path)
label_encoder = LabelEncoder()

/media/maro/Mom0-0/Code/final-projects/Pred-Sus-Act


## Loading Data

In [2]:
selected_features_path = os.path.join(
    current_path,
    "data",
    "processed",
    "selected_features.json"
)
selected_features = pd.read_json(selected_features_path)

In [3]:
df = pd.read_csv(os.path.join(current_path,"data", "processed", "train_reduced.csv"))
# sampled_df = df#.groupby("label").sample(frac=0.20, random_state=42) # 20% per class
# sampled_df= sampled_df[selected_features]
df.shape

(700081, 21)

In [4]:
X = df.drop(columns=['label'])
y = df['label']
y = label_encoder.fit_transform(y)

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### USER defined function

In [41]:
from sklearn.metrics import classification_report, confusion_matrix

def generate_metric_report(y_true:np.array, y_pred:np.array, db_save:bool=False)->pd.DataFrame:
    # Using sklearn's classification report, then converting it to dataframe
    report = pd.DataFrame(classification_report(y_true, y_pred, output_dict=True))

    report = report.iloc[:-1, :] # excluding last row
    report = report * 100 # convert into percentage

    # Rounding to 2 decimals for each col
    for i, col in enumerate(report.columns):
        report.iloc[:, i] = report.iloc[:, i].apply(lambda v: round(v, 2))
    report.columns = list(label_encoder.inverse_transform(range(0, 6,1))) + ['accuracy', 'macro avg', 'weighted avg']
    return report

def generate_confusion_matrix(y_true:np.array, y_pred:np.array)->pd.DataFrame:
    d= pd.DataFrame(confusion_matrix(y_true, y_pred))
    d.index = list(label_encoder.inverse_transform(range(0, 6,1)))
    d.columns = list(label_encoder.inverse_transform(range(0, 6,1)))
    return d

# Core Models

## SVM

In [6]:
from cuml.svm import SVC
clf_svc = SVC(kernel="rbf")
clf_svc.fit(x_train, y_train)

In [7]:
svc_pred = clf_svc.predict(x_test)
generate_metric_report(y_test, svc_pred)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Unnamed: 0,0,1,2,3,4,5,accuracy,macro avg,weighted avg
precision,67.04,73.8,66.07,67.81,70.62,0.0,72.94,57.56,70.89
recall,45.61,98.26,22.93,7.64,31.47,0.0,72.94,34.32,72.94
f1-score,54.28,84.29,34.05,13.73,43.54,0.0,72.94,38.32,67.57


## KNN

In [6]:
from cuml.neighbors import KNeighborsClassifier
clf_knn = KNeighborsClassifier(n_neighbors=11)
clf_knn.fit(x_train, y_train)

In [7]:
knn_pred = clf_knn.predict(x_test)
generate_metric_report(y_test, knn_pred)

Unnamed: 0,BenignTraffic,DDoS,DoS,MITM,Mirai,Recon,accuracy,macro avg,weighted avg
precision,64.11,90.49,86.41,49.73,91.08,60.19,88.58,73.67,88.14
recall,74.63,96.7,70.5,13.97,94.97,30.33,88.58,63.52,88.58
f1-score,68.97,93.49,77.65,21.81,92.98,40.34,88.58,65.87,87.96


In [43]:
generate_confusion_matrix(y_test, knn_pred)

Unnamed: 0,BenignTraffic,DDoS,DoS,MITM,Mirai,Recon
BenignTraffic,3235,80,155,142,596,127
DDoS,6,88117,2965,0,34,4
DoS,268,8954,22399,5,80,64
MITM,792,13,53,181,210,47
Mirai,206,119,159,21,9719,10
Recon,539,99,190,15,32,381


In [34]:
import numpy as np
test_label_dist = np.unique(y_test, return_counts=True)[1]
test_label_dist

array([ 4335, 91126, 31770,  1296, 10234,  1256])

## Random Forest

In [44]:
from sklearn.ensemble import RandomForestClassifier
clf_rf = RandomForestClassifier(n_estimators=100, random_state=0)
clf_rf.fit(x_train, y_train)

In [45]:
y_pred = clf_rf.predict(x_test)
generate_metric_report(y_test, y_pred)

Unnamed: 0,BenignTraffic,DDoS,DoS,MITM,Mirai,Recon,accuracy,macro avg,weighted avg
precision,84.78,94.47,91.75,89.9,99.8,79.36,93.82,90.01,93.77
recall,96.12,97.43,83.44,70.06,99.33,65.21,93.82,85.27,93.82
f1-score,90.1,95.93,87.4,78.75,99.56,71.59,93.82,87.22,93.7


In [47]:
generate_confusion_matrix(y_test, y_pred)

Unnamed: 0,BenignTraffic,DDoS,DoS,MITM,Mirai,Recon
BenignTraffic,4167,1,11,42,2,112
DDoS,7,88788,2325,1,3,2
DoS,49,5185,26510,13,7,6
MITM,282,2,9,908,7,88
Mirai,26,3,15,20,10165,5
Recon,384,2,24,26,1,819


## XGB Classifier

In [48]:
#gamma=0, learning_rate=0.03, min_child_weight=7, reg_lambda=0.4, subsample=0.6
import xgboost as xgb

initial_model = xgb.XGBClassifier(n_estimators=1000,
                                  gamma=0,
                                  max_depth=None,
                                  learning_rate=0.01,
                                  num_parallel_tree=10,
                                  subsample=0.7,
                                  colsample_bytree=0.8,
                                  colsample_bylevel=0.8,
                                  base_score=0.5,
                    tree_method= 'hist',device = 'cuda',
                    min_child_weight = 7, reg_lambda =0.1,
                    random_state=42, seed=42
                                  )

init_mod = initial_model.fit(x_train, y_train,
                             # early_stopping_rounds=10,
                             # eval_set=[(X_test, y_test)],
                             # eval_metric='error',
                             verbose=100)


    E.g. tree_method = "hist", device = "cuda"



In [49]:
y_pred = init_mod.predict(x_test)
generate_metric_report(y_test, y_pred)


    E.g. tree_method = "hist", device = "cuda"

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




Unnamed: 0,BenignTraffic,DDoS,DoS,MITM,Mirai,Recon,accuracy,macro avg,weighted avg
precision,84.17,91.18,90.47,89.72,99.71,78.32,91.32,88.93,91.3
recall,95.92,97.39,72.72,68.67,99.32,63.85,91.32,82.98,91.32
f1-score,89.66,94.18,80.63,77.8,99.51,70.35,91.32,85.36,90.99


In [50]:
generate_confusion_matrix(y_test, y_pred)

Unnamed: 0,BenignTraffic,DDoS,DoS,MITM,Mirai,Recon
BenignTraffic,4158,1,13,46,5,112
DDoS,7,88747,2362,0,4,6
DoS,47,8577,23104,15,14,13
MITM,299,1,13,890,6,87
Mirai,31,0,16,19,10164,4
Recon,398,2,31,22,1,802


## Ensemble

In [6]:
from sklearn.ensemble import VotingClassifier
from cuml.linear_model import LogisticRegression
from cuml.svm import SVC
from cuml.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

estimators = [
    ("svc", SVC(kernel="rbf", probability=True)),
    ("knn", KNeighborsClassifier(n_neighbors=11)),
    ("rf", RandomForestClassifier(n_estimators=100, random_state=0))
]

voting = VotingClassifier(
    n_jobs=-1,
    estimators=estimators,
    voting = "soft"
)

voting.fit(x_train, y_train)

[I] [01:31:21.844904] Unused keyword parameter: n_jobs during cuML estimator initialization


In [34]:
y_pred = voting.predict(x_test)

In [43]:
generate_metric_report(y_test, y_pred)

Unnamed: 0,BenignTraffic,DDoS,DoS,MITM,Mirai,Recon,accuracy,macro avg,weighted avg
precision,78.66,90.97,93.19,96.72,95.54,82.89,91.26,89.66,91.41
recall,87.52,98.72,72.49,34.1,97.6,44.75,91.26,72.53,91.26
f1-score,82.86,94.68,81.55,50.43,96.56,58.12,91.26,77.37,90.74


In [None]:
generate_confusion_matrix(y_test, y_pred)

# Submission

In [44]:
# choose your model
final_model = voting
submit = False

In [45]:
testing_data = pd.read_csv(os.path.join(current_path, "data", "raw", "test.csv"))

id = testing_data['Id']
testing_data = testing_data[df.columns[:-1]]
testing_data


Unnamed: 0,flow_time,header_size,packet_duration,overall_rate,src_rate,fin_packets,urg_packets,rst_packets,max_value,value_covariance,fin_flags,syn_flags,rst_flags,psh_flags,ack_flags,protocol_http,protocol_https,protocol_tcp,protocol_udp,protocol_icmp
0,0.000000,54.0,64.00,249.534700,249.534700,0.0,0.0,0.0,54.0,0.000000,0,0,0,0,0,0,0,1,0,0
1,4.466080,108.0,64.00,0.447820,0.447820,0.0,0.0,0.0,54.0,0.000000,0,1,0,0,0,0,0,1,0,0
2,0.000000,54.0,64.00,1.295361,1.295361,0.0,1.0,1.0,54.0,0.000000,0,0,0,1,1,0,0,1,0,0
3,0.000000,0.0,64.00,2.249081,2.249081,0.0,0.0,0.0,42.0,0.000000,0,0,0,0,0,0,0,0,0,1
4,4.413071,108.0,64.00,0.453199,0.453199,0.0,0.0,0.0,54.0,0.000000,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
104282,0.000000,54.0,64.00,10.863896,10.863896,0.0,0.0,0.0,54.0,0.000000,0,0,0,0,0,1,0,1,0,0
104283,0.000000,54.0,64.00,0.657968,0.657968,0.0,0.0,0.0,54.0,0.000000,0,0,0,0,0,0,0,1,0,0
104284,0.000000,54.0,64.00,1.356930,1.356930,1.0,0.0,0.0,54.0,0.000000,1,0,1,0,0,0,0,1,0,0
104285,0.000000,0.0,64.00,23.900780,23.900780,0.0,0.0,0.0,42.0,0.000000,0,0,0,0,0,0,0,0,0,1


In [46]:
y_pred = voting.predict(testing_data)
# !kaggle competitions submit -c csai-253-project-phase-2 -f submission.csv -m "Message"

In [49]:
result = pd.DataFrame(label_encoder.inverse_transform(y_pred), columns=['Target'])
result['Id'] = id
result = result[['Id', 'Target']]

# correct_names = ['DDoS', 'Recon', 'BenignTraffic', 'MITM', 'DoS', 'Mirai']
# result['Target'] = result['Target'].map({name: name for name in correct_names})

result.to_csv(os.path.join(current_path, "data", "submission.csv"), index=False)
print(result['Target'].value_counts())

Target
DDoS             74545
DoS              20369
Mirai             5906
BenignTraffic     2808
Recon              409
MITM               250
Name: count, dtype: int64


In [30]:
# Save trained ensemble model
joblib.dump(voting, 'voting_model.pkl')


['voting_model.pkl']

In [32]:
# Loading the model from previously saved file

voting = joblib.load('voting_model.pkl')