In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score, r2_score, roc_curve, auc, roc_auc_score
from sklearn.metrics import confusion_matrix, precision_score, accuracy_score, recall_score
import itertools
import joblib
import json

In [17]:
# Load model information files
model_1 = open('../models/model_info_Cost Effective Decision Tree (Ingress).json')
model_2 = open('../models/model_info_Cost Effective Decision Tree (Controller).json')

In [18]:
model_info_ingress = json.load(model_1)
model_info_controller = json.load(model_2)

In [19]:
model_info_ingress

{'model_name': 'Cost Effective Decision Tree (Ingress)',
 'training_file': 'MLM_Decision_Tree_Ingress.ipynb',
 'file_name': '../models/model_dtc_ingress.pkl',
 'scaler_file': '../models/scaler_dtc_ingress.pkl',
 'prepared_by': 'Digital Khalid',
 'library': 'Scikit-Learn',
 'classifier': 'Decision Tree Classifier',
 'normalization': 'Standard Scaler (Z-Score)',
 'data_file': '../datasets/mawi_flows.csv',
 'features': ['src_port', 'dst_port', 'protocol', 'pkt_size_1'],
 'target': ['elephant']}

In [22]:
model_ingress = joblib.load(model_info_ingress['file_name'])

In [29]:
# Read data file
input_file = model_info_ingress['data_file']
input_file
flows = pd.read_csv(input_file)

In [53]:
features_ingress = model_info_ingress['features']
target_ingress = model_info_ingress['target']

X_ingress = flows[features_ingress]
y = flows[target_ingress]

In [32]:
scaler_ingress = joblib.load(model_info_ingress['scaler_file'])

In [59]:
X_norm_ingress = scaler_ingress.transform(X_ingress)
X_norm_ingress

array([[-1.48137583,  2.11086118, -0.33874667, -0.11701203],
       [ 0.75371663, -0.59077602,  2.95205852,  1.07173183],
       [ 0.50830622, -0.75146112, -0.33874667, -0.23893448],
       ...,
       [-1.5707218 , -0.12623944, -0.33874667, -0.11701203],
       [ 0.51338111, -0.64920697, -0.33874667, -0.23893448],
       [ 0.77001214,  1.52882759, -0.33874667, -0.19829367]])

In [60]:
ingress_elephant = model_ingress.predict(X_norm_ingress)
ingress_elephant

array([0, 1, 0, ..., 0, 0, 0])

In [61]:
ingress_elephant = pd.Series(ingress_elephant)

In [62]:
ingress_elephant

0          0
1          1
2          0
3          0
4          1
          ..
1518385    0
1518386    0
1518387    0
1518388    0
1518389    0
Length: 1518390, dtype: int64

In [63]:
ingress_elephant = pd.concat([flows, ingress_elephant], axis=1)
ingress_elephant

Unnamed: 0,start_time,end_time,src_ip,dst_ip,protocol,src_port,dst_port,pkt_size_1,pkt_size_2,pkt_size_3,...,7_pkt_max_iat,7_pkt_mean_iat,7_pkt_duration,flow_size,flow_pkt_count,flow_max_iat,flow_mean_iat,flow_duration,elephant,0
0,1688187600,1688187685,149.40.55.233,203.115.138.41,6,1999,54524,66,66,66,...,19,12.4375,44,792,12,19,6.982422e+00,85,0,0
1,1688187600,1688187692,202.11.248.134,52.113.75.222,17,50005,3479,183,191,196,...,0,0.0000,0,841789,4697,1,4.547474e-13,92,1,1
2,1688187600,1688187606,45.100.248.16,202.249.92.162,6,44734,443,54,54,54,...,4,2.5000,6,216,4,4,2.500000e+00,6,0,0
3,1688187600,1688187600,38.88.114.91,163.37.23.124,6,43332,3128,54,0,0,...,0,0.0000,0,54,1,0,0.000000e+00,0,0,0
4,1688187600,1688187600,163.37.117.124,38.186.128.63,6,61554,443,85,54,0,...,0,0.0000,0,139,2,0,0.000000e+00,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1518385,1688187692,1688187692,45.197.187.47,163.37.128.119,6,48152,3128,58,0,0,...,0,0.0000,0,58,1,0,0.000000e+00,0,0,0
1518386,1688187692,1688187692,167.248.189.103,163.37.93.253,6,7528,80,58,0,0,...,0,0.0000,0,58,1,0,0.000000e+00,0,0,0
1518387,1688187692,1688187692,202.249.93.185,46.25.179.131,6,80,12256,66,66,0,...,0,0.0000,0,132,2,0,0.000000e+00,0,0,0
1518388,1688187692,1688187692,146.64.20.95,150.161.104.40,6,44843,2375,54,0,0,...,0,0.0000,0,54,1,0,0.000000e+00,0,0,0


In [64]:
ingress_elephant.rename(columns={0: 'ingress_elephant'}, inplace=True)

In [65]:
ingress_elephant['ingress_elephant'].value_counts()

ingress_elephant
0    1452704
1      65686
Name: count, dtype: int64

In [67]:
ingress_elephant = ingress_elephant.query('ingress_elephant == 1')
ingress_elephant

Unnamed: 0,start_time,end_time,src_ip,dst_ip,protocol,src_port,dst_port,pkt_size_1,pkt_size_2,pkt_size_3,...,7_pkt_max_iat,7_pkt_mean_iat,7_pkt_duration,flow_size,flow_pkt_count,flow_max_iat,flow_mean_iat,flow_duration,elephant,ingress_elephant
1,1688187600,1688187692,202.11.248.134,52.113.75.222,17,50005,3479,183,191,196,...,0,0.0,0,841789,4697,1,4.547474e-13,92,1,1
4,1688187600,1688187600,163.37.117.124,38.186.128.63,6,61554,443,85,54,0,...,0,0.0,0,139,2,0,0.000000e+00,0,0,1
5,1688187600,1688187613,103.218.167.178,131.241.192.55,6,443,54447,1063,1126,590,...,0,0.0,0,26214,76,10,6.250315e-01,13,0,1
8,1688187600,1688187620,160.240.216.161,150.161.6.133,6,47714,80,66,66,66,...,0,0.0,0,4429710,67041,1,5.000000e-01,20,1,1
12,1688187600,1688187624,223.204.131.39,150.161.6.133,6,61429,443,54,54,54,...,0,0.0,0,1374766,24719,8,3.250000e+00,24,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1518338,1688187692,1688187692,46.25.179.131,202.249.93.173,6,59526,443,54,0,0,...,0,0.0,0,54,1,0,0.000000e+00,0,0,1
1518354,1688187692,1688187692,38.186.128.37,163.37.145.161,6,80,58072,62,0,0,...,0,0.0,0,62,1,0,0.000000e+00,0,0,1
1518368,1688187692,1688187692,150.154.49.113,150.161.111.29,17,443,58289,1392,1392,0,...,0,0.0,0,2784,2,0,0.000000e+00,0,0,1
1518382,1688187692,1688187692,202.11.253.190,20.134.191.129,6,64216,443,66,0,0,...,0,0.0,0,66,1,0,0.000000e+00,0,0,1


In [42]:
model_info_controller

{'model_name': 'Cost Effective Decision Tree (Controller)',
 'training_file': 'MLM_Decision_Tree_Controller.ipynb',
 'file_name': '../models/model_dtc_controller.pkl',
 'scaler_file': '../models/scaler_dtc_controller.pkl',
 'prepared_by': 'Digital Khalid',
 'library': 'Scikit-Learn',
 'classifier': 'Decision Tree Classifier',
 'normalization': 'Standard Scaler (Z-Score)',
 'data_file': '../datasets/flows_ingress_elephant.csv',
 'features': ['src_port',
  'dst_port',
  'protocol',
  'pkt_size_1',
  'pkt_size_2',
  'pkt_size_3',
  'pkt_size_4',
  'pkt_size_5',
  'pkt_size_6',
  'pkt_size_7',
  '7_pkt_size',
  '7_pkt_min_size',
  '7_pkt_max_size',
  '7_pkt_mean_size'],
 'target': ['elephant']}

In [68]:
model_controller = joblib.load(model_info_controller['file_name'])

In [69]:
features_controller = model_info_controller['features']
target_controller = model_info_controller['target']

X_controller = ingress_elephant[features_controller]

In [70]:
scaler_controller = joblib.load(model_info_controller['scaler_file'])

In [71]:
X_norm_controller = scaler_controller.transform(X_controller)
X_norm_controller

array([[ 1.1070318 , -0.84355919,  2.05260191, ...,  0.53224246,
        -0.4343061 , -0.24780138],
       [ 1.56244011, -0.9583822 , -0.48718653, ..., -0.29021012,
        -0.54495971, -0.54683991],
       [-0.84733189,  1.08407559, -0.48718653, ...,  3.51008804,
         1.02711896,  1.85795229],
       ...,
       [-0.84733189,  1.22938192,  2.05260191, ...,  9.19635506,
         0.75796155,  3.00916421],
       [ 1.66740997, -0.9583822 , -0.48718653, ..., -0.20512882,
        -0.56390041, -0.55625089],
       [-0.70706976, -0.90327775,  2.05260191, ..., -0.08459698,
        -0.54695347, -0.51054044]])

In [78]:
controller_elephant = model_controller.predict(X_norm_controller)
controller_elephant

array([0., 0., 1., ..., 0., 0., 0.])

In [79]:
controller_elephant = pd.Series(controller_elephant)

In [74]:
controller_elephant

0        0.0
1        0.0
2        1.0
3        0.0
4        0.0
        ... 
65681    0.0
65682    0.0
65683    0.0
65684    0.0
65685    0.0
Length: 65686, dtype: float64

In [75]:
controller_elephant = pd.concat([ingress_elephant, controller_elephant], axis=1)
controller_elephant

Unnamed: 0,start_time,end_time,src_ip,dst_ip,protocol,src_port,dst_port,pkt_size_1,pkt_size_2,pkt_size_3,...,7_pkt_mean_iat,7_pkt_duration,flow_size,flow_pkt_count,flow_max_iat,flow_mean_iat,flow_duration,elephant,ingress_elephant,0
1,1.688188e+09,1.688188e+09,202.11.248.134,52.113.75.222,17.0,50005.0,3479.0,183.0,191.0,196.0,...,0.0,0.0,841789.0,4697.0,1.0,4.547474e-13,92.0,1.0,1.0,0.0
4,1.688188e+09,1.688188e+09,163.37.117.124,38.186.128.63,6.0,61554.0,443.0,85.0,54.0,0.0,...,0.0,0.0,139.0,2.0,0.0,0.000000e+00,0.0,0.0,1.0,0.0
5,1.688188e+09,1.688188e+09,103.218.167.178,131.241.192.55,6.0,443.0,54447.0,1063.0,1126.0,590.0,...,0.0,0.0,26214.0,76.0,10.0,6.250315e-01,13.0,0.0,1.0,0.0
8,1.688188e+09,1.688188e+09,160.240.216.161,150.161.6.133,6.0,47714.0,80.0,66.0,66.0,66.0,...,0.0,0.0,4429710.0,67041.0,1.0,5.000000e-01,20.0,1.0,1.0,0.0
12,1.688188e+09,1.688188e+09,223.204.131.39,150.161.6.133,6.0,61429.0,443.0,54.0,54.0,54.0,...,0.0,0.0,1374766.0,24719.0,8.0,3.250000e+00,24.0,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65680,,,,,,,,,,,...,,,,,,,,,,0.0
65681,,,,,,,,,,,...,,,,,,,,,,0.0
65682,,,,,,,,,,,...,,,,,,,,,,0.0
65683,,,,,,,,,,,...,,,,,,,,,,0.0


In [51]:
controller_elephant.rename(columns={0: 'controller_elephant'}, inplace=True)

In [52]:
controller_elephant

Unnamed: 0,start_time,end_time,src_ip,dst_ip,protocol,src_port,dst_port,pkt_size_1,pkt_size_2,pkt_size_3,...,7_pkt_mean_iat,7_pkt_duration,flow_size,flow_pkt_count,flow_max_iat,flow_mean_iat,flow_duration,elephant,ingress_elephant,controller_elephant
0,1688187600,1688187685,149.40.55.233,203.115.138.41,6,1999,54524,66,66,66,...,12.4375,44,792,12,19,6.982422e+00,85,0,0,0.0
1,1688187600,1688187692,202.11.248.134,52.113.75.222,17,50005,3479,183,191,196,...,0.0000,0,841789,4697,1,4.547474e-13,92,1,0,0.0
2,1688187600,1688187606,45.100.248.16,202.249.92.162,6,44734,443,54,54,54,...,2.5000,6,216,4,4,2.500000e+00,6,0,0,0.0
3,1688187600,1688187600,38.88.114.91,163.37.23.124,6,43332,3128,54,0,0,...,0.0000,0,54,1,0,0.000000e+00,0,0,0,0.0
4,1688187600,1688187600,163.37.117.124,38.186.128.63,6,61554,443,85,54,0,...,0.0000,0,139,2,0,0.000000e+00,0,0,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1518385,1688187692,1688187692,45.197.187.47,163.37.128.119,6,48152,3128,58,0,0,...,0.0000,0,58,1,0,0.000000e+00,0,0,0,0.0
1518386,1688187692,1688187692,167.248.189.103,163.37.93.253,6,7528,80,58,0,0,...,0.0000,0,58,1,0,0.000000e+00,0,0,0,0.0
1518387,1688187692,1688187692,202.249.93.185,46.25.179.131,6,80,12256,66,66,0,...,0.0000,0,132,2,0,0.000000e+00,0,0,0,0.0
1518388,1688187692,1688187692,146.64.20.95,150.161.104.40,6,44843,2375,54,0,0,...,0.0000,0,54,1,0,0.000000e+00,0,0,0,0.0


In [80]:
y_test = y
y_pred = controller_elephant

In [81]:
# Evaluate the model's performance
precision = precision_score(y_test, y_pred, average = 'macro', zero_division=0)
print("Precision:", precision)

recall = recall_score(y_test, y_pred, average = 'macro', zero_division=0)
print("Recall:", recall)

f1 = f1_score(y_test, y_pred, average = 'macro', zero_division=0)
print("F1 Score:", f1)

accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)

fpr, tpr, _ = roc_curve(y_test, y_pred)
roc_auc = auc(fpr, tpr)
print(f'False Positive Rate: {fpr[1]}\nTrue Positive Rate: {tpr[1]}\nROC_AUC: {roc_auc}')

ValueError: Found input variables with inconsistent numbers of samples: [1518390, 65686]