# Import

In [1]:
!pip install shap

Collecting shap
  Downloading shap-0.41.0-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (572 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/572.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m563.2/572.6 kB[0m [31m23.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m572.6/572.6 kB[0m [31m15.3 MB/s[0m eta [36m0:00:00[0m
Collecting slicer==0.0.7 (from shap)
  Downloading slicer-0.0.7-py3-none-any.whl (14 kB)
Installing collected packages: slicer, shap
Successfully installed shap-0.41.0 slicer-0.0.7


In [2]:
import shap
import pandas as pd
import tensorflow as tf
from tensorflow import keras

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Load dataset & model

In [4]:
df = pd.read_csv('/content/drive/MyDrive/Dataset/insdn/binary/train.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 275111 entries, 0 to 275110
Data columns (total 77 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Flow Duration      275111 non-null  float64
 1   Tot Fwd Pkts       275111 non-null  float64
 2   Tot Bwd Pkts       275111 non-null  float64
 3   TotLen Fwd Pkts    275111 non-null  float64
 4   TotLen Bwd Pkts    275111 non-null  float64
 5   Fwd Pkt Len Max    275111 non-null  float64
 6   Fwd Pkt Len Min    275111 non-null  float64
 7   Fwd Pkt Len Mean   275111 non-null  float64
 8   Fwd Pkt Len Std    275111 non-null  float64
 9   Bwd Pkt Len Max    275111 non-null  float64
 10  Bwd Pkt Len Min    275111 non-null  float64
 11  Bwd Pkt Len Mean   275111 non-null  float64
 12  Bwd Pkt Len Std    275111 non-null  float64
 13  Flow Byts/s        275111 non-null  float64
 14  Flow Pkts/s        275111 non-null  float64
 15  Flow IAT Mean      275111 non-null  float64
 16  Fl

In [5]:
model_mlp = tf.keras.models.load_model('/content/drive/MyDrive/Dataset/insdn/insdn_mlp_multi.h5', compile=True)

In [6]:
x_normal = df[df['Label'] == 0].reset_index(drop=True)

In [7]:
x_normal.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54797 entries, 0 to 54796
Data columns (total 77 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Flow Duration      54797 non-null  float64
 1   Tot Fwd Pkts       54797 non-null  float64
 2   Tot Bwd Pkts       54797 non-null  float64
 3   TotLen Fwd Pkts    54797 non-null  float64
 4   TotLen Bwd Pkts    54797 non-null  float64
 5   Fwd Pkt Len Max    54797 non-null  float64
 6   Fwd Pkt Len Min    54797 non-null  float64
 7   Fwd Pkt Len Mean   54797 non-null  float64
 8   Fwd Pkt Len Std    54797 non-null  float64
 9   Bwd Pkt Len Max    54797 non-null  float64
 10  Bwd Pkt Len Min    54797 non-null  float64
 11  Bwd Pkt Len Mean   54797 non-null  float64
 12  Bwd Pkt Len Std    54797 non-null  float64
 13  Flow Byts/s        54797 non-null  float64
 14  Flow Pkts/s        54797 non-null  float64
 15  Flow IAT Mean      54797 non-null  float64
 16  Flow IAT Std       547

In [8]:
x_normal['Label'].value_counts()

0    54797
Name: Label, dtype: int64

In [9]:
df = df.drop(columns=['Label'])

In [10]:
x_normal = x_normal.drop(columns=['Label'])

In [11]:
feature_list = x_normal.columns

# SHAP

In [12]:
explainer = shap.KernelExplainer(model_mlp, shap.sample(df, 100))

In [13]:
normal_1500 = x_normal.head(1500)

In [14]:
shap_values = explainer.shap_values(normal_1500)

  0%|          | 0/1500 [00:00<?, ?it/s]

In [15]:
data_df = pd.DataFrame(shap_values[0])

In [16]:
data_df.to_csv('/content/drive/MyDrive/1500_samples_[0].csv', index=False)

In [17]:
data_df.columns = feature_list
data_df.head()

Unnamed: 0,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,Fwd Pkt Len Mean,Fwd Pkt Len Std,Bwd Pkt Len Max,...,Fwd Act Data Pkts,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min
0,-0.016213,-0.011424,0.02054,0.010562,-0.002964,0.019846,0.00719,0.0,0.019109,0.015782,...,0.002229,0.0,-0.010306,-0.009783,0.008458,-0.017709,0.008,-0.0454,-0.018068,-0.015668
1,0.002375,0.000739,-0.002176,0.0,0.001997,0.010443,0.015367,0.010324,0.0,0.001907,...,-0.00531,0.0,0.001485,0.001807,0.002048,-0.002969,0.002021,0.0,0.011725,-0.002128
2,0.004361,0.003518,0.003847,0.0,0.0,0.008549,0.02156,0.017555,-0.003446,0.001738,...,0.008333,0.0,-0.004769,0.0,-0.00289,0.0,0.00144,0.0,0.004082,-0.01012
3,0.007202,0.005448,0.0,0.0,-0.004913,0.007874,0.01704,0.022536,0.00277,0.0,...,0.0,0.0,0.0,0.0,0.0,0.001046,0.0,0.0,0.002418,0.0
4,0.009965,-0.00895,0.0,0.0,-0.011063,0.035538,0.003742,0.035933,0.030627,0.0,...,0.007677,0.0,0.0,0.0,0.0,0.001599,0.0,0.0,0.008079,0.0


In [18]:
data_dict = {}
for idx in range(0, 1500):
  # Select the desired row and get the top 10 values's column names
  row_values = data_df.iloc[idx].nlargest(10).index.tolist()

  # Create a dictionary and add the list with filename as the key
  data_dict['Normal sample ' + str(idx)] = row_values

In [19]:
print(data_dict)

{'Normal sample 0': ['Pkt Len Max', 'Init Bwd Win Byts', 'Bwd Header Len', 'Fwd IAT Tot', 'Pkt Size Avg', 'Bwd IAT Tot', 'Pkt Len Std', 'SYN Flag Cnt', 'Pkt Len Mean', 'Bwd Pkts/s'], 'Normal sample 1': ['SYN Flag Cnt', 'ACK Flag Cnt', 'FIN Flag Cnt', 'Pkt Size Avg', 'Pkt Len Mean', 'Pkt Len Max', 'Flow Pkts/s', 'Bwd Pkts/s', 'Pkt Len Min', 'Pkt Len Std'], 'Normal sample 2': ['SYN Flag Cnt', 'ACK Flag Cnt', 'FIN Flag Cnt', 'Pkt Size Avg', 'Bwd Pkts/s', 'Pkt Len Mean', 'Flow Pkts/s', 'Pkt Len Min', 'Pkt Len Max', 'Pkt Len Std'], 'Normal sample 3': ['SYN Flag Cnt', 'FIN Flag Cnt', 'ACK Flag Cnt', 'Pkt Len Max', 'Pkt Size Avg', 'Flow Pkts/s', 'Pkt Len Std', 'Bwd Pkts/s', 'Pkt Len Min', 'Pkt Len Mean'], 'Normal sample 4': ['Init Bwd Win Byts', 'Fwd Pkt Len Mean', 'FIN Flag Cnt', 'Fwd Pkt Len Max', 'Pkt Len Max', 'Fwd Pkt Len Std', 'Fwd Seg Size Avg', 'Flow Pkts/s', 'ACK Flag Cnt', 'Pkt Len Mean'], 'Normal sample 5': ['SYN Flag Cnt', 'FIN Flag Cnt', 'ACK Flag Cnt', 'Pkt Size Avg', 'Pkt Len M

In [20]:
len(data_dict)

1500

# New whitelist: 38 top features with over 10k freq each

In [21]:
df = pd.DataFrame.from_dict(data_dict, orient='index', columns=['Feature 1', 'Feature 2', 'Feature 3', 'Feature 4', 'Feature 5', 'Feature 6', 'Feature 7', 'Feature 8', 'Feature 9', 'Feature 10'])

In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1500 entries, Normal sample 0 to Normal sample 1499
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Feature 1   1500 non-null   object
 1   Feature 2   1500 non-null   object
 2   Feature 3   1500 non-null   object
 3   Feature 4   1500 non-null   object
 4   Feature 5   1500 non-null   object
 5   Feature 6   1500 non-null   object
 6   Feature 7   1500 non-null   object
 7   Feature 8   1500 non-null   object
 8   Feature 9   1500 non-null   object
 9   Feature 10  1500 non-null   object
dtypes: object(10)
memory usage: 128.9+ KB


In [23]:
df.head()

Unnamed: 0,Feature 1,Feature 2,Feature 3,Feature 4,Feature 5,Feature 6,Feature 7,Feature 8,Feature 9,Feature 10
Normal sample 0,Pkt Len Max,Init Bwd Win Byts,Bwd Header Len,Fwd IAT Tot,Pkt Size Avg,Bwd IAT Tot,Pkt Len Std,SYN Flag Cnt,Pkt Len Mean,Bwd Pkts/s
Normal sample 1,SYN Flag Cnt,ACK Flag Cnt,FIN Flag Cnt,Pkt Size Avg,Pkt Len Mean,Pkt Len Max,Flow Pkts/s,Bwd Pkts/s,Pkt Len Min,Pkt Len Std
Normal sample 2,SYN Flag Cnt,ACK Flag Cnt,FIN Flag Cnt,Pkt Size Avg,Bwd Pkts/s,Pkt Len Mean,Flow Pkts/s,Pkt Len Min,Pkt Len Max,Pkt Len Std
Normal sample 3,SYN Flag Cnt,FIN Flag Cnt,ACK Flag Cnt,Pkt Len Max,Pkt Size Avg,Flow Pkts/s,Pkt Len Std,Bwd Pkts/s,Pkt Len Min,Pkt Len Mean
Normal sample 4,Init Bwd Win Byts,Fwd Pkt Len Mean,FIN Flag Cnt,Fwd Pkt Len Max,Pkt Len Max,Fwd Pkt Len Std,Fwd Seg Size Avg,Flow Pkts/s,ACK Flag Cnt,Pkt Len Mean


In [24]:
df.to_csv('/content/drive/MyDrive/MLP_top10_of_each_normalfile.csv')

In [25]:
freq = {}
for column in df.columns:
  for feature in df[column]:
    if feature in freq:
      continue
    else:
      count = sum(df[column_2] == feature for column_2 in df.columns)
      freq[feature] = count.sum()
      print(freq[feature])

1024
1204
699
390
145
82
489
42
196
261
249
340
134
918
1159
256
1027
257
1012
987
178
31
147
881
173
213
842
694
108
21
95
131
132
23
23
9
52
125
45
5
26
31
26
19
6
6
6
8
6
2
4
7
4
7
10
5
9
5
4
3
3
1
3


In [26]:
len(freq)

63

In [27]:
freq_df = pd.DataFrame(list(freq.items()), columns=['Feature', 'Frequency'])
freq_df.head()

Unnamed: 0,Feature,Frequency
0,Pkt Len Max,1024
1,SYN Flag Cnt,1204
2,Init Bwd Win Byts,699
3,Fwd Pkt Len Max,390
4,PSH Flag Cnt,145


In [28]:
freq_df.sort_values(by=['Frequency'], ascending=False, inplace=True)
freq_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 63 entries, 1 to 61
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Feature    63 non-null     object
 1   Frequency  63 non-null     int64 
dtypes: int64(1), object(1)
memory usage: 1.5+ KB


In [29]:
freq_df.head(24)

Unnamed: 0,Feature,Frequency
1,SYN Flag Cnt,1204
14,FIN Flag Cnt,1159
16,Pkt Size Avg,1027
0,Pkt Len Max,1024
18,Flow Pkts/s,1012
19,Pkt Len Mean,987
13,ACK Flag Cnt,918
23,Bwd Pkts/s,881
26,Pkt Len Std,842
2,Init Bwd Win Byts,699


In [30]:
freq_df[:38].to_csv('/content/drive/MyDrive/MLP_top24_InSDN.csv', index=False)

# Detect normal samples

In [31]:
whitelist = freq_df.iloc[:, 0][:38].tolist()
print(whitelist)

['SYN Flag Cnt', 'FIN Flag Cnt', 'Pkt Size Avg', 'Pkt Len Max', 'Flow Pkts/s', 'Pkt Len Mean', 'ACK Flag Cnt', 'Bwd Pkts/s', 'Pkt Len Std', 'Init Bwd Win Byts', 'Pkt Len Min', 'Down/Up Ratio', 'Fwd Pkt Len Max', 'Fwd IAT Tot', 'Fwd Pkt Len Std', 'Fwd Seg Size Avg', 'Fwd Pkt Len Mean', 'Fwd IAT Std', 'Flow IAT Max', 'Fwd IAT Max', 'Bwd IAT Tot', 'Idle Max', 'Bwd IAT Max', 'PSH Flag Cnt', 'Bwd PSH Flags', 'Idle Mean', 'Idle Min', 'Bwd IAT Std', 'Bwd Pkt Len Min', 'URG Flag Cnt', 'Flow Byts/s', 'Flow Duration', 'Flow IAT Std', 'Fwd Pkt Len Min', 'Bwd IAT Mean', 'Bwd Header Len', 'Fwd IAT Mean', 'Bwd URG Flags']


In [32]:
alert = []

# Iterate over each key-value pair in the data_dict
for key, value in data_dict.items():
    # Check if there are one feature is not in the whitelist
    for string in value:
      if string not in whitelist:
        alert.append(key)
        break

# Print the alert dictionary
print("Alert list length:" + str(len(alert)))

Alert list length:166


In [33]:
whitelist = freq_df.iloc[:, 0][:24].tolist()
print(whitelist)

['SYN Flag Cnt', 'FIN Flag Cnt', 'Pkt Size Avg', 'Pkt Len Max', 'Flow Pkts/s', 'Pkt Len Mean', 'ACK Flag Cnt', 'Bwd Pkts/s', 'Pkt Len Std', 'Init Bwd Win Byts', 'Pkt Len Min', 'Down/Up Ratio', 'Fwd Pkt Len Max', 'Fwd IAT Tot', 'Fwd Pkt Len Std', 'Fwd Seg Size Avg', 'Fwd Pkt Len Mean', 'Fwd IAT Std', 'Flow IAT Max', 'Fwd IAT Max', 'Bwd IAT Tot', 'Idle Max', 'Bwd IAT Max', 'PSH Flag Cnt']


In [34]:
alert = []

# Iterate over each key-value pair in the data_dict
for key, value in data_dict.items():
    # Check if there are one feature is not in the whitelist
    for string in value:
      if string not in whitelist:
        alert.append(key)
        break

# Print the alert dictionary
print("Alert list length:" + str(len(alert)))

Alert list length:629


## False positive of XAI on normal samples of trainset:
top 38: 166/1500

top 24: 629/1500