In [1]:
pip install pandas numpy networkx matplotlib igraph

Collecting matplotlib
  Downloading matplotlib-3.9.2-cp312-cp312-win_amd64.whl.metadata (11 kB)
Collecting igraph
  Downloading igraph-0.11.6-cp39-abi3-win_amd64.whl.metadata (3.9 kB)
Collecting contourpy>=1.0.1 (from matplotlib)
  Using cached contourpy-1.3.0-cp312-cp312-win_amd64.whl.metadata (5.4 kB)
Collecting cycler>=0.10 (from matplotlib)
  Downloading cycler-0.12.1-py3-none-any.whl.metadata (3.8 kB)
Collecting fonttools>=4.22.0 (from matplotlib)
  Downloading fonttools-4.54.1-cp312-cp312-win_amd64.whl.metadata (167 kB)
Collecting kiwisolver>=1.3.1 (from matplotlib)
  Downloading kiwisolver-1.4.7-cp312-cp312-win_amd64.whl.metadata (6.4 kB)
Collecting pyparsing>=2.3.1 (from matplotlib)
  Downloading pyparsing-3.2.0-py3-none-any.whl.metadata (5.0 kB)
Collecting texttable>=1.6.2 (from igraph)
  Using cached texttable-1.7.0-py2.py3-none-any.whl.metadata (9.8 kB)
Downloading matplotlib-3.9.2-cp312-cp312-win_amd64.whl (7.8 MB)
   ---------------------------------------- 0.0/7.8 MB ? et

In [2]:
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt

Data Loading

In [4]:
df = pd.read_csv('./data/NF-ToN-IoT.csv')

In [70]:
print(df.head())

   IPV4_SRC_ADDR  L4_SRC_PORT    IPV4_DST_ADDR  L4_DST_PORT  PROTOCOL  \
0  192.168.1.195        63318   52.139.250.253          443         6   
1   192.168.1.79        57442    192.168.1.255        15600        17   
2   192.168.1.79        57452  239.255.255.250        15600        17   
3  192.168.1.193          138    192.168.1.255          138        17   
4   192.168.1.79        51989    192.168.1.255        15600        17   

   L7_PROTO  IN_BYTES  OUT_BYTES  IN_PKTS  OUT_PKTS  TCP_FLAGS  \
0     91.00       181        165        2         1         24   
1      0.00        63          0        1         0          0   
2      0.00        63          0        1         0          0   
3     10.16       472          0        2         0          0   
4      0.00        63          0        1         0          0   

   FLOW_DURATION_MILLISECONDS  Label  Attack  
0                         327      0  Benign  
1                           0      0  Benign  
2                      

Original Dataset Features

In [5]:
num_benign = len(df[df['Label'] == 0])
num_attack = len(df[df['Label'] == 1])
print("Num_benign: " , num_benign)
print("Num_attack: " , num_attack)
print("Attacks Classes: " , list(df["Attack"].unique()))
print(f"Attack/Benign Ratio: {num_attack/num_benign:.3f}")

percentage_distribution = df['Attack'].value_counts(normalize=True) * 100
print("Distribution of Attacks Classes: " , percentage_distribution)

Num_benign:  212811
Num_attack:  998118
Attacks Classes:  ['Benign', 'dos', 'injection', 'ddos', 'scanning', 'password', 'mitm', 'xss', 'backdoor', 'ransomware']
Attack/Benign Ratio: 4.690
Distribution of Attacks Classes:  Attack
injection     38.354272
ddos          18.698867
Benign        17.574193
password      12.430043
xss            8.251516
scanning       1.718846
dos            1.430142
backdoor       1.424031
mitm           0.106365
ransomware     0.011727
Name: proportion, dtype: float64


Original Dataset Centrality Measures

In [72]:
#Constructing Graph
G = nx.from_pandas_edgelist(df, 'IPV4_SRC_ADDR', 'IPV4_DST_ADDR', create_using=nx.DiGraph())
G.remove_nodes_from(list(nx.isolates(G)))

#Computing Graph-level Measures
number_of_nodes = G.number_of_nodes()
number_of_edges = G.number_of_edges()
max_degree = max(dict(G.degree()).values())
avg_degree = sum(dict(G.degree()).values()) / number_of_nodes
transitivity = nx.transitivity(G)
density = nx.density(G)

print(f"Number of nodes: {number_of_nodes}")
print(f"Number of edges: {number_of_edges}")
print(f"Max degree: {max_degree}")
print(f"Average degree: {avg_degree:.2f}")
print(f"Transitivity: {transitivity:.4f}")
print(f"Density: {density:.4f}")

#Computing Communities and Community Measures
import igraph as ig
G1 = ig.Graph.from_networkx(G)
part = G1.community_infomap()

communities = []
for com in part:
    communities.append([G1.vs[node_index]['_nx_name'] for node_index in com])
print(f"==>> number of communities: {len(communities)}")

# Step 1: Map each node to its community
node_to_community = {}
for community_index, community in enumerate(communities):
    for node in community:
        node_to_community[node] = community_index

# Step 2: Count inter-cluster edges efficiently
inter_cluster_edges = 0
for u, v in G.edges():
     if node_to_community[u] != node_to_community[v]:
        inter_cluster_edges += 1

mixing_parameter = inter_cluster_edges / G.number_of_edges()
modularity = nx.community.modularity(G, communities)
print(f"Mixing parameter: {mixing_parameter:.4f}")
print(f"Modularity: {modularity:.4f}")

Number of nodes: 1501
Number of edges: 2021
Max degree: 1168
Average degree: 2.69
Transitivity: 0.0007
Density: 0.0009
==>> number of communities: 7
Mixing parameter: 0.1450
Modularity: 0.1210


Inspecting Density and Transitivity of Communities

In [73]:
subgraph_objects = [G.subgraph(nodes) for nodes in communities]

for i, subgraph in enumerate(subgraph_objects):
    density = nx.density(subgraph)
    transitivity = nx.transitivity(subgraph)
    print(f"Subgraph {i} has density {density} & transitivity {transitivity}" )

Subgraph 0 has density 0.0007663257579388281 & transitivity 0.00011407236684340945
Subgraph 1 has density 0.21428571428571427 & transitivity 0
Subgraph 2 has density 0.28308823529411764 & transitivity 0.07568807339449542
Subgraph 3 has density 0.056666666666666664 & transitivity 0.1511627906976744
Subgraph 4 has density 0.4166666666666667 & transitivity 0
Subgraph 5 has density 0 & transitivity 0
Subgraph 6 has density 0 & transitivity 0


Modification Based on Community Mixing

In [78]:
community0 = communities[0]
community6 = communities[2]
community7 = communities[3]

#Choosing 3 communities to form a new modified dataset
filtered_df = df[
    (df['IPV4_SRC_ADDR'].isin(community0) | df['IPV4_DST_ADDR'].isin(community0)) |
    (df['IPV4_SRC_ADDR'].isin(community6) | df['IPV4_DST_ADDR'].isin(community6)) |
    (df['IPV4_SRC_ADDR'].isin(community7) | df['IPV4_DST_ADDR'].isin(community7)) 
]

#Constructing new graph from these communities
G4 = nx.from_pandas_edgelist(filtered_df, 'IPV4_SRC_ADDR', 'IPV4_DST_ADDR', create_using=nx.DiGraph())

In [79]:
#Computing graph-level measures of new modified dataset
number_of_nodes = G4.number_of_nodes()
number_of_edges = G4.number_of_edges()
max_degree = max(dict(G4.degree()).values())
avg_degree = sum(dict(G4.degree()).values()) / number_of_nodes
transitivity = nx.transitivity(G4)
density = nx.density(G4)

print(f"Number of nodes: {number_of_nodes}")
print(f"Number of edges: {number_of_edges}")
print(f"Max degree: {max_degree}")
print(f"Average degree: {avg_degree:.2f}")
print(f"Transitivity: {transitivity:.4f}")
print(f"Density: {density:.4f}")

#Computing communities and community measures of new modified dataset
import igraph as ig
GA = ig.Graph.from_networkx(G4)
part = GA.community_infomap()

communitiesX = []
for com in part:
    communitiesX.append([GA.vs[node_index]['_nx_name'] for node_index in com])
print(f"==>> number of communities: {len(communitiesX)}")

# Step 1: Map each node to its community
node_to_community = {}
for community_index, community in enumerate(communitiesX):
    for node in community:
        node_to_community[node] = community_index

# Step 2: Count inter-cluster edges efficiently
inter_cluster_edges = 0
for u, v in G4.edges():
     if node_to_community[u] != node_to_community[v]:
        inter_cluster_edges += 1
    

mixing_parameter = inter_cluster_edges / G4.number_of_edges()
modularity = nx.community.modularity(G4, communitiesX)
print(f"Mixing parameter: {mixing_parameter:.4f}")
print(f"Modularity: {modularity:.4f}")

Number of nodes: 1497
Number of edges: 2003
Max degree: 1168
Average degree: 2.68
Transitivity: 0.0007
Density: 0.0009
==>> number of communities: 2
Mixing parameter: 0.0210
Modularity: 0.0249


Modified Dataset Features

In [80]:
num_benign = len(filtered_df[filtered_df['Label'] == 0])
num_attack = len(filtered_df[filtered_df['Label'] == 1])

print("Num_benign: " , num_benign)
print("Num_attack: " , num_attack)
print("Attacks Classes: " , list(filtered_df["Attack"].unique()))
print(f"Attack/Benign Ratio: {num_attack/num_benign:.3f}")

percentage_distribution = filtered_df['Attack'].value_counts(normalize=True) * 100
print("Distribution of Attacks Classes: " , percentage_distribution)

Num_benign:  268688
Num_attack:  1108995
Attacks Classes:  ['Benign', 'dos', 'injection', 'ddos', 'scanning', 'password', 'mitm', 'xss', 'backdoor', 'ransomware']
Attack/Benign Ratio: 4.127
Distribution of Attacks Classes:  Attack
injection     34.009202
ddos          23.687960
Benign        19.502890
password      11.345063
xss            7.254499
scanning       1.558196
dos            1.286000
backdoor       1.251885
mitm           0.093998
ransomware     0.010307
Name: proportion, dtype: float64


Extracting Modified Dataset

In [82]:
filtered_df.to_csv('./data/NF-ToN-IoT-Modified.csv', index=False)

In [17]:
filtered_df.to_pickle("NF-UQ-NIDS-Modified.pkl" , compression="zip")