In [None]:
pip install pandas numpy networkx matplotlib igraph

In [1]:
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import igraph as ig

Data Loading

In [2]:
df = pd.read_csv('./data/NF-UNSW-NB15.csv')
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.dropna(axis=0, how='any', inplace=True)
df.drop_duplicates(keep ='last', inplace= True, ignore_index= True)

In [3]:
print(df.head())

   IPV4_SRC_ADDR  L4_SRC_PORT  IPV4_DST_ADDR  L4_DST_PORT  PROTOCOL  L7_PROTO  \
0  149.171.126.0        62073     59.166.0.5        56082         6       0.0   
1  149.171.126.2        32284     59.166.0.5         1526         6       0.0   
2  149.171.126.0           21     59.166.0.1        21971         6       1.0   
3     59.166.0.1        23800  149.171.126.0        46893         6       0.0   
4     59.166.0.5        63062  149.171.126.2           21         6       1.0   

   IN_BYTES  OUT_BYTES  IN_PKTS  OUT_PKTS  TCP_FLAGS  \
0      9672        416       11         8         25   
1      1776        104        6         2         25   
2      1842       1236       26        22         25   
3       528       8824       10        12         27   
4      1786       2340       32        34         25   

   FLOW_DURATION_MILLISECONDS  Label  Attack  
0                          15      0  Benign  
1                           0      0  Benign  
2                        1111      

Original Dataset Features

In [4]:
num_benign = len(df[df['Label'] == 0])
num_attack = len(df[df['Label'] == 1])
print("Num_benign: " , num_benign)
print("Num_attack: " , num_attack)
print("Attacks Classes: " , list(df["Attack"].unique()))

percentage_distribution = df['Attack'].value_counts(normalize=True) * 100
print("Distribution of Attacks Classes: " , percentage_distribution)

Num_benign:  1531166
Num_attack:  72212
Attacks Classes:  ['Benign', 'Exploits', 'Reconnaissance', 'DoS', 'Generic', 'Shellcode', 'Backdoor', 'Fuzzers', 'Worms', 'Analysis']
Distribution of Attacks Classes:  Attack
Benign            95.496259
Exploits           1.542182
Fuzzers            1.205580
Reconnaissance     0.765571
Generic            0.346456
DoS                0.314336
Analysis           0.124113
Backdoor           0.110829
Shellcode          0.085133
Worms              0.009542
Name: proportion, dtype: float64


Original Dataset Centrality Measures

In [5]:
#Constructing Graph
G = nx.from_pandas_edgelist(df, 'IPV4_SRC_ADDR', 'IPV4_DST_ADDR', create_using=nx.DiGraph())
#G.remove_nodes_from(list(nx.isolates(G)))

#Computing Graph-Level Measures
number_of_nodes = G.number_of_nodes()
number_of_edges = G.number_of_edges()
max_degree = max(dict(G.degree()).values())
avg_degree = sum(dict(G.degree()).values()) / number_of_nodes
transitivity = nx.transitivity(G)
density = nx.density(G)

print(f"Number of nodes: {number_of_nodes}")
print(f"Number of edges: {number_of_edges}")
print(f"Max degree: {max_degree}")
print(f"Average degree: {avg_degree:.2f}")
print(f"Transitivity: {transitivity:.4f}")
print(f"Density: {density:.4f}")

#Computing Communities and Community Measures
import igraph as ig
G1 = ig.Graph.from_networkx(G)
part = G1.community_infomap()

communities = []
for com in part:
    communities.append([G1.vs[node_index]['_nx_name'] for node_index in com])
print(f"==>> number of communities: {len(communities)}")

# Step 1: Map each node to its community
node_to_community = {}
for community_index, community in enumerate(communities):
    for node in community:
        node_to_community[node] = community_index

# Step 2: Count inter-cluster edges efficiently
inter_cluster_edges = 0
for u, v in G.edges():
     if node_to_community[u] != node_to_community[v]:
        inter_cluster_edges += 1
    
mixing_parameter = inter_cluster_edges / G.number_of_edges()
modularity = nx.community.modularity(G, communities)
print(f"Mixing parameter: {mixing_parameter:.4f}")
print(f"Modularity: {modularity:.4f}")

Number of nodes: 44
Number of edges: 294
Max degree: 21
Average degree: 13.36
Transitivity: 0.0000
Density: 0.1554
==>> number of communities: 6
Mixing parameter: 0.0136
Modularity: 0.4415


Modification Based on Centrality Measures

In [86]:
eigenvector_centrality = nx.eigenvector_centrality_numpy(G)
#Sorting nodes by eigenvector centrality
sorted_nodes = sorted(eigenvector_centrality, key=eigenvector_centrality.get, reverse=False)
#Taking top 21% nodes that have highest centrality to form a new modified dataset
top_few_percent_count = int(0.35* len(sorted_nodes))
top_few_percent_nodes = sorted_nodes[:top_few_percent_count]
#Constructing new graph from these nodes
G0 = G.subgraph(top_few_percent_nodes)

Modified Dataset Centrality Measures

In [87]:
#Computing graph-level measures of new modified dataset
number_of_nodes = G0.number_of_nodes()
number_of_edges = G0.number_of_edges()
max_degree = max(dict(G0.degree()).values())
avg_degree = sum(dict(G0.degree()).values()) / number_of_nodes
transitivity = nx.transitivity(G0)
density = nx.density(G0)

print(f"Number of nodes: {number_of_nodes}")
print(f"Number of edges: {number_of_edges}")
print(f"Max degree: {max_degree}")
print(f"Average degree: {avg_degree}")
print(f"Transitivity: {transitivity}")
print(f"Density: {density}")

#Computing communities and community measures of new modified dataset
G2 = ig.Graph.from_networkx(G0)
part = G2.community_infomap()

communities2 = []
for com in part:
    communities2.append([G2.vs[node_index]['_nx_name'] for node_index in com])
print(f"==>> number of communities: {len(communities2)}")
# Step 1: Map each node to its community
node_to_community = {}
for community_index, community in enumerate(communities2):
    for node in community:
        node_to_community[node] = community_index

# Step 2: Count inter-cluster edges efficiently
inter_cluster_edges = 0
for u, v in G0.edges():
    if node_to_community[u] != node_to_community[v]:
        inter_cluster_edges += 1

mixing_parameter = inter_cluster_edges / G0.number_of_edges()
modularity = nx.community.modularity(G0, communities2)
print(f"Mixing parameter: {mixing_parameter}")
print(f"Modularity: {modularity}")

Number of nodes: 15
Number of edges: 24
Max degree: 8
Average degree: 3.2
Transitivity: 0
Density: 0.11428571428571428
==>> number of communities: 5
Mixing parameter: 0.125
Modularity: 0.43923611111111105


Modified Dataset Features

In [88]:
filtered_df = df[ (df['IPV4_SRC_ADDR'].isin(top_few_percent_nodes)) | (df['IPV4_DST_ADDR'].isin(top_few_percent_nodes)) ]
num_benign = len(filtered_df[filtered_df['Label'] == 0])
num_attack = len(filtered_df[filtered_df['Label'] == 1])

print("Num_benign: " , num_benign)
print("Num_attack: " , num_attack)
print("Attacks Classes: " , list(df["Attack"].unique()))

percentage_distribution = df['Attack'].value_counts(normalize=True) * 100
print("Distribution of Attacks Classes: " , percentage_distribution)

Num_benign:  31679
Num_attack:  58441
Attacks Classes:  ['Benign', 'Exploits', 'Reconnaissance', 'DoS', 'Generic', 'Shellcode', 'Backdoor', 'Fuzzers', 'Worms', 'Analysis']
Distribution of Attacks Classes:  Attack
Benign            95.496259
Exploits           1.542182
Fuzzers            1.205580
Reconnaissance     0.765571
Generic            0.346456
DoS                0.314336
Analysis           0.124113
Backdoor           0.110829
Shellcode          0.085133
Worms              0.009542
Name: proportion, dtype: float64


Extracting Modified Dataset

In [89]:
filtered_df.to_csv('./data/NF-UNSW-NB15-Modified.csv', index=False)

In [25]:
filtered_df.to_pickle("./data/NF-ToN-IoT-v2-Modified.pkl")