In [None]:
import networkx as nx
import pandas as pd
import numpy as np
import torch
import torch_geometric
from torch_geometric.data import Data
import ipaddress
import os

class CICIDSGraphGenerator:
    def __init__(self, dataframe):
        self.df = dataframe.copy()
        self.edge_flow_graph = None
        self.node_flow_graph = None
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        
    def _normalize_features(self, features):
        return (features - features.mean(dim=0)) / (features.std(dim=0) + 1e-8)
    
    def _map_ips_randomly(self):
        """
        Randomly map IP addresses to avoid potential labeling bias
        
        Returns:
            dict: Mapping of original IPs to randomized IPs
        """
        unique_ips = pd.concat([self.df['Source IP'], self.df['Destination IP']]).unique()
        random_ip_map = {}
        
        for ip in unique_ips:
            random_ip = str(ipaddress.IPv4Address(np.random.randint(0x0A000000, 0x0AFFFFFF)))
            random_ip_map[ip] = random_ip
        
        return random_ip_map
    
    def create_edge_flow_graph(self, binary_classification=True):
        """
        Create Edge-Flow Graph: Hosts as nodes, Flows as edges
        
        Args:
            binary_classification (bool, optional): If True, use binary labels (BENIGN, MALICIOUS). If False, use multi-class labels.
        
        Returns:
            torch_geometric.data.Data: Graph data for GNN
        """
        # Randomly map IPs
        ip_map = self._map_ips_randomly()
        
        # Select edge features
        edge_features = [
            'Flow Duration', 'Total Fwd Packets', 'Total Backward Packets',
            'Total Length of Fwd Packets', 'Total Length of Bwd Packets', 
            'Flow Packets/s', 'Flow Bytes/s',
            'Fwd Packet Length Mean', 'Bwd Packet Length Mean',
            'Protocol'
        ]
        
        # Node features (based on ports)
        node_features = [
            'Source Port', 'Destination Port', 
            'Fwd Header Length', 'Bwd Header Length',
            'min_seg_size_forward'
        ]
        
        # Node mapping
        unique_hosts = set(ip_map[ip] for ip in self.df['Source IP'].unique()) | \
                       set(ip_map[ip] for ip in self.df['Destination IP'].unique())
        node_mapping = {host: idx for idx, host in enumerate(unique_hosts)}
        
        # Prepare node feature matrix
        node_feature_matrix = torch.tensor([self._normalize_features(
            self.df[(self.df['Source IP'].map(ip_map) == host) | 
                    (self.df['Destination IP'].map(ip_map) == host)][node_features].mean().values)
            for host in unique_hosts], dtype=torch.float).to(self.device)
        
        # Prepare edges and edge features
        edge_index = []
        edge_features_list = []
        
        for _, row in self.df.iterrows():
            source_node = node_mapping[ip_map[row['Source IP']]]
            dest_node = node_mapping[ip_map[row['Destination IP']]]
            
            # Add bidirectional edge
            edge_index.append([source_node, dest_node])
            edge_index.append([dest_node, source_node])
            
            # Extract edge features
            edge_vector = row[edge_features].values
            edge_features_list.append(self._normalize_features(torch.tensor(edge_vector, dtype=torch.float)))
            edge_features_list.append(self._normalize_features(torch.tensor(edge_vector, dtype=torch.float)))
        
        # Convert to PyTorch Geometric Data
        edge_index = torch.tensor(edge_index, dtype=torch.long).to(self.device).t().contiguous()
        edge_attr = torch.stack(edge_features_list).to(self.device)
        
        # Add labels
        if binary_classification:
            labels = torch.tensor(self.df['Label'].map({'BENIGN': 0, 'MALICIOUS': 1}).values, dtype=torch.long).to(self.device)
        else:
            label_mapping = {label: idx for idx, label in enumerate(self.df['Label'].unique())}
            labels = torch.tensor(self.df['Label'].map(label_mapping).values, dtype=torch.long).to(self.device)
        
        self.edge_flow_graph = Data(x=node_feature_matrix, edge_index=edge_index, edge_attr=edge_attr, y=labels)
        return self.edge_flow_graph
    
    def create_node_flow_graph(self, binary_classification=True):
        """
        Create Node-Flow Graph: Hosts as nodes, Flows connecting hosts
        
        Args:
            binary_classification (bool, optional): If True, use binary labels (BENIGN, MALICIOUS). If False, use multi-class labels.
        
        Returns:
            torch_geometric.data.Data: Graph data for GNN
        """
        # Select features for host nodes
        host_features = [
            'Total Fwd Packets', 'Total Backward Packets', 
            'Total Length of Fwd Packets', 'Total Length of Bwd Packets',
            'Flow Packets/s', 'Flow Bytes/s',
            'Flow Duration'
        ]
        
        # Select features for flow nodes
        flow_features = [
            'Fwd Packet Length Mean', 'Bwd Packet Length Mean',
            'Flow IAT Mean', 'Flow IAT Max',
            'Fwd Header Length', 'Bwd Header Length',
            'Protocol'
        ]
        
        # Prepare node features
        unique_hosts = set(self.df['Source IP'].unique()) | set(self.df['Destination IP'].unique())
        
        # Node mapping
        node_mapping = {ip: idx for idx, ip in enumerate(unique_hosts)}
        flow_node_offset = len(node_mapping)
        
        # Node feature matrix
        host_feature_matrix = torch.tensor([self._normalize_features(
            self.df[(self.df['Source IP'] == host) | (self.df['Destination IP'] == host)][host_features].mean().values)
            for host in unique_hosts], dtype=torch.float).to(self.device)
        
        # Create flow nodes and connect hosts
        flow_feature_matrix = []
        edge_index = []
        
        for _, row in self.df.iterrows():
            # Create flow node
            flow_node_idx = flow_node_offset + len(flow_feature_matrix)
            
            # Extract flow features
            flow_vector = row[flow_features].values
            flow_feature_matrix.append(self._normalize_features(torch.tensor(flow_vector, dtype=torch.float)))
            
            # Connect source host to flow
            source_host_idx = node_mapping[row['Source IP']]
            dest_host_idx = node_mapping[row['Destination IP']]
            
            edge_index.append([source_host_idx, flow_node_idx])
            edge_index.append([flow_node_idx, dest_host_idx])
        
        flow_feature_matrix = torch.stack(flow_feature_matrix).to(self.device)
        edge_index = torch.tensor(edge_index, dtype=torch.long).to(self.device).t().contiguous()
        
        # Add labels
        if binary_classification:
            labels = torch.tensor(self.df['Label'].map({'BENIGN': 0, 'MALICIOUS': 1}).values, dtype=torch.long).to(self.device)
        else:
            label_mapping = {label: idx for idx, label in enumerate(self.df['Label'].unique())}
            labels = torch.tensor(self.df['Label'].map(label_mapping).values, dtype=torch.long).to(self.device)
        
        self.node_flow_graph = Data(x=torch.cat([host_feature_matrix, flow_feature_matrix], dim=0), edge_index=edge_index, y=labels)
        return self.node_flow_graph
    
    def generate_graphs(self, binary_classification=True):
        """
        Generate both graph types
        
        Args:
            binary_classification (bool, optional): If True, use binary labels (BENIGN, MALICIOUS). If False, use multi-class labels.
        
        Returns:
            tuple: Edge-Flow and Node-Flow graphs
        """
        self.create_edge_flow_graph(binary_classification)
        self.create_node_flow_graph(binary_classification)
        
        return self.edge_flow_graph, self.node_flow_graph







In [None]:
def split_and_save_graphs(df, save_dir, binary_classification=True):
    # Split data into train and test
    train_df = df.sample(frac=0.8, random_state=42)
    test_df = df.drop(train_df.index)
    
    # Create graph generator
    graph_generator = CICIDSGraphGenerator(train_df)
    
    # Generate graphs
    train_edge_flow, train_node_flow = graph_generator.generate_graphs(binary_classification)
    
    # Create graph generator for test data
    test_graph_generator = CICIDSGraphGenerator(test_df)
    test_edge_flow, test_node_flow = test_graph_generator.generate_graphs(binary_classification)
    
    # Save graphs
    os.makedirs(save_dir, exist_ok=True)
    
    if binary_classification:
        torch.save(train_edge_flow, os.path.join(save_dir, 'train_edge_flow_binary.pt'))
        torch.save(train_node_flow, os.path.join(save_dir, 'train_node_flow_binary.pt'))
        torch.save(test_edge_flow, os.path.join(save_dir, 'test_edge_flow_binary.pt'))
        torch.save(test_node_flow, os.path.join(save_dir, 'test_node_flow_binary.pt'))
    else:
        torch.save(train_edge_flow, os.path.join(save_dir, 'train_edge_flow_multiclass.pt'))
        torch.save(train_node_flow, os.path.join(save_dir, 'train_node_flow_multiclass.pt'))
        torch.save(test_edge_flow, os.path.join(save_dir, 'test_edge_flow_multiclass.pt'))
        torch.save(test_node_flow, os.path.join(save_dir, 'test_node_flow_multiclass.pt'))
    
    print("Graphs saved successfully!")    
    
    
    df = pd.read_csv('path_to_your_dataset.csv')
    
# Preprocess labels for binary and multi-class classification
df['Label'] = df['Label'].map(lambda x: 'MALICIOUS' if x != 'BENIGN' else 'BENIGN')


split_and_save_graphs(df, 'path_to_save_directory', binary_classification=True)

split_and_save_graphs(df, 'path_to_save_directory', binary_classification=False)

In [None]:
import networkx as nx
import matplotlib.pyplot as plt

def visualize_graph(graph, title, save_path=None):
    """
    Visualize the created graph
    
    Args:
        graph (torch_geometric.data.Data): The graph to be visualized
        title (str): Title of the plot
        save_path (str, optional): Path to save the plot
    """
    G = nx.Graph()

    # Add nodes
    for i in range(graph.x.shape[0]):
        G.add_node(i)

    # Add edges
    for i, j in graph.edge_index.t().tolist():
        G.add_edge(i, j)

    # Plot the graph
    plt.figure(figsize=(12, 8))
    pos = nx.spring_layout(G)
    nx.draw(G, pos, with_labels=False, node_color='lightblue', edge_color='gray', alpha=0.7)
    plt.title(title)
    plt.tight_layout()

    if save_path:
        plt.savefig(save_path)
    else:
        plt.show()

# Example usage
if __name__ == "__main__":
    # Load the saved graphs
    train_edge_flow = torch.load('path_to_train_edge_flow.pt')
    train_node_flow = torch.load('path_to_train_node_flow.pt')
    test_edge_flow = torch.load('path_to_test_edge_flow.pt')
    test_node_flow = torch.load('path_to_test_node_flow.pt')

    # Visualize the graphs
    visualize_graph(train_edge_flow, 'Train Edge-Flow Graph', 'train_edge_flow.png')
    visualize_graph(train_node_flow, 'Train Node-Flow Graph', 'train_node_flow.png')
    visualize_graph(test_edge_flow, 'Test Edge-Flow Graph', 'test_edge_flow.png')
    visualize_graph(test_node_flow, 'Test Node-Flow Graph', 'test_node_flow.png')