## data_preprocessing.py

In [79]:
# data_preprocessing.py
import pandas as pd
import numpy as np
from torch_geometric.data import Data  # Used for graph data structure

class DataPreprocessor:
    def __init__(self, dataset_path, m=3):
        self.dataset_path = dataset_path  # Path to CSV dataset
        self.m = m  # Sequence length (number of graphs per sequence)

    def load_and_preprocess_data(self):
        # Load dataset (adjust columns based on your CSV)
        df = pd.read_csv(self.dataset_path, usecols=['_source_source_ip', '_source_destination_ip', 
                                                      '_source_network_bytes', '_source_@timestamp', 'label'])
        # Convert timestamp to datetime
        df['timestamp'] = pd.to_datetime(df['_source_@timestamp'])
        # Group by minute windows (window_id)
        df['window_id'] = df['timestamp'].dt.floor('T').astype('int64') // 10**9 // 60
        grouped = df.groupby('window_id')
        window_ids = np.array(sorted(grouped.groups.keys()))

        # Create graph data for each window (placeholder logic)
        X, y = [], []
        for wid in window_ids:
            window_df = grouped.get_group(wid)
            # Simplified graph creation: 10 nodes with random features, 20 edges
            # Replace with actual graph creation logic based on your data
            graph = Data(x=np.random.rand(10, 4), edge_index=np.random.randint(0, 10, (2, 20)))
            label = int((window_df['label'] == 'malicious').any())  # Binary label: 1 if malicious
            X.append(graph)
            y.append(label)

        # Create sequences of m graphs
        X_seq, y_seq = [], []
        for k in range(self.m, len(window_ids)):
            seq = X[k - self.m:k]
            X_seq.append(seq)
            y_seq.append(y[k])

        # Split into train (80%), validation (10%), and test (10%) sets
        n = len(X_seq)
        train_end = int(0.8 * n)
        val_end = int(0.9 * n)
        X_train, y_train = X_seq[:train_end], y_seq[:train_end]
        X_val, y_val = X_seq[train_end:val_end], y_seq[train_end:val_end]
        X_test, y_test = X_seq[val_end:], y_seq[val_end:]

        return X_train, y_train, X_val, y_val, X_test, y_test
    
# Explain this code
# This code defines a class DataPreprocessor that preprocesses a dataset for graph-based time series forecasting.
# The class has a constructor that takes the path to a CSV dataset and a sequence length m as input.
# The load_and_preprocess_data method loads the dataset, groups the data into minute windows, creates graph data for each window,
# creates sequences of m graphs, and splits the data into train, validation, and test sets.
# The graph data is created using random features and edges for demonstration purposes, and should be replaced with actual graph creation logic.
# The labels are binary, with a value of 1 indicating a malicious window.
# The method returns the train, validation, and test sets as lists of sequences of graphs and corresponding labels.
# The code uses the pandas library for data manipulation and the torch_geometric library for graph data creation.
# The Data class from torch_geometric is used to represent graph data in PyTorch Geometric.


## GNN.py


In [80]:
# GNN.py
import tensorflow as tf

class GCNLayer(tf.keras.layers.Layer):
    def __init__(self, units, activation='relu'):
        super(GCNLayer, self).__init__()
        # Dense layer with L2 regularization
        print(units)
        self.dense = tf.keras.layers.Dense(units, activation=None, use_bias=False,
                                          kernel_regularizer=tf.keras.regularizers.l2(1e-3))
        self.activation = tf.keras.activations.get(activation)

    def call(self, node_features, adj_norm):
        # GCN operation: A_norm * X * W
        h = tf.sparse.sparse_dense_matmul(adj_norm, node_features)
        h = self.dense(h)
        h = self.activation(h)
        return h

class GNN(tf.keras.Model):
    def __init__(self, hidden_units=64, output_units=32):
        super(GNN, self).__init__()
        self.gcn1 = GCNLayer(hidden_units)
        self.dropout1 = tf.keras.layers.Dropout(0.6)
        self.gcn2 = GCNLayer(output_units)
        self.dropout2 = tf.keras.layers.Dropout(0.6)

    def call(self, inputs):
        # Unpack the input tuple
        node_features, edge_indices, num_nodes = inputs
        
        # Compute the normalized adjacency matrix
        adj_norm = compute_normalized_adjacency(edge_indices, num_nodes)
        
        # Pass node_features and adj_norm to GCN layers
        x = self.gcn1(node_features, adj_norm)
        x = self.dropout1(x)
        x = self.gcn2(x, adj_norm)
        x = self.dropout2(x)
        
        # Optional: Pool node embeddings into a graph embedding
        embedding = tf.reduce_mean(x, axis=0)  # Mean pooling
        return embedding



def compute_normalized_adjacency(edge_indices, num_nodes):
    # Cast num_nodes to int64 for consistency with edge_indices
    num_nodes = tf.cast(num_nodes, tf.int64)
    
    # Create sparse adjacency matrix from edge_indices (already int64)
    adj = tf.sparse.SparseTensor(
        indices=tf.transpose(edge_indices),
        values=tf.ones([tf.shape(edge_indices)[1]], dtype=tf.float32),
        dense_shape=[num_nodes, num_nodes]
    )
    
    # Create sparse identity matrix for self-loops
    identity_indices = tf.stack([tf.range(num_nodes), tf.range(num_nodes)], axis=1)
    identity_values = tf.ones([num_nodes], dtype=tf.float32)
    identity_sparse = tf.sparse.SparseTensor(
        indices=identity_indices,
        values=identity_values,
        dense_shape=[num_nodes, num_nodes]
    )
    
    # Add self-loops to adjacency matrix
    adj = tf.sparse.add(adj, identity_sparse)
    
    # Compute degree normalization
    degree = tf.sparse.reduce_sum(adj, axis=1)
    degree_inv_sqrt = tf.pow(degree + 1e-9, -0.5)
    degree_inv_sqrt = tf.where(tf.math.is_inf(degree_inv_sqrt), 0.0, degree_inv_sqrt)
    
    # Normalize adjacency matrix
    adj_norm = tf.sparse.SparseTensor(
        indices=adj.indices,
        values=adj.values * tf.gather(degree_inv_sqrt, adj.indices[:, 0]) * tf.gather(degree_inv_sqrt, adj.indices[:, 1]),
        dense_shape=adj.dense_shape
    )
    
    return adj_norm

# Explain this code in a simple way that is easy to understand and recall 
# when needed.
# This code defines a custom Graph Convolutional Network (GCN) layer and a
# GCN model using TensorFlow. The GCN layer is defined as a custom layer
# that performs the graph convolution operation on the input node features
# and adjacency matrix. The adjacency matrix is normalized using the degree
# normalization technique. The GCN model is defined as a sequence of GCN layers
# with ReLU activation functions. The model takes input node features, edge
# indices, and the number of nodes in the graph. The normalized adjacency matrix
# is computed using the edge indices and number of nodes, and passed through
# the GCN layers to generate node embeddings. The code also includes utility
# functions to create sparse tensors for the adjacency matrix and normalize it
# using the degree normalization technique.



## LSTM.py

In [81]:
# LSTM.py
import tensorflow as tf

class LSTMModel(tf.keras.Model):
    def __init__(self, input_size=32, hidden_size=64):
        super(LSTMModel, self).__init__()
        # LSTM with L2 regularization
        self.lstm = tf.keras.layers.LSTM(hidden_size, return_sequences=False,
                                        kernel_regularizer=tf.keras.regularizers.l2(1e-2),
                                        recurrent_regularizer=tf.keras.regularizers.l2(1e-2))
        # Output layer with L2 regularization
        self.dense = tf.keras.layers.Dense(1, activation='sigmoid',
                                         kernel_regularizer=tf.keras.regularizers.l2(1e-2))

    def call(self, inputs):
        x = self.lstm(inputs)
        x = self.dense(x)
        return x
    

# This code defines a simple LSTM (Long Short-Term Memory) model using the Keras API
# The model consists of an LSTM layer followed by a dense layer with a sigmoid activation function
# The LSTM layer has a hidden size of 64 and returns only the final output sequence
# Both the LSTM and dense layers have L2 regularization with a regularization strength of 1e-4
# The model takes input sequences of shape (batch_size, sequence_length, input_size)
# and outputs a single prediction for each input sequence in the batch
# The LSTM model is used for sequence data, such as time series or sequential graph data
# The sigmoid activation function in the output layer is suitable for binary classification tasks
# The model can be trained using binary cross-entropy loss and optimized using gradient descent algorithms


## training.py

In [82]:
import tensorflow as tf
import random


class ModelTrainer:
    def __init__(self, X_train, y_train, X_val, y_val, model_path, m=3, num_episodes=50, batch_size=32, hidden_units=128, output_units=32):
        # Initialize training and validation data, model path, and training hyperparameters.
        self.X_train, self.y_train = X_train, y_train
        self.X_val, self.y_val = X_val, y_val
        self.model_path = model_path
        self.m = m
        self.num_episodes = num_episodes
        self.batch_size = batch_size
        # Initialize GNN and LSTM models.
        self.gnn = GNN(hidden_units=hidden_units, output_units=output_units)
        self.lstm = LSTMModel(input_size=32, hidden_size=64)
        self.optimizer = tf.keras.optimizers.Adam(learning_rate=0.00005)
        # Early stopping parameters.
        self.best_val_loss = float('inf')
        self.counter = 0
        self.patience = 2

    def train(self):
        # Training loop over episodes.
        for episode in range(self.num_episodes):
            train_data = list(zip(self.X_train, self.y_train))
            random.shuffle(train_data)
            train_loss = 0
            for i in range(0, len(train_data), self.batch_size):
                batch = train_data[i:i + self.batch_size]
                # Extract sequences and labels.
                batch_X = [seq for seq, _ in batch]
                batch_y = tf.convert_to_tensor([label for _, label in batch], dtype=tf.float32)[:, None]
                with tf.GradientTape() as tape:
                    batch_embeddings = []
                    # Generate embeddings for each sequence using the GNN.
                    for sequence in batch_X:
                        sequence_embeddings = []
                        for graph in sequence:
                            node_features = tf.convert_to_tensor(graph['x'], dtype=tf.float32)
                            edge_indices = tf.convert_to_tensor(graph['edge_index'], dtype=tf.int64)
                            num_nodes = node_features.shape[0]
                            embedding = self.gnn((node_features, edge_indices, num_nodes))
                            sequence_embeddings.append(embedding)
                        batch_embeddings.append(tf.stack(sequence_embeddings))
                    batch_embeddings = tf.stack(batch_embeddings)
                    # LSTM model processes the sequence embeddings.
                    y_pred = self.lstm(batch_embeddings)
                    
                    # ----- Weighted Loss Computation -----
                    # Set class weights; adjust these values based on the class distribution.
                    class_weights = {0: 1.0, 1: 5.0}
                    # Compute binary crossentropy loss.
                    loss = tf.keras.losses.binary_crossentropy(batch_y, y_pred)
                    # Gather weights corresponding to each label.
                    weights = tf.gather(tf.constant([class_weights[0], class_weights[1]]), tf.cast(batch_y, tf.int32))
                    # Apply weights and reduce to mean.
                    loss = loss * weights
                    loss = tf.reduce_mean(loss)
                    # --------------------------------------
                    
                grads = tape.gradient(loss, self.gnn.trainable_variables + self.lstm.trainable_variables)
                self.optimizer.apply_gradients(zip(grads, self.gnn.trainable_variables + self.lstm.trainable_variables))
                train_loss += tf.reduce_mean(loss).numpy()
            train_loss /= (len(train_data) + self.batch_size - 1) // self.batch_size  # Ceiling division for number of batches

            # Validation loop.
            val_data = list(zip(self.X_val, self.y_val))
            val_loss = 0
            for i in range(0, len(val_data), self.batch_size):
                batch = val_data[i:i + self.batch_size]
                batch_X = [seq for seq, _ in batch]
                batch_y = tf.convert_to_tensor([label for _, label in batch], dtype=tf.float32)[:, None]
                batch_embeddings = []
                for sequence in batch_X:
                    sequence_embeddings = []
                    for graph in sequence:
                        node_features = tf.convert_to_tensor(graph['x'], dtype=tf.float32)
                        edge_indices = tf.convert_to_tensor(graph['edge_index'], dtype=tf.int64)
                        num_nodes = node_features.shape[0]
                        embedding = self.gnn((node_features, edge_indices, num_nodes))
                        sequence_embeddings.append(embedding)
                    batch_embeddings.append(tf.stack(sequence_embeddings))
                batch_embeddings = tf.stack(batch_embeddings)
                y_pred = self.lstm(batch_embeddings)
                loss = tf.keras.losses.binary_crossentropy(batch_y, y_pred)
                weights = tf.gather(tf.constant([class_weights[0], class_weights[1]]), tf.cast(batch_y, tf.int32))
                loss = loss * weights
                loss = tf.reduce_mean(loss)
                val_loss += tf.reduce_mean(loss).numpy()
            val_loss /= (len(val_data) + self.batch_size - 1) // self.batch_size
            print(f"Episode {episode}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")

            if val_loss < self.best_val_loss:
                self.best_val_loss = val_loss
                self.counter = 0
                # Save best model weights.
                self.gnn.save_weights(f"{self.model_path}_gnn.h5")
                self.lstm.save_weights(f"{self.model_path}_lstm.h5")
            else:
                self.counter += 1
                if self.counter >= self.patience:
                    print("Early stopping triggered")
                    break

        # Load best model weights after training.
        self.gnn.load_weights(f"{self.model_path}_gnn.h5")
        self.lstm.load_weights(f"{self.model_path}_lstm.h5")



# Explain this code in a simple way that is easy to understand and recall
# when needed.
# This code defines a ModelTrainer class that trains a GNN-LSTM model for graph-based time series forecasting using TensorFlow.
# The class takes training and validation data, model parameters, and training hyperparameters as input.
# The train method trains the model for a specified number of episodes, shuffling the training data and computing the loss for each batch.
# The model is saved at each episode, and the best model is saved based on the validation loss
# Early stopping is implemented to prevent overfitting, with a patience of 10 episodes
# The GNN and LSTM models are trained using binary cross-entropy loss and optimized using the Adam optimizer
# The GNN model is used to generate node embeddings for each graph in the sequence, which are then passed to the LSTM model
# The LSTM model processes the sequence of node embeddings and outputs a single prediction for each input sequence
# The training process is monitored using the training and validation loss, and early stopping is triggered if the validation loss does not improve
# The best model weights are loaded after training is completed, and the training process is terminated
# The trained model can be used for graph-based time series forecasting tasks, such as anomaly detection or event prediction
# The code demonstrates how to train a GNN-LSTM model for graph-based time series forecasting using TensorFlow and custom layers.


## evaluation.py

In [83]:
# evaluation.py
import tensorflow as tf

class ModelEvaluator:
    def __init__(self, gnn, lstm):
        self.gnn = gnn
        self.lstm = lstm

    def evaluate(self, X_val, y_val):
        val_loss = 0
        correct = 0
        total = 0
        for sequence, label in zip(X_val, y_val):
            embeddings = []
            for graph in sequence:
                node_features = tf.convert_to_tensor(graph['x'], dtype=tf.float32)
                edge_indices = tf.convert_to_tensor(graph['edge_index'], dtype=tf.int64)
                num_nodes = node_features.shape[0]
                embedding = self.gnn((node_features, edge_indices, num_nodes))
                embeddings.append(embedding)
            sequence_embeddings = tf.stack(embeddings)[None, :]
            y_pred = self.lstm(sequence_embeddings).numpy()[0][0]
            val_loss += tf.keras.losses.binary_crossentropy([label], [y_pred]).numpy()
            if (y_pred > 0.5) == label:
                correct += 1
            total += 1
        val_loss /= total
        val_accuracy = correct / total
        return val_loss, val_accuracy

## testing.py

In [84]:
# testing.py
import tensorflow as tf
import pandas as pd
import numpy as np
from torch_geometric.data import Data

class ModelTester:
    def __init__(self, model):
        self.gnn, self.lstm = model

    def test(self, X_test, y_test):
        """
        Test the trained model on preprocessed test data.
        
        Args:
            X_test (list): List of sequences, where each sequence is a list of graph dictionaries.
            y_test (list): List of binary labels (0 or 1).
        
        Returns:
            float: Test accuracy.
        """
        correct = 0
        total = 0
        for sequence, label in zip(X_test, y_test):
            embeddings = []
            for graph in sequence:
                node_features = tf.convert_to_tensor(graph['x'], dtype=tf.float32)
                edge_indices = tf.convert_to_tensor(graph['edge_index'], dtype=tf.int64)
                num_nodes = node_features.shape[0]
                embedding = self.gnn((node_features, edge_indices, num_nodes))
                embeddings.append(embedding)
            sequence_embeddings = tf.stack(embeddings)[None, :]  # Add batch dimension
            y_pred = self.lstm(sequence_embeddings).numpy()[0][0]  # Get scalar prediction
            if (y_pred > 0.5) == label:
                correct += 1
            total += 1
        test_accuracy = correct / total
        print(f"Test Accuracy: {test_accuracy:.4f}")
        return test_accuracy

    def preprocess_and_test(self, dataset_path, m=3):
        """
        Preprocess a raw test dataset and test the model on it.
        
        Args:
            dataset_path (str): Path to the raw test dataset CSV file.
            m (int): Number of graphs per sequence (default=3).
        
        Returns:
            float: Test accuracy on the preprocessed dataset.
        """
        # Load dataset (adjust columns based on your CSV structure)
        df = pd.read_csv(dataset_path, usecols=['_source_source_ip', '_source_destination_ip', 
                                                '_source_network_bytes', '_source_@timestamp', 'label'],delimiter=';')
        # Convert timestamp to datetime
        df['timestamp'] = pd.to_datetime(df['_source_@timestamp'])
        # Group by minute windows (window_id)
        df['window_id'] = df['timestamp'].dt.floor('T').astype('int64') // 10**9 // 60
        grouped = df.groupby('window_id')
        window_ids = np.array(sorted(grouped.groups.keys()))

        # Create graph data for each window
        X, y = [], []
        for wid in window_ids:
            window_df = grouped.get_group(wid)
            # Simplified graph creation (replace with your actual graph creation logic)
            graph = Data(x=np.random.rand(10, 4), edge_index=np.random.randint(0, 10, (2, 20)))
            label = int((window_df['label'] == 'malicious').any())  # Binary label: 1 if malicious
            X.append(graph)
            y.append(label)

        # Create sequences of m graphs
        X_seq, y_seq = [], []
        for k in range(m, len(window_ids)):
            seq = X[k - m:k]
            X_seq.append(seq)
            y_seq.append(y[k])

        # Test the preprocessed data using the existing test method
        print(f"Testing on preprocessed dataset from {dataset_path}")
        test_accuracy = self.test(X_seq, y_seq)
        return test_accuracy

## Main.py

In [85]:
# main.py
# import DataPreprocessor
# from training import ModelTrainer
# import evaluation
# import testing

if __name__ == "__main__":
    # Paths (adjust as needed)
    dataset_path = "C:\\Users\\ASUS\\Guidewire_Hackathon\\datasets\\elastic_may2021_malicious_data.csv"
    model_path = "C:\\Users\\ASUS\\Guidewire_Hackathon\\src\\models\\trained_hybrid_model"
    # Preprocess data
    preprocessor = DataPreprocessor(dataset_path, m=3)
    X_train, y_train, X_val, y_val, X_test, y_test = preprocessor.load_and_preprocess_data()
    print("Training label distribution:", np.bincount(y_train))
    print("Validation label distribution:", np.bincount(y_val))
    print("Test label distribution:", np.bincount(y_test))
    # # Train model
    trainer = ModelTrainer(X_train, y_train, X_val, y_val, model_path, m=3, num_episodes=15, batch_size=32,hidden_units=64,output_units=8)
    trainer.train()
    # Evaluate (optional)
    evaluator = ModelEvaluator(trainer.gnn, trainer.lstm)
    val_loss, val_accuracy = evaluator.evaluate(X_val, y_val)
    print(f"Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}")
    # Test
    tester = ModelTester((trainer.gnn, trainer.lstm))
    test_accuracy = tester.test(X_test, y_test)

    new_test_accuracy = tester.preprocess_and_test(r'C:\Users\ASUS\Guidewire_Hackathon\datasets\elastic_may2022_data.csv', m=3)
    print(f"New Test Accuracy: {new_test_accuracy:.4f}")



Training label distribution: [   0 1193]
Validation label distribution: [  0 149]
Test label distribution: [  0 150]
64
8
Episode 0, Train Loss: 3.4330, Val Loss: 3.4058
Episode 1, Train Loss: 3.3759, Val Loss: 3.3412
Episode 2, Train Loss: 3.3025, Val Loss: 3.2567
Episode 3, Train Loss: 3.2064, Val Loss: 3.1464
Episode 4, Train Loss: 3.0815, Val Loss: 3.0035
Episode 5, Train Loss: 2.9195, Val Loss: 2.8192
Episode 6, Train Loss: 2.7135, Val Loss: 2.5868
Episode 7, Train Loss: 2.4573, Val Loss: 2.3039
Episode 8, Train Loss: 2.1526, Val Loss: 1.9741
Episode 9, Train Loss: 1.8062, Val Loss: 1.6120
Episode 10, Train Loss: 1.4406, Val Loss: 1.2467
Episode 11, Train Loss: 1.0895, Val Loss: 0.9157
Episode 12, Train Loss: 0.7867, Val Loss: 0.6488
Episode 13, Train Loss: 0.5533, Val Loss: 0.4534
Episode 14, Train Loss: 0.3878, Val Loss: 0.3161
Validation Loss: 0.0637, Validation Accuracy: 1.0000
Test Accuracy: 1.0000
Testing on preprocessed dataset from C:\Users\ASUS\Guidewire_Hackathon\dataset

In [86]:
# Example inputs
node_features = tf.random.uniform((10, 4), dtype=tf.float32)  # 10 nodes, 4 features
edge_indices = tf.constant([[0, 1, 2, 3], [1, 2, 3, 4]], dtype=tf.int64)  # Example edges
num_nodes = tf.constant(10, dtype=tf.int32)

# Create GNN instance
gnn = GNN(hidden_units=128, output_units=32)

# Call the model
output = gnn((node_features, edge_indices, num_nodes))
print(output.shape)  # Should output (32,) due to mean pooling

128
32
(32,)
