# SPECTRE PROTOTYPE
---

## Dependencies

In [1]:
import sys
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras

from sklearn.preprocessing import StandardScaler, LabelBinarizer
from sklearn.decomposition import PCA

from confluent_kafka import Producer, Consumer, KafkaError
import requests

2023-05-26 20:57:53.157037: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-05-26 20:57:53.322449: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-05-26 20:57:53.323620: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Load SavedModel

In [2]:
model = tf.saved_model.load("../spectre-ann/Model/DDOS_2/A/SavedModel")

2023-05-26 20:57:58.454177: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-05-26 20:57:58.454485: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1956] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


## Data Pipeline

### Data Preprocessing

In [3]:
def prod_data_preprocess(df):
    
    dimensions_num_for_PCA = 7
    
    def clean_dataset(df):
        assert isinstance(df, pd.DataFrame), "df needs to be a pd.DataFrame"
        df.dropna(inplace=True)
        indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(1)
        return df[indices_to_keep]

    def get_PCA_feature_names(num_of_pca_components):
        feature_names = []
        for i in range(num_of_pca_components):    
            feature_names.append(f"Principal component {i+1}")
        return feature_names

    # Clean the dataset.
    df = clean_dataset(df)

    # Reset the index and remove the unneeded index column.
    df = df.reset_index(drop=True)

    # Check if the 'label' column exists in the DataFrame.
    if 'label' in df.columns:
        # Save the label attribute before dropping it.
        df_labels = df['label']
        df_no_labels = df.drop('label', axis=1)
    else:
        # If the 'label' column does not exist, use the DataFrame as is.
        df_no_labels = df
        df_labels = None

    # Scale the data.
    df_scaled = StandardScaler().fit_transform(df_no_labels)

    # Perform PCA.
    pca = PCA(n_components=7)
    principal_components = pca.fit_transform(df_no_labels)

    # Create a DataFrame with principal components.
    principal_component_headings = get_PCA_feature_names(7)
    df_pc = pd.DataFrame(data=principal_components, columns=principal_component_headings)

    if df_labels is not None:
        # Concatenate principal components and labels.
        df_final = pd.concat([df_pc, df_labels], axis=1)

        # Apply LabelBinarizer to the labels.
        lb = LabelBinarizer()
        df_final['label'] = lb.fit_transform(df_final['label'])
    else:
        df_final = df_pc

    return df_final

### Kafka

In [4]:
import socket

# Check if Zookeeper is running on port 2181
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
result = sock.connect_ex(('localhost', 2181))
if result == 0:
    print("Zookeeper is running on port 2181")
else:
    print("Zookeeper is not running on port 2181")

# Check if Kafka is running on port 9092
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
result = sock.connect_ex(('localhost', 9092))
if result == 0:
    print("Kafka is running on port 9092")
else:
    print("Kafka is not running on port 9092")

Zookeeper is running on port 2181
Kafka is running on port 9092


In [None]:
def kafka_producer(file_path, topic_name):
    # Read the csv file.
    df = pd.read_csv(file_path)

    # Preprocess the data using the prod_data_preprocess function.
    df_preprocessed = prod_data_preprocess(df)

    # Convert the dataframe to a list of dictionaries.
    data = df_preprocessed.to_dict('records')

    # Define the Kafka producer configuration.
    producer_config = {
        'bootstrap.servers': 'localhost:9092',
        'client.id': 'python-producer'
    }

    # Create the Kafka producer.
    producer = Producer(producer_config)

    # Send each dictionary to the Kafka topic.
    for record in data:
        producer.produce(topic_name, key='key', value=str(record))

    # Wait for any outstanding messages to be delivered and delivery reports to be received.
    producer.flush()

In [None]:
def kafka_consumer(topic_name, saved_model_path):
    # Define the Kafka consumer configuration.
    consumer_config = {
        'bootstrap.servers': 'localhost:9092',
        'group.id': 'python-consumer',
        'auto.offset.reset': 'earliest'
    }

    # Create the Kafka consumer.
    consumer = Consumer(consumer_config)

    # Subscribe to the Kafka topic.
    consumer.subscribe([topic_name])

    # Define the SavedModel configuration and load the model.
    saved_model_config = tf.saved_model.LoadOptions(
        experimental_io_device='/job:localhost'
    )
    saved_model = tf.saved_model.load(saved_model_path, options=saved_model_config)

    # Define a function to process each message from the Kafka topic.
    def process_message(msg):
        # Parse the message and get the features.
        features = json.loads(msg.value())
        features_array = np.array(list(features.values()))

        # Make a prediction using the SavedModel.
        prediction = saved_model(features_array[tf.newaxis, ...])

        # Calculate the detection rate.
        detection_rate = np.argmax(prediction, axis=1)

        return detection_rate

    # Poll for new messages in the Kafka topic.
    while True:
        msg = consumer.poll(1.0)

        if msg is None:
            # No message received in the last poll interval.
            continue
        if msg.error():
            # Handle any errors that occurred while polling for messages.
            print(f"Error while consuming message: {msg.error()}")
        else:
            # Process the message and print the detection rate.
            detection_rate = process_message(msg)
            print(f"Detection rate: {detection_rate}")


## Predictions