# SPECTRE PROTOTYPE
---

## `producer.py`

`producer.py` is a Python script that performs the following tasks:

1. Import necessary libraries and modules. 
2. Print a welcome message and wait for 1 second. 
3. Define the prod_datapreprocess function, which performs the following tasks: 
   1. Read a CSV file and create a DataFrame.
   2. Clean the dataset by removing NaN, inf, and -inf values.
   3. Perform feature scaling using StandardScaler.
   4. Apply PCA (Principal Component Analysis) to reduce dimensionality.
   5. Combine the principal components with the original labels.
   6. Perform label binarization for the 'label' column.
   7. Return the processed DataFrame with features (X) and labels (y).
4. Call the prod_datapreprocess function on a given dataset.
5. Configure the Kafka producer with the necessary settings.
6. Create a Kafka producer instance.
7. Configure and create a Kafka consumer for handshake purposes.
8. Perform a handshake between the producer and consumer, waiting for a 'READY' message.
9. Send a 'READY' message from the producer to the consumer.
10. Iterate through the processed DataFrame (X) and send each row to the Kafka producer.
11. Flush the producer to ensure all messages are sent.

In [None]:
# producer.py
# https://www.phind.com/search?cache=cf139efb-38e8-4fb5-9cda-5c67194a11a6

# Import necessary libraries
from confluent_kafka import Producer, Consumer, KafkaError
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelBinarizer
from sklearn.decomposition import PCA
import time

# Print the welcome message
print("==================================")
print("SPECTRE - PRODUCER MODULE")
print("==================================")
time.sleep(1)


# Define a function to preprocess the data
def prod_datapreprocess(csv_file):
    
    # Read a CSV file and create a DataFrame
    df = pd.read_csv(csv_file)
    
    dimensions_num_for_PCA = 7
    
    # Function to clean the dataset by removing NaN, inf, and -inf values
    def clean_dataset(df):
        assert isinstance(df, pd.DataFrame), "df needs to be a pd.DataFrame"
        df.dropna(inplace=True)
        indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(axis=1)
        return df[indices_to_keep]

    # Function to get PCA feature names
    def get_PCA_feature_names(num_of_pca_components):
        feature_names = []
        for i in range(num_of_pca_components):
            feature_names.append(f"Principal component {i+1}")
        return feature_names
    
    # Preprocess the dataset
    df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '')
    df_cleaned = df.copy()
    df_cleaned = clean_dataset(df_cleaned)

    df_cleaned = df_cleaned.reset_index()
    df_cleaned.drop('index', axis=1, inplace=True)

    # Saving the label attribute before dropping it
    df_labels = df_cleaned['label']
    df_cleaned.drop('label', axis=1, inplace=True)
    df_features = df_cleaned.columns.tolist()

    # Perform feature scaling
    df_scaled = StandardScaler().fit_transform(df_cleaned)
    df_scaled = pd.DataFrame(data=df_scaled, columns=df_features)

    # Performing PCA
    pca = PCA(n_components=dimensions_num_for_PCA)
    principal_components = pca.fit_transform(df_scaled)

    # Creating a DataFrame with principal components
    principal_component_headings = get_PCA_feature_names(dimensions_num_for_PCA)
    df_pc = pd.DataFrame(data=principal_components, columns=principal_component_headings)

    # Combine the principal components with the original labels
    df_final = pd.concat([df_pc, df_labels], axis=1)

    # Perform label binarization. Converts "ANOMALY" = 1 and "BENIGN" = 0.
    lb = LabelBinarizer()
    df_final['label'] = lb.fit_transform(df_final['label'])

    # Split the dataset into features (X) and labels (y)
    X = df_final.drop(['label'], axis = 1)
    y = df_final['label']

    # Returns features(X)
    return X


# Read the CSV file and preprocess the data

# DDoS Attack CSV
X = prod_datapreprocess('/home/aryn/spectre-dev/dataset/CICIDS2017/MachineLearningCSV/MachineLearningCVE/Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv')

# DDoS Prime CSV
#X = prod_datapreprocess('/home/aryn/spectre-dev/dataset/DDoS_Dataset/ddos_balanced/final_dataset.csv')

# Bening CSV
#X = prod_datapreprocess('/home/aryn/spectre-dev/dataset/CICIDS2017/MachineLearningCSV/MachineLearningCVE/Monday-WorkingHours.pcap_ISCX.csv')

# Set the producer configuration
producer_conf = {
    'bootstrap.servers': 'localhost:9092',
    'max.in.flight.requests.per.connection': 1   # Add this line to set the maximum number of in-flight messages to 1
}

# Create a Kafka producer
producer = Producer(producer_conf)

# Set the handshake consumer configuration
handshake_consumer_conf = {
    'bootstrap.servers': 'localhost:9092',
    'group.id': 'producer_handshake_group',
    'session.timeout.ms': 6000,
    'auto.offset.reset': 'earliest'
}

# Create a Kafka consumer for the handshake
handshake_consumer = Consumer(handshake_consumer_conf)
handshake_consumer.subscribe(['handshake'])

# Wait for the handshake from the consumer
timeout_counter = 0
timeout_limit = 10

# Perform handshake with the consumer
while True:
    # ... (Handshake waiting code)
    msg = handshake_consumer.poll(1.0)
    if msg is None:
        timeout_counter += 1
        if timeout_counter >= timeout_limit:
            print("==================================")
            print("CONNECTION FAILURE")
            print("==================================")
            exit(1)
        continue
    if msg.error():
        print(f"Handshake consumer error: {msg.error()}")
    else:
        handshake_msg = msg.value().decode('utf-8')
        if handshake_msg == 'READY':
            print("==================================")
            print("CONNECTION ESTABLISHED")
            print("==================================")
            break

# Send a ready message to the consumer
producer.produce('handshake', 'READY')

# Iterate through the preprocessed data and send it to the Kafka producer line by line
for i, row in X.iterrows():
    #serialized_data = str(row)  # Convert the row to a string
    serialized_data = ','.join(map(str, row.values))
    print(f"Serialized data: {serialized_data}")
    producer.produce('detect_anomalies', serialized_data)
    time.sleep(1.5)

# Flush the producer to ensure all messages are sent
producer.flush()

----
## `anomaly_detector.py`

`anomaly_detector.py` is a Python script that performs the following tasks:

1. Import necessary libraries and modules.
2. Print a welcome message and wait for 1 second.
3. Define the on_message function, which performs the following tasks:
   1. Convert the received data to a list of strings using comma as the delimiter.
   2. Append the list of strings to the data buffer.
   3. Check if the data buffer has enough data points.
   4. If the buffer has enough data points, convert the buffer to a NumPy array of floats, predict the anomalies using the pre-trained TensorFlow model, and check if any predicted values are above the threshold.
   5. If an anomaly is detected, print an "ANOMALY" message. Otherwise, print a "BENIGN" message.
   6. Reset the data buffer.
4. Subscribe to the 'detect_anomalies' topic.
5. Consume messages from the 'detect_anomalies' topic and process them using the on_message function.

In [None]:
# anomalay_detector.py
# https://www.phind.com/search?cache=cf139efb-38e8-4fb5-9cda-5c67194a11a6

from confluent_kafka import Consumer, Producer, KafkaError
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import load_model
import time

# Print the header for the anomaly detector module
print("==================================")
print("SPECTRE - CONSUMER & ANOMALY DETECTOR MODULE")
print("==================================")
time.sleep(1)

# Load the pre-trained TensorFlow model
model = load_model('/home/aryn/spectre-dev/spectre-code/spectre-ann/Model/DDOS_2/A/spectre_ddos_2_h5.h5')

# Print the header for the anomaly detector module
consumer_conf = {
    'bootstrap.servers': 'localhost:9092',
    'group.id': 'mygroup',
    'session.timeout.ms': 6000,
    'auto.offset.reset': 'earliest',
    'queued.min.messages': 1  # Add this line to set the minimum number of records in the queue to 1
}

# Create a Kafka consumer instance
consumer = Consumer(consumer_conf)

# Define Kafka producer configuration for handshake with the consumer
handshake_producer_conf = {
    'bootstrap.servers': 'localhost:9092'
}

# Create a Kafka producer instance for handshake
handshake_producer = Producer(handshake_producer_conf)
handshake_producer.produce('handshake', 'READY')

# Define Kafka consumer configuration for handshake with the producer
handshake_consumer_conf = {
    'bootstrap.servers': 'localhost:9092',
    'group.id': 'consumer_handshake_group',
    'session.timeout.ms': 6000,
    'auto.offset.reset': 'earliest'
}

# Create a Kafka consumer instance for handshake
handshake_consumer = Consumer(handshake_consumer_conf)
handshake_consumer.subscribe(['handshake'])

# Initialize timeout counter and limit for handshake
timeout_counter = 0
timeout_limit = 10

# Perform handshake with the producer
while True:
    msg = handshake_consumer.poll(1.0)
    if msg is None:
        timeout_counter += 1
        if timeout_counter >= timeout_limit:
            print("==================================")
            print("CONNECTION FAILURE")
            print("==================================")
            exit(1)
        continue
    if msg.error():
        print(f"Handshake consumer error: {msg.error()}")
    else:
        handshake_msg = msg.value().decode('utf-8')
        if handshake_msg == 'READY':
            print("==================================")
            print("CONNECTION ESTABLISHED")
            print("==================================")
            break

# Subscribe to the 'detect_anomalies' topic
consumer.subscribe(['detect_anomalies'])

# Initialize the data buffer
received_data_buffer = []

# Consume messages and process them using the on_message function
def on_message(msg):
    global received_data_buffer
    # Choosing a higher threshold value (e.g., 0.7) will reduce the chances of benign data being misclassified as anomalies (false positives)
    # but might also result in missing some actual anomalies (false negatives). The best threshold value balances the trade-off between false positives and false negatives.
    # One approach to determine a good threshold value is to learn from past data, identifying the minimum and maximum deviations and setting the threshold accordingly, possibly with a scaling factor for flexibility.
    threshold = 0.7  # Set the threshold value for anomaly detection
    
    
    if msg.error():
        print(f"Consumer error: {msg.error()}")
    else:
        received_data_str = msg.value().decode('utf-8')  # Convert the received data to a string
        received_data_list = received_data_str.strip('[]').split(',')  # Convert the received data string to a list of strings using comma as the delimiter
        received_data_buffer.append(received_data_list)  # Append the list of strings to the buffer

        if len(received_data_buffer) == 7:
            X_received = np.array(received_data_buffer, dtype=np.float64)  # Convert the buffer to a numpy array of floats
            prediction = model.predict(X_received)
            print(f'Prediction: {prediction}')
            
            # Check if there is an anomaly and print the appropriate message
            if np.any(prediction > threshold):
                print("==================================")
                print("ANOMALY")
                print("==================================")
            else:
                print("==================================")
                print("BENIGN")
                print("==================================")
                
            received_data_buffer = []  # Reset the buffer
        else:
            # Debug: Print the received_data_str length
            #print(f"Received data length: {len(received_data_str)}")  
            
            # Debug: Print the received_data_buffer
            print(f"Received data instances: {len(received_data_buffer)}") 

# Consume messages and process them using the on_message function
while True:
    msg = consumer.poll(1.0)
    if msg is None:
        continue
    if msg.error():
        print(f"Consumer error: {msg.error()}")
    else:
        on_message(msg)

# Data Preprocess Method

In [None]:
# Define a function to preprocess the data
def prod_datapreprocess(csv_file):
    
    # Read a CSV file and create a DataFrame
    df = pd.read_csv(csv_file)
    
    dimensions_num_for_PCA = 7
    
    # Function to clean the dataset by removing NaN, inf, and -inf values
    def clean_dataset(df):
        assert isinstance(df, pd.DataFrame), "df needs to be a pd.DataFrame"
        df.dropna(inplace=True)
        indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(axis=1)
        return df[indices_to_keep]

    # Function to get PCA feature names
    def get_PCA_feature_names(num_of_pca_components):
        feature_names = []
        for i in range(num_of_pca_components):
            feature_names.append(f"Principal component {i+1}")
        return feature_names
    
    # Preprocess the dataset
    df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '')
    df_cleaned = df.copy()
    df_cleaned = clean_dataset(df_cleaned)

    df_cleaned = df_cleaned.reset_index()
    df_cleaned.drop('index', axis=1, inplace=True)

    # Saving the label attribute before dropping it
    df_labels = df_cleaned['label']
    df_cleaned.drop('label', axis=1, inplace=True)
    df_features = df_cleaned.columns.tolist()

    # Perform feature scaling
    df_scaled = StandardScaler().fit_transform(df_cleaned)
    df_scaled = pd.DataFrame(data=df_scaled, columns=df_features)

    # Performing PCA
    pca = PCA(n_components=dimensions_num_for_PCA)
    principal_components = pca.fit_transform(df_scaled)

    # Creating a DataFrame with principal components
    principal_component_headings = get_PCA_feature_names(dimensions_num_for_PCA)
    df_pc = pd.DataFrame(data=principal_components, columns=principal_component_headings)

    # Combine the principal components with the original labels
    df_final = pd.concat([df_pc, df_labels], axis=1)

    # Perform label binarization. Converts "ANOMALY" = 1 and "BENIGN" = 0.
    lb = LabelBinarizer()
    df_final['label'] = lb.fit_transform(df_final['label'])

    # Split the dataset into features (X) and labels (y)
    X = df_final.drop(['label'], axis = 1)
    y = df_final['label']

    # Returns features(X)
    return X

In [9]:
import yaml
from yaml.loader import SafeLoader
import streamlit_authenticator as stauth
hashed_passwords = stauth.Hasher(['admin1', 'password1']).generate()
print(hashed_passwords)

['$2b$12$RbKiXOZRKYyqx3dBmt4VdeViVTL6EY.yLO5wV1TnUXtgJEtbGhuEu', '$2b$12$DSkNtQJytUsuPwPFjLbrtOdroqBgGnFFyKcHdz0vu0./ZB1Pt6F/O']


In [10]:
import bcrypt

password = "admin1".encode('utf-8')
hashed_password = bcrypt.hashpw(password, bcrypt.gensalt()).decode('utf-8')
print(hashed_password)

$2b$12$D0e7eVJNHDIFArw3un.IG.W0Clty3hgALoY56SxCRaW41XzABpCCm


In [11]:
# homepage.py

import streamlit as st
import pandas as pd
import sqlite3
import time
from confluent_kafka.admin import AdminClient
from streamlit_extras.colored_header import colored_header
from streamlit_extras.metric_cards import style_metric_cards
from streamlit import date_input
from datetime import datetime
import streamlit_authenticator as stauth
import plotly.express as px
import altair as alt
import bcrypt

# Connect to the SQLite database
db_path = "/home/aryn/spectre-dev/spectre-code/spectre-ann/prototype/database/predictions.db"
conn = sqlite3.connect(db_path)
c = conn.cursor()

st.set_page_config(
    page_title="SPECTRE DDoS Detection Dashboard",
    page_icon="🧠",
    layout="wide",
    initial_sidebar_state="collapsed",
)

# Add a collapsible sidebar for the refresh button
with st.sidebar:
    st.header("🧠 SPECTRE Options")
    #st.write("Click the button below to refresh the data:")
    st.caption("Refresh SPECTRE Dashboard")
    refresh_button = st.button("Refresh Data")
    st.divider()
    st.subheader("⚠️ DANGEROUS")
    st.caption("Careful when using these options")
    del_db = st.button("Delete Database Entry")
    
    
# dashboard title
st.title("🧠 SPECTRE DASHBOARD")
st.caption("A lightweight solution for DDoS Detection")

# Create a placeholder for the line chart
line_chart_placeholder = st.empty()

# Function to fetch data from the database and display it
def display_predictions():
    
    # Define Kafka configuration
    kafka_conf = {
        'bootstrap.servers': 'localhost:9092'
    }

    # Create an AdminClient instance
    admin_client = AdminClient(kafka_conf)

    # Get the metadata for the Kafka cluster
    metadata = admin_client.list_topics(timeout=5)

    #st.header("DASHBOARD")
       
    # Query the predictions from the database
    c.execute('SELECT * FROM predictions')
    predictions = c.fetchall()

    # Convert the predictions to a pandas DataFrame
    predictions_df = pd.DataFrame(predictions, columns=['ID', 'Prediction', 'Result', 'F1 Score', 'Timestamp'])
    #st.write(f"Unique values in Result column: {predictions_df['Result'].unique()}")

    # Convert the Timestamp column to a DatetimeIndex
    #anomaly_counts = predictions_df[predictions_df['Result'] == 'ANOMALY'].set_index('Timestamp').resample('5T').count()
    
    # Convert the 'Timestamp' column to a pandas datetime object
    predictions_df['Timestamp'] = pd.to_datetime(predictions_df['Timestamp'])
    
    # Set the 'Timestamp' column as the index and resample
    # Group the predictions_df by 'Result' and resample by 'Timestamp'
    grouped_df = predictions_df.groupby('Result').resample('5T', on='Timestamp').count()

    # Filter the grouped_df for 'ANOMALY' and reset the index
    anomaly_counts = grouped_df.loc['ANOMALY'].reset_index()

        
    # Query the count of anomalies and benign results from the database
    c.execute("SELECT Result, COUNT(*) FROM predictions GROUP BY Result")
    count_data = dict(c.fetchall())
    #st.write(f"Total rows in predictions table: {len(predictions)}")  # Add this line to check the values in the Result column

    st.divider()
    
    col_top1, col_top2 = st.columns(2)
    
    with col_top1:
        #st.subheader("Attack Summary")
        colored_header(
            label="Attack Summary",
            description="A summary of attacks that occured",
            color_name="yellow-80",
        )
        # Create a row for the metrics
        #metrics_row = st.columns(2)
        
        # Display the count of anomalies and benign results using st.metric
        #with metrics_row[0]:
        #    st.metric("DDoS Count", count_data.get('ANOMALY', 0))
        #with metrics_row[1]:
        #    st.metric("Benign Count", count_data.get('BENIGN', 0))
        
        met_col1, met_col2 = st.columns(2)
        met_col1.metric(label="DDoS Count",value=count_data.get('ANOMALY', 0))
        met_col2.metric(label="Benign Count",value=count_data.get('BENIGN', 0))
        style_metric_cards(background_color="#191923", border_left_color= "#E59500", border_color="#E59500", border_size_px=2, border_radius_px= 5)
        
    with col_top2:
        #st.subheader("SPECTRE Details")
        colored_header(
            label="SPECTRE Details",
            description="Overview on SPECTRE",
            color_name="yellow-80",
        )
        kafka_expander = st.expander(label='KAFKA METRICS')
        with kafka_expander:
            #st.subheader("Welcome to Developer Area")
            # Kafka Information
            #st.write("KAFKA METRICS")  
            # Create a container to display the number of topics
            with st.container():
                # Check if there are any errors
                if not metadata.brokers:
                    st.warning("Kafka is not running properly!")
                else:
                    st.success("Kafka is running properly!")
        status_expander = st.expander(label='SPECTRE Status')
        with status_expander:
            #st.write("SPECTRE Status")
            st.write("Version: 2.0")

    st.divider()
    
    # Add the date_input widget to the date_col
    selected_date = date_input("Select a Date", value=datetime.today().date())
    
    # Create two columns for displaying the table and line chart side by side
    col1, col2 = st.columns(2)

    # Display the table in the first column
    with col1:
        #st.subheader("Log Details")
        colored_header(
            label="Log Details",
            description="Attack Logs",
            color_name="yellow-80",
        )
        
        

        # Filter the DataFrame based on the selected date
        filtered_df = predictions_df[predictions_df['Timestamp'].dt.date == selected_date]
        
        # Add a select box to choose between top 10 results and all results
        table_option = st.selectbox("Choose table display option:", ["Recent 10 Results", "All Results"])

        # Filter the DataFrame based on the selected option
        if table_option == "Recent 10 Results":
            #display_df = predictions_df.tail(10)
            display_df = filtered_df.tail(10)
        else:
            #display_df = predictions_df
            display_df = filtered_df

        # Set the option to display all columns without truncation
        pd.set_option('display.max_columns', None)

        # Display the table with a scrollable container and full width
        st.dataframe(display_df[['Timestamp', 'F1 Score', 'Result']], use_container_width=True, hide_index=True)

    # Line Chart Definition
    # Create a new DataFrame for the line chart with separate columns for anomalies and benign predictions
    #line_chart_data = predictions_df[predictions_df['Result'] == 'ANOMALY'].set_index('Timestamp').resample('5S').count()['Result']
    # Display the line chart in the second column
    with col2:
        #st.subheader("Attack Graph")
        #st.write("This line chart shows the number of anomalies over time:")
        colored_header(
            label="Attack Graph",
            description="This line chart shows the number of anomalies over time",
            color_name="yellow-80",
        )
                      
        # Add a 'Date' column to the anomaly_counts DataFrame
        anomaly_counts['Date'] = anomaly_counts['Timestamp'].dt.date


        # Filter the anomaly_counts DataFrame based on the selected date
        anomaly_counts_filtered = anomaly_counts[anomaly_counts['Date'].astype(str) == str(selected_date)]
        
        st.write("Anomaly Counts Filtered DataFrame:")
        st.write(anomaly_counts_filtered)
        st.write(print("Timestamp data:", anomaly_counts_filtered['Timestamp'].tolist()))
        st.write(print("Anomaly count data:", anomaly_counts_filtered['ID'].tolist()))

        st.write("Grouped DataFrame:")
        st.write(grouped_df)
        
        st.write("Anomaly Counts DataFrame:")
        st.write(anomaly_counts)
        
        if anomaly_counts_filtered.empty:
            st.warning("No data available for the selected date.")
        else:
            fig = px.line(anomaly_counts_filtered, x='Timestamp', y='ID', title='Anomalies Over Time')
            fig.update_xaxes(title_text='Timestamp')
            fig.update_yaxes(title_text='Anomaly Count')
            st.plotly_chart(fig, use_container_width=True)
    
    dev_expander = st.expander(label='Developer Area')
    with dev_expander:
        st.header("Welcome to Developer Area")
        st.caption("Components to added")
        
            
    
# Refresh the data when the refresh button is clicked
if refresh_button:
    display_predictions()
elif del_db:
    # Delete all rows from the predictions table
    c.execute("DELETE FROM predictions")
    conn.commit()
else:
    # Display the data automatically every 2 minutes (120 seconds)
    while True:
        display_predictions()
        time.sleep(120)