In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from keras.models import Sequential
from keras.layers import Bidirectional, LSTM, Dense, Concatenate, Input
from keras.models import Model

In [139]:
# Load dataset
df = pd.read_csv('/kaggle/input/network-traffic-for-dos-detection/dataset.csv')

# Print columns of the DataFrame
print("Columns after loading dataset:")
print(df.columns)

Columns after loading dataset:
Index(['Id', 'IP', 'bot', 'FunctionId', 'functionTrigger', 'timestamp',
       'SubmitTime', 'RTT', 'InvocationDelay', 'ResponseDelay',
       'FunctionDuration', 'ActiveFunctionsAtRequest',
       'ActiveFunctionsAtResponse', 'maxcpu', 'avgcpu', 'p95maxcpu',
       'vmcategory', 'vmcorecountbucket', 'vmmemorybucket'],
      dtype='object')


In [140]:
def calculate_entropy(feature_values):
    # Convert float64 data to integer by rounding
    feature_values = np.round(feature_values).astype(int)
    
    # Count occurrences of each unique value in the feature
    value_counts = np.bincount(feature_values)
    
    # Calculate probability of each unique value
    probabilities = value_counts / len(feature_values)
    
    # Calculate entropy
    entropy = -np.sum(probabilities * np.log2(probabilities + 1e-10))  # Adding a small value to avoid log(0)
    
    return entropy

In [141]:
# Function to prepare dataset and calculate entropy
def prepare_dataset(df):
    # Select relevant features
    selected_features = ['RTT', 'InvocationDelay', 'ResponseDelay', 'FunctionDuration', 'ActiveFunctionsAtRequest', 
                         'ActiveFunctionsAtResponse', 'maxcpu', 'avgcpu', 'p95maxcpu', 'vmcorecountbucket', 'vmmemorybucket']
    
    # Normalize selected features if necessary
    scaler = MinMaxScaler()
    df[selected_features] = scaler.fit_transform(df[selected_features])
    
    # Calculate entropy for each data instance
    for feature in selected_features:
        df[feature + '_entropy'] = df.groupby('IP')[feature].transform(lambda x: calculate_entropy(x.values))
    
    # Print columns after adding entropy columns
    print("Columns after adding entropy columns:")
    print(df.columns)
    
    return df


In [142]:
# Function to prepare data for model training
def prepare_data_for_model(df):
    # Convert categorical variables to numerical labels if necessary
    label_encoder = LabelEncoder()
    df['vmcategory'] = label_encoder.fit_transform(df['vmcategory'])
    
    # Select features and target
    selected_features = ['RTT_entropy', 'InvocationDelay_entropy', 'ResponseDelay_entropy', 'FunctionDuration_entropy', 
                         'ActiveFunctionsAtRequest_entropy', 'ActiveFunctionsAtResponse_entropy', 'maxcpu_entropy', 
                         'avgcpu_entropy', 'p95maxcpu_entropy', 'vmcorecountbucket_entropy', 'vmmemorybucket_entropy']
    X = df[selected_features].values
    y = df['bot'].values
    
    # Reshape data for LSTM input
    X = X.reshape(X.shape[0], X.shape[1], 1)
    
    return X, y

In [143]:
# Print columns of the DataFrame
print("Columns after loading dataset:")
print(df.columns)

Columns after loading dataset:
Index(['Id', 'IP', 'bot', 'FunctionId', 'functionTrigger', 'timestamp',
       'SubmitTime', 'RTT', 'InvocationDelay', 'ResponseDelay',
       'FunctionDuration', 'ActiveFunctionsAtRequest',
       'ActiveFunctionsAtResponse', 'maxcpu', 'avgcpu', 'p95maxcpu',
       'vmcategory', 'vmcorecountbucket', 'vmmemorybucket'],
      dtype='object')


In [144]:
# Function to build Bi-LSTM model
def build_model(input_shape):
    model = Sequential()
    model.add(Bidirectional(LSTM(units=64, return_sequences=True), input_shape=input_shape))
    model.add(Bidirectional(LSTM(units=32)))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [145]:
# Prepare dataset and calculate entropy
df = prepare_dataset(df)

Columns after adding entropy columns:
Index(['Id', 'IP', 'bot', 'FunctionId', 'functionTrigger', 'timestamp',
       'SubmitTime', 'RTT', 'InvocationDelay', 'ResponseDelay',
       'FunctionDuration', 'ActiveFunctionsAtRequest',
       'ActiveFunctionsAtResponse', 'maxcpu', 'avgcpu', 'p95maxcpu',
       'vmcategory', 'vmcorecountbucket', 'vmmemorybucket', 'RTT_entropy',
       'InvocationDelay_entropy', 'ResponseDelay_entropy',
       'FunctionDuration_entropy', 'ActiveFunctionsAtRequest_entropy',
       'ActiveFunctionsAtResponse_entropy', 'maxcpu_entropy', 'avgcpu_entropy',
       'p95maxcpu_entropy', 'vmcorecountbucket_entropy',
       'vmmemorybucket_entropy'],
      dtype='object')


In [146]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer

function_trigger_mapping = {'notification': 0, 'other_value': 1}

df['functionTrigger'] = df['functionTrigger'].map(function_trigger_mapping)

numerical_columns = ['Id', 'bot', 'FunctionId', 'functionTrigger', 'SubmitTime', 'RTT',
                     'InvocationDelay', 'ResponseDelay', 'FunctionDuration', 'ActiveFunctionsAtRequest',
                     'ActiveFunctionsAtResponse', 'maxcpu', 'avgcpu', 'p95maxcpu', 'vmcorecountbucket',
                     'vmmemorybucket']

# Impute NaN values with a specific value, such as 0
imputer = SimpleImputer(strategy='constant', fill_value=0)
df[numerical_columns] = imputer.fit_transform(df[numerical_columns])

scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(df[numerical_columns])

In [147]:
# Prepare data for model training
# Split the dataset into training, validation, and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [148]:
# Reshape the input data to match the LSTM input shape
timesteps = 1  # Assuming each sample represents a single timestep
num_features = X_scaled.shape[1]  # Use the scaled dataset for reshaping
X_train = X_train.reshape(X_train.shape[0], timesteps, num_features)
X_val = X_val.reshape(X_val.shape[0], timesteps, num_features)
X_test = X_test.reshape(X_test.shape[0], timesteps, num_features)

In [None]:
# Train the model
model.fit(X_train_reshaped, y_train, epochs=10, batch_size=32, validation_split=0.2)

# Make predictions on the test set
predictions = model.predict(X_test_reshaped)


Epoch 1/10
 300/3742 [=>............................] - ETA: 10:57 - loss: 0.6110 - accuracy: 0.7059

In [None]:
# Make predictions on the test set
predictions = model.predict(X_test_scaled)
binary_predictions = (predictions > 0.5).astype(int)

# Evaluate the model
accuracy = accuracy_score(y_test, binary_predictions)
print("Accuracy:", accuracy)

print("Classification Report:")
print(classification_report(y_test, binary_predictions))
