In [16]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

# Load the data
file_path = '../data/dns_queries.csv'  # Replace with your file path if needed
data = pd.read_csv(file_path)

# Set column names exactly as per the description.txt
data.columns = [
    'user_ip', 'domain', 'timestamp', 'attack', 'request', 'len', 'subdomains_count', 
    'w_count', 'w_max', 'entropy', 'w_max_ratio', 'w_count_ratio', 'digits_ratio', 
    'uppercase_ratio', 'time_avg', 'time_stdev', 'size_avg', 'size stdev', 
    'throughput', 'unique', 'entropy_avg', 'entropy_stdev'
]

# Select feature columns for training
feature_columns = [
    'len', 'subdomains_count', 'w_count', 'w_max', 'entropy', 'w_max_ratio', 
    'w_count_ratio', 'digits_ratio', 'uppercase_ratio', 'time_avg', 'time_stdev', 
    'size_avg', 'size stdev', 'throughput', 'unique', 'entropy_avg', 'entropy_stdev'
]

X = data[feature_columns]

# Normalize the data
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# Split the data for training (simulate normal) and testing (with anomalies)
X_train, X_test = train_test_split(X_scaled, test_size=0.2, random_state=42)

# X_train and X_test are now ready for use in training and evaluating your model


In [17]:
X_train

array([[1.        , 0.38461538, 0.83739837, ..., 0.        , 0.91507355,
        0.05494378],
       [0.07741935, 0.        , 0.12195122, ..., 0.        , 0.52042884,
        0.1824018 ],
       [0.01935484, 0.        , 0.02439024, ..., 0.        , 0.1830784 ,
        0.15295861],
       ...,
       [0.14193548, 0.07692308, 0.11382114, ..., 0.        , 0.68224278,
        0.06932567],
       [0.2       , 0.07692308, 0.10569106, ..., 0.        , 0.69603026,
        0.49057813],
       [0.01935484, 0.        , 0.04065041, ..., 0.        , 0.12833598,
        0.25411392]])

In [18]:
X_test

array([[0.07741935, 0.        , 0.06504065, ..., 0.        , 0.47456869,
        0.27821786],
       [0.07741935, 0.        , 0.05691057, ..., 0.        , 0.56701341,
        0.16572431],
       [0.14193548, 0.07692308, 0.06504065, ..., 0.        , 0.61173613,
        0.63475459],
       ...,
       [0.01935484, 0.        , 0.02439024, ..., 0.        , 0.15570719,
        0.19588259],
       [0.01935484, 0.        , 0.01626016, ..., 0.        , 0.12833598,
        0.25411392],
       [0.07741935, 0.        , 0.04065041, ..., 0.        , 0.53550976,
        0.18715135]])

In [21]:
from keras.models import Model
from keras.layers import Input, Dense
import numpy as np

In [23]:
# Define the autoencoder model
input_dim = X_train.shape[1]
encoding_dim = int(input_dim / 2)  # Dimension of the encoded representation

In [26]:
# Build the autoencoder architecture
input_layer = Input(shape=(input_dim,))
encoder = Dense(encoding_dim, activation='relu')(input_layer)
decoder = Dense(input_dim, activation='sigmoid')(encoder)
autoencoder = Model(inputs=input_layer, outputs=decoder)

In [27]:
# Compile the model
autoencoder.compile(optimizer='adam', loss='mean_squared_error')

In [28]:
# Train the model
autoencoder.fit(X_train, X_train, epochs=50, batch_size=32, shuffle=True, validation_split=0.1)

Epoch 1/50
[1m952/952[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 614us/step - loss: 0.0776 - val_loss: 0.0052
Epoch 2/50
[1m952/952[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 571us/step - loss: 0.0050 - val_loss: 0.0032
Epoch 3/50
[1m952/952[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 553us/step - loss: 0.0033 - val_loss: 0.0022
Epoch 4/50
[1m952/952[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 526us/step - loss: 0.0024 - val_loss: 0.0016
Epoch 5/50
[1m952/952[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 523us/step - loss: 0.0018 - val_loss: 0.0013
Epoch 6/50
[1m952/952[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 547us/step - loss: 0.0014 - val_loss: 0.0011
Epoch 7/50
[1m952/952[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 555us/step - loss: 0.0012 - val_loss: 8.8847e-04
Epoch 8/50
[1m952/952[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 608us/step - loss: 9.9996e-04 - val_loss: 7.8774e-04
Epoch 9/50


<keras.src.callbacks.history.History at 0x1a98e1c9c40>

In [29]:
# Predict using the autoencoder to get reconstruction errors
reconstructions = autoencoder.predict(X_test)

[1m265/265[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 522us/step


In [30]:
# Calculate reconstruction error for each test sample
reconstruction_errors = np.mean(np.power(X_test - reconstructions, 2), axis=1)

In [31]:
# Set a threshold for anomaly detection (e.g., 95th percentile of training errors)
threshold = np.percentile(reconstruction_errors, 95)

In [33]:
# Classify anomalies
anomalies = reconstruction_errors > threshold
anomalies

array([False, False,  True, ..., False, False, False])

In [34]:
# Print results
print(f"Number of anomalies detected: {np.sum(anomalies)}")

Number of anomalies detected: 423
