In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [8]:
# 1. Data Loading

data = pd.read_csv("UNSW-NB15.csv", low_memory=False)

if 'Label' in data.columns:
    data.drop('Label', axis=1, inplace=True)
data['Attack Type'] = data['attack_cat']
data.drop('attack_cat', axis=1, inplace=True)


# Converting Some feature data types to appropriate one as it is relevant to convert

obj_to_int = ['ct_ftp_cmd', 'sport', 'dsport']
for col in obj_to_int:
    # Strip whitespace and replace blank strings with NaN
    data[col] = data[col].astype(str).str.strip().replace('', None)
    
    # Convert to numeric, coerce errors to NaN, then fill with 0 and convert to int
    data[col] = pd.to_numeric(data[col], errors='coerce').fillna(0).astype(int)
    
X = data.drop('Attack Type', axis=1)
y = data['Attack Type']
y2 = np.where(data['Attack Type'] == "Normal Traffic", "Normal Traffic", "Attack")

In [9]:
# Label encoding for categorical features

from sklearn.preprocessing import LabelEncoder

label_encoders = {}  # To store encoders for each column

for col in X.columns:
    if X[col].dtypes == 'object':
        le = LabelEncoder()
        X[col] = le.fit_transform(X[col].astype(str))  # Convert to string just in case
        label_encoders[col] = le  # Save encoder if you want to inverse transform later

In [10]:
# Normalize / Scale the Data

from sklearn.preprocessing import MinMaxScaler

# Scale features to [0, 1]
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

In [13]:
# Split Dataset (only use normal data for training)

import numpy as np
from sklearn.model_selection import train_test_split

# Get only normal traffic (label = 0)

le = LabelEncoder()
y2 = le.fit_transform(y2.astype(str))  # Convert to string just in case
        
X_normal = X_scaled[y2 == 0]

# Split into train/validation
X_train, X_val = train_test_split(X_normal, test_size=0.2, random_state=42)

In [25]:
# Build and Train Autoencoder

import tensorflow as tf
from tensorflow.keras import layers, models

input_dim = X_train.shape[1]

# Autoencoder Model
input_layer = layers.Input(shape=(input_dim,))
encoded = layers.Dense(64, activation='relu')(input_layer)
encoded = layers.Dense(32, activation='relu')(encoded)
encoded = layers.Dense(16, activation='relu')(encoded)

decoded = layers.Dense(32, activation='relu')(encoded)
decoded = layers.Dense(64, activation='relu')(decoded)
output_layer = layers.Dense(input_dim, activation='sigmoid')(decoded)

autoencoder = models.Model(input_layer, output_layer)
autoencoder.compile(optimizer='adam', loss='mse')

# Train
autoencoder.fit(X_train, X_train,
                epochs=20,
                batch_size=256,
                validation_data=(X_val, X_val),
                shuffle=True)

ImportError: DLL load failed while importing h5t: The specified procedure could not be found.

In [None]:
# Calculate Reconstruction Error

# Predict reconstruction on the full dataset
reconstructed = autoencoder.predict(X_scaled)
mse = np.mean(np.power(X_scaled - reconstructed, 2), axis=1)

# Decide threshold using only normal data
threshold = np.percentile(mse[y == 0], 95)  # 95th percentile

# Predict anomaly if error > threshold
y_pred = (mse > threshold).astype(int)

In [None]:
# 2. Data Preprocessing

# Normalize pixel values to [0, 1]
X_train = X_train.astype('float32') / 255.0
X_test = X_test.astype('float32') / 255.0

# Reshape for Conv2D input: (samples, height, width, channels)
X_train = X_train.reshape((-1, 28, 28, 1))
X_test = X_test.reshape((-1, 28, 28, 1))


In [None]:
# 3. Model Building (Encoder + Decoder)

from tensorflow.keras import layers, models

# Input layer
input_img = layers.Input(shape=(28, 28, 1))

# Encoder
x = layers.Conv2D(16, (3, 3), activation='relu', padding='same')(input_img)
x = layers.MaxPooling2D((2, 2), padding='same')(x)
x = layers.Conv2D(8, (3, 3), activation='relu', padding='same')(x)
encoded = layers.MaxPooling2D((2, 2), padding='same')(x)

# Decoder
x = layers.Conv2D(8, (3, 3), activation='relu', padding='same')(encoded)
x = layers.UpSampling2D((2, 2))(x)
x = layers.Conv2D(16, (3, 3), activation='relu')(x)
x = layers.UpSampling2D((2, 2))(x)
decoded = layers.Conv2D(1, (3, 3), activation='sigmoid', padding='same')(x)

# Combine into an autoencoder model
autoencoder = models.Model(input_img, decoded)


In [None]:
# 4. Model Compilation

autoencoder.compile(optimizer='adam', loss='binary_crossentropy')

In [None]:
# 5. Model Training

history = autoencoder.fit(
    X_train, X_train,  # Input = Output
    epochs=10,
    batch_size=128,
    shuffle=True,
    validation_split=0.1
)


In [None]:
# 6. Evaluation and Visualization

import matplotlib.pyplot as plt

# Predict on test set
decoded_imgs = autoencoder.predict(X_test[:10])

# Visualize original vs reconstructed
n = 10
plt.figure(figsize=(20, 4))
for i in range(n):
    # Original
    ax = plt.subplot(2, n, i + 1)
    plt.imshow(X_test[i].reshape(28, 28), cmap="gray")
    plt.axis('off')

    # Reconstructed
    ax = plt.subplot(2, n, i + 1 + n)
    plt.imshow(decoded_imgs[i].reshape(28, 28), cmap="gray")
    plt.axis('off')
plt.show()


In [None]:
# 7. Model Saving

autoencoder.save("autoencoder_mnist_model.h5")