In [2]:
import zipfile

train_zip_path = 'kddcup.data.gz.zip'
test_zip_path = 'corrected.gz.zip'

with zipfile.ZipFile(train_zip_path, 'r') as zip_ref:
    zip_ref.extractall('')

with zipfile.ZipFile(test_zip_path, 'r') as zip_ref:
    zip_ref.extractall('')

In [5]:
import numpy as np-
import pandas as pd
from sklearn.preprocessing import LabelEncoder

train_file_path = 'kddcup.data.gz'
test_file_path = 'corrected.gz'

columns = [
    "duration", "protocol_type", "service", "flag", "src_bytes", "dst_bytes", "land", "wrong_fragment", "urgent",
    "hot", "num_failed_logins", "logged_in", "num_compromised", "root_shell", "su_attempted", "num_root", 
    "num_file_creations", "num_shells", "num_access_files", "num_outbound_cmds", "is_host_login",
    "is_guest_login", "count", "srv_count", "serror_rate", "srv_serror_rate", "rerror_rate", "srv_rerror_rate",
    "same_srv_rate", "diff_srv_rate", "srv_diff_host_rate", "dst_host_count", "dst_host_srv_count", 
    "dst_host_same_srv_rate", "dst_host_diff_srv_rate", "dst_host_same_src_port_rate", "dst_host_srv_diff_host_rate",
    "dst_host_serror_rate", "dst_host_srv_serror_rate", "dst_host_rerror_rate", "dst_host_srv_rerror_rate", "label"
]

df_train = pd.read_csv(train_file_path, compression='gzip', sep=",", names=columns, index_col=None)

df_test = pd.read_csv(test_file_path, compression='gzip', sep=",", names=columns, index_col=None)

df_train = df_train[df_train["service"] == "http"]
df_train = df_train.drop("service", axis=1)
df_test = df_test[df_test["service"] == "http"]
df_test = df_test.drop("service", axis=1)
columns.remove("service")

label_encoders = {}
for col in df_train.columns:
    if df_train[col].dtype == "object":
        label_encoders[col] = LabelEncoder()
        combined_data = pd.concat([df_train[col], df_test[col]])
        label_encoders[col].fit(combined_data)
        df_train[col] = label_encoders[col].transform(df_train[col])
        df_test[col] = label_encoders[col].transform(df_test[col])

print(df_train.head())
print(df_test.head())

df_train.to_csv('preprocessed_kddcup_train_data.csv', index=False)
df_test.to_csv('preprocessed_kddcup_test_data.csv', index=False)



   duration  protocol_type  flag  src_bytes  dst_bytes  land  wrong_fragment  \
0         0              0     9        215      45076     0               0   
1         0              0     9        162       4528     0               0   
2         0              0     9        236       1228     0               0   
3         0              0     9        233       2032     0               0   
4         0              0     9        239        486     0               0   

   urgent  hot  num_failed_logins  ...  dst_host_srv_count  \
0       0    0                  0  ...                   0   
1       0    0                  0  ...                   1   
2       0    0                  0  ...                   2   
3       0    0                  0  ...                   3   
4       0    0                  0  ...                   4   

   dst_host_same_srv_rate  dst_host_diff_srv_rate  \
0                     0.0                     0.0   
1                     1.0               

In [6]:
import pandas as pd

train_data = pd.read_csv('preprocessed_kddcup_train_data.csv')
test_data = pd.read_csv('preprocessed_kddcup_test_data.csv')

X_train = train_data.drop('label', axis=1)
y_train = train_data['label']
X_test = test_data.drop('label', axis=1)
y_test = test_data['label']

In [7]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [8]:
from sklearn.ensemble import IsolationForest

iso_forest = IsolationForest(contamination=0.1, random_state=42)
iso_forest.fit(X_train_scaled)

y_train_pred = iso_forest.predict(X_train_scaled)
y_test_pred = iso_forest.predict(X_test_scaled)

y_train_pred = [1 if x == -1 else 0 for x in y_train_pred]
y_test_pred = [1 if x == -1 else 0 for x in y_test_pred]

In [9]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense

input_dim = X_train_scaled.shape[1]
input_layer = Input(shape=(input_dim,))
encoder = Dense(32, activation="relu")(input_layer)
encoder = Dense(16, activation="relu")(encoder)
encoder = Dense(8, activation="relu")(encoder)
decoder = Dense(16, activation="relu")(encoder)
decoder = Dense(32, activation="relu")(decoder)
decoder = Dense(input_dim, activation="sigmoid")(decoder)

autoencoder = Model(inputs=input_layer, outputs=decoder)
autoencoder.compile(optimizer='adam', loss='mse')

autoencoder.fit(X_train_scaled, X_train_scaled, epochs=50, batch_size=32, shuffle=True, validation_split=0.1)

reconstructions = autoencoder.predict(X_train_scaled)
train_loss = tf.keras.losses.mse(reconstructions, X_train_scaled)

threshold = np.mean(train_loss) + np.std(train_loss)
y_train_pred = [1 if x > threshold else 0 for x in train_loss]

reconstructions = autoencoder.predict(X_test_scaled)
test_loss = tf.keras.losses.mse(reconstructions, X_test_scaled)
y_test_pred = [1 if x > threshold else 0 for x in test_loss]


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [11]:
from sklearn.metrics import classification_report, confusion_matrix

print("Isolation Forest Classification Report (Train):")
print(classification_report(y_train, y_train_pred))

print("Isolation Forest Confusion Matrix (Train):")
print(confusion_matrix(y_train, y_train_pred))

print("Isolation Forest Classification Report (Test):")
print(classification_report(y_test, y_test_pred))

print("Isolation Forest Confusion Matrix (Test):")
print(confusion_matrix(y_test, y_test_pred))

print("Autoencoder Classification Report (Train):")
print(classification_report(y_train, y_train_pred))

print("Autoencoder Confusion Matrix (Train):")
print(confusion_matrix(y_train, y_train_pred))

print("Autoencoder Classification Report (Test):")
print(classification_report(y_test, y_test_pred))

print("Autoencoder Confusion Matrix (Test):")
print(confusion_matrix(y_test, y_test_pred))

Isolation Forest Classification Report (Train):


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       0.00      0.00      0.00      2203
           2       0.00      0.00      0.00        13
           3       0.00      0.00      0.00      1801
           4       0.00      0.00      0.00         1
           5       0.00      0.00      0.00    619046
           6       0.00      0.00      0.00         4
           7       0.00      0.00      0.00        16
           9       0.00      0.00      0.00         7

    accuracy                           0.00    623091
   macro avg       0.00      0.00      0.00    623091
weighted avg       0.00      0.00      0.00    623091

Isolation Forest Confusion Matrix (Train):
[[     0      0      0      0      0      0      0      0      0]
 [  2193     10      0      0      0      0      0      0      0]
 [     0     13      0      0      0      0      0      0      0]
 [     9   1792      0      0      0      0      0    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       0.00      0.00      0.00      2203
           2       0.00      0.00      0.00        13
           3       0.00      0.00      0.00      1801
           4       0.00      0.00      0.00         1
           5       0.00      0.00      0.00    619046
           6       0.00      0.00      0.00         4
           7       0.00      0.00      0.00        16
           9       0.00      0.00      0.00         7

    accuracy                           0.00    623091
   macro avg       0.00      0.00      0.00    623091
weighted avg       0.00      0.00      0.00    623091

Autoencoder Confusion Matrix (Train):
[[     0      0      0      0      0      0      0      0      0]
 [  2193     10      0      0      0      0      0      0      0]
 [     0     13      0      0      0      0      0      0      0]
 [     9   1792      0      0      0      0      0      0  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
