In [15]:
# Data Manipulation and Processing
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

# Models
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense
from sklearn.tree import DecisionTreeClassifier
# Evaluation
from sklearn.metrics import accuracy_score, classification_report

In [10]:
# Load Data
train_df = pd.read_csv('KDDTrain+.txt', header=None)
test_df = pd.read_csv('KDDTest+.txt', header=None)


column_names = [
    "duration", "protocol_type", "service", "flag", "src_bytes", "dst_bytes",
    "land", "wrong_fragment", "urgent", "hot", "num_failed_logins", "logged_in",
    "num_compromised", "root_shell", "su_attempted", "num_root", "num_file_creations",
    "num_shells", "num_access_files", "num_outbound_cmds", "is_host_login",
    "is_guest_login", "count", "srv_count", "serror_rate", "srv_serror_rate",
    "rerror_rate", "srv_rerror_rate", "same_srv_rate", "diff_srv_rate",
    "srv_diff_host_rate", "dst_host_count", "dst_host_srv_count",
    "dst_host_same_srv_rate", "dst_host_diff_srv_rate", "dst_host_same_src_port_rate",
    "dst_host_srv_diff_host_rate", "dst_host_serror_rate", "dst_host_srv_serror_rate",
    "dst_host_rerror_rate", "dst_host_srv_rerror_rate", "label", "difficulty_level"
]
train_df.columns = column_names
test_df.columns = column_names


train_df = train_df.drop('difficulty_level', axis=1)
test_df = test_df.drop('difficulty_level', axis=1)


categorical_cols = ['protocol_type', 'service', 'flag']
label_encoder = LabelEncoder()

for col in categorical_cols:
    train_df[col] = label_encoder.fit_transform(train_df[col])
    test_df[col] = label_encoder.transform(test_df[col])

# Encode Labels (Normal = 0, Anomaly = 1)
train_df['label'] = train_df['label'].apply(lambda x: 0 if x == 'normal' else 1)
test_df['label'] = test_df['label'].apply(lambda x: 0 if x == 'normal' else 1)


X_train = train_df.drop('label', axis=1)
y_train = train_df['label']
X_test = test_df.drop('label', axis=1)
y_test = test_df['label']


scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

print("Dataset Loaded and Preprocessed.")


Dataset Loaded and Preprocessed.


Naive Bayes Model

In [11]:
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)
y_pred_nb = nb_model.predict(X_test)

print("Naive Bayes Accuracy:", accuracy_score(y_test, y_pred_nb) * 100)
print(classification_report(y_test, y_pred_nb))

Naive Bayes Accuracy: 77.14247693399574
              precision    recall  f1-score   support

           0       0.67      0.91      0.78      9711
           1       0.91      0.66      0.77     12833

    accuracy                           0.77     22544
   macro avg       0.79      0.79      0.77     22544
weighted avg       0.81      0.77      0.77     22544



Logistic Regression

In [12]:
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_test)

print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr) * 100)
print(classification_report(y_test, y_pred_lr))

Logistic Regression Accuracy: 75.3903477643719
              precision    recall  f1-score   support

           0       0.65      0.93      0.77      9711
           1       0.93      0.62      0.74     12833

    accuracy                           0.75     22544
   macro avg       0.79      0.78      0.75     22544
weighted avg       0.81      0.75      0.75     22544



K Nearest Neighbours

In [13]:
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, y_train)
y_pred_knn = knn_model.predict(X_test)

print("KNN Accuracy:", accuracy_score(y_test, y_pred_knn) * 100)
print(classification_report(y_test, y_pred_knn))

KNN Accuracy: 76.75656493967354
              precision    recall  f1-score   support

           0       0.65      0.98      0.78      9711
           1       0.97      0.61      0.75     12833

    accuracy                           0.77     22544
   macro avg       0.81      0.79      0.77     22544
weighted avg       0.83      0.77      0.76     22544



Decision Tree

In [16]:
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)
y_pred_dt = dt_model.predict(X_test)

print("Decision Tree Accuracy:", accuracy_score(y_test, y_pred_dt) * 100)
print(classification_report(y_test, y_pred_dt))

Decision Tree Accuracy: 78.85468417317246
              precision    recall  f1-score   support

           0       0.68      0.97      0.80      9711
           1       0.97      0.65      0.78     12833

    accuracy                           0.79     22544
   macro avg       0.82      0.81      0.79     22544
weighted avg       0.84      0.79      0.79     22544



Random Forest

In [17]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf) * 100)
print(classification_report(y_test, y_pred_rf))

Random Forest Accuracy: 77.06706884315118
              precision    recall  f1-score   support

           0       0.66      0.97      0.78      9711
           1       0.97      0.62      0.75     12833

    accuracy                           0.77     22544
   macro avg       0.81      0.80      0.77     22544
weighted avg       0.83      0.77      0.77     22544



Support Vector Machine

In [18]:
svm_model = SVC(kernel='rbf', C=1.0, gamma='scale')
svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_test)

print("SVM Accuracy:", accuracy_score(y_test, y_pred_svm) * 100)
print(classification_report(y_test, y_pred_svm))

SVM Accuracy: 78.19375443577005
              precision    recall  f1-score   support

           0       0.67      0.98      0.79      9711
           1       0.98      0.63      0.77     12833

    accuracy                           0.78     22544
   macro avg       0.82      0.81      0.78     22544
weighted avg       0.84      0.78      0.78     22544



Artificial Neural Networks

In [19]:
ann_model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])

ann_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
ann_model.fit(X_train, y_train, epochs=5, batch_size=64, validation_split=0.2)
loss, accuracy = ann_model.evaluate(X_test, y_test)
print("ANN Accuracy:", accuracy * 100)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/5
[1m1575/1575[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 2ms/step - accuracy: 0.9564 - loss: 0.1242 - val_accuracy: 0.9917 - val_loss: 0.0278
Epoch 2/5
[1m1575/1575[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 4ms/step - accuracy: 0.9909 - loss: 0.0261 - val_accuracy: 0.9927 - val_loss: 0.0224
Epoch 3/5
[1m1575/1575[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 4ms/step - accuracy: 0.9923 - loss: 0.0228 - val_accuracy: 0.9932 - val_loss: 0.0223
Epoch 4/5
[1m1575/1575[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 5ms/step - accuracy: 0.9933 - loss: 0.0197 - val_accuracy: 0.9925 - val_loss: 0.0208
Epoch 5/5
[1m1575/1575[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 4ms/step - accuracy: 0.9934 - loss: 0.0188 - val_accuracy: 0.9927 - val_loss: 0.0202
[1m705/705[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.7845 - loss: 0.8953
ANN Accuracy: 77.95866131782532


In [20]:
ensemble_model = VotingClassifier(
    estimators=[
        ('nb', nb_model),
        ('lr', lr_model),
        ('knn', knn_model),
        ('dt', dt_model),
        ('rf', rf_model),
        ('svm', svm_model)
    ],
    voting='hard'
)

ensemble_model.fit(X_train, y_train)
y_pred_ensemble = ensemble_model.predict(X_test)

print("Ensemble Model Accuracy:", accuracy_score(y_test, y_pred_ensemble) * 100)
print(classification_report(y_test, y_pred_ensemble))


Ensemble Model Accuracy: 76.25532292405963
              precision    recall  f1-score   support

           0       0.65      0.97      0.78      9711
           1       0.97      0.60      0.74     12833

    accuracy                           0.76     22544
   macro avg       0.81      0.79      0.76     22544
weighted avg       0.83      0.76      0.76     22544

