### Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Dropout, Activation, Input
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
import pandas as pd
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.utils import resample
from tensorflow.keras.layers import LeakyReLU

In [2]:
# Logistic Map for Chaotic Dynamics
def logistic_map(size, r=3.99, seed=0.5):
    chaos = np.zeros(size)
    chaos[0] = seed
    for i in range(1, size):
        chaos[i] = r * chaos[i-1] * (1 - chaos[i-1])
    return chaos

### Load the data

In [3]:
data = pd.read_csv("Dataset/kddcup99.csv")
data.head(5)

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,lnum_compromised,lroot_shell,lsu_attempted,lnum_root,lnum_file_creations,lnum_shells,lnum_access_files,lnum_outbound_cmds,is_host_login,is_guest_login,count,srv_count,serror_rate,srv_serror_rate,rerror_rate,srv_rerror_rate,same_srv_rate,diff_srv_rate,srv_diff_host_rate,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,label
0,0,tcp,http,SF,181,5450,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,8,8,0.0,0.0,0.0,0.0,1.0,0.0,0.0,9,9,1.0,0.0,0.11,0.0,0.0,0.0,0.0,0.0,normal
1,0,tcp,http,SF,239,486,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,8,8,0.0,0.0,0.0,0.0,1.0,0.0,0.0,19,19,1.0,0.0,0.05,0.0,0.0,0.0,0.0,0.0,normal
2,0,tcp,http,SF,235,1337,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,8,8,0.0,0.0,0.0,0.0,1.0,0.0,0.0,29,29,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,normal
3,0,tcp,http,SF,219,1337,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,6,6,0.0,0.0,0.0,0.0,1.0,0.0,0.0,39,39,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,normal
4,0,tcp,http,SF,217,2032,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,6,6,0.0,0.0,0.0,0.0,1.0,0.0,0.0,49,49,1.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,normal


### Encode categorical features

What the categorical features represent in the KDD99 dataset:  
  

1. protocol_type (Network Protocol)  
Represents the transport layer protocol used in the network connection.  

0 → icmp (Internet Control Message Protocol)  
1 → tcp (Transmission Control Protocol)  
2 → udp (User Datagram Protocol)  
  
2. service (Network Service)  
Indicates the network service on the destination (e.g., HTTP, FTP, SMTP).  
The dataset originally contained 66 unique service types (e.g., http, ftp, smtp, etc.), but these were label-encoded as 0-65.  

Some common services:  

http → Web traffic  
ftp → File Transfer Protocol  
smtp → Email sending  
telnet → Remote login  
  
3. flag (Connection Status Flag)  
Represents the connection status between the source and destination.  

0 → OTH (Other)  
1 → REJ (Connection rejected)  
2 → RSTO (Reset by other side)  
3 → RSTOS0 (Reset before establishing connection)  
4 → RSTR (Connection reset)  
5 → S0 (Connection attempt seen but not established)  
6 → S1 (Established but no data exchanged)  
7 → S2 (Partial data exchange)  
8 → S3 (Established and data exchange completed)  
9 → SF (Normal connection with successful data transfer)  
10 → SH (Connection started but not completed)  
  
4. label (Attack Type)  
Represents whether the network connection is normal or an attack.  

0 → normal (No attack)  
1 → attack (Malicious activity detected)  

In [4]:
label_encoder = LabelEncoder()
data['protocol_type'] = label_encoder.fit_transform(data['protocol_type'])
data['service'] = label_encoder.fit_transform(data['service'])
data['flag'] = label_encoder.fit_transform(data['flag'])
data['label'] = label_encoder.fit_transform(data['label'])

### Handle class imbalance

In [5]:
majority = data[data.label == 0]
minority = data[data.label == 1]
minority_upsampled = resample(minority, replace=True, n_samples=len(majority), random_state=42)
data = pd.concat([majority, minority_upsampled])

In [6]:
# Feature selection
X = data.drop(columns=['label'])
y = data['label']

In [34]:
X.columns

Index(['duration', 'protocol_type', 'service', 'flag', 'src_bytes',
       'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot',
       'num_failed_logins', 'logged_in', 'lnum_compromised', 'lroot_shell',
       'lsu_attempted', 'lnum_root', 'lnum_file_creations', 'lnum_shells',
       'lnum_access_files', 'lnum_outbound_cmds', 'is_host_login',
       'is_guest_login', 'count', 'srv_count', 'serror_rate',
       'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate',
       'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count',
       'dst_host_srv_count', 'dst_host_same_srv_rate',
       'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
       'dst_host_srv_diff_host_rate', 'dst_host_serror_rate',
       'dst_host_srv_serror_rate', 'dst_host_rerror_rate',
       'dst_host_srv_rerror_rate'],
      dtype='object')

In [7]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
# Standardize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [9]:
# Apply chaotic sequence to enhance input features
chaos_sequence = logistic_map(len(X_train))
X_train = np.multiply(X_train, chaos_sequence[:, np.newaxis])
X_test = np.multiply(X_test, chaos_sequence[:len(X_test), np.newaxis])


In [10]:
# Generative Adversarial Network (GAN) for Synthetic Data
latent_dim = 100
def build_generator():
    model = Sequential([
        Dense(128, input_dim=latent_dim),
        LeakyReLU(alpha=0.2),
        Dense(256),
        LeakyReLU(alpha=0.2),
        Dense(X_train.shape[1], activation='tanh')
    ])
    return model

def build_discriminator():
    model = Sequential([
        Dense(256, input_dim=X_train.shape[1]),
        LeakyReLU(alpha=0.2),
        Dense(128),
        LeakyReLU(alpha=0.2),
        Dense(1, activation='sigmoid')
    ])
    return model

generator = build_generator()
discriminator = build_discriminator()
discriminator.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

z = Input(shape=(latent_dim,))
fake_data = generator(z)
discriminator.trainable = False
validity = discriminator(fake_data)
gan = Model(z, validity)
gan.compile(loss='binary_crossentropy', optimizer='adam')

In [11]:
# Train GAN
def train_gan(epochs=10000, batch_size=32):
    for epoch in range(epochs):
        noise = np.random.normal(0, 1, (batch_size, latent_dim))
        generated_data = generator.predict(noise)
        real_batch = X_train[np.random.randint(0, X_train.shape[0], batch_size)]
        
        X_combined = np.vstack((real_batch, generated_data))
        y_combined = np.hstack((np.ones(batch_size), np.zeros(batch_size)))
        
        d_loss = discriminator.train_on_batch(X_combined, y_combined)
        g_loss = gan.train_on_batch(noise, np.ones(batch_size))

train_gan()



In [12]:
# Generate new synthetic data
def generate_synthetic_data(n_samples=5000):
    noise = np.random.normal(0, 1, (n_samples, latent_dim))
    return generator.predict(noise)

X_synthetic = generate_synthetic_data()
y_synthetic = np.random.randint(0, 2, size=(X_synthetic.shape[0],))



In [13]:
# Combine synthetic data with training set
X_train = np.vstack((X_train, X_synthetic))
y_train = np.hstack((y_train, y_synthetic))

In [14]:
# Define MLP Model
mlp = Sequential([
    Dense(128, input_shape=(X_train.shape[1],), activation='relu'),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
    ])

mlp.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [15]:
# Pretrain MLP on KDD dataset
mlp.fit(X_train[:len(y)], y_train[:len(y)], epochs=10, batch_size=32, validation_data=(X_test, y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1b39d514e50>

In [16]:
# Fine-tune on contemporary synthetic data
mlp.fit(X_synthetic, y_synthetic, epochs=10, batch_size=32)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1b3be004f10>

In [17]:
# XGBoost Model
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb_model.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.



In [18]:
# Hybrid Model Predictions
mlp_pred = (mlp.predict(X_test) > 0.5).astype(int).flatten()
xgb_pred = xgb_model.predict(X_test)




In [20]:
# Hybrid Fusion (Weighted Average)
final_pred = (0.6 * mlp_pred + 0.4 * xgb_pred) > 0.5

In [23]:
# Evaluation
print("Hybrid Model Accuracy:", accuracy_score(y_test, final_pred))

Hybrid Model Accuracy: 0.9365079365079365


In [31]:
# Save models
import joblib

joblib.dump(xgb_model, "xgb_model.pkl")
joblib.dump(scaler, "scaler.pkl")

print("models saved")


models saved


In [32]:
# Save MLP Model
mlp.save("mlp_model.h5")
print("model saved!")
print("models saved")

model saved!
models saved


In [33]:
print("Number of features expected by scaler:", scaler.n_features_in_)


Number of features expected by scaler: 41
